In [1]:
import sys
import datetime

sys.path.append("..")

from utils.get_settings import parse

settings = parse("../utils")

settings

{'train': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv',
  'pickle_path': '../temp_result/train_data.pkl',
  'pickle_path_augument': '../temp_result/train_data_a.pkl'},
 'test': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv',
  'pickle_path': '../temp_result/test_data.pkl',
  'pickle_path_augument': '../temp_result/test_data_a.pkl'},
 'valid': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv',
  'pickle_path': '../temp_result/valid_data.pkl',
  'pickle_path_augument': '../temp_result/valid_data_a.pkl'},
 'stage2': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv',
  'pickle_path': '../temp_result/stage2_data.pkl'}}

In [2]:
model_name = "BIDAF_1024_argument_neither"

In [3]:
import pandas as pd
import numpy as np
from joblib import dump, load

np.random.seed(seed=0)

import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f8072cd6f10>

In [4]:
train_data = pd.read_pickle(settings["train"]["pickle_path"])
valid_data = pd.read_pickle(settings["valid"]["pickle_path"])

train_data = pd.concat([train_data,valid_data],axis = 0)

test_data = pd.read_pickle(settings["test"]["pickle_path"])

In [5]:
train_data_a = pd.read_pickle(settings["train"]["pickle_path_augument"])
valid_data_a = pd.read_pickle(settings["valid"]["pickle_path_augument"])

train_data_a = pd.concat([train_data_a,valid_data_a],axis = 0)

In [6]:
test_data_a = pd.read_pickle(settings["test"]["pickle_path_augument"])

In [7]:
train_data["hash_value"] = train_data.vector_bert_1024.map(lambda x:x[0,0])
test_data["hash_value"] = test_data.vector_bert_1024.map(lambda x:x[0,0])
train_data_a["hash_value"] = train_data_a.vector_bert_1024.map(lambda x:x[0,0])
test_data_a["hash_value"] = test_data_a.vector_bert_1024.map(lambda x:x[0,0])

In [8]:
columns = ['vector_bert_1024', 'pron_vector_bert_1024_mean','A_idx_bert','B_idx_bert','label','pron_idx_bert','neither_idx_bert','name_idx_bert','hash_value']

In [9]:
X_train = train_data[columns]
y_train = np.array(train_data.label)

X_train_a = train_data_a[columns]
y_train_a = np.array(train_data_a.label)

X_test = test_data[columns]
y_test = np.array(test_data.label)

X_test_a = test_data_a[columns]
y_test_a = np.array(test_data_a.label)

In [10]:
from sklearn.model_selection import KFold 
from bidaf_neither import BIDAF
from sklearn.metrics import log_loss

score = []
pred_bidaf_tr = np.zeros((X_train.shape[0],3))

kf = KFold(n_splits=5) 
for train_index, test_index in kf.split(X_train):
    X_tr, X_ts = X_train.iloc[train_index], X_train.iloc[test_index,:]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
    
    X_tr = pd.concat([X_tr,X_train_a],axis = 0)
    y_tr = np.concatenate([y_tr,y_train_a], axis = 0)
    #print (X_tr.count())
    '''
    Important step for reduce data leaking
    '''
    X_tr["label"] = y_tr
    X_tr = X_tr[X_tr.duplicated(subset='hash_value', keep=False)]
    y_tr = np.array(X_tr.label)
    #X_tr = X_tr.drop(columns = ["label"])
    #print (X_tr.count())
    
    bidaf = BIDAF().fit(X_tr, y_tr)
    pred_ts = bidaf.predict_proba(X_ts)
    s = log_loss(y_ts,pred_ts)
    score.append(s)
    pred_bidaf_tr[test_index] = pred_ts

    

[BIDAF][BIDAF][BIDAF][BIDAF][BIDAF]

In [11]:
score

[0.55758300975478,
 0.619178276282243,
 0.6141326487445152,
 0.5419913023675746,
 0.6409061953073748]

In [12]:
np.array(score).mean()

0.5947582864912976

In [13]:
X_train = pd.concat([X_train,X_train_a],axis = 0)
y_train = np.concatenate([y_train,y_train_a])

In [14]:
bidaf = BIDAF().fit(X_train, y_train)

pred_bidaf = bidaf.predict_proba(X_test)

dump((pred_bidaf_tr,pred_bidaf), "../ensemble/"+ model_name +'+basic.joblib') 

[BIDAF]

['../ensemble/BIDAF_1024_argument_neither+basic.joblib']

In [15]:
log_loss(y_test,pred_bidaf)

0.5645404996477744

In [16]:
sub_df = pd.read_csv("../test_and_submit/sample_submission_stage_1.csv")
sub_df.loc[:, ['A','B','NEITHER']] = pred_bidaf
sub_df.to_csv("../test_and_submit/submission+model+"+model_name+"@"+str(datetime.datetime.now())+".csv", index=False)
sub_df.head()

Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.453089,0.339345,0.207565
1,development-2,0.996608,0.001061,0.002331
2,development-3,0.006806,0.985793,0.007401
3,development-4,0.253139,0.284167,0.462694
4,development-5,0.011111,0.980966,0.007923


In [17]:
X_train = pd.concat([X_train,X_test,X_test_a],axis = 0)
y_train = np.concatenate([y_train,y_test,y_test_a],axis = 0)

bidaf = BIDAF().fit(X_train, y_train)

[BIDAF]

In [18]:
dump(bidaf, model_name +'.joblib') 

['BIDAF_1024_argument_neither.joblib']

--------------------------------------------------------------------------------------

In [1]:
import sys
import datetime

sys.path.append("..")

from utils.get_settings import parse

settings = parse("../utils")

settings

{'train': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv',
  'pickle_path': '../temp_result/train_data.pkl',
  'pickle_path_augument': '../temp_result/train_data_a.pkl'},
 'test': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv',
  'pickle_path': '../temp_result/test_data.pkl',
  'pickle_path_augument': '../temp_result/test_data_a.pkl'},
 'valid': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv',
  'pickle_path': '../temp_result/valid_data.pkl',
  'pickle_path_augument': '../temp_result/valid_data_a.pkl'},
 'stage2': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv',
  'pickle_path': '../temp_result/stage2_data.pkl'}}

In [2]:
model_name = "BIDAF_1024_argument_neither"
import joblib
import pandas as pd
import numpy as np

bidaf = joblib.load(model_name +'.joblib')

stage2_data = pd.read_pickle(settings["stage2"]["pickle_path"])

columns = ['vector_bert_1024', 'pron_vector_bert_1024_mean','A_idx_bert',
           'B_idx_bert','label','pron_idx_bert','neither_idx_bert','name_idx_bert']
columns = [col for col in columns if col!='label']

X_test = stage2_data[columns]
X_test["label"] = 0

pred_bidaf= bidaf.predict_proba(X_test)

joblib.dump(pred_bidaf, "../ensemble/"+ model_name +'+stage2.joblib')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


['../ensemble/BIDAF_1024_argument_neither+stage2.joblib']

## Viewed by Xingce BAO @4.11