In [1]:
import sys
import datetime

sys.path.append("..")

from utils.get_settings import parse

settings = parse("../utils")

settings

{'train': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv',
  'pickle_path': '../temp_result/train_data.pkl',
  'pickle_path_augument': '../temp_result/train_data_a.pkl'},
 'test': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv',
  'pickle_path': '../temp_result/test_data.pkl',
  'pickle_path_augument': '../temp_result/test_data_a.pkl'},
 'valid': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv',
  'pickle_path': '../temp_result/valid_data.pkl',
  'pickle_path_augument': '../temp_result/valid_data_a.pkl'}}

In [2]:
model_name = "ensemble_baseline"

In [3]:
import pandas as pd
import numpy as np

np.random.seed(seed=0)

In [4]:
train_data = pd.read_pickle(settings["train"]["pickle_path"])
valid_data = pd.read_pickle(settings["valid"]["pickle_path"])

train_data = pd.concat([train_data,valid_data],axis = 0)

test_data = pd.read_pickle(settings["test"]["pickle_path"])

In [5]:
columns = ['A_dist_bert', 'B_dist_bert',
       'A_pos_bert', 'B_pos_bert', 'pron_pos_bert', 'topic_A', 'topic_B']

In [6]:
X_train = np.concatenate([np.array(list(train_data[col])).reshape(train_data.shape[0],-1) for col in columns],axis = 1)
y_train = np.array(train_data.label)


X_test = np.concatenate([np.array(list(test_data[col])).reshape(test_data.shape[0],-1) for col in columns],axis = 1)
y_test = np.array(test_data.label)

In [7]:
from joblib import dump, load
model_name_list = ["LR_256_baseline","SVM_256_baseline","MLP_1024_baseline","BIDAF_1024_baseline"
                   ,"BIDAF_1024_neither","BIDAF_1024_argument_neither","BIDAF_1024_argument_baseline"]

for model in model_name_list:
    print (model)
    tr,ts = load(model +'+basic.joblib')
    X_train = np.concatenate([X_train,tr],axis = 1)

    X_test = np.concatenate([X_test,ts],axis = 1)

LR_256_baseline
SVM_256_baseline
MLP_1024_baseline
BIDAF_1024_baseline
BIDAF_1024_neither
BIDAF_1024_argument_neither
BIDAF_1024_argument_baseline


In [8]:
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

score = []
kf = KFold(n_splits=5) 
for train_index, test_index in kf.split(X_train):
    X_tr, X_ts = X_train[train_index], X_train[test_index]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
    lr = LogisticRegression(penalty = "l2",solver = "saga",random_state=0,multi_class='multinomial', C = 0.375).fit(X_tr, y_tr)
    pred_ts = lr.predict_proba(X_ts)
    s = log_loss(y_ts,pred_ts)
    score.append(s)
    

    



In [9]:
score

[0.43135811455589085,
 0.45222982761443553,
 0.4724724994286566,
 0.4346791249213905,
 0.446285960873168]

In [10]:
np.array(score).mean()

0.4474051054787084

In [11]:
lr = LogisticRegression(penalty = "l2",solver = "saga",random_state=0,multi_class='multinomial',C = 0.375).fit(X_train, y_train)
pred_lr= lr.predict_proba(X_test)



In [12]:
log_loss(y_test,pred_lr)

0.4379176191565179

In [13]:
sub_df = pd.read_csv("../test_and_submit/sample_submission_stage_1.csv")
sub_df.loc[:, ['A','B','NEITHER']] = pred_lr
sub_df.to_csv("../test_and_submit/submission+model+"+model_name+"@"+str(datetime.datetime.now())+".csv", index=False)
sub_df.head()

Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.594229,0.307638,0.098133
1,development-2,0.97825,0.018788,0.002961
2,development-3,0.008513,0.984113,0.007374
3,development-4,0.070598,0.670438,0.258964
4,development-5,0.005892,0.990081,0.004027


In [14]:
X_train = np.concatenate([X_train,X_test],axis = 0)
y_train = np.concatenate([y_train,y_test],axis = 0)

lr = LogisticRegression(penalty = "l1",solver = "saga",random_state=0,multi_class='multinomial', C= 0.038).fit(X_train, y_train)



In [15]:
from joblib import dump, load
dump(lr, model_name +'.joblib') 

['ensemble_baseline.joblib']