In [1]:
import sys
import datetime

sys.path.append("..")

from utils.get_settings import parse

settings = parse("../utils")

settings

{'train': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv',
  'pickle_path': '../temp_result/train_data.pkl',
  'pickle_path_augument': '../temp_result/train_data_a.pkl'},
 'test': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv',
  'pickle_path': '../temp_result/test_data.pkl',
  'pickle_path_augument': '../temp_result/test_data_a.pkl'},
 'valid': {'file_path': 'https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv',
  'pickle_path': '../temp_result/valid_data.pkl',
  'pickle_path_augument': '../temp_result/valid_data_a.pkl'}}

In [2]:
model_name = "LR_256_baseline"

In [3]:
import pandas as pd
import numpy as np
from joblib import dump, load

np.random.seed(seed=0)

In [4]:
train_data = pd.read_pickle(settings["train"]["pickle_path"])
valid_data = pd.read_pickle(settings["valid"]["pickle_path"])

train_data = pd.concat([train_data,valid_data],axis = 0)

test_data = pd.read_pickle(settings["test"]["pickle_path"])

In [5]:
columns = ['A_dist_bert', 'B_dist_bert',
       'A_pos_bert', 'B_pos_bert', 'pron_pos_bert', 'topic_A', 'topic_B', 
       'A_vector_bert_256_mean', 'B_vector_bert_256_mean',
       'pron_vector_bert_256_mean', 'product_vector_A_bert_256',
       'product_vector_B_bert_256']

In [6]:
X_train = np.concatenate([np.array(list(train_data[col])).reshape(train_data.shape[0],-1) for col in columns],axis = 1)
y_train = np.array(train_data.label)


X_test = np.concatenate([np.array(list(test_data[col])).reshape(test_data.shape[0],-1) for col in columns],axis = 1)
y_test = np.array(test_data.label)

In [7]:
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

score = []

pred_lr_tr = np.zeros((X_train.shape[0],3))

kf = KFold(n_splits=5) 
for train_index, test_index in kf.split(X_train):
    X_tr, X_ts = X_train[train_index], X_train[test_index]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
    lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',C = 0.025).fit(X_tr, y_tr)
    pred_ts = lr.predict_proba(X_ts)
    s = log_loss(y_ts,pred_ts)
    score.append(s)
    pred_lr_tr[test_index] = pred_ts
    

    



In [8]:
score

[0.5030328349398948,
 0.5586405146598432,
 0.5519591905306113,
 0.5395483889781854,
 0.548022718811565]

In [9]:
np.array(score).mean()

0.5402407295840199

In [10]:
lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',C = 0.025).fit(X_train, y_train)

pred_lr= lr.predict_proba(X_test)

dump((pred_lr_tr,pred_lr), "../ensemble/"+ model_name +'+basic.joblib') 



['../ensemble/LR_256_baseline+basic.joblib']

In [11]:
log_loss(y_test,pred_lr)

0.5117770696402661

In [12]:
sub_df = pd.read_csv("../test_and_submit/sample_submission_stage_1.csv")
sub_df.loc[:, ['A','B','NEITHER']] = pred_lr
sub_df.to_csv("../test_and_submit/submission+model+"+model_name+"@"+str(datetime.datetime.now())+".csv", index=False)
sub_df.head()

Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.737477,0.149122,0.113401
1,development-2,0.870883,0.098178,0.030939
2,development-3,0.018041,0.960333,0.021626
3,development-4,0.013279,0.726443,0.260278
4,development-5,0.001412,0.991504,0.007084


In [13]:
X_train = np.concatenate([X_train,X_test],axis = 0)
y_train = np.concatenate([y_train,y_test],axis = 0)

lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',C = 0.025).fit(X_train, y_train)



In [14]:
dump(lr, model_name +'.joblib') 

['LR_256_baseline.joblib']