In [68]:
import numpy as np, pandas as pd
import ast 
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
data = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [3]:
data.shape

(87598, 14)

In [4]:
data.head(3)

Unnamed: 0,answer_start,context,question,text,sentences,quest_emb,target,sent_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc,root_match_idx,root_match_idx_first
0,515,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,"['Architecturally, the school has a Catholic c...",[[ 0.11010079 0.11422941 0.11560896 ... 0.0...,5,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[0.424736299052452, 0.36405004106069117, 0.347...","[14.563858, 15.262212, 17.398178, 14.272491, 1...",5,5,[5],5
1,188,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,"['Architecturally, the school has a Catholic c...",[[ 0.10951651 0.11030623 0.05210007 ... -0.0...,2,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[0.45407456884452513, 0.32262004808444933, 0.3...","[12.889506, 12.285219, 16.843704, 8.361172, 11...",3,3,[],0
2,279,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,"['Architecturally, the school has a Catholic c...",[[ 0.01195647 0.14930707 0.02660049 ... 0.0...,3,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[0.39585783692319865, 0.29170832145169434, 0.3...","[11.857297, 11.392319, 15.061656, 7.1847134, 8...",3,3,"[1, 2, 3, 4, 5, 6]",1


In [5]:
ast.literal_eval(data["sentences"][0])

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

In [6]:
data = data[data["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [7]:
def create_features(data):
    train = pd.DataFrame()
     
    for k in range(len(data["euclidean_dis"])):
        dis = ast.literal_eval(data["euclidean_dis"][k])
        for i in range(len(dis)):
            train.loc[k, "column_euc_"+"%s"%i] = dis[i]
    
    print("Finished")
    
    for k in range(len(data["cosine_sim"])):
        dis = ast.literal_eval(data["cosine_sim"][k].replace("nan","1"))
        for i in range(len(dis)):
            train.loc[k, "column_cos_"+"%s"%i] = dis[i]
            
    train["target"] = data["target"]
    return train

In [8]:
train = create_features(data)

Finished


In [9]:
del data

In [10]:
train.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,14.563858,15.262212,17.398178,14.272491,13.339654,9.336262,15.720997,,,,...,0.36405,0.347755,0.394242,0.371025,0.18569,0.351921,,,,5
1,12.889506,12.285219,16.843704,8.361172,11.918098,17.601221,14.929258,,,,...,0.32262,0.355004,0.271561,0.392342,0.384383,0.362597,,,,2
2,11.857297,11.392319,15.061656,7.184713,8.465475,13.927309,12.249868,,,,...,0.291708,0.309919,0.223061,0.265975,0.293025,0.288711,,,,3


In [11]:
# train.fillna(10000, inplace=True)

In [12]:
train.head(3).transpose()

Unnamed: 0,0,1,2
column_euc_0,14.563858,12.889506,11.857297
column_euc_1,15.262212,12.285219,11.392319
column_euc_2,17.398178,16.843704,15.061656
column_euc_3,14.272491,8.361172,7.184713
column_euc_4,13.339654,11.918098,8.465475
column_euc_5,9.336262,17.601221,13.927309
column_euc_6,15.720997,14.929258,12.249868
column_euc_7,,,
column_euc_8,,,
column_euc_9,,,


### Fitting Multinomial Logistic Regression

### Standardize

In [13]:
train.apply(max, axis = 0)

column_euc_0    64.051060
column_euc_1    70.236871
column_euc_2    57.691372
column_euc_3    56.178432
column_euc_4    58.976977
column_euc_5    50.716494
column_euc_6    60.221045
column_euc_7          NaN
column_euc_8          NaN
column_euc_9          NaN
column_cos_0     1.466755
column_cos_1     1.606138
column_cos_2     1.552746
column_cos_3     1.544334
column_cos_4     1.542615
column_cos_5     1.477041
column_cos_6     1.544371
column_cos_7          NaN
column_cos_8          NaN
column_cos_9          NaN
target           9.000000
dtype: float64

In [14]:
subset1 = train.iloc[:,:10].fillna(60)
subset2 = train.iloc[:,10:].fillna(1)

In [15]:
 subset1.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9
0,14.563858,15.262212,17.398178,14.272491,13.339654,9.336262,15.720997,60.0,60.0,60.0
1,12.889506,12.285219,16.843704,8.361172,11.918098,17.601221,14.929258,60.0,60.0,60.0
2,11.857297,11.392319,15.061656,7.184713,8.465475,13.927309,12.249868,60.0,60.0,60.0


In [16]:
 subset2.head(3)

Unnamed: 0,column_cos_0,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,0.424736,0.36405,0.347755,0.394242,0.371025,0.18569,0.351921,1.0,1.0,1.0,5
1,0.454075,0.32262,0.355004,0.271561,0.392342,0.384383,0.362597,1.0,1.0,1.0,2
2,0.395858,0.291708,0.309919,0.223061,0.265975,0.293025,0.288711,1.0,1.0,1.0,3


In [17]:
train2 = pd.concat([subset1, subset2],axis=1, join_axes=[subset1.index])

In [18]:
train2.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,14.563858,15.262212,17.398178,14.272491,13.339654,9.336262,15.720997,60.0,60.0,60.0,...,0.36405,0.347755,0.394242,0.371025,0.18569,0.351921,1.0,1.0,1.0,5
1,12.889506,12.285219,16.843704,8.361172,11.918098,17.601221,14.929258,60.0,60.0,60.0,...,0.32262,0.355004,0.271561,0.392342,0.384383,0.362597,1.0,1.0,1.0,2
2,11.857297,11.392319,15.061656,7.184713,8.465475,13.927309,12.249868,60.0,60.0,60.0,...,0.291708,0.309919,0.223061,0.265975,0.293025,0.288711,1.0,1.0,1.0,3


In [19]:
train2.apply(max, axis = 0)

column_euc_0    64.051060
column_euc_1    70.236871
column_euc_2    60.000000
column_euc_3    60.000000
column_euc_4    60.000000
column_euc_5    60.000000
column_euc_6    60.221045
column_euc_7    60.000000
column_euc_8    60.000000
column_euc_9    60.000000
column_cos_0     1.466755
column_cos_1     1.606138
column_cos_2     1.552746
column_cos_3     1.544334
column_cos_4     1.542615
column_cos_5     1.477041
column_cos_6     1.544371
column_cos_7     1.450005
column_cos_8     1.118746
column_cos_9     1.023689
target           9.000000
dtype: float64

In [20]:
scaler = MinMaxScaler()
X = scaler.fit_transform(train2.iloc[:,:-1])

In [21]:
X

array([[0.21673264, 0.20752197, 0.27581956, ..., 0.67938476, 0.88892501,
        0.97551015],
       [0.19023154, 0.16460763, 0.26639416, ..., 0.67938476, 0.88892501,
        0.97551015],
       [0.17389408, 0.15173618, 0.23610146, ..., 0.67938476, 0.88892501,
        0.97551015],
       ...,
       [0.19892814, 0.163572  , 0.26380138, ..., 0.67938476, 0.88892501,
        0.97551015],
       [0.17695844, 0.14612473, 0.25149443, ..., 0.67938476, 0.88892501,
        0.97551015],
       [0.27575241, 0.23371491, 0.39075045, ..., 0.67938476, 0.88892501,
        0.97551015]])

In [22]:
train_x, test_x, train_y, test_y = train_test_split(X,
train.iloc[:,-1], train_size=0.8, random_state = 5)

In [23]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))


Multinomial Logistic regression Train Accuracy :  0.6373448858212791
Multinomial Logistic regression Test Accuracy :  0.6398026315789473


### Logistic-Regression with Root Match feature

In [24]:
predicted = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [25]:
predicted = predicted[predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [26]:
predicted.shape

(85119, 14)

In [27]:
def get_columns_from_root(train):
    
    for i in range(train.shape[0]):
        if len(ast.literal_eval(train["root_match_idx"][i])) == 0: pass
        
        else:
            for item in ast.literal_eval(train["root_match_idx"][i]):
                train.loc[i, "column_root_"+"%s"%item] = 1
    return train

In [28]:
predicted = get_columns_from_root(predicted)

In [29]:
predicted.head(3).transpose()

Unnamed: 0,0,1,2
answer_start,515,188,279
context,"Architecturally, the school has a Catholic cha...","Architecturally, the school has a Catholic cha...","Architecturally, the school has a Catholic cha..."
question,To whom did the Virgin Mary allegedly appear i...,What is in front of the Notre Dame Main Building?,The Basilica of the Sacred heart at Notre Dame...
text,Saint Bernadette Soubirous,a copper statue of Christ,the Main Building
sentences,"['Architecturally, the school has a Catholic c...","['Architecturally, the school has a Catholic c...","['Architecturally, the school has a Catholic c..."
quest_emb,[[ 0.11010079 0.11422941 0.11560896 ... 0.0...,[[ 0.10951651 0.11030623 0.05210007 ... -0.0...,[[ 0.01195647 0.14930707 0.02660049 ... 0.0...
target,5,2,3
sent_emb,"[array([ 0.05519997, 0.05013141, 0.04787038,...","[array([ 0.05519997, 0.05013141, 0.04787038,...","[array([ 0.05519997, 0.05013141, 0.04787038,..."
cosine_sim,"[0.424736299052452, 0.36405004106069117, 0.347...","[0.45407456884452513, 0.32262004808444933, 0.3...","[0.39585783692319865, 0.29170832145169434, 0.3..."
euclidean_dis,"[14.563858, 15.262212, 17.398178, 14.272491, 1...","[12.889506, 12.285219, 16.843704, 8.361172, 11...","[11.857297, 11.392319, 15.061656, 7.1847134, 8..."


In [30]:
subset3 = predicted[["column_root_0","column_root_1","column_root_2","column_root_3","column_root_4","column_root_5",\
             "column_root_6","column_root_7","column_root_8","column_root_9"]]

In [31]:
subset3.fillna(0, inplace=True)

In [32]:
train3 = pd.concat([subset3, train2],axis=1, join_axes=[subset3.index])

In [33]:
train3.head(3).transpose()

Unnamed: 0,0,1,2
column_root_0,0.0,0.0,0.0
column_root_1,0.0,0.0,1.0
column_root_2,0.0,0.0,1.0
column_root_3,0.0,0.0,1.0
column_root_4,0.0,0.0,1.0
column_root_5,1.0,0.0,1.0
column_root_6,0.0,0.0,1.0
column_root_7,0.0,0.0,0.0
column_root_8,0.0,0.0,0.0
column_root_9,0.0,0.0,0.0


In [34]:
train3 = train3[["column_root_0","column_root_1","column_root_2","column_root_3","column_root_4","column_root_5",\
             "column_root_6","column_root_7","column_root_8","column_root_9", "column_cos_0","column_cos_1",\
           "column_cos_2","column_cos_3","column_cos_4","column_cos_5",\
             "column_cos_6","column_cos_7","column_cos_8","column_cos_9", "target"]]

In [35]:
train_x, test_x, train_y, test_y = train_test_split(train3.iloc[:,:-1],
train3.iloc[:,-1], train_size=0.8, random_state = 5)

In [36]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))


Multinomial Logistic regression Train Accuracy :  0.6421910566120861
Multinomial Logistic regression Test Accuracy :  0.6449718045112782


### Random Forest

In [64]:
rf = RandomForestClassifier(min_samples_leaf=8, n_estimators=60)
rf.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, rf.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, rf.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.7216388868492547
Multinomial Logistic regression Test Accuracy :  0.670641447368421


### XgBoost

In [72]:
model = xgb.XGBClassifier()
param_dist = {"max_depth": [3,5,10],
              "min_child_weight" : [1,5,10],
              "learning_rate": [0.07, 0.1,0.2],
               }

# run randomized search
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=5, n_jobs=-1)
grid_search.fit(train_x, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] learning_rate=0.07, max_depth=3, min_child_weight=1 .............
[CV] learning_rate=0.07, max_depth=3, min_child_weight=1 .............
[CV] learning_rate=0.07, max_depth=3, min_child_weight=1 .............
[CV] learning_rate=0.07, max_depth=3, min_child_weight=5 .............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=1, score=0.6720993745044489, total= 1.7min
[CV] learning_rate=0.07, max_depth=3, min_child_weight=5 .............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=5, score=0.6727601092414766, total= 1.7min
[CV] learning_rate=0.07, max_depth=3, min_child_weight=5 .............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=1, score=0.6748612212529739, total= 1.7min
[CV] learning_rate=0.07, max_depth=3, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=1, score=0.6715576118087685, total= 1.7min
[CV] learning_rate=0.07, max_depth=3, min_chil

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  5.9min


[CV]  learning_rate=0.07, max_depth=5, min_child_weight=1, score=0.6782662320500397, total= 2.3min
[CV] learning_rate=0.07, max_depth=5, min_child_weight=5 .............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight=1, score=0.6802802008987576, total= 2.3min
[CV] learning_rate=0.07, max_depth=5, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight=5, score=0.6796757994890318, total= 2.4min
[CV] learning_rate=0.07, max_depth=5, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight=10, score=0.6782221830675711, total= 2.3min
[CV] learning_rate=0.07, max_depth=5, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight=5, score=0.6802802008987576, total= 2.4min
[CV] learning_rate=0.07, max_depth=10, min_child_weight=1 ............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight=5, score=0.67724168319013, total= 2.4min
[CV] learning_rate=0.07, max_depth=10, min_child_wei

[CV] learning_rate=0.2, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=5, score=0.6776823088786076, total= 1.5min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=1 ..............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=10, score=0.6774293013831381, total= 1.6min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=1 ..............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=10, score=0.6798836901929686, total= 1.5min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=1 ..............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=10, score=0.6771976206212822, total= 1.5min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=5 ..............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.6776054973130121, total= 2.1min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=5 ..............


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 40.3min


[CV]  learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.6790466120363028, total= 2.1min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=5 ..............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.6768891826393478, total= 2.1min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=5, score=0.6764161747863624, total= 2.1min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=5, score=0.6787822715657768, total= 2.3min
[CV] learning_rate=0.2, max_depth=5, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=5, score=0.6767569949328046, total= 2.3min
[CV] learning_rate=0.2, max_depth=10, min_child_weight=1 .............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=10, score=0.677165007488327, total= 2.3min
[CV] learning_rate=0.2, max_depth=10, min_child_weight=1 

[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 52.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5, 10], 'min_child_weight': [1, 5, 10], 'learning_rate': [0.07, 0.1, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [73]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [74]:
xg = xgb.XGBClassifier(max_depth=5)
xg.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, xg.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, xg.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.7062926793450327
Multinomial Logistic regression Test Accuracy :  0.685561560150376


In [69]:
xgb.XGBClassifier??