In [2]:
import numpy as np, pandas as pd
import ast 
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')
import spacy
from nltk import Tree
en_nlp = spacy.load('en')
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [3]:
data = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [4]:
data.shape


(14480, 15)

In [5]:
data.head(3)

Unnamed: 0,answer_start,answers,context,question,text,sentences,target,sent_emb,quest_emb,cosine_sim,euclidean_dis,pred_idx_cos,pred_idx_euc,root_match_idx,root_match_idx_first
0,269,yes,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,1,"[array([0.03037658, 0.04433101, 0.08135635, .....",[[0.01491953 0.02197376 0.02136409 ... 0.01360...,"[0.1401391625404358, 0.11776834726333618, 0.09...","[2.8352642, 2.4563262, 1.5417788, 2.9730926]",2,2,[],-1
1,207,yes,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,1,"[array([0.03037658, 0.04433101, 0.08135635, .....",[[0.04444952 0.02800576 0.03035772 ... 0.02242...,"[0.12254136800765991, 0.08665323257446289, 0.0...","[2.396976, 1.7860672, 1.1152366, 2.3845947]",2,2,[],-1
2,526,yes,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,3,"[array([0.03037658, 0.04433101, 0.08135635, .....",[[0.03949683 0.04509903 0.01808935 ... 0.04610...,"[0.09432470798492432, 0.06841456890106201, 0.0...","[1.8688717, 1.4023948, 1.0954475, 1.7620108]",1,2,[],-1


In [6]:
ast.literal_eval(data["sentences"][0])

['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.',
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.",
 "Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.",
 'Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".']

In [7]:
data = data[data["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [8]:
def create_features(data):
    train = pd.DataFrame()
     
    for k in range(len(data["euclidean_dis"])):
        dis = ast.literal_eval(data["euclidean_dis"][k])
        for i in range(len(dis)):
            train.loc[k, "column_euc_"+"%s"%i] = dis[i]
    
    print("Finished euclidean_dis")
    
    for k in range(len(data["cosine_sim"])):
        dis = ast.literal_eval(data["cosine_sim"][k].replace("nan","1"))
        for i in range(len(dis)):
            train.loc[k, "column_cos_"+"%s"%i] = dis[i]
            
    train["target"] = data["target"]
    return train

In [9]:
train = create_features(data)

Finished enclidean_dis


In [10]:
del data

In [11]:
train.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,2.835264,2.456326,1.541779,2.973093,,,,,,,...,0.117768,0.099913,0.122374,,,,,,,1
1,2.396976,1.786067,1.115237,2.384595,,,,,,,...,0.086653,0.073723,0.101862,,,,,,,1
2,1.868872,1.402395,1.095447,1.762011,,,,,,,...,0.068415,0.072894,0.072626,,,,,,,3


In [12]:
train.head(3).transpose()

Unnamed: 0,0,1,2
column_euc_0,2.835264,2.396976,1.868872
column_euc_1,2.456326,1.786067,1.402395
column_euc_2,1.541779,1.115237,1.095447
column_euc_3,2.973093,2.384595,1.762011
column_euc_4,,,
column_euc_5,,,
column_euc_6,,,
column_euc_7,,,
column_euc_8,,,
column_euc_9,,,


## Fitting Multinomial Logistic Regression¶


### standardize

In [14]:
train.apply(max, axis = 0)

column_euc_0    8.966061
column_euc_1    8.182969
column_euc_2    8.952271
column_euc_3    8.043482
column_euc_4         NaN
column_euc_5         NaN
column_euc_6         NaN
column_euc_7         NaN
column_euc_8         NaN
column_euc_9         NaN
column_cos_0    1.000000
column_cos_1    1.000000
column_cos_2    1.000000
column_cos_3    1.000000
column_cos_4         NaN
column_cos_5         NaN
column_cos_6         NaN
column_cos_7         NaN
column_cos_8         NaN
column_cos_9         NaN
target          9.000000
dtype: float64

In [15]:
subset1 = train.iloc[:,:10].fillna(60)
subset2 = train.iloc[:,10:].fillna(1)

In [16]:
subset1.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9
0,2.835264,2.456326,1.541779,2.973093,60.0,60.0,60.0,60.0,60.0,60.0
1,2.396976,1.786067,1.115237,2.384595,60.0,60.0,60.0,60.0,60.0,60.0
2,1.868872,1.402395,1.095447,1.762011,60.0,60.0,60.0,60.0,60.0,60.0


In [17]:
subset2.head(3)

Unnamed: 0,column_cos_0,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,0.140139,0.117768,0.099913,0.122374,1.0,1.0,1.0,1.0,1.0,1.0,1
1,0.122541,0.086653,0.073723,0.101862,1.0,1.0,1.0,1.0,1.0,1.0,1
2,0.094325,0.068415,0.072894,0.072626,1.0,1.0,1.0,1.0,1.0,1.0,3


In [18]:
train2 = pd.concat([subset1, subset2],axis=1, join_axes=[subset1.index])

In [19]:
train2.head(3)

Unnamed: 0,column_euc_0,column_euc_1,column_euc_2,column_euc_3,column_euc_4,column_euc_5,column_euc_6,column_euc_7,column_euc_8,column_euc_9,...,column_cos_1,column_cos_2,column_cos_3,column_cos_4,column_cos_5,column_cos_6,column_cos_7,column_cos_8,column_cos_9,target
0,2.835264,2.456326,1.541779,2.973093,60.0,60.0,60.0,60.0,60.0,60.0,...,0.117768,0.099913,0.122374,1.0,1.0,1.0,1.0,1.0,1.0,1
1,2.396976,1.786067,1.115237,2.384595,60.0,60.0,60.0,60.0,60.0,60.0,...,0.086653,0.073723,0.101862,1.0,1.0,1.0,1.0,1.0,1.0,1
2,1.868872,1.402395,1.095447,1.762011,60.0,60.0,60.0,60.0,60.0,60.0,...,0.068415,0.072894,0.072626,1.0,1.0,1.0,1.0,1.0,1.0,3


In [20]:
train2.apply(max, axis = 0)

column_euc_0     8.966061
column_euc_1    60.000000
column_euc_2    60.000000
column_euc_3    60.000000
column_euc_4    60.000000
column_euc_5    60.000000
column_euc_6    60.000000
column_euc_7    60.000000
column_euc_8    60.000000
column_euc_9    60.000000
column_cos_0     1.000000
column_cos_1     1.000000
column_cos_2     1.000000
column_cos_3     1.000000
column_cos_4     1.000000
column_cos_5     1.000000
column_cos_6     1.000000
column_cos_7     1.000000
column_cos_8     1.000000
column_cos_9     1.000000
target           9.000000
dtype: float64

In [23]:
scaler  =  MinMaxScaler ()
X = scaler.fit_transform(train2.iloc[:,:-1])

In [24]:
X

array([[0.30728601, 0.03862689, 0.02339999, ..., 1.        , 1.        ,
        1.        ],
       [0.25776416, 0.02742898, 0.0162742 , ..., 1.        , 1.        ,
        1.        ],
       [0.19809406, 0.02101902, 0.0159436 , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.12737751, 0.01028485, 0.0166541 , ..., 1.        , 1.        ,
        1.        ],
       [0.16713035, 0.01570982, 0.01832059, ..., 1.        , 1.        ,
        1.        ],
       [0.05847764, 0.01426775, 0.02062567, ..., 1.        , 1.        ,
        1.        ]])

In [25]:
train_x, test_x, train_y, test_y = train_test_split(X,
train.iloc[:,-1], train_size=0.8, random_state = 5)

In [26]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.3087949035568926
Multinomial Logistic regression Test Accuracy :  0.2898089171974522


## Logistic-Regression with Root Match feature

In [27]:
predicted = pd.read_csv("train_detect_sent.csv").reset_index(drop=True)

In [28]:
predicted = predicted[predicted["sentences"].apply(lambda x: len(ast.literal_eval(x)))<11].reset_index(drop=True)

In [29]:
predicted.shape

(14128, 15)

In [30]:
def get_columns_from_root(train):
    
    for i in range(train.shape[0]):
        if len(ast.literal_eval(train["root_match_idx"][i])) == 0: pass
        
        else:
            for item in ast.literal_eval(train["root_match_idx"][i]):
                train.loc[i, "column_root_"+"%s"%item] = 1
    return train

In [31]:
predicted = get_columns_from_root(predicted)

In [32]:
predicted.head(3).transpose()

Unnamed: 0,0,1,2
answer_start,269,207,526
answers,yes,yes,yes
context,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
question,When did Beyonce start becoming popular?,What areas did Beyonce compete in when she was...,When did Beyonce leave Destiny's Child and bec...
text,in the late 1990s,singing and dancing,2003
sentences,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...,['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/...
target,1,1,3
sent_emb,"[array([0.03037658, 0.04433101, 0.08135635, .....","[array([0.03037658, 0.04433101, 0.08135635, .....","[array([0.03037658, 0.04433101, 0.08135635, ....."
quest_emb,[[0.01491953 0.02197376 0.02136409 ... 0.01360...,[[0.04444952 0.02800576 0.03035772 ... 0.02242...,[[0.03949683 0.04509903 0.01808935 ... 0.04610...
cosine_sim,"[0.1401391625404358, 0.11776834726333618, 0.09...","[0.12254136800765991, 0.08665323257446289, 0.0...","[0.09432470798492432, 0.06841456890106201, 0.0..."


In [33]:
subset3 = predicted[["column_root_0","column_root_1","column_root_2","column_root_3","column_root_4","column_root_5",\
             "column_root_6","column_root_7","column_root_8","column_root_9"]]

In [34]:
subset3.fillna(0, inplace=True)

In [35]:
train3 = pd.concat([subset3, train2],axis=1, join_axes=[subset3.index])

In [36]:
train3.head(3).transpose()

Unnamed: 0,0,1,2
column_root_0,0.0,0.0,0.0
column_root_1,0.0,0.0,0.0
column_root_2,0.0,0.0,0.0
column_root_3,0.0,0.0,0.0
column_root_4,0.0,0.0,0.0
column_root_5,0.0,0.0,0.0
column_root_6,0.0,0.0,0.0
column_root_7,0.0,0.0,0.0
column_root_8,0.0,0.0,0.0
column_root_9,0.0,0.0,0.0


In [37]:
train3 = train3[["column_root_0","column_root_1","column_root_2","column_root_3","column_root_4","column_root_5",\
             "column_root_6","column_root_7","column_root_8","column_root_9", "column_cos_0","column_cos_1",\
           "column_cos_2","column_cos_3","column_cos_4","column_cos_5",\
             "column_cos_6","column_cos_7","column_cos_8","column_cos_9", "target"]]

In [38]:
train_x, test_x, train_y, test_y = train_test_split(train3.iloc[:,:-1],
train3.iloc[:,-1], train_size=0.8, random_state = 5)

In [39]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.3577242965846753
Multinomial Logistic regression Test Accuracy :  0.3453644727530078


## XgBoost

In [40]:
model = xgb.XGBClassifier()
param_dist = {"max_depth": [3,5,10],
              "min_child_weight" : [1,5,10],
              "learning_rate": [0.07, 0.1,0.2],
               }

# run randomized search
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=5, n_jobs=-1)
grid_search.fit(train_x, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_depth=3, learning_rate=0.07, min_child_weight=1 .............
[CV] max_depth=3, learning_rate=0.07, min_child_weight=1 .............
[CV] max_depth=3, learning_rate=0.07, min_child_weight=1 .............
[CV] max_depth=3, learning_rate=0.07, min_child_weight=5 .............
[CV]  max_depth=3, learning_rate=0.07, min_child_weight=1, score=0.44630071599045346, total=  10.0s
[CV] max_depth=3, learning_rate=0.07, min_child_weight=5 .............
[CV]  max_depth=3, learning_rate=0.07, min_child_weight=1, score=0.4435290991230401, total=  10.3s
[CV] max_depth=3, learning_rate=0.07, min_child_weight=5 .............
[CV]  max_depth=3, learning_rate=0.07, min_child_weight=1, score=0.43657112526539277, total=  10.4s
[CV] max_depth=3, learning_rate=0.07, min_child_weight=10 ............
[CV]  max_depth=3, learning_rate=0.07, min_child_weight=5, score=0.44550517104216386, total=  11.0s
[CV] max_depth=3, learning_rate=0.07, min_c

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   37.6s


[CV]  max_depth=5, learning_rate=0.07, min_child_weight=5, score=0.4553168920710687, total=  14.7s
[CV] max_depth=5, learning_rate=0.07, min_child_weight=10 ............
[CV]  max_depth=5, learning_rate=0.07, min_child_weight=10, score=0.4465658976398833, total=  14.6s
[CV] max_depth=5, learning_rate=0.07, min_child_weight=10 ............
[CV]  max_depth=5, learning_rate=0.07, min_child_weight=5, score=0.44028662420382164, total=  15.1s
[CV] max_depth=10, learning_rate=0.07, min_child_weight=1 ............
[CV]  max_depth=5, learning_rate=0.07, min_child_weight=5, score=0.43874568163699174, total=  15.7s
[CV] max_depth=10, learning_rate=0.07, min_child_weight=1 ............
[CV]  max_depth=5, learning_rate=0.07, min_child_weight=10, score=0.4437367303609342, total=  14.6s
[CV] max_depth=10, learning_rate=0.07, min_child_weight=1 ............
[CV]  max_depth=5, learning_rate=0.07, min_child_weight=10, score=0.4363539728939676, total=  14.7s
[CV] max_depth=10, learning_rate=0.07, min_chi

[CV] max_depth=5, learning_rate=0.2, min_child_weight=1 ..............
[CV]  max_depth=3, learning_rate=0.2, min_child_weight=10, score=0.43842887473460723, total=   9.6s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=1 ..............
[CV]  max_depth=3, learning_rate=0.2, min_child_weight=10, score=0.44246611745947384, total=   9.3s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=5 ..............
[CV]  max_depth=5, learning_rate=0.2, min_child_weight=1, score=0.4386104481569875, total=  15.4s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=5 ..............


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  4.6min


[CV]  max_depth=5, learning_rate=0.2, min_child_weight=1, score=0.43365180467091297, total=  14.4s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=5 ..............
[CV]  max_depth=5, learning_rate=0.2, min_child_weight=1, score=0.4416688812117991, total=  15.0s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=10 .............
[CV]  max_depth=5, learning_rate=0.2, min_child_weight=5, score=0.44311853619729513, total=  14.6s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=10 .............
[CV]  max_depth=5, learning_rate=0.2, min_child_weight=5, score=0.4294055201698514, total=  15.6s
[CV] max_depth=5, learning_rate=0.2, min_child_weight=10 .............
[CV]  max_depth=5, learning_rate=0.2, min_child_weight=5, score=0.43794844538931704, total=  17.2s
[CV] max_depth=10, learning_rate=0.2, min_child_weight=1 .............
[CV]  max_depth=5, learning_rate=0.2, min_child_weight=10, score=0.44099708300185625, total=  16.7s
[CV] max_depth=10, learning_rate=0.2, min_child_weig

[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  6.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5, 10], 'min_child_weight': [1, 5, 10], 'learning_rate': [0.07, 0.1, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [41]:
xg = xgb.XGBClassifier(max_depth=5)
xg.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, xg.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, xg.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.5792780038931162
Multinomial Logistic regression Test Accuracy :  0.45046001415428166
