In [1]:
%load_ext autoreload
%autoreload 2
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
import eli5



In [2]:
from final import *

In [3]:
a = 0.165 / 0.37
b = (1 - 0.165) / (1 - 0.37)

def fix_prob(x):
    return a * x / (a * x + b * (1 - x))

In [4]:
df_train = pd.read_csv('data/train_features.csv', encoding="ISO-8859-1")
X_train_ab = df_train.iloc[:, 2:]
# X_train_ab = X_train_ab.drop('euclidean_distance', axis=1)
# X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)

In [5]:
X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)

In [6]:
%%time
df_train = pd.read_csv('data/train.csv')
df_train = df_train.fillna(' ')

df_test = pd.read_csv('data/test.csv')
ques = pd.concat([df_train[['question1', 'question2']], \
                  df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return (len(q_dict[row['question1']]))

def q2_freq(row):
    return (len(q_dict[row['question2']]))

def q1_q2_intersect(row):
    return (len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

test_leaky = df_test.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']]
del df_test

train_leaky = df_train.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']]

CPU times: user 5min 17s, sys: 1.4 s, total: 5min 18s
Wall time: 5min 19s


In [7]:
stops = set(stopwords.words("english"))

In [8]:
%%time
df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())
# df_train['question1'] = df_train['question1'].map(text_to_wordlist)
# df_train['question2'] = df_train['question2'].map(text_to_wordlist)

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

CPU times: user 3.65 s, sys: 164 ms, total: 3.82 s
Wall time: 3.81 s


In [9]:
X_train_devil = pd.read_csv('data/train_devil.csv')
X_train_pr = pd.read_csv('data/train_pagerank.csv')
X_train_lda = pd.read_csv("data/train_lda_feat.csv", encoding="ISO-8859-1")
X_train_lda = X_train_lda.iloc[:,6:]
X_train_meph = pd.read_csv('data/mephistopheies_b_train.csv')
X_train_lastwords = pd.read_csv('data/last_words_train.csv')
X_train_stemmeph = pd.read_csv('data/stem_mephi_train.csv')
X_train_stemwordmeph = pd.read_csv('data/stem_word_mephi_train.csv')
X_train_kcores = pd.read_csv('data/kcores_train.csv')
X_train_rest = pd.read_csv('data/mario_all_train.csv')
X_train_rest = X_train_rest.drop(['q1_freq', 'q2_freq', 'q1_q2_intersect', 'kcore_dif'], axis=1)

X_train_devil = X_train_devil.drop('devil_jaccard', axis=1)

Build features

In [10]:
%%time
X_feat = build_features(df_train, stops, weights)

  R = np.sum(shared_weights) / np.sum(total_weights)


CPU times: user 2min 59s, sys: 516 ms, total: 2min 59s
Wall time: 3min


In [11]:
X = pd.concat((X_feat, X_train_ab, train_leaky,
               X_train_devil, X_train_pr, X_train_lda, X_train_meph, X_train_lastwords,
               X_train_stemmeph, X_train_stemwordmeph, X_train_kcores, X_train_rest), axis=1)
y = df_train['is_duplicate'].values

In [12]:
del X_train_devil, X_train_pr, X_train_lda, X_train_meph, X_train_lastwords, X_train_stemmeph,\
    X_train_stemwordmeph, X_train_kcores, X_train_rest

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=17)

In [14]:
del X_train, X_valid, y_train, y_valid

Generate samples.

Build test features

In [15]:
%%time
df_test = pd.read_csv('data/test_features.csv', encoding="ISO-8859-1")
x_test_ab = df_test.iloc[:, 2:]
# x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
# x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)

df_test = pd.read_csv('data/test.csv')
df_test = df_test.fillna(' ')

df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())
# df_test['question1'] = df_test['question1'].map(text_to_wordlist)
# df_test['question2'] = df_test['question2'].map(text_to_wordlist)

CPU times: user 26 s, sys: 1.57 s, total: 27.6 s
Wall time: 28.4 s


In [16]:
x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)

In [17]:
%%time
x_test_feat = build_features(df_test, stops, weights)

  R = np.sum(shared_weights) / np.sum(total_weights)
  R = np.sum(shared_weights) / np.sum(total_weights)


CPU times: user 17min 23s, sys: 1.4 s, total: 17min 24s
Wall time: 17min 24s


In [18]:
x_test_devil = pd.read_csv('data/test_devil.csv')
x_test_pr = pd.read_csv('data/test_pagerank.csv')
x_test_lda = pd.read_csv("data/test_lda_feat.csv", encoding="ISO-8859-1")
x_test_lda = x_test_lda.iloc[:,3:]
x_test_meph = pd.read_csv('data/mephistopheies_b_test.csv')
x_test_lastwords = pd.read_csv('data/last_words_test.csv')
x_test_stemmeph = pd.read_csv('data/stem_mephi_test.csv')
x_test_stemwordmeph = pd.read_csv('data/stem_word_mephi_test.csv')
x_test_kcores = pd.read_csv('data/kcores_test.csv')
x_test_rest = pd.read_csv('data/mario_all_test.csv')
x_test_rest = x_test_rest.drop(['q1_freq', 'q2_freq', 'q1_q2_intersect', 'kcore_dif'], axis=1)

In [19]:
x_test = pd.concat((x_test_feat, x_test_ab, test_leaky,
                    x_test_devil, x_test_pr, x_test_lda, x_test_meph, x_test_lastwords,
                    x_test_stemmeph, x_test_stemwordmeph, x_test_kcores, x_test_rest), axis=1)

In [20]:
del x_test_devil, x_test_pr, x_test_lda, x_test_meph, x_test_lastwords, x_test_stemmeph,\
    x_test_stemwordmeph, x_test_kcores, x_test_rest

In [21]:
d_test = xgb.DMatrix(x_test)

In [35]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=18)

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.75
params['base_score'] = 0.2

params['colsample_bytree'] = 1
params['colsample_bylevel'] = 1
params['n_jobs'] = -1

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_tr, X_val = X.iloc[train_index], X.iloc[test_index]
    y_tr, y_val = y[train_index], y[test_index]
    
    d_train = xgb.DMatrix(X_tr, label=y_tr)
    d_valid = xgb.DMatrix(X_val, label=y_val)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=50, verbose_eval=200)
    val_loss = log_loss(y_val, bst.predict(d_valid))
    print(val_loss)
    
    p_test = bst.predict(d_test)
    sub = pd.DataFrame()
    sub['test_id'] = df_test['test_id']
    sub['is_duplicate'] = fix_prob(p_test)
    sub.to_csv('predictions/4fold_rs18_{}_{:.5f}.csv'.format(i, val_loss), index=False)

[0]	train-logloss:0.713267	valid-logloss:0.713306
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.232631	valid-logloss:0.239005
[400]	train-logloss:0.210479	valid-logloss:0.224151
[600]	train-logloss:0.196636	valid-logloss:0.218464
[800]	train-logloss:0.186463	valid-logloss:0.215533
[1000]	train-logloss:0.17812	valid-logloss:0.21378
[1200]	train-logloss:0.170811	valid-logloss:0.212577
[1400]	train-logloss:0.163896	valid-logloss:0.211703
[1600]	train-logloss:0.157735	valid-logloss:0.211067
[1800]	train-logloss:0.151867	valid-logloss:0.210523
[2000]	train-logloss:0.146313	valid-logloss:0.210143
[2200]	train-logloss:0.140885	valid-logloss:0.209799
[2400]	train-logloss:0.135637	valid-logloss:0.209502
[2600]	train-logloss:0.130748	valid-logloss:0.209337
[2800]	train-logloss:0.125902	valid-logloss:0.209118
[3000]	train-logloss:0.121293	valid-logloss:0.209008
[3200]	tra

# RF

In [22]:
X_rf = X.fillna(X.mean())
X_rf = X_rf.replace(np.inf, 1e30)

In [23]:
from sklearn.ensemble import RandomForestClassifier as RF

rf = RF(n_estimators=1500, criterion='gini',
        max_depth=None, min_samples_split=2, min_samples_leaf=1, min_impurity_split=1e-07,
        warm_start=True,
        n_jobs=-1, random_state=17, verbose=1)

rf.fit(X_rf, y);

[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 14.0min finished


In [27]:
x_test_rf = x_test.fillna(x_test.mean())
x_test_rf = x_test_rf.replace(np.inf, 1e30)

In [32]:
p_test = rf.predict_proba(x_test_rf)
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = fix_prob(p_test[:,1])
sub.to_csv('predictions/rf.csv', index=False)

In [33]:
del X_rf, x_test_rf, rf

---

Train model

In [291]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.75
params['base_score'] = 0.2

params['colsample_bytree'] = 1
params['colsample_bylevel'] = 1
params['n_jobs'] = -1

param['updater'] = 'grow_gpu' 

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=100, verbose_eval=100)
print(log_loss(y_valid, bst.predict(d_valid)))
# bst.save_model('xgb.mdl')

[0]	train-logloss:0.71398	valid-logloss:0.706504
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.271359	valid-logloss:0.27654
[200]	train-logloss:0.233244	valid-logloss:0.241401


KeyboardInterrupt: 

In [288]:
eli5.explain_weights_xgboost(bst, top=50)

Weight,Feature
0.2313,devil_cos
0.0987,q1_q2_intersect
0.0974,devil_jaccard
0.0943,word_match
0.0275,kcore_min
0.0174,intersection_count
0.0141,stem_chunigram_all_jaccard_max
0.0137,tfidf_wm
0.0126,max_degree
0.0100,min_degree


Test model

In [289]:
p_test = bst.predict(d_test)
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = fix_prob(p_test)
sub.to_csv('predictions/{:.5f}_py3_wo_sampling_fixed_5000iter_devilFixed_pr_ek2_lda_params_meph_lastwords_stemmeph_stemwordmeph_kcores_early100_rest.csv'.format(log_loss, index=False)