https://www.kaggle.com/ericxu10101/quora-question-pairs/xgboost-tfidf-logloss-0-3

In [44]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
# import seaborn as sns
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [6]:
df_train = pd.read_csv('../data/train.csv', nrows = 50000)
df_train = df_train.fillna(' ')
# df_test = pd.read_csv('../data/test.csv', nrows = 10000)
# df_test = df_test.fillna(' ')

# explore
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
# test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str)

stops = set(stopwords.words("english"))

In [22]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_diff_unique_stop(row):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def same_start_word(row):
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_diff_unique_stop(row):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


In [8]:
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [9]:
df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

In [125]:
# df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
# df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())

In [10]:
eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [23]:
def build_features(data):
    X = pd.DataFrame()
    X['word_match'] = data.apply(word_match_share, axis=1, raw=True)
    X['tfidf_wm'] = data.apply(tfidf_word_match_share, axis=1, raw=True)
    X['tfidf_wm_stops'] = data.apply(tfidf_word_match_share_stops, axis=1, raw=True)
    X['jaccard'] = data.apply(jaccard, axis=1, raw=True)
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True)
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True)
    X['wc_diff_unq_stop'] = data.apply(wc_diff_unique_stop, axis=1, raw=True)
    X['same_start'] = data.apply(same_start_word, axis=1, raw=True)
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True)
    X['char_diff_unq_stop'] = data.apply(char_diff_unique_stop, axis=1, raw=True)
    
    return X

In [24]:
# model
# x_train = pd.DataFrame()
# x_train['word_match'] = df_train.apply(word_match_share, axis=1, raw=True)
# x_train['tfidf_wm'] = df_train.apply(tfidf_word_match_share, axis=1, raw=True)
# x_train['tfidf_wm_stops'] = df_train.apply(tfidf_word_match_share_stops, axis=1, raw=True)
# x_train['jaccard'] = df_train.apply(jaccard, axis=1, raw=True)
# x_train['wc_diff'] = df_train.apply(wc_diff, axis=1, raw=True)
# x_train['wc_diff_unique'] = df_train.apply(wc_diff_unique, axis=1, raw=True)
# x_train['wc_diff_unq_stop'] = df_train.apply(wc_diff_unique_stop, axis=1, raw=True)
# x_train['same_start'] = df_train.apply(same_start_word, axis=1, raw=True)
# x_train['char_diff'] = df_train.apply(char_diff, axis=1, raw=True)

x_train = build_features(df_train)
y_train = df_train['is_duplicate'].values

# x_test = build_features(df_test)



In [25]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

In [26]:
# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165

In [27]:
print(len(pos_train))
print(len(neg_train))
print(len(pos_train)/(len(pos_train)+len(neg_train)))

18649
31351
0.37298


In [28]:
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

0.19091346498367168


In [29]:
x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
# del pos_train, neg_train

In [30]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [31]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4
params['subsample'] = 1.0

In [32]:
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50)

# d_test = xgb.DMatrix(x_test)
# p_test = bst.predict(d_test)
# sub = pd.DataFrame()
# sub['test_id'] = df_test['test_id']
# sub['is_duplicate'] = p_test

# print(sub.head())

[0]	train-logloss:0.682856	valid-logloss:0.683037
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.438653	valid-logloss:0.443481
[100]	train-logloss:0.375683	valid-logloss:0.382171
[150]	train-logloss:0.355211	valid-logloss:0.36247
[200]	train-logloss:0.347369	valid-logloss:0.355148
[250]	train-logloss:0.343703	valid-logloss:0.351883
[300]	train-logloss:0.341348	valid-logloss:0.350064
[350]	train-logloss:0.339555	valid-logloss:0.348832
[400]	train-logloss:0.337756	valid-logloss:0.347687
[450]	train-logloss:0.33587	valid-logloss:0.346489
[500]	train-logloss:0.334274	valid-logloss:0.345533
[550]	train-logloss:0.332916	valid-logloss:0.344741
[600]	train-logloss:0.331618	valid-logloss:0.343976
[650]	train-logloss:0.330384	valid-logloss:0.34324
[700]	train-logloss:0.329303	valid-logloss:0.342652
[750]	train-logloss:0.328239	valid-logloss:0.342101
[800]	train-logloss:0.3

In [33]:
bst.get_score()

{'char_diff': 995,
 'char_diff_unq_stop': 823,
 'jaccard': 1805,
 'same_start': 254,
 'tfidf_wm': 3450,
 'tfidf_wm_stops': 2544,
 'wc_diff': 593,
 'wc_diff_unique': 380,
 'wc_diff_unq_stop': 402,
 'word_match': 2298}

In [40]:
pred_y = bst.predict(d_valid)

In [41]:
(pred_y > 0.5).astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
log_loss(y_valid, pred_y)

0.33962351851029654

In [46]:
accuracy_score(y_valid, (pred_y > 0.5).astype(int))

0.81819112453293752

(50k, 1000 rounds)

* Original Rebalanced - .355
* O + jacc_nostpo - .348
* O + jacc + wc_diffs - .344
* O + jacc + wcd + tf_stp - .340
* O + jacc + wcd + tf_stp + char_sp - .340

(50k, 1000 rounds, 0.5 subsample)
* O + jacc + wc_diffs - .343

(50k, 1000 rounds, 1.0 subsample, max_depth 5)
* O + jacc + wc_diffs - .341

(50k, 1000 rounds, no rebalance)
* Original - .457
* O + jacc + wc_diffs - .441

In [80]:
log_loss([0]*83 + [1]*17, [0.2] * 100)

0.45881359270459127

In [83]:
log_loss([0]*65 + [1]*35, [0.35] * 100)

0.64744663903463229

In [238]:
params = {}
params['objective'] = ('binary:logistic',)
# params['eval_metric'] = ('logloss',)
params['n_estimators'] = (1000)
# params['early_stopping_rounds'] = (50,)
params['learning_rate'] = (0.02, 0.05)
params['max_depth'] = (3,4,5)
params['subsample'] = (0.4, 0.6, 0.8, 1.0)
params['silent'] = (False, )
# params['dtrain'] = (d_train,)

In [239]:
xb = XGBClassifier()
gs = GridSearchCV(xb, params, scoring='neg_log_loss', cv=2) #cv=5)
gs.fit(x_train, y_train)
# gs.fit()
results = gs.cv_results_
for p, t_v, v_v in zip(results['params'], results['mean_train_score'], results['mean_test_score']):
    print('Params: ', p, ' MeanTrainScore', t_v, ' MeanTestScore: ', v_v)

Params:  {'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 10, 'objective': 'binary:logistic', 'silent': False, 'subsample': 0.4}  MeanTrainScore -0.607365356421  MeanTestScore:  -0.607459703669
Params:  {'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 10, 'objective': 'binary:logistic', 'silent': False, 'subsample': 0.6}  MeanTrainScore -0.607327092703  MeanTestScore:  -0.607432086984
Params:  {'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 10, 'objective': 'binary:logistic', 'silent': False, 'subsample': 0.8}  MeanTrainScore -0.607255700523  MeanTestScore:  -0.607353256244
Params:  {'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 10, 'objective': 'binary:logistic', 'silent': False, 'subsample': 1.0}  MeanTrainScore -0.607249691268  MeanTestScore:  -0.607361253471
Params:  {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 10, 'objective': 'binary:logistic', 'silent': False, 'subsample': 0.4}  MeanTrainScore -0.606367039733  MeanTestScore:  -0.60650978