https://www.kaggle.com/ericxu10101/quora-question-pairs/xgboost-tfidf-logloss-0-3

In [1]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
# import seaborn as sns
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV



In [32]:
df_train = pd.read_csv('../data/train.csv', nrows = 20000)
df_train = df_train.fillna(' ')
# df_test = pd.read_csv('../data/test.csv', nrows = 10000)
# df_test = df_test.fillna(' ')

# explore
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
# test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str)

stops = set(stopwords.words("english"))

In [33]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    return l1 / l2

def wc_diff_unique_stop(row):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    return l1 / l2

def same_start_word(row):
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_diff_unique_stop(row):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

In [34]:
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [35]:
df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

In [125]:
# df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
# df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())

In [36]:
eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [37]:
def build_features(data):
    X = pd.DataFrame()
    X['word_match'] = data.apply(word_match_share, axis=1, raw=True)
    X['tfidf_wm'] = data.apply(tfidf_word_match_share, axis=1, raw=True)
    X['tfidf_wm_stops'] = data.apply(tfidf_word_match_share_stops, axis=1, raw=True)
    X['jaccard'] = data.apply(jaccard, axis=1, raw=True)
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True)
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True)
    X['wc_diff_unq_stop'] = data.apply(wc_diff_unique_stop, axis=1, raw=True)
    X['same_start'] = data.apply(same_start_word, axis=1, raw=True)
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True)
    X['char_diff_unq_stop'] = data.apply(char_diff_unique_stop, axis=1, raw=True)
    
    return X

In [38]:
# model
# x_train = pd.DataFrame()
# x_train['word_match'] = df_train.apply(word_match_share, axis=1, raw=True)
# x_train['tfidf_wm'] = df_train.apply(tfidf_word_match_share, axis=1, raw=True)
# x_train['tfidf_wm_stops'] = df_train.apply(tfidf_word_match_share_stops, axis=1, raw=True)
# x_train['jaccard'] = df_train.apply(jaccard, axis=1, raw=True)
# x_train['wc_diff'] = df_train.apply(wc_diff, axis=1, raw=True)
# x_train['wc_diff_unique'] = df_train.apply(wc_diff_unique, axis=1, raw=True)
# x_train['wc_diff_unq_stop'] = df_train.apply(wc_diff_unique_stop, axis=1, raw=True)
# x_train['same_start'] = df_train.apply(same_start_word, axis=1, raw=True)
# x_train['char_diff'] = df_train.apply(char_diff, axis=1, raw=True)

x_train = build_features(df_train)
y_train = df_train['is_duplicate'].values

# x_test = build_features(df_test)



In [39]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

In [40]:
# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165

In [41]:
print(len(pos_train))
print(len(neg_train))
print(len(pos_train)/(len(pos_train)+len(neg_train)))

7474
12526
0.3737


In [42]:
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

0.19085801838610827


pos_train = pos_train.reindex(np.random.permutation(pos_train.index)).iloc[:int(.5*len(pos_train))]

In [43]:
x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
# del pos_train, neg_train

In [44]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [45]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4
params['subsample'] = 1.0

In [46]:
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50)

# d_test = xgb.DMatrix(x_test)
# p_test = bst.predict(d_test)
# sub = pd.DataFrame()
# sub['test_id'] = df_test['test_id']
# sub['is_duplicate'] = p_test

# print(sub.head())

[0]	train-logloss:0.68289	valid-logloss:0.682909
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.4396	valid-logloss:0.440946
[100]	train-logloss:0.376179	valid-logloss:0.378458
[150]	train-logloss:0.35478	valid-logloss:0.357888
[200]	train-logloss:0.34699	valid-logloss:0.350819
[250]	train-logloss:0.343113	valid-logloss:0.347598
[300]	train-logloss:0.340237	valid-logloss:0.345626
[350]	train-logloss:0.337643	valid-logloss:0.344093
[400]	train-logloss:0.335286	valid-logloss:0.342916
[450]	train-logloss:0.332969	valid-logloss:0.341773
[500]	train-logloss:0.330955	valid-logloss:0.340916
[550]	train-logloss:0.328763	valid-logloss:0.339974
[600]	train-logloss:0.326743	valid-logloss:0.339091
[650]	train-logloss:0.324554	valid-logloss:0.33833
[700]	train-logloss:0.322445	valid-logloss:0.337568
[750]	train-logloss:0.320684	valid-logloss:0.336964
[800]	train-logloss:0.3190

In [48]:
bst.save_model('DELETE_ME.mdl')

In [49]:
bst.predict(x_train)

AttributeError: 'DataFrame' object has no attribute 'feature_names'

In [31]:
xgb.cv(params, d_train, 200, 3, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.682853+4.24447e-05	test-logloss:0.682872+8.17598e-05
[50]	train-logloss:0.438522+0.00097408	test-logloss:0.439266+0.0020817
[100]	train-logloss:0.375334+0.0013622	test-logloss:0.376773+0.00253524
[150]	train-logloss:0.354615+0.00170072	test-logloss:0.356769+0.00237781


Unnamed: 0,test-logloss-mean,test-logloss-std,train-logloss-mean,train-logloss-std
0,0.682872,0.000082,0.682853,0.000042
1,0.672997,0.000160,0.672963,0.000083
2,0.663501,0.000235,0.663451,0.000124
3,0.654368,0.000307,0.654302,0.000163
4,0.645573,0.000377,0.645492,0.000201
5,0.637102,0.000441,0.637005,0.000240
6,0.628937,0.000505,0.628823,0.000278
7,0.621064,0.000565,0.620938,0.000312
8,0.613467,0.000622,0.613326,0.000347
9,0.606135,0.000673,0.605978,0.000384


In [19]:
bst.get_score()

{'char_diff': 1003,
 'char_diff_unq_stop': 923,
 'jaccard': 1722,
 'same_start': 221,
 'tfidf_wm': 3455,
 'tfidf_wm_stops': 2427,
 'wc_diff': 372,
 'wc_diff_unique': 428,
 'wc_diff_unq_stop': 439,
 'word_match': 2122}

In [40]:
pred_y = bst.predict(d_valid)

(50k, 1000 rounds)

* Original Rebalanced - .355
* O + jacc_nostpo - .348
* O + jacc + wc_diffs - .344
* O + jacc + wcd + tf_stp - .340
* O + jacc + wcd + tf_stp + char_sp - .340

(50k, 1000 rounds, 0.5 subsample)
* O + jacc + wc_diffs - .343

(50k, 1000 rounds, 1.0 subsample, max_depth 5)
* O + jacc + wc_diffs - .341

(50k, 1000 rounds, no rebalance)
* Original - .457
* O + jacc + wc_diffs - .441

In [80]:
log_loss([0]*83 + [1]*17, [0.2] * 100)

0.45881359270459127

In [83]:
log_loss([0]*65 + [1]*35, [0.35] * 100)

0.64744663903463229

In [31]:
params = {}
params['objective'] = ('binary:logistic',)
params['n_estimators'] = (1000,)
# params['early_stopping_rounds'] = (50,)
params['learning_rate'] = (0.02, 0.05)
params['max_depth'] = (3,4,5)
params['subsample'] = (0.4, 0.6, 0.8, 1.0)
params['silent'] = (False, )
# params['dtrain'] = (d_train,)

In [32]:
xb = XGBClassifier()
gs = GridSearchCV(xb, params, scoring='neg_log_loss', cv=2) #cv=5)
gs.fit(x_train, y_train)
# gs.fit()
results = gs.cv_results_
for p, t_v, v_v in zip(results['params'], results['mean_train_score'], results['mean_test_score']):
    print('Params: ', p, ' MeanTrainScore', t_v, ' MeanTestScore: ', v_v)

KeyboardInterrupt: 