https://www.kaggle.com/ericxu10101/quora-question-pairs/xgboost-tfidf-logloss-0-3

In [19]:
import functools

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.cross_validation import train_test_split

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


In [3]:
def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [4]:
def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['word_match'] = data.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
    X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
    X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13

#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17    

    return X

## RUN HERE

In [68]:
df_train = pd.read_csv('../data/train_features.csv', encoding = "ISO-8859-1", nrows=50000)

In [69]:
df_test = pd.read_csv('../data/test_features.csv', encoding = "ISO-8859-1", nrows=50000)

### NORMAL RUN

In [70]:
x_train_ab = df_train.iloc[:, 2:-1]
x_train_ab = x_train_ab.drop('euclidean_distance', axis=1)
x_train_ab = x_train_ab.drop('jaccard_distance', axis=1)

In [71]:
df_train = pd.read_csv('../data/train.csv', nrows=50000)
df_train = df_train.fillna(' ')
# df_test = pd.read_csv('../data/test.csv')
# df_test = df_test.fillna(' ')

### LEAKY FEATURES

In [72]:
ques = pd.concat([df_train[['question1', 'question2']], \
        df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
ques.shape

from collections import defaultdict
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

In [74]:
df_leaky = df_train.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]

In [76]:
stops = set(stopwords.words("english"))

In [77]:
df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

# df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
# df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())

In [78]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.iloc[:,:-1], df_train['is_duplicate'], test_size=0.1, random_state=4242)

In [79]:
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]

In [80]:
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])

In [81]:
np.mean(y_train)

0.19157762233761033

In [82]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())
# test_qs = pd.Series(df_test['question1'].tolist() + df_test['question2'].tolist())

In [83]:
eps = 5000 
words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

print('Building Features')
x_train = build_features(df_train, stops, weights)
y_train = df_train['is_duplicate'].values

Building Features




In [87]:
x_train = pd.concat((x_train, x_train_ab, df_leaky), axis=1)

In [47]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

In [48]:
pl = len(pos_train)
nl = len(neg_train)
tl = pl+nl

In [49]:
print(pl,nl,tl)

18649 31351 50000


In [50]:
pl/tl

0.37298

In [51]:
ratio = pl/tl

18 / 100 = (18649 - x) / (tl-x)

* 18(tl) - 18x = 100(pl) - 100x
* x = (100pl - 18tl) / 82

In [52]:
int(pl - (100*pl - 18*tl)/82)

6881

In [53]:
int(pl - (pl - .18*tl)/.82)

6881

In [55]:
p = .18

In [56]:
int(pl - (pl - p*tl)/((1-p)))

6881

In [57]:
pos_train = pos_train.iloc[:6881]

In [58]:
len(pos_train) / (len(pos_train) + len(neg_train))

0.17998012136430216

In [59]:
ratio = len(pos_train) / (len(pos_train) + len(neg_train))
scale = (ratio-.2) / ratio
pos_train = pos_train.iloc[:int(scale*len(pos_train))]

In [60]:
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

0.08930553121897085


In [61]:
x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=4242)

In [62]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4
params['subsample'] = 1.0
params['base_score'] = 0.2

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

In [63]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [64]:
#Leaky Feature
bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.343169	valid-logloss:0.346046
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.251424	valid-logloss:0.257367
[100]	train-logloss:0.221908	valid-logloss:0.22971
[150]	train-logloss:0.209939	valid-logloss:0.219182
[200]	train-logloss:0.204776	valid-logloss:0.214933
[250]	train-logloss:0.201402	valid-logloss:0.212516
[300]	train-logloss:0.198748	valid-logloss:0.21075
[350]	train-logloss:0.196179	valid-logloss:0.20912
[400]	train-logloss:0.193965	valid-logloss:0.207758
[450]	train-logloss:0.191804	valid-logloss:0.206823
[500]	train-logloss:0.189967	valid-logloss:0.206119
[550]	train-logloss:0.188184	valid-logloss:0.205529
[600]	train-logloss:0.186579	valid-logloss:0.205105
[650]	train-logloss:0.18519	valid-logloss:0.20486
[700]	train-logloss:0.18364	valid-logloss:0.204358
[750]	train-logloss:0.182344	valid-logloss:0.203884
[800]	train-logloss:0.1811

In [120]:
bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.483366	valid-logloss:0.485115
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.382709	valid-logloss:0.38593
[100]	train-logloss:0.35588	valid-logloss:0.360264
[150]	train-logloss:0.345449	valid-logloss:0.350679
[200]	train-logloss:0.340131	valid-logloss:0.346118
[250]	train-logloss:0.336469	valid-logloss:0.34306
[300]	train-logloss:0.332899	valid-logloss:0.340495
[350]	train-logloss:0.329261	valid-logloss:0.337549
[400]	train-logloss:0.326207	valid-logloss:0.335198
[450]	train-logloss:0.323111	valid-logloss:0.332813
[500]	train-logloss:0.32091	valid-logloss:0.331251
[550]	train-logloss:0.318263	valid-logloss:0.329436
[600]	train-logloss:0.316027	valid-logloss:0.327864
[650]	train-logloss:0.31382	valid-logloss:0.326361
[700]	train-logloss:0.311839	valid-logloss:0.325062
[750]	train-logloss:0.310278	valid-logloss:0.323903
[800]	train-logloss:0.308

In [98]:
bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.682852	valid-logloss:0.683012
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.437888	valid-logloss:0.441379
[100]	train-logloss:0.373792	valid-logloss:0.378204
[150]	train-logloss:0.352147	valid-logloss:0.357048
[200]	train-logloss:0.343269	valid-logloss:0.34886
[250]	train-logloss:0.338612	valid-logloss:0.344828
[300]	train-logloss:0.335222	valid-logloss:0.342007
[350]	train-logloss:0.331766	valid-logloss:0.339435
[400]	train-logloss:0.328657	valid-logloss:0.33693
[450]	train-logloss:0.325714	valid-logloss:0.334532
[500]	train-logloss:0.323026	valid-logloss:0.332328
[550]	train-logloss:0.320594	valid-logloss:0.330683
[600]	train-logloss:0.318377	valid-logloss:0.329208
[650]	train-logloss:0.316131	valid-logloss:0.327679
[700]	train-logloss:0.314196	valid-logloss:0.326426
[750]	train-logloss:0.312286	valid-logloss:0.32499
[800]	train-logloss:0.3

In [124]:
scores = bst.get_score()

In [125]:
for x in x_train.columns:
    if x in scores:
        print(x, scores[x])
    else:
        print(x, None)

word_match 1238
tfidf_wm 1267
tfidf_wm_stops 907
jaccard 544
wc_diff 116
wc_ratio 156
wc_diff_unique 97
wc_ratio_unique 71
wc_diff_unq_stop 39
wc_ratio_unique_stop 164
same_start 166
char_diff 158
char_diff_unq_stop 226
total_unique_words 222
total_unq_words_stop 323
char_ratio 223
char_ratio2 None
len_q1 396
len_q2 294
diff_len 100
len_char_q1 324
len_char_q2 264
len_word_q1 167
len_word_q2 321
common_words 105
fuzz_qratio 505
fuzz_WRatio 96
fuzz_partial_ratio 420
fuzz_partial_token_set_ratio None
fuzz_partial_token_sort_ratio 303
fuzz_token_set_ratio 364
fuzz_token_sort_ratio 616
wmd 482
norm_wmd 712
cosine_distance 301
cityblock_distance 171
canberra_distance 304
minkowski_distance 202
braycurtis_distance 181
skew_q1vec 502
skew_q2vec 457
kur_q1vec 565


In [81]:
bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.683619	valid-logloss:0.683446
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.457441	valid-logloss:0.452386
[100]	train-logloss:0.398939	valid-logloss:0.391836
[150]	train-logloss:0.38042	valid-logloss:0.372674
[200]	train-logloss:0.373827	valid-logloss:0.365809
[250]	train-logloss:0.370998	valid-logloss:0.363437
[300]	train-logloss:0.369559	valid-logloss:0.362555
[350]	train-logloss:0.368674	valid-logloss:0.362182
[400]	train-logloss:0.367655	valid-logloss:0.361998
[450]	train-logloss:0.366358	valid-logloss:0.36189
Stopping. Best iteration:
[442]	train-logloss:0.366551	valid-logloss:0.361871



In [56]:
bst.get_score()

{'char_diff': 1708,
 'char_diff_unq_stop': 1431,
 'jaccard': 3887,
 'same_start': 327,
 'tfidf_wm': 5,
 'tfidf_wm_stops': 48,
 'wc_diff': 821,
 'wc_diff_unique': 771,
 'wc_diff_unq_stop': 561,
 'word_match': 3003}

In [121]:
pred = bst.predict(d_valid)

In [122]:
accuracy_score(y_valid, (pred > 0.5).astype(int))

0.83283857098986591

In [146]:
log_loss(y_valid, bst.predict(d_valid))

0.31904054899189865

(50k, 1000 rounds)

* Original Rebalanced - .355
* O + jacc_nostpo - .348
* O + jacc + wc_diffs - .344
* O + jacc + wcd + tf_stp - .340
* O + jacc + wcd + tf_stp + char_sp - .340

* 1-5, 7, 9, 11-13   - .3492
* 1-13 .3478
* 1-16 - .355    (TEST .336)

(50k, 1000 rounds, 0.5 subsample)
* O + jacc + wc_diffs - .343

(50k, 1000 rounds, 1.0 subsample, max_depth 5)
* O + jacc + wc_diffs - .341

(50k, 1000 rounds, no rebalance)
* Original - .457
* O + jacc + wc_diffs - .441

In [31]:
xgb.cv(params, d_train, 200, 3, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.682853+4.24447e-05	test-logloss:0.682872+8.17598e-05
[50]	train-logloss:0.438522+0.00097408	test-logloss:0.439266+0.0020817
[100]	train-logloss:0.375334+0.0013622	test-logloss:0.376773+0.00253524
[150]	train-logloss:0.354615+0.00170072	test-logloss:0.356769+0.00237781


Unnamed: 0,test-logloss-mean,test-logloss-std,train-logloss-mean,train-logloss-std
0,0.682872,0.000082,0.682853,0.000042
1,0.672997,0.000160,0.672963,0.000083
2,0.663501,0.000235,0.663451,0.000124
3,0.654368,0.000307,0.654302,0.000163
4,0.645573,0.000377,0.645492,0.000201
5,0.637102,0.000441,0.637005,0.000240
6,0.628937,0.000505,0.628823,0.000278
7,0.621064,0.000565,0.620938,0.000312
8,0.613467,0.000622,0.613326,0.000347
9,0.606135,0.000673,0.605978,0.000384


In [19]:
bst.get_score()

{'char_diff': 1003,
 'char_diff_unq_stop': 923,
 'jaccard': 1722,
 'same_start': 221,
 'tfidf_wm': 3455,
 'tfidf_wm_stops': 2427,
 'wc_diff': 372,
 'wc_diff_unique': 428,
 'wc_diff_unq_stop': 439,
 'word_match': 2122}

(50k, 1000 rounds)

* Original Rebalanced - .355
* O + jacc_nostpo - .348
* O + jacc + wc_diffs - .344
* O + jacc + wcd + tf_stp - .340
* O + jacc + wcd + tf_stp + char_sp - .340

(50k, 1000 rounds, 0.5 subsample)
* O + jacc + wc_diffs - .343

(50k, 1000 rounds, 1.0 subsample, max_depth 5)
* O + jacc + wc_diffs - .341

(50k, 1000 rounds, no rebalance)
* Original - .457
* O + jacc + wc_diffs - .441

In [80]:
log_loss([0]*83 + [1]*17, [0.2] * 100)

0.45881359270459127

In [83]:
log_loss([0]*65 + [1]*35, [0.35] * 100)

0.64744663903463229

In [31]:
params = {}
params['objective'] = ('binary:logistic',)
params['n_estimators'] = (1000,)
# params['early_stopping_rounds'] = (50,)
params['learning_rate'] = (0.02, 0.05)
params['max_depth'] = (3,4,5)
params['subsample'] = (0.4, 0.6, 0.8, 1.0)
params['silent'] = (False, )
# params['dtrain'] = (d_train,)

In [32]:
xb = XGBClassifier()
gs = GridSearchCV(xb, params, scoring='neg_log_loss', cv=2) #cv=5)
gs.fit(x_train, y_train)
# gs.fit()
results = gs.cv_results_
for p, t_v, v_v in zip(results['params'], results['mean_train_score'], results['mean_test_score']):
    print('Params: ', p, ' MeanTrainScore', t_v, ' MeanTestScore: ', v_v)

KeyboardInterrupt: 