In [1]:
import numpy as np
import pandas as pd 
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
import xgboost as xgb



In [2]:
features_train_nlp = pd.read_csv('data/nlp_stemmed_features_train.csv')
features_train_non_nlp = pd.read_csv('data/non_nlp_features_train.csv')
features_test_nlp = pd.read_csv('data/nlp_stemmed_features_test.csv')
features_test_non_nlp = pd.read_csv('data/non_nlp_features_test.csv')

features_train = pd.concat([features_train_nlp, features_train_non_nlp], axis=1)
features_test = pd.concat([features_test_nlp, features_test_non_nlp], axis=1)

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
# Rebalancing the Data.

x_train = pd.DataFrame()

x_train.head()

x_test = pd.DataFrame()

old_y_train = df_train['is_duplicate'].values

pos_train = features_train[old_y_train == 1]
neg_train = features_train[old_y_train == 0]
pos_train.head()
neg_train.head()

# Balance the positive cases and negative cases
p = 0.165
scale = (float(len(pos_train) / float(len(pos_train) + len(neg_train)) ) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

In [5]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [None]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 10

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

In [None]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-logloss:0.678215	valid-logloss:0.67833
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.556353	valid-logloss:0.557549
[20]	train-logloss:0.469778	valid-logloss:0.471935
[30]	train-logloss:0.406068	valid-logloss:0.409102
[40]	train-logloss:0.358047	valid-logloss:0.361921


In [None]:
d_test = xgb.DMatrix(features_test)
p_test = bst.predict(d_test)

p_test_df = pd.DataFrame({"test_id":df_test["test_id"], "is_duplicate":p_test.ravel()})
p_test_df = p_test_df.reindex(columns=['test_id','is_duplicate'])
p_test_df.to_csv('predictions/xgb_preds.csv', index=False)