# Quora Duplicate Questions classification
###  Kaggle competition

## Model build

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from time import time

### Import files

In [2]:
X_train = pd.read_pickle('data/X_train.pkl')
Y_train = pd.read_pickle('data/Y_train.pkl')
X_test = pd.read_pickle('data/X_test.pkl')
Y_test = pd.read_pickle('data/Y_test.pkl')

### Perform Grid search for best parameters

In [25]:
models_to_test = {
    'random_forest': {
                    'estimator': RandomForestClassifier(n_estimators=100, n_jobs=-1)
                    ,'params': {'max_depth':[2,3,4], 'min_samples_leaf':[1,0.02,0.04], 'max_features':[None,'sqrt']}
                    }
    ,'gbm': {
                    'estimator': GradientBoostingClassifier(n_estimators=100)
                    ,'params': {'max_depth':[2, 3, 4 ], 'min_samples_leaf':[1,0.02,0.04], 'max_features':[None,'sqrt']}
                    }
    ,'lreg': {
                    'estimator': LogisticRegression()
                    ,'params': {'C':[1e-3,1e-2,1], 'penalty':['l1','l2']}
                    }
                }

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
res = []
t0 = time()
samp_size = X_train.shape[0] # 100000 # control the size on the input to control execution time
for key, val in models_to_test.items():
    print ('#'*20)
    clf = GridSearchCV(val['estimator'], param_grid = val['params'], cv = 3, scoring=make_scorer(log_loss, needs_proba=True))
    clf.fit(X_train.head(samp_size), Y_train.head(samp_size))
    print ('Model type: %s' % key)
    print (clf.best_params_)
    print (clf.best_score_)
    res.append(clf.cv_results_)
    print ('Time taken to search params for this model = %1.2f secs' % (time()-t0))
    t0 = time()

####################
Model type: gbm
{'max_features': 'sqrt', 'max_depth': 2, 'min_samples_leaf': 0.04}
0.504566370272
Time taken to search params for this model = 1066.97 secs
####################
Model type: lreg
{'penalty': 'l2', 'C': 0.001}
0.583792673481
Time taken to search params for this model = 41.70 secs
####################
Model type: random_forest
{'max_features': 'sqrt', 'max_depth': 2, 'min_samples_leaf': 0.02}
0.543954784866
Time taken to search params for this model = 717.59 secs


#### Best model chosen is the GBM model with the params stated above

In [30]:
%%time
clf = GradientBoostingClassifier(n_estimators=500, max_features='sqrt', max_depth = 2, min_samples_leaf = 0.04)
clf.fit(X_train, Y_train)

CPU times: user 1min 5s, sys: 3.07 s, total: 1min 8s
Wall time: 1min 8s


In [31]:
preds = clf.predict(X_test)
pred_proba = clf.predict_proba(X_test)
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
print('Accuracy = %1.5f' % accuracy_score(Y_test, preds))
print('ROC AUC = %1.5f' % roc_auc_score(Y_test, preds))
print('Log Loss = %1.5f' % log_loss(Y_test, pred_proba))

Accuracy = 0.71425
ROC AUC = 0.69328
Log Loss = 0.49555


### Generate predictions on the validation set

In [45]:
df_test = pd.read_pickle('data/df_test.pkl')
pred_proba = clf.predict_proba(df_test)

In [46]:
results = pd.DataFrame()
results['is_duplicate'] = pred_proba[:,1]
results.to_csv('submission.csv', index_label='test_id')