In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [2]:
events = pd.read_csv('features_events.csv').set_index('user_id')
sessions = pd.read_csv('features_session.csv')
sessions['user_id'] = sessions['id_']
sessions = sessions.drop(['id_'], axis=1).set_index('user_id')
labels14 = pd.read_csv('labels14.csv').set_index('user_id')
labels7 = pd.read_csv('labels7.csv').set_index('user_id')

In [4]:
users = pd.read_csv('user_dict.csv', header=None)
inv_user_dict = {row[1][1]:row[1][0] for row in users.iterrows()}

In [5]:
data14 = labels14.join(events).join(sessions)
data7 = labels7.join(events).join(sessions)
y14 = data14['label']
X14 = data14.drop('label', axis=1)
y7 = data7['label']
X7 = data7.drop('label', axis=1)

In [6]:
X14_train, X14_test, y14_train, y14_test = train_test_split(X14, y14, test_size=0.2, random_state=42)
X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size=0.2, random_state=42)

In [33]:
X14_train=X14_train.fillna(0) 
X14_test=X14_test.fillna(0)
y14_train=y14_train.fillna(0)
y14_test=y14_test.fillna(0) 
X7_train=X7_train.fillna(0) 
X7_test=X7_test.fillna(0)
y7_train=y7_train.fillna(0)
y7_test=y7_test.fillna(0) 

### model - logistic regression 

In [115]:
# 14 days

In [61]:
param_grid = {'C': [0.001, 0.1, 10, 1000],
              'penalty':['l1','l2'],
              'tol':[0.0001, 0.001, 0.01]}
lr = LogisticRegression()

# should change the name to gs_lr_14
gs_lr_14 = GridSearchCV(estimator=lr, param_grid=param_grid,       
                     scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
gs_lr_14.fit(X14_train, y14_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  6.9min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.001, 0.1, 10, 1000], 'penalty': ['l1', 'l2'], 'tol': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [44]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y14_test, gs_dt.predict_proba(X14_test)[:,1])
auc(false_positive_rate, true_positive_rate)

0.919110067117026

In [116]:
# 7 days

In [45]:
param_grid = {'C': [0.001, 0.1, 10, 1000],
              'penalty':['l1','l2'],
              'tol':[0.0001, 0.001, 0.01]}
lr = LogisticRegression()

gs_lr_7 = GridSearchCV(estimator=lr, param_grid=param_grid,
                     scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
gs_lr_7.fit(X7_train, y7_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  7.1min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.001, 0.1, 10, 1000], 'penalty': ['l1', 'l2'], 'tol': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [46]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y14_test, gs_lr_7.predict_proba(X14_test)[:,1])
auc(false_positive_rate, true_positive_rate)

0.8680078741432048

In [200]:
gs_lr_7.best_params_

{'C': 0.001, 'penalty': 'l1', 'tol': 0.001}

In [201]:
gs_lr_14.best_params_

{'C': 10, 'penalty': 'l2', 'tol': 0.0001}

### Model - XGBoost

In [None]:
# 14 day model

In [118]:
max_depth = 7
scale_pos_weight = 100    # helps with unbalanced data
# eval_set = [(X_train, y_train), (X_test, y_test)]
xgb14 = XGBClassifier(max_depth=max_depth, scale_pos_weight=scale_pos_weight)
xgb14.fit(X14_train, y14_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=100, seed=None,
       silent=True, subsample=1)

In [119]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y14_test, xgb14.predict_proba(X14_test)[:,1])

In [120]:
auc(false_positive_rate, true_positive_rate)

0.9572667149215248

In [117]:
# 7 day model

In [68]:
max_depth = 7
scale_pos_weight = 100    # this is supposed to help with unbalanced data
# eval_set = [(X_train, y_train), (X_test, y_test)]
xgb7 = XGBClassifier(max_depth=max_depth, scale_pos_weight=scale_pos_weight)
xgb7.fit(X7_train, y7_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=100, seed=None,
       silent=True, subsample=1)

In [69]:
false_positive_rate7, true_positive_rate7, thresholds7 = roc_curve(y7_test, xgb7.predict_proba(X7_test)[:,1])

In [70]:
auc(false_positive_rate7, true_positive_rate7)

0.967188572279333

### Model  - Random Forest

In [80]:
clf = RandomForestClassifier()
param_grid = { 
    'n_estimators': [10, 20, 50],
    'max_depth' : [1,5,10],
    'min_samples_leaf': [1, 2, 4, 6]}
gs_rf_7 = GridSearchCV(clf, param_grid=param_grid, scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
gs_rf_7.fit(X7_train, y7_train)


Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 12.5min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [10, 20, 50], 'max_depth': [1, 5, 10], 'min_samples_leaf': [1, 2, 4, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [82]:
false_positive_rate7, true_positive_rate7, thresholds7 = roc_curve(y7_test, gs_rf_7.predict_proba(X7_test)[:,1])
auc(false_positive_rate7, true_positive_rate7)

0.9732613017514721

In [81]:
gs_rf_14 = GridSearchCV(clf, param_grid=param_grid, scoring='accuracy', cv=10, verbose=1, n_jobs=-1)
gs_rf_14.fit(X14_train, y14_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 12.6min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [10, 20, 50], 'max_depth': [1, 5, 10], 'min_samples_leaf': [1, 2, 4, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [195]:
false_positive_rate7, true_positive_rate7, thresholds7 = roc_curve(y14_test, gs_rf_14.predict_proba(X14_test)[:,1])
auc(false_positive_rate7, true_positive_rate7)

0.9636539245630716

In [198]:
gs_rf_7.best_params_

{'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 50}

In [199]:
gs_rf_7.best_params_

{'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 50}

### model- stacking 1

In [166]:
# 7days
# https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/

In [196]:
# base models: rf, lr, xgb
# meta model: lr

In [62]:
def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]

        model.fit(X=x_train,y=y_train)
        train_pred=np.append(train_pred,model.predict(x_val))
        test_pred=np.append(test_pred,model.predict(test))
    return test_pred.reshape(-1,1),train_pred

In [105]:
train_pred_lr7 = pd.DataFrame(gs_lr_7.predict_proba(X7_train)[:,1])
test_pred_lr7 = pd.DataFrame(gs_lr_7.predict_proba(X7_test)[:,1])

In [106]:
train_pred_xgb7 = pd.DataFrame(xgb7.predict_proba(X7_train)[:,1])
test_pred_xgb7 = pd.DataFrame(xgb7.predict_proba(X7_test)[:,1])

In [107]:
train_pred_rf7 = pd.DataFrame(gs_rf_7.predict_proba(X7_train)[:,1])
test_pred_rf7 = pd.DataFrame(gs_rf_7.predict_proba(X7_test)[:,1])

In [202]:
df = pd.concat([train_pred_lr7, train_pred_xgb7, train_pred_rf7], axis=1)
df_test = pd.concat([test_pred_lr7, test_pred_xgb7, test_pred_rf7], axis=1)

meta_model_7 = LogisticRegression()
meta_model_7.fit(df,y7_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y7_test, meta_model_7.predict_proba(df_test)[:,1])
auc(false_positive_rate, true_positive_rate)



0.9684099407491156

In [114]:
# 14 days

In [121]:
train_pred_lr14 = pd.DataFrame(gs_lr_14.predict_proba(X7_train)[:,1])
test_pred_lr14 = pd.DataFrame(gs_lr_14.predict_proba(X7_test)[:,1])

In [122]:
train_pred_xgb14 = pd.DataFrame(xgb14.predict_proba(X7_train)[:,1])
test_pred_xgb14 = pd.DataFrame(xgb14.predict_proba(X7_test)[:,1])

In [123]:
train_pred_rf14 = pd.DataFrame(gs_rf_14.predict_proba(X7_train)[:,1])
test_pred_rf14 = pd.DataFrame(gs_rf_14.predict_proba(X7_test)[:,1])

In [203]:
df = pd.concat([train_pred_lr14, train_pred_xgb14, train_pred_rf14], axis=1)
df_test = pd.concat([test_pred_lr14, test_pred_xgb14, test_pred_rf14], axis=1)

meta_model_14 = LogisticRegression()
meta_model_14.fit(df,y7_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y14_test, meta_model_14.predict_proba(df_test)[:,1])
auc(false_positive_rate, true_positive_rate)



0.9585330618824868

In [146]:
# create "X_full" for this situation 

In [149]:
full7_rf = pd.DataFrame(gs_rf_7.predict_proba(X_full)[:,1])
full7_xgb = pd.DataFrame(xgb7.predict_proba(X_full)[:,1])
full7_lr = pd.DataFrame(gs_lr_7.predict_proba(X_full)[:,1])
full7 = pd.concat([full7_rf, full7_xgb, full7_lr], axis=1)

full14_rf = pd.DataFrame(gs_rf_14.predict_proba(X_full)[:,1])
full14_xgb = pd.DataFrame(xgb14.predict_proba(X_full)[:,1])
full14_lr = pd.DataFrame(gs_lr_14.predict_proba(X_full)[:,1])
full14 = pd.concat([full14_rf, full14_xgb, full14_lr], axis=1)


### model stacking 2

In [174]:
# 7 days

In [None]:
# base model: xgb, rf
# meta model: lr

In [177]:
# another stacking   
df = pd.concat([train_pred_xgb7, train_pred_rf7], axis=1)
df_test = pd.concat([test_pred_xgb7, test_pred_rf7], axis=1)

meta_model_7 = LogisticRegression()
meta_model_7.fit(df,y7_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y7_test, meta_model_7.predict_proba(df_test)[:,1])
auc(false_positive_rate, true_positive_rate)



0.9684800089329956

In [178]:
# 14 days

In [179]:
# another stacking 
df = pd.concat([train_pred_xgb14, train_pred_rf14], axis=1)
df_test = pd.concat([test_pred_xgb14, test_pred_rf14], axis=1)

meta_model_14 = LogisticRegression()
meta_model_14.fit(df,y7_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y14_test, meta_model_14.predict_proba(df_test)[:,1])
auc(false_positive_rate, true_positive_rate)



0.9585370386234808

In [186]:
# create X_full for this situation 
full7_rf = pd.DataFrame(gs_rf_7.predict_proba(X_full)[:,1])
full7_xgb = pd.DataFrame(xgb7.predict_proba(X_full)[:,1])
full7 = pd.concat([full7_rf, full7_xgb], axis=1)

full14_rf = pd.DataFrame(gs_rf_14.predict_proba(X_full)[:,1])
full14_xgb = pd.DataFrame(xgb14.predict_proba(X_full)[:,1])
full14 = pd.concat([full14_rf, full14_xgb], axis=1)

# Predict on full data for kaggle submission

In [181]:
events_full = pd.read_csv('features_events_full.csv')\
                .set_index('user_id')
sessions_full = pd.read_csv('features_session_full.csv')
                    
sessions_full['user_id'] = sessions_full['id_']
sessions_full = sessions_full.drop(['id_'], axis=1)\
                .drop_duplicates()\
                .groupby('user_id', as_index=False)\
                .mean()\
                .set_index('user_id')
X_full = events_full.join(sessions_full, how='left')

In [182]:
X_full=X_full.fillna(0) 

In [None]:
y14_pred = meta_model_14.predict_proba(X_full)
y7_pred = meta_model_7.predict_proba(X_full)

In [188]:
# if it's stacking, use these
y14_pred = meta_model_14.predict_proba(full7)
y7_pred = meta_model_7.predict_proba(full14)

In [189]:
predictions = pd.DataFrame({'user_id_hash':X_full.index,
                            'user_purchase_binary_7_days':y7_pred[:,1],
                            'user_purchase_binary_14_days':y14_pred[:,1]})

In [190]:
predictions['user_id_hash'] = predictions['user_id_hash'].map(inv_user_dict)
predictions.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,dfa54ccdb64bddfc2ea6a1da90e3a908cd9250bddfd6a8...,0.000276,0.000174
1,c4d7c49762e7fdfac7eaba9975d26c4bc555cab68a3c94...,0.000354,0.000189
2,ffb2f6b4dba62a448604b775a14acce44cd7dd5af33ec7...,0.00034,0.000198
3,f9d94bf4c5e6f44ab3623f589dc406dd32eb9b712ddfad...,0.000316,0.000185
4,233442a2c9452f0301f9a7b280ef077064ab98b8c88623...,0.002251,0.000791


In [191]:
samp_submission = pd.read_csv('sample_submission_2.csv')
samp_submission = samp_submission.rename(index=str, columns={"user_purchase_binary_7_days": "samp7",
                                                             "user_purchase_binary_14_days": "samp14"})
samp_submission.head()

Unnamed: 0,user_id_hash,samp7,samp14
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02


In [192]:
submission = pd.merge(samp_submission, predictions, how='left', on='user_id_hash')

In [193]:
submission = submission.drop(columns=['samp7', 'samp14'])
submission.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.000347,0.0002
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.000379,0.000188
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.000303,0.000191
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.000363,0.000187
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.000273,0.000174


In [197]:
# save to csv

In [194]:
submission.to_csv(path_or_buf='stacking_2.csv', index=False)