In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [2]:
events = pd.read_csv('features_events.csv').set_index('user_id')
sessions = pd.read_csv('features_session.csv')
sessions['user_id'] = sessions['id_']
sessions = sessions.drop(['id_', 'is_first_session'], axis=1).set_index('user_id')
labels14 = pd.read_csv('labels14.csv').set_index('user_id')
labels7 = pd.read_csv('labels7.csv').set_index('user_id')

In [3]:
users = pd.read_csv('data/user_dict.csv', header=None)
inv_user_dict = {row[1][1]:row[1][0] for row in users.iterrows()}

In [4]:
data14 = labels14.join(events).join(sessions)
data7 = labels7.join(events).join(sessions)
y14 = data14['label']
X14 = data14.drop('label', axis=1)
y7 = data7['label']
X7 = data7.drop('label', axis=1)

In [5]:
X14_train, X14_test, y14_train, y14_test = train_test_split(X14, y14, test_size=0.2, random_state=42)
X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size=0.2, random_state=42)

### Model 1 - XGBoost

In [6]:
# 14 day model
max_depth = 7
scale_pos_weight = 100    # helps with unbalanced data
# eval_set = [(X_train, y_train), (X_test, y_test)]
xgb14 = XGBClassifier(max_depth=max_depth, scale_pos_weight=scale_pos_weight)
xgb14.fit(X14_train, y14_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=100, seed=None,
       silent=True, subsample=1)

In [7]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y14_test, xgb14.predict_proba(X14_test)[:,1])

In [8]:
auc(false_positive_rate, true_positive_rate)

0.9566629924287051

In [9]:
# 7 day model
max_depth = 7
scale_pos_weight = 100    # this is supposed to help with unbalanced data
# eval_set = [(X_train, y_train), (X_test, y_test)]
xgb7 = XGBClassifier(max_depth=max_depth, scale_pos_weight=scale_pos_weight)
xgb7.fit(X7_train, y7_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=100, seed=None,
       silent=True, subsample=1)

In [10]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
false_positive_rate7, true_positive_rate7, thresholds7 = roc_curve(y7_test, xgb7.predict_proba(X7_test)[:,1])

In [11]:
auc(false_positive_rate7, true_positive_rate7)

0.9669524015567605

### Model 2 - Random Forest

In [12]:
# from sklearn.ensemble import RandomForestClassifier
# rf14 = RandomForestClassifier(class_weight='balanced')
# rf14.fit(X14_train, y14_train)

### Predict on full data for kaggle submission

In [80]:
events_full = pd.read_csv('features_events_full.csv')\
                .set_index('user_id')
sessions_full = pd.read_csv('features_session_full.csv')
                    
sessions_full['user_id'] = sessions_full['id_']
sessions_full = sessions_full.drop(['id_', 'is_first_session'], axis=1)\
                .drop_duplicates()\
                .groupby('user_id', as_index=False)\
                .mean()\
                .set_index('user_id')
X_full = events_full.join(sessions_full, how='left')

In [65]:
y14_pred = xgb14.predict_proba(X_full)
y7_pred = xgb7.predict_proba(X_full)

In [66]:
predictions = pd.DataFrame({'user_id_hash':X_full.index,
                            'user_purchase_binary_7_days':y7_pred[:,1],
                            'user_purchase_binary_14_days':y14_pred[:,1]})

In [67]:
predictions['user_id_hash'] = predictions['user_id_hash'].map(inv_user_dict)
predictions.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,dfa54ccdb64bddfc2ea6a1da90e3a908cd9250bddfd6a8...,0.002602,0.012577
1,c4d7c49762e7fdfac7eaba9975d26c4bc555cab68a3c94...,0.019203,0.06968
2,ffb2f6b4dba62a448604b775a14acce44cd7dd5af33ec7...,0.043153,0.062002
3,f9d94bf4c5e6f44ab3623f589dc406dd32eb9b712ddfad...,0.028472,0.042903
4,233442a2c9452f0301f9a7b280ef077064ab98b8c88623...,0.243211,0.350147


In [68]:
samp_submission = pd.read_csv('data/sample_submission_2.csv')
samp_submission = samp_submission.rename(index=str, columns={"user_purchase_binary_7_days": "samp7",
                                                             "user_purchase_binary_14_days": "samp14"})
samp_submission.head()

Unnamed: 0,user_id_hash,samp7,samp14
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02


In [69]:
submission = pd.merge(samp_submission, predictions, how='left', on='user_id_hash')

In [74]:
submission = submission.drop(columns=['samp7', 'samp14'])
submission.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.022761,0.034179
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.026022,0.077285
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.02499,0.029626
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.017702,0.064583
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.002762,0.007739


In [77]:
submission.to_csv(path_or_buf='orange_bears_submission1.csv', index=False)