In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('b_s_train.csv', index_col=['user_id', 'seller_id'])
test = pd.read_csv('b_s_test.csv', index_col=['user_id', 'seller_id'])

In [6]:
features = ['purchase', 'click', 'cart', 'click_buy_ratio', 'cart_buy_ratio',
            'purchase_last_m', 'click_last_m', 'cart_last_m',
            'click_recent', 'cart_recent', 'last_purchase', 'last_click', 'last_cart',
            'sentimentscore_pos',
            'outcome']
train = train[features]
test = test[features]

In [7]:
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# random forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import RandomizedSearchCV

In [82]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [99]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv=2, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


KeyboardInterrupt: 

In [None]:
rf_random.best_params_

In [114]:
rf = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=8, min_samples_split=10,
                            min_samples_leaf=4, max_features='auto')
rf.fit(X_train, y_train)
for threshold in np.linspace(0.04, 0.2, 15):
    predicted_proba = rf.predict_proba(X_test)
    y_pred = (predicted_proba [:,1] >= threshold).astype('int')
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print(threshold, f1+auc)

0.04 1.1549004685221784
0.05142857142857143 1.1738028450101843
0.06285714285714286 1.183420338741019
0.07428571428571429 1.1884960693288076
0.08571428571428572 1.1982976537799122
0.09714285714285714 1.200525748133727
0.10857142857142857 1.2065611542432149
0.12 1.217127607274374
0.13142857142857142 1.222161722972053
0.14285714285714285 1.2234449689018856
0.15428571428571428 1.220175074807921
0.16571428571428573 1.2202181830392775
0.17714285714285716 1.2201338055314026
0.18857142857142858 1.2206013223122723
0.2 1.2176669632478174


In [127]:
threshold = 0.11
predicted_proba = rf.predict_proba(X_test)
y_pred = (predicted_proba [:,1] >= threshold).astype('int')

In [128]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(acc, f1, auc)

0.9910846832737892 0.38097036058146116 0.8255457599279079


In [129]:
cm

array([[363506,   2756],
       [   523,   1009]])

# xgboost

In [10]:
import xgboost as xgb

In [14]:
random_grid = [
  {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3, 10),
    'min_child_weight': range(11),
    'gamma': [i/10.0 for i in range(4)],
    'subsample': [i/10 for i in range(5, 9)],
    'colsample_bytree': [i/10.0 for i in range(1, 9)]
  }
]

In [15]:
xg = xgb.XGBClassifier(objective='binary:logistic')
xg_random = RandomizedSearchCV(estimator = xg, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
xg_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate...
                                           num_parallel_tree=None,
                            

[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=9, min_child_weight=10, n_estimators=100, subsample=0.7; total time=  22.1s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.5; total time=  33.9s
[CV] END colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150, subsample=0.7; total time=  13.9s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=8, n_estimators=100, subsample=0.5; total time=  18.5s
[CV] END colsample_bytree=0.1, gamma=0.0, learning_rate=0.01, max_depth=7, min_child_weight=4, n_estimators=50, subsample=0.6; total time=   3.4s
[CV] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.8; total time=  12.7s
[CV] END colsample_bytree=0.6, gamma=0.0, learning_rate=0.3, max_depth=9, min_child_weight=2, n_estimators=150, subsample=

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50, subsample=0.8; total time=   7.4s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=200, subsample=0.7; total time=  24.2s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.5; total time=  34.0s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=8, n_estimators=100, subsample=0.5; total time=  18.2s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.5; total time=  14.1s
[CV] END colsample_bytree=0.1, gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.8; total time=  10.9s
[CV] END colsample_bytree=0.5, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=7, n_estimators=150, subsample=

[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.3, max_depth=3, min_child_weight=6, n_estimators=200, subsample=0.5; total time=  20.9s
[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=  15.1s
[CV] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.01, max_depth=6, min_child_weight=9, n_estimators=100, subsample=0.7; total time=  10.7s
[CV] END colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150, subsample=0.7; total time=  13.7s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.3, max_depth=6, min_child_weight=5, n_estimators=100, subsample=0.8; total time=  16.9s
[CV] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50, subsample=0.7; total time=   5.2s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50, subsample=0.8; total time=   7.3s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.3, max_depth=3, min_child_weight=6, n_estimators=200, subsample=0.5; total time=  20.9s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.5; total time=  33.6s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.3, max_depth=6, min_child_weight=5, n_estimators=100, subsample=0.8; total time=  16.9s
[CV] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50, subsample=0.7; total time=   5.0s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.5; total time=  13.8s
[CV] END colsample_bytree=0.1, gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.

In [16]:
xg_random.best_params_

{'subsample': 0.8,
 'n_estimators': 150,
 'min_child_weight': 4,
 'max_depth': 5,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.8}

In [136]:
xg = xgb.XGBClassifier(n_estimators=50, max_depth=5, learning_rate=0.2, 
                                gamma=0.1, colsample_bytree=0.2)
xg.fit(X_train, y_train)
for threshold in np.linspace(0.05, 0.2, 14):
    predicted_proba = xg.predict_proba(X_test)
    y_pred = (predicted_proba [:,1] >= threshold).astype('int')
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print(threshold, auc+f1)

0.05 1.1723890108387285
0.06153846153846154 1.181357815468047
0.07307692307692308 1.188692516930994
0.08461538461538462 1.193346236947984
0.09615384615384617 1.200894194798796
0.1076923076923077 1.2074285720835296
0.11923076923076925 1.2112733179346686
0.1307692307692308 1.2097857163585246
0.14230769230769233 1.2103577295919699
0.15384615384615385 1.2110761600277826
0.1653846153846154 1.2067247614367664
0.17692307692307696 1.2017539755289706
0.1884615384615385 1.202889484438249
0.2 1.2054177928076704


In [137]:
threshold = 0.1
predicted_proba = xg.predict_proba(X_test)
y_pred = (predicted_proba [:,1] >= threshold).astype('int')

In [138]:
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(acc, f1, auc)

0.9909677700016857 0.37766953915324086 0.8251620531653545


In [139]:
cm

array([[363464,   2798],
       [   524,   1008]])

In [106]:
threshold = 0.14
predicted_proba = xg.predict_proba(X_test)
y_pred = (predicted_proba [:,1] >= threshold).astype('int')

In [107]:
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(acc, f1, auc)

0.9921097135896725 0.3718614718614719 0.7773095767040196


In [108]:
cm

array([[364033,   2229],
       [   673,    859]])

In [1]:
import pandas as pd

In [3]:
log = pd.read_csv('log.csv')

In [5]:
len(set(log.user_id))

5567