# Model - Random Forest - combined user, product, user-product features

In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

### Load training set

In [23]:
train_data = pd.read_csv('data/modelling/train_data_14_features.csv')
train_data.head()

Unnamed: 0,user_id,product_id,up_product_cnt,up_reorder_ratio,up_mean_cart_position,up_mean_days_between_orders,up_mean_order_dow,up_mean_order_hod,u_total_orders,u_mean_products,u_mean_days_between_orders,u_mean_order_dow,p_mean_cart_position,p_order_count,p_reorder_ratio,p_mean_days_between_orders,reordered
0,1,196,10,0.9,1.4,17.6,2.5,10.3,10,5.9,17.272727,2.636364,3.721774,35791,0.77648,11.195803,1.0
1,1,10258,9,0.888889,3.333333,19.555556,2.555556,10.555556,10,5.9,17.272727,2.636364,4.277492,1946,0.713772,11.099692,1.0
2,1,10326,1,0.0,5.0,28.0,4.0,15.0,10,5.9,17.272727,2.636364,4.191097,5526,0.652009,11.177705,0.0
3,1,12427,10,0.9,3.3,17.6,2.5,10.3,10,5.9,17.272727,2.636364,4.760037,6476,0.740735,9.955837,0.0
4,1,13032,3,0.666667,6.333333,21.666667,2.666667,8.0,10,5.9,17.272727,2.636364,5.622767,3751,0.657158,10.616636,1.0


### Establish Features for the model

In [24]:
# Set feature and target variables
X = train_data.drop(['reordered', 
                     'p_mean_days_between_orders', 
                     'up_mean_cart_position', 
                     'up_mean_order_hod', 'up_mean_order_dow'], axis=1)
y = train_data.reordered

# Create the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

### Train the model

In [26]:
# Instantiate the classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=123, class_weight='balanced')

# Fit the classifier to the train set
rf_clf.fit(X_train, y_train)

### Predict with Train set

In [27]:
y_train_pred = rf_clf.predict(X_train)

f1_score(y_train, y_train_pred)

0.999763424835886

#### Confusion Matrix

In [28]:
pd.crosstab(y_train, y_train_pred)

col_0,0.0,1.0
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5734481,0
1.0,294,621220


In [29]:
#ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize='all', cmap='GnBu')
pd.crosstab(y_train, y_train_pred, normalize='all').round(3)

col_0,0.0,1.0
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.902,0.0
1.0,0.0,0.098


#### Train Scores

In [30]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   5734481
         1.0       1.00      1.00      1.00    621514

    accuracy                           1.00   6355995
   macro avg       1.00      1.00      1.00   6355995
weighted avg       1.00      1.00      1.00   6355995



### Predict with Test set

In [31]:
y_test_pred = rf_clf.predict(X_test)

f1_score(y_test, y_test_pred)

0.1833224310792409

#### Confusion Matrix

In [32]:
pd.crosstab(y_test, y_test_pred)

col_0,0.0,1.0
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1897877,13479
1.0,185030,22280


In [33]:
# ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred, normalize='all', cmap='GnBu')
pd.crosstab(y_test, y_test_pred, normalize='all').round(3)

col_0,0.0,1.0
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.896,0.006
1.0,0.087,0.011


#### Confusion Matrix

In [34]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95   1911356
         1.0       0.62      0.11      0.18    207310

    accuracy                           0.91   2118666
   macro avg       0.77      0.55      0.57   2118666
weighted avg       0.88      0.91      0.88   2118666



### Cross-validation

In [37]:
from sklearn.model_selection import KFold, cross_val_score

kfold = KFold(n_splits=3, random_state=123, shuffle=True)

cv_results = cross_val_score(rf_clf, X, y, cv=kfold, scoring='f1', verbose=1)

print(cv_results.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.1817897789525047


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 68.5min finished
