In [11]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from cycler import cycler

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import xgboost as xgb ## install xgboost if not installed

%matplotlib inline

# Set color cycle for all plots
plt.rcParams["axes.prop_cycle"] = cycler(
    color=[plt.get_cmap("Set2")(i) for i in range(9)]
)

import warnings
warnings.simplefilter('ignore', FutureWarning)

## Overview

- Question
    - **What features did contribute for customers to be repeaters?**
    - **Can we predict if a customer will be a repeater?**
    - This task is binary classification, using the `repeater` label in `df_train`.
 
- Current situation
    - Repeaters constitute only 3% of the total.
    - The dataset is imbalanced.
    - I plan to use `SMOTE` to address this issue.
 
- Models
    - Logistic linear regression model.
    - Random forest
    - XGBoost

- Possible reasons for customers not becoming repeaters (my hypothesis):
    - Dissatisfaction with the initial purchase experience (late product delivery, issues with the product or packaging based on customer reviews/score)
    - Customer have promotion, so they use olist. Without it, they don't. The question is, why didn't this lead to a second purchase?
    - The time span between the first and second purchase for repeaters
    - Differences in the categories of products purchased (ex: repeaters buy daily necessities, while others buy gadgets?)

## 1. Load data

In [12]:
from utils import get_df_description

# load df
df = pd.read_csv('../datasets/df_train.csv')
print(df.shape)

# load json file
description_json = json.load(open('df_train_description.json', 'r'))
description_json_df = pd.DataFrame(description_json)

# get df_description
df_description = get_df_description(df, description_json_df)
df_description

(96095, 25)


Unnamed: 0,column,dtype,missing_values,source,description
0,customer_unique_id,object,0,df_customers,PK
1,frequency,int64,0,Calculated,number of orders
2,repeater,int64,0,Calculated,"1:repeater, 0:non-repeater"
3,fo_order_id,object,0,df_orders,FK
4,fo_customer_id,object,0,df_orders,FK (first order's customer_id)
5,fo_order_status,object,0,df_orders,
6,fo_order_purchase_timestamp,object,0,df_orders,
7,fo_order_approved_at,object,0,df_orders,
8,fo_order_delivered_carrier_date,object,0,df_orders,
9,fo_order_delivered_customer_date,object,0,df_orders,


## 2. Prepare df_numeric

In [13]:
## Selecting numeric columns
df_numeric = df.select_dtypes(include=['number']).dropna()
df_numeric = df_numeric.drop(columns=['customer_zip_code_prefix']) ## qualitative
print(df_numeric.shape)
df_numeric.head()

(3456, 12)


Unnamed: 0,frequency,repeater,recency,monetary,fo_payment_value,fo_is_daytime,fo_is_weekday,fo_delivery_delay_days,fo_review_score_mean,fo_voucher_payment_value,fo_voucher_used,customer_in_sao_paulo
19,1,0,114,78.42,78.42,1,1,-15.0,3.0,69.89,1,0
55,1,0,308,354.87,354.87,1,1,-6.0,3.0,293.54,1,0
73,1,0,482,109.78,109.78,1,1,-17.0,5.0,31.73,1,0
165,1,0,113,79.51,79.51,1,1,-21.0,5.0,79.51,1,0
232,1,0,289,66.91,66.91,1,0,-20.0,4.0,43.92,1,0


In [14]:
## X should contain information about the first purchase, not RFM stuff.
X = df_numeric.drop(['repeater', 'frequency', 'recency', 'monetary'], axis=1)
y = df_numeric['repeater']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (3456, 8)
y shape: (3456,)


## Logistic Regression

In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Creating a pipeline that first applies SMOTE and then trains a logistic regression model
pipeline = make_pipeline(
    SMOTE(random_state=42),
    LogisticRegression(max_iter=1000, random_state=42)
)

# Evaluating the pipeline using cross-validation
scores = cross_val_score(pipeline, X, y, cv=skf, scoring='accuracy')

print("Logistic Regression Model\n")
print(f"Cross-Validation Accuracy Scores: {scores}")
print(f"Mean CV Accuracy: {np.mean(scores)}")
print(f"Standard Deviation in CV Accuracy: {np.std(scores)}")


Logistic Regression Model

Cross-Validation Accuracy Scores: [0.64450867 0.71924747 0.56729378 0.63965268 0.74095514]
Mean CV Accuracy: 0.6623315459709059
Standard Deviation in CV Accuracy: 0.06210516039499559


**Interpretation**
- **Mean CV Accuracy**: The average accuracy across the five folds is approximately 66.23%, indicating that, on average, the model correctly predicts whether a customer will be a repeater about two-thirds of the time.
- **Variability**: The standard deviation of the CV accuracy scores is about 6.21%, showing a moderate variation in the model's performance across different folds. This variation can be attributed to the diversity of data in each fold or intrinsic differences in the difficulty of predicting certain subsets of the data.
- **Range of Accuracy Scores**: The accuracy scores across folds range from approximately 56.73% to 74.10%, highlighting the inconsistent performance of the model across different segments of the dataset.

In [16]:
## What features are important?

pipeline.fit(X, y)
lr_model = pipeline.named_steps['logisticregression']

feature_names = X.columns
coefficients = lr_model.coef_[0]

feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

print(feature_importance)

                    Feature  Coefficient
6           fo_voucher_used     0.366277
4      fo_review_score_mean     0.112921
5  fo_voucher_payment_value     0.003562
7     customer_in_sao_paulo     0.000000
3    fo_delivery_delay_days    -0.002865
0          fo_payment_value    -0.004496
2             fo_is_weekday    -0.428268
1             fo_is_daytime    -0.848081


**Interpretation**
- `fo_voucher_used`
  - The most influential positive feature, suggesting that customers who used a voucher in their first order are more likely to repeat.
- `fo_is_weekday`
  - Customers who make their first order on a weekday are less likely to repeat, with a significant negative influence.
- `fo_is_daytime`
  - Similar to fo_is_weekday, orders made during daytime have a notably negative effect on repeat probability.
- `customer_in_sao_paulo`
  - Shows no effect

**Next Steps**
- Although binary classification is used in `fo_is_weekday` and `fo_is_daytime`, more detailed classification by time or day of the week could improve the performance of the model.
- We can also drop `customer_in_sao_paulo` as it has no effect on the repeater probability.
- Add intersection terms

## Random Forest

In [17]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline_rf = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Perform cross-validation
scores_rf = cross_val_score(pipeline_rf, X, y, cv=skf, scoring='accuracy')

# Print the results
print("Random Forest\n")
print(f"Cross-Validation Accuracy Scores: {scores_rf}")
print(f"Mean CV Accuracy: {np.mean(scores_rf)}")
print(f"Standard Deviation in CV Accuracy: {np.std(scores_rf)}")

Random Forest

Cross-Validation Accuracy Scores: [0.90462428 0.91895803 0.9276411  0.91895803 0.91027496]
Mean CV Accuracy: 0.9160912809616623
Standard Deviation in CV Accuracy: 0.007939226059490222


**Interpretation**
- The mean accuracy is higher than Logistic Regression, at approximately 66.23%.
  - Main reason is Random Forest is a non-linear model. We have some binary features, so the performance can be better than Logistic Regression. 

In [18]:
## What features are important?

pipeline_rf.fit(X, y)
rf_model = pipeline_rf.named_steps['random_forest']

importances = rf_model.feature_importances_

# Convert the feature importances to a DataFrame for easier viewing
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False).reset_index(drop=True)

print(feature_importances)

                    Feature  Importance
0          fo_payment_value    0.241995
1    fo_delivery_delay_days    0.240965
2  fo_voucher_payment_value    0.240364
3      fo_review_score_mean    0.230956
4             fo_is_daytime    0.024553
5             fo_is_weekday    0.021168
6           fo_voucher_used    0.000000
7     customer_in_sao_paulo    0.000000


**Interpretation**
- There are no significant feature that impact the model performance.
- Again, customer_in_sao_paulo has no effect.

**Next Steps**
- Drop `customer_in_sao_paulo` and `fo_voucher_used`.

## XGBoost

In [19]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline_xgb = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('xgboost', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Perform cross-validation
scores_xgb = cross_val_score(pipeline_xgb, X, y, cv=skf, scoring='accuracy')

# Print the results
print("XGBoost\n")
print(f"Cross-Validation Accuracy Scores: {scores_xgb}")
print(f"Mean CV Accuracy: {np.mean(scores_xgb)}")
print(f"Standard Deviation in CV Accuracy: {np.std(scores_xgb)}")

XGBoost

Cross-Validation Accuracy Scores: [0.94364162 0.93777135 0.94356006 0.93632417 0.94645441]
Mean CV Accuracy: 0.9415503208050658
Standard Deviation in CV Accuracy: 0.0038485328160467206


**Interpretation**
- the accuracy is better than Random Forest. 

In [20]:
## What features are important?

pipeline_xgb.fit(X, y)
xgb_model = pipeline_xgb.named_steps['xgboost']

importances = xgb_model.feature_importances_

# Convert the feature importances to a DataFrame for easier viewing
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False).reset_index(drop=True)

print(feature_importances)

                    Feature  Importance
0      fo_review_score_mean    0.565927
1    fo_delivery_delay_days    0.164186
2             fo_is_daytime    0.079539
3             fo_is_weekday    0.068734
4          fo_payment_value    0.063925
5  fo_voucher_payment_value    0.057689
6           fo_voucher_used    0.000000
7     customer_in_sao_paulo    0.000000


**Interpretation**
- `fo_review_score_mean` is the most important feature.
- Same as Random Forest, `customer_in_sao_paulo` and `fo_voucher_used` has no effect.