In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
df = pd.read_csv('./result/df_cleaned.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380580 entries, 0 to 380579
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   InvoiceNo               380580 non-null  int64  
 1   StockCode               380580 non-null  object 
 2   Description             380580 non-null  object 
 3   Quantity                380580 non-null  int64  
 4   InvoiceDate             380580 non-null  object 
 5   UnitPrice               380580 non-null  float64
 6   CustomerID              380580 non-null  float64
 7   Country                 380580 non-null  object 
 8   TotalSales              380580 non-null  float64
 9   Country_isUK            380580 non-null  object 
 10  InvoiceYearMonth        380580 non-null  object 
 11  date                    380580 non-null  object 
 12  first_transaction_date  380580 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 37.7+ MB
None


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalSales,Country_isUK,InvoiceYearMonth,date,first_transaction_date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3,UK,2010-12,2010-12-01,2010-12-01
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,UK,2010-12,2010-12-01,2010-12-01
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0,UK,2010-12,2010-12-01,2010-12-01
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,UK,2010-12,2010-12-01,2010-12-01
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,UK,2010-12,2010-12-01,2010-12-01


In [44]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['date'] = pd.to_datetime(df['date'])
# Time splitting
n_days = 90
max_date = df['date'].max()
cutoff = max_date - pd.to_timedelta(n_days, unit='d')

temp_in = df[df['InvoiceDate'] <= cutoff]
temp_out = df[df['InvoiceDate'] > cutoff]

In [45]:
# Feature engienering (RFM)

targets = temp_out\
            .groupby('CustomerID')\
            .agg({'TotalSales': sum})\
            .rename({'TotalSales': 'spend_90_days'}, axis=1)\
            .assign(spend_90_days_flag = 1)

targets.head()

Unnamed: 0_level_0,spend_90_days,spend_90_days_flag
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1
12347.0,1294.32,1
12348.0,310.0,1
12349.0,1757.55,1
12352.0,944.23,1
12356.0,58.35,1


In [46]:
# Recency
recency = temp_in[['CustomerID', 'date']]\
            .groupby('CustomerID')\
            .apply(lambda x: (max_date - x['date'].max()) / pd.to_timedelta(1, 'day'))\
            .to_frame()\
            .set_axis(['recency'], axis=1)

recency.head()

Unnamed: 0_level_0,recency
CustomerID,Unnamed: 1_level_1
12346.0,316.0
12347.0,120.0
12348.0,239.0
12350.0,301.0
12352.0,253.0


In [47]:
# Frequency
frequency = temp_in[['CustomerID', 'date']]\
            .groupby('CustomerID')\
            .count()\
            .set_axis(['frequency'], axis=1)

frequency.head()

Unnamed: 0_level_0,frequency
CustomerID,Unnamed: 1_level_1
12346.0,1
12347.0,124
12348.0,28
12350.0,17
12352.0,38


In [48]:
# Monetary

monetary = temp_in\
            .groupby('CustomerID')\
            .agg({'TotalSales': ['sum']})\
            .set_axis(['monetary_sum'], axis=1)

monetary.head()

Unnamed: 0_level_0,monetary_sum
CustomerID,Unnamed: 1_level_1
12346.0,77183.6
12347.0,2790.86
12348.0,1487.24
12350.0,334.4
12352.0,1561.81


In [49]:
# Create variables model

variables = pd.concat([recency, frequency, monetary], axis=1)\
            .merge(targets,
                  left_index=True,
                  right_index=True,
                  how='left')\
            .fillna(0)

variables.head()

Unnamed: 0_level_0,recency,frequency,monetary_sum,spend_90_days,spend_90_days_flag
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346.0,316.0,1,77183.6,0.0,0.0
12347.0,120.0,124,2790.86,1294.32,1.0
12348.0,239.0,28,1487.24,310.0,1.0
12350.0,301.0,17,334.4,0.0,0.0
12352.0,253.0,38,1561.81,944.23,1.0


In [50]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

targets = ['spend_90_days', 'spend_90_days_flag']

X = variables.drop(targets, axis=1)
# scaler = StandardScaler()
# X_columns = X.columns
# X = scaler.fit_transform(X)
# X = pd.DataFrame(X, columns=X_columns)
# Regression
y_spending = variables['spend_90_days']
# Classification
y_prob = variables['spend_90_days_flag']

# ML for Regression
xgb_reg = XGBRegressor(objective='reg:squarederror', random_state=42)

xgb_reg_model = GridSearchCV(estimator=xgb_reg,
                            param_grid=dict(learning_rate=[0.01, 0.1, 0.3, 0.5]),
                            scoring='neg_mean_absolute_error',
                            refit=True,
                            cv=5)

xgb_reg_model.fit(X, y_spending)

In [51]:
print("Best score: ", xgb_reg_model.best_score_)
print("Best params: ", xgb_reg_model.best_params_)
print("Best estimator: \n", xgb_reg_model.best_estimator_)

predictions_reg = xgb_reg_model.predict(X)

Best score:  -608.0696756089767
Best params:  {'learning_rate': 0.01}
Best estimator: 
 XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=42, ...)


In [52]:
# ML for classification
xgb_clf = XGBClassifier(objective='binary:logistic', random_state=42)

xgb_clf_model = GridSearchCV(estimator=xgb_clf,
                            param_grid=dict(learning_rate=[0.01, 0.1, 0.3, 0.5]),
                            scoring='roc_auc',
                            refit=True,
                            cv=5)
xgb_clf_model.fit(X, y_prob)

In [53]:
print("Best score: ", xgb_clf_model.best_score_)
print("Best params: ", xgb_clf_model.best_params_)
print("Best estimator: \n", xgb_clf_model.best_estimator_)

predictions_clf = xgb_clf_model.predict_proba(X)

Best score:  0.7220740783041857
Best params:  {'learning_rate': 0.01}
Best estimator: 
 XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)


In [54]:
# Feature important (regression)

imp_feat_reg = xgb_reg_model\
                .best_estimator_\
                .get_booster()\
                .get_score(importance_type='gain')

imp_feat_reg

{'recency': 290098656.0, 'frequency': 23109968.0, 'monetary_sum': 1254267392.0}

In [55]:
# Feature important (classification)

imp_feat_clf = xgb_clf_model\
                .best_estimator_\
                .get_booster()\
                .get_score(importance_type='gain')

imp_feat_clf

{'recency': 5.642875671386719,
 'frequency': 3.2551660537719727,
 'monetary_sum': 13.57374095916748}

In [56]:
df_predictions = pd.concat(
    [
        pd.DataFrame(predictions_reg).set_axis(['pred_spend'], axis=1),
        pd.DataFrame(predictions_clf)[[1]].set_axis(['pred_prob'], axis=1),
        variables.reset_index()
    ],
    axis=1
)

df_predictions.head()

Unnamed: 0,pred_spend,pred_prob,CustomerID,recency,frequency,monetary_sum,spend_90_days,spend_90_days_flag
0,0.302355,0.272002,12346.0,316.0,1,77183.6,0.0,0.0
1,673.155579,0.778029,12347.0,120.0,124,2790.86,1294.32,1.0
2,440.110168,0.585282,12348.0,239.0,28,1487.24,310.0,1.0
3,136.135818,0.349632,12350.0,301.0,17,334.4,0.0,0.0
4,470.776581,0.541683,12352.0,253.0,38,1561.81,944.23,1.0
