In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../../data/marketing_campaign.csv', sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [3]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

In [4]:
df.isnull().sum() / df.shape[0]

ID                     0.000000
Year_Birth             0.000000
Education              0.000000
Marital_Status         0.000000
Income                 0.010714
Kidhome                0.000000
Teenhome               0.000000
Dt_Customer            0.000000
Recency                0.000000
MntWines               0.000000
MntFruits              0.000000
MntMeatProducts        0.000000
MntFishProducts        0.000000
MntSweetProducts       0.000000
MntGoldProds           0.000000
NumDealsPurchases      0.000000
NumWebPurchases        0.000000
NumCatalogPurchases    0.000000
NumStorePurchases      0.000000
NumWebVisitsMonth      0.000000
AcceptedCmp3           0.000000
AcceptedCmp4           0.000000
AcceptedCmp5           0.000000
AcceptedCmp1           0.000000
AcceptedCmp2           0.000000
Complain               0.000000
Z_CostContact          0.000000
Z_Revenue              0.000000
Response               0.000000
dtype: float64

In [5]:
df = df.dropna()
df.shape

(2216, 29)

# Data processing

In [6]:
df['Z_CostContact'].describe()

count    2216.0
mean        3.0
std         0.0
min         3.0
25%         3.0
50%         3.0
75%         3.0
max         3.0
Name: Z_CostContact, dtype: float64

In [7]:
df['Z_Revenue'].describe()

count    2216.0
mean       11.0
std         0.0
min        11.0
25%        11.0
50%        11.0
75%        11.0
max        11.0
Name: Z_Revenue, dtype: float64

In [8]:
for col in df.select_dtypes('object').columns:
    display(df[col].value_counts(normalize=True))

Graduation    0.503610
PhD           0.217058
Master        0.164711
2n Cycle      0.090253
Basic         0.024368
Name: Education, dtype: float64

Married     0.386733
Together    0.258574
Single      0.212545
Divorced    0.104693
Widow       0.034296
Alone       0.001354
Absurd      0.000903
YOLO        0.000903
Name: Marital_Status, dtype: float64

In [9]:
df['Education'] = df['Education'].map({
    'Basic': 0,
    '2n Cycle': 1,
    'Graduation': 2,
    'Master': 3,
    'PhD': 4
})

In [10]:
df_model = pd.get_dummies(df, columns=['Marital_Status'], drop_first=False)
df_model = df_model.drop(columns=[
    'Marital_Status_Alone',
    'Z_CostContact', 'Z_Revenue'])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_model['Dt_Customer'] = scaler.fit_transform(df_model['Dt_Customer'].astype(int).values.reshape(-1, 1))

In [11]:
# # Reconvert dates
# pd.to_datetime(
#     scaler.inverse_transform(
#         df_model['Dt_Customer'].astype(float).values.reshape(-1, 1)
#         )[:, 0].astype(int)
#     )

In [12]:
df_model

Unnamed: 0,ID,Year_Birth,Education,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,AcceptedCmp2,Complain,Response,Marital_Status_Absurd,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO
0,5524,1957,2,58138.0,0,0,0.051502,58,635,88,...,0,0,1,0,0,0,1,0,0,0
1,2174,1954,2,46344.0,1,1,0.838340,38,11,1,...,0,0,0,0,0,0,1,0,0,0
2,4141,1965,2,71613.0,0,0,0.553648,26,426,49,...,0,0,0,0,0,0,0,1,0,0
3,6182,1984,2,26646.0,1,0,0.801144,26,11,4,...,0,0,0,0,0,0,0,1,0,0
4,5324,1981,4,58293.0,1,0,0.769671,94,173,43,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,2,61223.0,0,1,0.454936,46,709,43,...,0,0,0,0,0,1,0,0,0,0
2236,4001,1946,4,64014.0,2,1,0.972818,56,406,0,...,0,0,0,0,0,0,0,1,0,0
2237,7270,1981,2,56981.0,0,0,0.778255,91,908,48,...,0,0,0,0,1,0,0,0,0,0
2238,8235,1956,3,69245.0,0,1,0.776824,8,428,30,...,0,0,0,0,0,0,0,1,0,0


In [13]:
df_model['Response'].value_counts(normalize=True)

0    0.849729
1    0.150271
Name: Response, dtype: float64

In [14]:
from sklearn.model_selection import train_test_split, StratifiedKFold

In [15]:
X = df_model.drop(columns=['Response'])
y = df_model['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [16]:
y_train.value_counts(normalize=True)

0    0.849774
1    0.150226
Name: Response, dtype: float64

In [17]:
y_test.value_counts(normalize=True)

0    0.849624
1    0.150376
Name: Response, dtype: float64

In [18]:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(random_state=42)

# X_train, y_train = smote.fit_resample(X_train, y_train)

## Model development

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, classification_report

In [20]:
class_weights = 'balanced'

models = (
    (KNeighborsClassifier(), 'KNN'),
    (RandomForestClassifier(random_state=42, class_weight=class_weights), 'Random Forest'),
    (BaggingClassifier(random_state=42), 'Bagging'),
    (ExtraTreesClassifier(random_state=42, class_weight=class_weights), 'Extra Trees'),
    (XGBClassifier(random_state=42), 'XGBoost'),
)

model_names = []
accuracies = []
precisions = []
recalls = []
f1s = []
aucs = []

for model in models:
    clf = model[0]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    model_names.append(model[1])
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred))

model_comp = pd.DataFrame(
    {'Model': model_names, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1': f1s, 'AUROC': aucs})

In [21]:
model_comp.sort_values('F1', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUROC
4,XGBoost,0.878195,0.633803,0.45,0.526316,0.701991
2,Bagging,0.885714,0.730769,0.38,0.5,0.677611
3,Extra Trees,0.887218,0.837838,0.31,0.452555,0.64969
1,Random Forest,0.885714,0.928571,0.26,0.40625,0.62823
0,KNN,0.839098,0.37037,0.1,0.15748,0.534956


In [29]:
from sklearn.model_selection import GridSearchCV

rfc = XGBClassifier(
    # n_estimators=50,
    # class_weight='balanced',
    random_state=42,
    n_jobs=-1)

param_grid = {
    'n_estimators' : [50, 100,200, 300, 400, 500, 600, 700, 800],
    'max_depth' : np.append(np.arange(1, 31, 1), None),
    
}

gs = GridSearchCV(rfc, param_grid=param_grid, n_jobs=-1, cv=skf, scoring='f1')
gs.fit(X_train, y_train)
print(gs.best_estimator_, gs.best_params_, gs.best_score_)
y_pred = gs.predict(X_test)
f1_score(y_test, y_pred)

import pickle
pickle.dump(gs.best_estimator_, open('XGBC_best_estimator.pkl', 'wb'))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=300, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=42, ...) {'max_depth': 4, 'n_estimators': 300} 0.6002749719416386


0.5212121212121211

In [35]:
XGBC_best_estimator = pickle.load(open('XGBC_best_estimator.pkl', 'rb'))

In [37]:
XGBC_best_estimator.predict(X_test)
f1_score(y_test, y_pred)

0.5212121212121211

# Conversion prediction : model development

In [24]:
# df['conversion'].value_counts(normalize=True)

In [25]:
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier, XGBRFClassifier
# from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, classification_report

In [26]:
# class_weights = 'balanced'

# models = (
#     # (LogisticRegression(random_state=42, class_weight=class_weights), 'Logistic Regression'),
#     # (LogisticRegressionCV(random_state=42, class_weight=class_weights), 'Logistic Regression CV'),
#     (SVC(random_state=42, class_weight=class_weights), 'SVC'),
#     (KNeighborsClassifier(), 'KNN'),
#     # (DecisionTreeClassifier(random_state=42, class_weight=class_weights), 'Decision Tree'),
#     (RandomForestClassifier(random_state=42, class_weight=class_weights), 'Random Forest'),
#     (BaggingClassifier(random_state=42), 'Bagging'),
#     (ExtraTreesClassifier(random_state=42, class_weight=class_weights), 'Extra Trees'),
#     # (GradientBoostingClassifier(random_state=42), 'Gradient Boosting'),
#     (XGBClassifier(random_state=42), 'XGBoost'),
#     # (XGBRFClassifier(random_state=42), 'XGBoost RF')
# )

# model_names = []
# accuracies = []
# precisions = []
# recalls = []
# f1s = []
# aucs = []

# for model in models:
#     clf = model[0]
#     clf.fit(X_smo, y_smo)
#     y_pred = clf.predict(X_test)
#     model_names.append(model[1])
#     accuracies.append(accuracy_score(y_test, y_pred))
#     precisions.append(precision_score(y_test, y_pred))
#     recalls.append(recall_score(y_test, y_pred))
#     f1s.append(f1_score(y_test, y_pred))
#     aucs.append(roc_auc_score(y_test, y_pred))

# model_comp = pd.DataFrame(
#     {'Model': model_names, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1': f1s, 'AUC': aucs})

In [27]:
# model_comp.sort_values('F1', ascending=False)

In [28]:
# from sklearn.model_selection import GridSearchCV

# rfc = RandomForestClassifier(
#     # n_estimators=50,
#     class_weight='balanced',
#     random_state=42,
#     n_jobs=-1)

# param_grid = {
#     'n_estimators' : [50, 100, 150, 200, 250, 300],
#     'max_depth' : [None, 5, 10]
# }

# gs = GridSearchCV(rfc, param_grid=param_grid, n_jobs=-1, cv=skf, scoring='f1')
# gs.fit(X_smo, y_smo)
# print(gs.best_estimator_, gs.best_params_, gs.best_score_)
# y_pred = gs.predict(X_test)
# f1_score(y_test, y_pred)