# 02_Exploratory Data Analysis & Baseline Performances

**Author:** Bilge Nur Karaca

*Feature names used in this project are either altered or invented and do not represent the original feature names. Code outputs are not provided due to confidentiality.*

In [None]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
f = open('OM_D1_train_data', 'rb')
df = pickle.load(f)
f.close()

In [None]:
print(df.label.value_counts())
print(len(df))
print(df.shape)
df.columns

In [None]:
num_vars = ['num_of_transactions', 'avg_basket_size', 'num_of_total_items',
               'min_basket_size', 'max_basket_size', 'avg_discount_amount',
               'lifetime_discount_amount', 'num_of_purchases_w_discount',
               'avg_order_value', 'total_net_amount', 'isShippedToBilled_sum',
               'days_since_last_purchase', 'days_since_first_purchase', 'num_returns',
               'Category1', 'Category2', 'Category3', 'Category4', 'Axe1',
               'Axe2', 'Axe3', 'Axe4', 'Age', 'purchase_frequency', 'offline_transactions']

cat_vars = ['isContactable','City' ,'last_coupon_type_used','Device']

### Draw histograms for all numerical features

In [None]:
# Draw histograms for all numerical features

plt.rc('xtick', labelsize=6) 
plt.rc('ytick', labelsize=6)
df[num_vars].hist(figsize = [20,15], bins=30);

### Null value adjustments

In [None]:
# Make "Age" null if the value is smaller than 18 and greater than 100.

df["Age"] = df.loc[:,"Age"].apply(lambda x: np.where(x<18,np.nan,x))
df["Age"] = df.loc[:,"Age"].apply(lambda x: np.where(x>100,np.nan,x))

In [None]:
# Make purhcase frequency "0" to "Null" because those are single-purchasers. 
# "0-frequency" makes no sense.

df["purchase_freq"].replace(0,np.nan, inplace=True)

### Remove outliers

In [None]:
colns = ["avg_order_value", 'Category1', 'Category2', 'Category3', 'Category4', 'Axe1',
               'Axe2', 'Axe3', 'Axe4',"offline_transactions", "num_returns",
            'isShippedToBilled_sum', "avg_discount_amount", "avg_basket_size"]

for col in colns:
    print(df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std()).sum(),"outliers, 6 std far from the mean")
    print(df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std()).sum() /df[col].count(),"of the column")
    print("Dropping...")
    df.drop(df[df[col].apply(lambda x: x > df[col].mean() + 6*df[col].std())==True].index, inplace=True)
    print("Dropped.")

### Recheck histograms after outlier removal

In [None]:
# Recheck histograms after outlier removal

plt.rc('xtick', labelsize=6) 
plt.rc('ytick', labelsize=6)
df[num_vars].hist(figsize = [20,15], bins=30);

#### Add a new feature

In [None]:
# Add the inverse of frequence (i.e. period) as a new feature

df["purchase_period"] = df["purchase_freq"]
df["purchase_freq"] = 1/df["purchase_freq"]

num_vars = num_vars + ["purchase_period"]

In [None]:
df['purchase_period'].hist()

In [None]:
col= "purchase_freq"
print(df[col].apply(lambda x: x > df[col].mean() + 9*df[col].std()).sum(),"outliers, 6 std far from the mean")
print(df[col].apply(lambda x: x > df[col].mean() + 9*df[col].std()).sum() /df[col].count(),"of the column")
print("Dropping...")
df.drop(df[df[col].apply(lambda x: x > df[col].mean() + 9*df[col].std())==True].index, inplace=True)
print("Dropped.")

### Class imbalance

In [None]:
print("# of samples per class:", df.label.value_counts())

In [None]:
# Calculate class imbalance
df.label.value_counts()[1]/(df.label.value_counts()[1]+df.label.value_counts()[0]) 

### Check correlation matrix by heatmap visualisation

In [None]:
import seaborn as sns
sns.set()
plt.figure(figsize=(25,25))
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20)
plt.rcParams.update({'font.size': 20})
sns.heatmap(round(df[num_vars + ["label"]].corr(),2), annot=True);

In [None]:
# Drop highly correlated variables

df.drop("Category2", axis=1,inplace=True) # 1 correlation
df.drop("isShippedToBilled_sum", axis=1,inplace=True)# 1 correlation
df.drop("num_of_purchases_w_discount", axis=1,inplace=True) # 1 correlation

### Check categorical variables & visualize distributions with regard to churn

In [None]:
# Check categorical variables

df.describe(include = 'object').T

In [None]:
# Visualize how the churn rate changes for different types of device.

pd.crosstab(df["Device"],df['label']).plot.bar(stacked=True);

In [None]:
# Visualize how the value distributions are for cat_vars.

plt.figure(figsize=(20,20))

idx = 1
for cat in cat_vars: 
    plt.subplot(3, 3, idx)
    sns.countplot(df[cat], label="Count")
    
    plt.xticks(rotation=90)

    #plt.xlabel(cat, fontsize=20)
    idx += 1
plt.show()

In [None]:
df.isContactable = df.isContactable.astype("category")
df.City = df.City.astype("category")
df.last_coupon_type_used = df.last_coupon_type_used.astype("category")
df.Device = df.Device.astype("category")

### Visualize numericals variable distributions with regard to churn (1 vs. 0)

In [None]:
# Visualize how the boxplots are for num_vars, for each label value.

plt.figure(figsize=(20,40))
for i, col in enumerate(df[num_vars]):
    plt.subplot(6,4,i+1)
    sns.boxplot(x='label', y=col, data=df)
plt.show();

# Pipeline

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [None]:
X= df.drop(["CustomerID","label"],axis=1)
y= df["label"]

In [None]:
X.info()

In [None]:
len(y)

In [None]:
X.drop(cat_vars, axis=1, inplace=True)

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
"""pipe_cat = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(handle_unknown="ignore"))
])
"""
pipe_num = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ct = ColumnTransformer([
    #("categorical_vars", pipe_cat, cat_vars),
    ("numeric_vars", pipe_num, num_vars),
    
], remainder="passthrough")

In [None]:
X.info()

In [None]:
clean_data = df.copy()
import pickle 
fx = open("OM_D2_Train_Data_Cleaned", 'wb') 
pickle.dump(clean_data, fx)
fx.close()

# Results Table

In [None]:
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, average_precision_score

In [None]:
# Create a data frame to store the results
def print_results(headline, true_value, pred, probs):
    scores=[]
    CM = confusion_matrix(true_value, pred)
    scores.append(headline)
    scores.append(accuracy_score(true_value, pred))      #accuracy
    scores.append(int(CM[1,1]))                          #TP
    scores.append(int(CM[0,1]))                          #FP
    scores.append(int(CM[0][0]))                         #TN
    scores.append(int(CM[1][0]))                         #FN
    scores.append(precision_score(true_value, pred))     #precision
    scores.append(recall_score(true_value, pred))        #recall
    scores.append(roc_auc_score(true_value, probs))      #roc_auc
    p, r, _ = precision_recall_curve(true_value, probs) 
    scores.append(auc(r,p))                              #pr_auc
    scores.append(f1_score(true_value, pred, average="macro"))            #f1-score
    return scores

score_names = ['method','accuracy','TP','FP','TN','FN','precision','recall','roc_auc','pr_auc','f1']
dfAcc = pd.DataFrame(data=np.zeros(shape=(0,11)), columns = score_names)

In [None]:
X.shape

# LoR 

In [None]:
pipe = Pipeline(steps=[('ct', ct),
                       ('classifier', LogisticRegression(random_state=42, max_iter=100000))
                          ]
                   )

params = [{'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l1'],
           'classifier__solver' : ['liblinear', 'saga']},
          
          {'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l2'], 
           'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
          }
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D2_LoR_nt', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

# XGB

In [None]:
# XGB

from xgboost import XGBClassifier

pipe = Pipeline(steps=[('ct', ct),
                       ('classifier', XGBClassifier(objective='binary:logistic', 
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.01, # default 0.3
                                                    max_depth=8, 
                                                    subsample=0.7, 
                                                    min_child_weight=55, 
                                                    gamma=1, 
                                                    reg_lambda=1, 
                                                    alpha=1, 
                                                    colsample_bytree=0.9, 
                                                    #colsample_bylevel=0.5,
                                                   # scale_pos_weight = 0.35
                                                   )
                                                   )
                            ]
                   )

params = [{ 
            #'classifier__n_estimators':[750,1000,1500],
           #'classifier__eta': [0.01],
           # 'classifier__max_depth':[8,9,10],
          # 'classifier__min_child_weight': [50,55,60],
         #  'classifier__colsample_bytree':[0.7,0.8,0.9],
           # 'classifier__subsample' : [0.6,0.7,0.8],
         #  'classifier__alpha':[0.2,0.3,0.4], #  defult 0. Increasing this value will make model more conservative.
        #    'classifier__gamma':[0.2,0.3,0.4], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__reg_lambda':[0.5,1,1.5,2,2.5], #  def=1 .Increasing this value will make model more conservative.
          #  'classifier__scale_pos_weight' : [0.25,0.3,0.35]
          }
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)




In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D2_XGB_nt', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# SGD

In [None]:
#SGD

from sklearn.linear_model import SGDClassifier

pipe = Pipeline(steps=[('ct', ct), 
                       ('SGD', SGDClassifier(random_state = 42,
                                             class_weight="balanced", 
                                             warm_start=False, 
                                             average=False,
                                             loss= "log_loss",
                                           #  learning_rate="adaptive",
                                             alpha = 0.05,
                                             eta0 = 0.1,
                                             learning_rate="adaptive",
                                             penalty="l1"
                                             
                                            )
                                            )
                          ]
                   )

param_grid = {#"SGD__alpha":[0.01,0.03,0.05], # The higher the value, the stronger the regularization.
              #"SGD__penalty": ["l1","l2", "elasticnet"],
              #"SGD__class_weight": ["balanced", 0.5,0.7,1,1.5],
              #"SGD__learning_rate" : ["constant", "optimal", "invscaling", "adaptive"],
              #"SGD__eta0" : [0.0001,0.001,0.01,0.1]
             } 


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D2_SGD_nt', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# Random Forest

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline(steps=[('ct', ct),
                        ('RF', RandomForestClassifier(random_state = 42, 
                                                       n_estimators=1000,
                                                      criterion="gini", 
                                                       max_depth=7, 
                                                       min_samples_split = 30,
                                                       max_features='sqrt', 
                                                     #  min_samples_leaf=10,
                                                       class_weight = "balanced"
                                                     ))
                      ]
                   )

param_grid = { #"RF__n_estimators": [750,1000,1500],
               # "RF__max_depth": [5,6,7],
              #  "RF__min_samples_split": [45],
               # "RF__max_features": ["sqrt", "log2", 10],
             # "RF__criterion"   : ["gini","entropy","log_loss"],
                                      } 


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D2_RF_nt', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# LIGTHGBM

In [None]:
# LIGHTGBM

import lightgbm as lgb

pipe = Pipeline(steps=[('ct', ct), 
                       ('LGBM',   lgb.LGBMClassifier(objective="binary",
                                                #     class_weight="balanced",                                                   
                                                     n_estimators=1500, 
                                                     learning_rate=0.01,
                                                     min_child_weight=25, 
                                                     max_depth=7, 
                                                     num_leaves=25, 
                                                     min_child_samples=50,                                                      
                                                     reg_alpha=0.8, 
                                                     reg_lambda=0.8,
                                                 #    subsample_freq=0, 
                                                     colsample_bytree=0.3, 
                                                     subsample=0.8,
                                                     min_split_gain = 25,
                                                     min_data_in_leaf = 25,
                                                     random_state=42))
                          ]
                   )

param_grid = {
              #   'LGBM__n_estimators': [750,1000,1500], 
              #   'LGBM__learning_rate' : [0.0001,0.001,0.01],
              #    'LGBM__min_child_weight': [250], 
              #    'LGBM__max_depth' : [5], 
               #  'LGBM__num_leaves': [250], 
              #   'LGBM__min_child_samples': [100,150,200,250,300],                                                      
              #   'LGBM__reg_alpha' : [5,6,7], # default 0 
              #  'LGBM__min_data_in_leaf' : [100,150,200,250,300],
              #   'LGBM__reg_lambda' : [1,1.5,2,2.5,3], # default 0 
              #   'LGBM__subsample_freq' : [1,5,10,100,500], 
             #  'LGBM__subsample' : [0.5,0.6,0.7,0.8,0.9], 
              # 'LGBM__colsample_bytree' : [0.5,0.6,0.7,0.8,0.9], 
              #  'LGBM__min_split_gain' : [122], 
             } 


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 2,
                    n_jobs = -1)
# default lgbm = 0.57

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D2_LGBM_nt', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
dfAcc_pck = dfAcc.copy()
import pickle 
fd = open("OM_D2_results_table", 'wb') 
pickle.dump(dfAcc_pck, fd)
fd.close()