<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train test split</a></span></li><li><span><a href="#Modelling:-Random-Forest" data-toc-modified-id="Modelling:-Random-Forest-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelling: Random Forest</a></span><ul class="toc-item"><li><span><a href="#default" data-toc-modified-id="default-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>default</a></span></li></ul></li><li><span><a href="#Hyperparameter-optimization" data-toc-modified-id="Hyperparameter-optimization-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Hyperparameter optimization</a></span><ul class="toc-item"><li><span><a href="#HPO-using-RandomizedSearchCV" data-toc-modified-id="HPO-using-RandomizedSearchCV-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>HPO using RandomizedSearchCV</a></span></li></ul></li><li><span><a href="#Result-Analysis" data-toc-modified-id="Result-Analysis-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Result Analysis</a></span></li></ul></div>

# Load the libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

pd.options.plotting.backend = "matplotlib"
pd.set_option('max_columns',100)

import time,os,json
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import sklearn
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

import joblib

from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

[(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,lgb]]

[('numpy', '1.18.4'),
 ('pandas', '1.1.0'),
 ('seaborn', '0.10.1'),
 ('sklearn', '0.23.1'),
 ('lightgbm', '2.3.1')]

In [3]:
def print_scores(ytest,ypreds):
    # for auc score we need to binarize
    labels = [0, 1, 2, 3]
    ytest_bin = label_binarize(ytest, classes=labels)
    ypreds_bin = label_binarize(ypreds, classes=labels)
    a = roc_auc_score(ytest_bin,ypreds_bin,
                      average='macro',multi_class='ovo')
    
    # precision recall
    p = precision_score(ytest,ypreds,average='macro')
    r = recall_score(ytest,ypreds,average='macro')
    f = f1_score(ytest,ypreds,average='macro')
    print(f'Precision: {p: .2f}')
    print(f'Recall   : {r: .2f}')
    print(f'F1-score : {f: .2f}')
    print(f'AUC      : {f: .2f}')

    c = classification_report(ytest, ypreds)
    print(c)

    cm = confusion_matrix(ytest,ypreds)
    names = list('ABCD')
    df_cm = pd.DataFrame(cm,index=names,columns=names)
    df_cm = df_cm.style.background_gradient()
    display(df_cm)

In [4]:
df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC': []
                    })

# Load the data

In [5]:
df_raw = pd.read_csv('../data/raw/train.csv')
print(df_raw.shape)

df_raw.head(2).append(df_raw.tail(2))

(8068, 11)


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
8066,467299,Female,No,27,Yes,Healthcare,1.0,Low,4.0,Cat_6,B
8067,461879,Male,Yes,37,Yes,Executive,0.0,Average,3.0,Cat_4,B


In [6]:
df = pd.read_csv('../data/processed/clean_data.csv')
print(df.shape)

df.head(2).append(df.tail(2))

(8068, 38)


Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Segmentation,Ever_Married_NA,Graduated_NA,Profession_NA,Work_Experience_NA,Family_Size_NA,Var_1_NA,Age_cat,Family_Size_cat,Work_Experience_cat,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,gen_mar,gen_grad,gen_spend,grad_spend,grad_spend_gen
0,0,0,22,0,1.0,0,4.0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,1,38,1,3.0,1,3.0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,3,3,4,4,13
8066,1,0,27,1,1.0,0,4.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,3,1,1,10
8067,0,1,37,1,0.0,1,3.0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,2,3,4,4


# Train test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
target = 'Segmentation'

df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],shuffle=True,
    test_size=0.2, random_state=SEED, stratify=df[target])

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, ser_ytrain_orig,
    test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

print(f"df             : {df.shape}")

print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xvalid      : {df_Xvalid.shape}")
print(f"ser_yvalid     : {ser_yvalid.shape}")

print(f"\ndf_Xtest       : {df_Xtest.shape}")
print(f"ser_ytest      : {ser_ytest.shape}")

df_Xtrain_orig.head(2)

df             : (8068, 38)

df_Xtrain_orig : (6454, 37)
ser_ytrain_orig: (6454,)

df_Xtrain      : (5163, 37)
ser_ytrain     : (5163,)

df_Xvalid      : (1291, 37)
ser_yvalid     : (1291,)

df_Xtest       : (1614, 37)
ser_ytest      : (1614,)


Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Ever_Married_NA,Graduated_NA,Profession_NA,Work_Experience_NA,Family_Size_NA,Var_1_NA,Age_cat,Family_Size_cat,Work_Experience_cat,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,gen_mar,gen_grad,gen_spend,grad_spend,grad_spend_gen
3582,0,1,27,0,2.0,0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0
6827,0,0,41,1,3.0,0,1.0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,1,1


In [9]:
cols_all = df_Xtrain_orig.columns.to_list()
cols_no_na = [i for i in cols_all if not i.endswith('_NA')]


cross = ['gen_mar', 'gen_grad', 'gen_spend',
         'grad_spend', 'grad_spend_gen']
cols_no_na_no_cross = [i for i in cols_no_na if i not in cross]
cols_no_na_no_cross

['Gender',
 'Ever_Married',
 'Age',
 'Graduated',
 'Work_Experience',
 'Spending_Score',
 'Family_Size',
 'Age_cat',
 'Family_Size_cat',
 'Work_Experience_cat',
 'Profession_Artist',
 'Profession_Doctor',
 'Profession_Engineer',
 'Profession_Entertainment',
 'Profession_Executive',
 'Profession_Healthcare',
 'Profession_Homemaker',
 'Profession_Lawyer',
 'Profession_Marketing',
 'Var_1_Cat_1',
 'Var_1_Cat_2',
 'Var_1_Cat_3',
 'Var_1_Cat_4',
 'Var_1_Cat_5',
 'Var_1_Cat_6',
 'Var_1_Cat_7']

In [10]:
features = cols_no_na

Xtr = df_Xtrain_orig[features]
Xtx = df_Xtest[features]
Xvd = df_Xvalid[features]

ytr = ser_ytrain_orig.to_numpy().ravel()
ytx = ser_ytest.to_numpy().ravel()
yvd = ser_yvalid.to_numpy().ravel()

# Modelling: Random Forest

## default

In [11]:
# time
time_start = time.time()

model_name = 'Random Forest'
desc = 'default'

# model
model = RandomForestClassifier(random_state=SEED, n_jobs=-1)

# fit and save the model
model.fit(Xtr, ytr)


# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model, Xtx, ytx, cv=skf)
ypreds = ypreds_cv

# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
                  average='macro',multi_class='ovo')

# model evaluation
average = 'macro'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            auc
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)

Time taken: 0 min 4 secs


Unnamed: 0,Model,Description,Accuracy,Precision,Recall,F1,AUC
0,Random Forest,default,0.477695,0.46712,0.469588,0.467799,0.647705


Precision:  0.47
Recall   :  0.47
F1-score :  0.47
AUC      :  0.47
              precision    recall  f1-score   support

           0       0.44      0.44      0.44       394
           1       0.39      0.36      0.37       372
           2       0.45      0.43      0.44       394
           3       0.59      0.65      0.62       454

    accuracy                           0.48      1614
   macro avg       0.47      0.47      0.47      1614
weighted avg       0.47      0.48      0.47      1614



Unnamed: 0,A,B,C,D
A,172,73,55,94
B,78,134,115,45
C,52,103,171,68
D,86,34,40,294


# Hyperparameter optimization

## HPO using RandomizedSearchCV

In [12]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import joblib

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X = np.array(Xtr)
y = np.array(ytr)

In [14]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(50,200,6)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 50, num=10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2,3,5,8, 10, 15]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 6, 8,10]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


grid = RandomizedSearchCV(model,params,
                          cv=skf.split(X, y),
                          scoring='accuracy')

In [15]:
%%time
# grid.fit(X,y)
# best_model = grid.best_estimator_
# joblib.dump(best_model, '../outputs/rf_randomsearch_best_model.pkl')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs


In [16]:
model_grid_random = joblib.load('../outputs/rf_randomsearch_best_model.pkl')
model_grid_random

RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=8,
                       min_samples_split=5, n_estimators=200, n_jobs=-1,
                       random_state=100)

In [17]:
# time
time_start = time.time()

model_name = 'Random Forest'
desc = 'grid_randomsearch'

# model
model = model_grid_random

# fit and save the model
model.fit(Xtr, ytr)

# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model, Xtx, ytx, cv=skf)
ypreds = ypreds_cv

# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
                  average='macro',multi_class='ovo')

# model evaluation
average = 'macro'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            auc
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)

Time taken: 0 min 4 secs


Unnamed: 0,Model,Description,Accuracy,Precision,Recall,F1,AUC
0,Random Forest,default,0.477695,0.46712,0.469588,0.467799,0.647705
1,Random Forest,grid_randomsearch,0.521066,0.507744,0.510599,0.505494,0.675283


Precision:  0.51
Recall   :  0.51
F1-score :  0.51
AUC      :  0.51
              precision    recall  f1-score   support

           0       0.44      0.48      0.46       394
           1       0.43      0.31      0.37       372
           2       0.54      0.53      0.53       394
           3       0.61      0.72      0.66       454

    accuracy                           0.52      1614
   macro avg       0.51      0.51      0.51      1614
weighted avg       0.51      0.52      0.51      1614



Unnamed: 0,A,B,C,D
A,189,63,52,90
B,99,117,112,44
C,47,67,208,72
D,91,22,14,327


# Result Analysis

In [32]:
df_out = pd.DataFrame({'True': ytx, 'Predicted': ypreds})
df_out = df_out.replace({0:'A',1:'B',2:'C',3:'D'})
df_out.head()

Unnamed: 0,True,Predicted
0,C,A
1,B,C
2,C,C
3,A,A
4,B,C


In [33]:
df_out['True'].value_counts().sort_index()

A    394
B    372
C    394
D    454
Name: True, dtype: int64

In [47]:
dfx = pd.crosstab(df_out['True'],df_out['Predicted'],margins=True)
dfx.style.set_caption('<h4>Confusion Matrix for customer segmentation</h4><br>Each rows are true labels.<br>Row sum is true sum of given label.')\
.background_gradient(axis=1,subset=(dfx.index[:-1],dfx.columns[:-1]))

Predicted,A,B,C,D,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,189,63,52,90,394
B,99,117,112,44,372
C,47,67,208,72,394
D,91,22,14,327,454
All,426,269,386,533,1614


In [43]:
pd.crosstab(df_out['True'],df_out['Predicted'],margins=False,normalize='index').mul(100).style.background_gradient(axis=1).format("{:.2f}%")

Predicted,A,B,C,D
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,47.97%,15.99%,13.20%,22.84%
B,26.61%,31.45%,30.11%,11.83%
C,11.93%,17.01%,52.79%,18.27%
D,20.04%,4.85%,3.08%,72.03%


In [31]:
"""
There are 394 true "A".
Out of 394, only 189 are predicted as "A". 189/394= 44.97%.

The classifier is the hard time classifying B.
Many of actual class "B" are classified as C.
""";