# CHURNLYTICAL_Model_Building

## 1- Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

# metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# warnings
import warnings
warnings.filterwarnings('ignore')

# pickle model
import pickle


In [2]:
df = pd.read_csv('../DATA/Churnlytical_DataBase.csv')
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,No,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1 - 12
1,Male,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25 - 36
2,Male,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1 - 12
3,Male,No,No,No,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,37 - 48
4,Female,No,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1 - 12


## 2- Data Pre-processing

In [3]:
def preprocess_inputs(df):
    df = df.copy()

    # Splitting the dataset into X and y

    X = df.drop('Churn', axis=1)
    y = df['Churn']

    # train test split
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

    # categorical columns
    cat_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'Churn']

    # numerical columns
    num_cols = [col for col in df.columns if df[col].dtype != 'object']

    # Dividing the columns into 3 categories;
    # standard scaler for numerical columns

    ss = StandardScaler()
    x_train[num_cols] = ss.fit_transform(x_train[num_cols])
    x_test[num_cols] = ss.transform(x_test[num_cols])

    with open("./Model_Processing/StandardScaler", "wb") as f: 
        pickle.dump(ss, f)

    # label encoder for target variable

    le = LabelEncoder()
    y_train_encode = le.fit_transform(y_train)
    y_test_encode = le.transform(y_test)

    # one-hot encoding for categorical features

    ohe = OneHotEncoder(sparse_output=False, drop='if_binary')
    x_train_ohe = ohe.fit_transform(x_train[cat_cols])
    x_test_ohe = ohe.transform(x_test[cat_cols])
    print("Numerical: ", ohe)
    print("Numerical train: ", x_train_ohe)

    with open("./Model_Processing/OneHotEncoder", "wb") as f: 
        pickle.dump(ohe, f)

    # assigning right column names to cat cols after one-hot encoding
    col_ohe = ohe.get_feature_names_out(cat_cols)
    print("Categorical: ", col_ohe)

    # create df for one hot encoded features
    x_train_ohe_df = pd.DataFrame(x_train_ohe, columns = col_ohe, index = x_train.index)
    x_test_ohe_df = pd.DataFrame(x_test_ohe, columns = col_ohe, index = x_test.index)

    # combine the numerical and encoded features
    x_train_encode = pd.concat([x_train.drop(columns=cat_cols), x_train_ohe_df], axis=1)
    x_test_encode = pd.concat([x_test.drop(columns=cat_cols), x_test_ohe_df], axis=1)

    return x_train_encode, x_test_encode, y_train_encode, y_test_encode

In [4]:
x_train_encode, x_test_encode, y_train_encode, y_test_encode = preprocess_inputs(df)

Numerical:  OneHotEncoder(drop='if_binary', sparse_output=False)
Numerical train:  [[0. 0. 1. ... 1. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 ...
 [1. 0. 1. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 1.]]
Categorical:  ['gender_Male' 'SeniorCitizen_Yes' 'Partner_Yes' 'Dependents_Yes'
 'PhoneService_Yes' 'MultipleLines_Yes' 'InternetService_DSL'
 'InternetService_Fiber optic' 'InternetService_No' 'OnlineSecurity_Yes'
 'OnlineBackup_Yes' 'DeviceProtection_Yes' 'TechSupport_Yes'
 'StreamingTV_Yes' 'StreamingMovies_Yes' 'Contract_Month-to-month'
 'Contract_One year' 'Contract_Two year' 'PaperlessBilling_Yes'
 'PaymentMethod_Bank transfer (automatic)'
 'PaymentMethod_Credit card (automatic)' 'PaymentMethod_Electronic check'
 'PaymentMethod_Mailed check' 'tenure_group_1 - 12' 'tenure_group_13 - 24'
 'tenure_group_25 - 36' 'tenure_group_37 - 48' 'tenure_group_49 - 60'
 'tenure_group_61 - 72']


## 3- Model Building

In [5]:
# decision tree classifier
model_dt = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [6]:
model_dt.fit(x_train_encode, y_train_encode)

In [7]:
y_pred = model_dt.predict(x_test_encode)

In [8]:
print(confusion_matrix(y_test_encode, y_pred))

[[901 132]
 [194 180]]


In [9]:
print(classification_report(y_test_encode, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1033
           1       0.58      0.48      0.52       374

    accuracy                           0.77      1407
   macro avg       0.70      0.68      0.69      1407
weighted avg       0.76      0.77      0.76      1407



In [10]:
y_train_encode.shape

(5625,)

In [11]:
print(np.bincount(y_train_encode))

[4130 1495]


- The accuracy is quite low also keeping in mind that the dataset is quite imbalanced. The accuracy should not be considered as the metrics as the accuracy is cursed in an imbalanced data.
- Hence, it is essential to check recall, precision & f1 score for the minority class, and it's quite evident these scores are low as well for the minority class i.e. Churned Customers.
- Hence, we shall to scale the classes of the dataset using SMOTE (Upsampling).
- But first let's try other as well classifiers before over-sampling.

In [12]:
def get_score(y_pred_list, y_test, plot=True, axis=0, cmap='Blues'):
    model_name = []
    accuracy = []
    precision = []
    recall = []
    f1 = []
    roc_auc = []

    for name, y_pred in y_pred_list.items():
        model_name.append(name)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        roc_auc.append(roc_auc_score(y_test, y_pred))

        score_list = {
          'model':model_name,
          'accuracy':accuracy,
          'precision':precision,
          'recall':recall,
          'f1_score':f1,
          'roc_auc':roc_auc
        }
    
    score_df = pd.DataFrame(score_list).set_index('model')

    if plot:
        display(score_df.style.background_gradient(axis=axis, cmap=cmap))

    return score_df

In [15]:
model_list ={
    #12
    'Logistic Regression' :LogisticRegression(max_iter=1000, random_state=100),
    'Ridge Classifier' :RidgeClassifier(random_state=100),
    'Decision Tree' :DecisionTreeClassifier(random_state=100),
    'Random Forest':RandomForestClassifier(random_state=100),
    'KNN' :KNeighborsClassifier(),
    'SVC' :SVC(random_state=100),
    'Neural Network' : MLPClassifier(max_iter=1000, random_state=100),
    'Gradient Boosting Classifier':GradientBoostingClassifier(random_state=100),
    'AdaBoost Classifier':AdaBoostClassifier(random_state=100),
    'CatBoost Classifier':CatBoostClassifier(random_state=100, verbose=False,train_dir="./Model_Processing/selected_model/catboost_info" ),
    'XGBoost':XGBClassifier(random_state=100, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM':LGBMClassifier(random_state=100)
}

In [16]:
y_pred_list = dict()

for name, model in model_list.items():
  model.fit(x_train_encode, y_train_encode)
  y_pred_list[name] = model.predict(x_test_encode)

score_smote = get_score(y_pred_list, y_test_encode)

[LightGBM] [Info] Number of positive: 1495, number of negative: 4130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 568
[LightGBM] [Info] Number of data points in the train set: 5625, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265778 -> initscore=-1.016151
[LightGBM] [Info] Start training from score -1.016151


Unnamed: 0_level_0,accuracy,precision,recall,f1_score,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.79602,0.654804,0.491979,0.561832,0.699039
Ridge Classifier,0.797441,0.673152,0.462567,0.548336,0.690625
Decision Tree,0.742004,0.515152,0.5,0.507463,0.664811
Random Forest,0.783227,0.628253,0.451872,0.525661,0.677533
KNN,0.769012,0.576803,0.491979,0.531025,0.680646
SVC,0.800995,0.680769,0.473262,0.55836,0.696457
Neural Network,0.743426,0.52,0.451872,0.483548,0.650428
Gradient Boosting Classifier,0.797441,0.644951,0.529412,0.581498,0.711947
AdaBoost Classifier,0.799574,0.639394,0.564171,0.599432,0.724486
CatBoost Classifier,0.789623,0.630872,0.502674,0.559524,0.698094


There is no drastic difference in the scores, now let's try over-sampling techniques to upscale the dataset.

## 4- Over-Sampling

In [17]:
print("Before OverSampling - counts of label '0': {}".format(sum(y_train_encode == 0)))
print("Before OverSampling - counts of label '1': {} \n".format(sum(y_train_encode == 1)))

sm = SMOTE(random_state=100)
x_train_smote, y_train_smote = sm.fit_resample(x_train_encode, y_train_encode.ravel())

print('After OverSampling with SMOTE - x_train: {}'.format(x_train_smote.shape))
print('After OverSampling with SMOTE - y_train: {} \n'.format(y_train_smote.shape))

print("After OverSampling with SMOTE - counts of label '0': {}".format(sum(y_train_smote == 0)))
print("After OverSampling with SMOTE - counts of label '1': {}".format(sum(y_train_smote == 1)))

Before OverSampling - counts of label '0': 4130
Before OverSampling - counts of label '1': 1495 

After OverSampling with SMOTE - x_train: (8260, 31)
After OverSampling with SMOTE - y_train: (8260,) 

After OverSampling with SMOTE - counts of label '0': 4130
After OverSampling with SMOTE - counts of label '1': 4130


Using over-sampling only on training set as there might be leakage of information from the test set to training set while training the model. So it is a good measure to not use any sampling techniques on the test set and it also helps in preventing overfitting issue.

After over-sampling, now let's train the model using Support Vector Machine classifier.

In [18]:
model_lr_smote = SVC()
model_lr_smote.fit(x_train_smote, y_train_smote)

In [19]:
y_pred = model_lr_smote.predict(x_test_encode)

In [20]:
print(accuracy_score(y_pred, y_test_encode))

0.7469793887704336


In [21]:
print(classification_report(y_test_encode, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81      1033
           1       0.52      0.73      0.61       374

    accuracy                           0.75      1407
   macro avg       0.70      0.74      0.71      1407
weighted avg       0.79      0.75      0.76      1407



- After upscaling, there is not a huge difference overall.
- The minority class has a slightly better score than the previous models but not much though, even after upsampling it seems like we need more samples of minority class for the model to perform better.<br>
Now let's try other classifiers as well and pick the best one.

In [22]:
y_pred_list = dict()

for name, model in model_list.items():
  model.fit(x_train_smote, y_train_smote)
  y_pred_list[name] = model.predict(x_test_encode)

score_smote = get_score(y_pred_list, y_test_encode)

[LightGBM] [Info] Number of positive: 4130, number of negative: 4130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3514
[LightGBM] [Info] Number of data points in the train set: 8260, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0_level_0,accuracy,precision,recall,f1_score,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.754087,0.524561,0.799465,0.633475,0.768561
Ridge Classifier,0.752665,0.522109,0.820856,0.638254,0.774416
Decision Tree,0.721393,0.478365,0.532086,0.503797,0.661009
Random Forest,0.778252,0.593939,0.524064,0.556818,0.697172
KNN,0.700071,0.460396,0.745989,0.569388,0.714718
SVC,0.746979,0.517045,0.729947,0.605322,0.741546
Neural Network,0.725657,0.486364,0.572193,0.525799,0.676706
Gradient Boosting Classifier,0.779673,0.571749,0.681818,0.621951,0.74846
AdaBoost Classifier,0.764748,0.540952,0.759358,0.631813,0.763029
CatBoost Classifier,0.78607,0.598383,0.593583,0.595973,0.724671


In [23]:
for i in y_pred_list:
    print(i, ':', accuracy_score(y_pred_list[i], y_test_encode))

Logistic Regression : 0.7540867093105899
Ridge Classifier : 0.7526652452025586
Decision Tree : 0.7213930348258707
Random Forest : 0.7782515991471215
KNN : 0.7000710732054015
SVC : 0.7469793887704336
Neural Network : 0.7256574271499645
Gradient Boosting Classifier : 0.7796730632551528
AdaBoost Classifier : 0.7647476901208244
CatBoost Classifier : 0.7860696517412935
XGBoost : 0.7725657427149965
LightGBM : 0.7810945273631841


The above metrics and scores are not quite as convincing but for now, CatBoost Classifier performs the best comparing to other classifiers, so let's pick it as our model and perform hyperparameter tuning.

In [24]:
model_cb = CatBoostClassifier(random_state=100, verbose=False,train_dir="./Model_Processing/selected_model/catboost_info")
model_cb.fit(x_train_smote, y_train_smote)

<catboost.core.CatBoostClassifier at 0x2931a34f990>

In [25]:
y_pred = model_cb.predict(x_test_encode)
acc_cb = accuracy_score(y_test_encode, y_pred)
print(acc_cb)

0.7860696517412935


## 5- Hyperparameter Tuning

In [26]:
params = {'depth'   : [3, 4, 5, 6, 7, 8, 9, 10],
        'learning_rate' : [0.01, 0.02, 0.03, 0.04],
        'iterations'    : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
            }

In [27]:
cbc = CatBoostClassifier(train_dir="./Model_Processing/selected_model/catboost_info")
cbc_grid = GridSearchCV(estimator = cbc, param_grid = params, cv = 5, n_jobs = -1, scoring = 'accuracy', verbose = False)
cbc_grid.fit(x_train_smote, y_train_smote)

0:	learn: 0.6671976	total: 86.4ms	remaining: 8.55s
1:	learn: 0.6429468	total: 176ms	remaining: 8.61s
2:	learn: 0.6218054	total: 265ms	remaining: 8.57s
3:	learn: 0.6013454	total: 347ms	remaining: 8.32s
4:	learn: 0.5845069	total: 456ms	remaining: 8.67s
5:	learn: 0.5684412	total: 527ms	remaining: 8.26s
6:	learn: 0.5534491	total: 636ms	remaining: 8.45s
7:	learn: 0.5396022	total: 784ms	remaining: 9.01s
8:	learn: 0.5266381	total: 962ms	remaining: 9.73s
9:	learn: 0.5149189	total: 1.07s	remaining: 9.63s
10:	learn: 0.5039386	total: 1.21s	remaining: 9.76s
11:	learn: 0.4940658	total: 1.34s	remaining: 9.84s
12:	learn: 0.4849815	total: 1.44s	remaining: 9.62s
13:	learn: 0.4781763	total: 1.65s	remaining: 10.1s
14:	learn: 0.4700477	total: 1.74s	remaining: 9.85s
15:	learn: 0.4627553	total: 1.9s	remaining: 9.97s
16:	learn: 0.4560515	total: 2s	remaining: 9.76s
17:	learn: 0.4492482	total: 2.1s	remaining: 9.58s
18:	learn: 0.4426376	total: 2.2s	remaining: 9.38s
19:	learn: 0.4359882	total: 2.3s	remaining: 9.

In [28]:
print('After running Grid Search;')
print('The Best Parameters are:', cbc_grid.best_params_)
print('The Best Score is:', cbc_grid.best_score_)

After running Grid Search;
The Best Parameters are: {'depth': 10, 'iterations': 100, 'learning_rate': 0.04}
The Best Score is: 0.8426150121065377


The Accuracy score is slightly better after Hyperparameter Tuning, but not much. Even an increase of 0.5% in accuracy after Hyperparameter Tuning is quite good. Keeping in mind the imbalance of the dataset, more or less the model seems to be good fit. But before deployment, let's perform Principal Component Analysis (PCA) which will help in simplifying the complexity in high-dimensional data while retaining trends and patterns. It does this by transforming the data into fewer dimensions, which acts as summaries of features.

In [29]:
model_cbc = CatBoostClassifier(iterations = 100, learning_rate = 0.04, max_depth = 10, verbose = False,train_dir="./Model_Processing/selected_model/catboost_info")
model_cbc.fit(x_train_smote, y_train_smote)

<catboost.core.CatBoostClassifier at 0x2931a1cd910>

## 6- Performing PCA

In [30]:
pca = PCA()
x_train_pca = pca.fit_transform(x_train_smote)
x_test_pca = pca.transform(x_test_encode)

In [31]:
model_cbc_pca = CatBoostClassifier(iterations = 100, learning_rate = 0.04, max_depth = 10, verbose = False, train_dir="./Model_Processing/selected_model/catboost_info")
model_cbc_pca.fit(x_train_pca, y_train_smote)

<catboost.core.CatBoostClassifier at 0x2931a40efd0>

In [32]:
y_pred = model_cbc_pca.predict(x_train_pca)

In [33]:
model_score_pca = model_cbc_pca.score(x_test_pca, y_test_encode)
print(model_score_pca)

0.7633262260127932


Even after performing Principal Component Analysis (PCA) to reduce the noise and dimensionality, we couldn't see any better results.<br>
So, let's finalize the model created by CatBoost Classifier as it is the best performer.

## 7- Pickle the model

In [34]:
with open('Churnlytical_Model.sav', 'wb') as f:
    pickle.dump(model_cbc, f)

In [35]:
with open('Churnlytical_Model.sav', 'rb') as f:
    m = pickle.load(f)

#

#### Completed 'CHURNLYTICAL_Model_Building'!

In [36]:
# Save feature names after fitting the encoder
with open('./Model_Processing/features.pkl', 'wb') as f:
    pickle.dump({'num_cols': num_cols, 'cat_cols': cat_cols}, f)


NameError: name 'num_cols' is not defined