## Importing necessary libraries

In [3]:
import pandas as pd
import numpy as np

# preprocessing tools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('/tel_churn.csv')
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,No,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1 - 12
1,Male,No,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25 - 36
2,Male,No,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1 - 12
3,Male,No,No,No,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,37 - 48
4,Female,No,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1 - 12


## Data Pre-processing

In [5]:
def preprocess_inputs(df):
    df = df.copy()

    # Splitting the dataset into X and y

    X = df.drop('Churn', axis=1)
    y = df['Churn']

    # train test split
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

    # categorical columns
    cat_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'Churn']

    # numerical columns
    num_cols = [col for col in df.columns if df[col].dtype != 'object']

    # Dividing the columns into 3 categories;
    # standard scaler for numerical columns

    ss = StandardScaler()
    x_train[num_cols] = ss.fit_transform(x_train[num_cols])
    x_test[num_cols] = ss.transform(x_test[num_cols])

    with open("StandardScaler", "wb") as f: 
        pickle.dump(ss, f)

    # label encoder for target variable

    le = LabelEncoder()
    y_train_encode = le.fit_transform(y_train)
    y_test_encode = le.transform(y_test)

    with open('Label-Encoder', 'wb') as f:
        pickle.dump(le, f)

    # one-hot encoding for categorical features

    ohe = OneHotEncoder(sparse=False, drop='if_binary')
    x_train_ohe = ohe.fit_transform(x_train[cat_cols])
    x_test_ohe = ohe.transform(x_test[cat_cols])

    with open("One-Hot-Encoder", "wb") as f: 
        pickle.dump(ohe, f)

    # assigning right column names to cat cols after one-hot encoding
    col_ohe = ohe.get_feature_names(cat_cols)

    # create df for one hot encoded features
    x_train_ohe_df = pd.DataFrame(x_train_ohe, columns = col_ohe, index = x_train.index)
    x_test_ohe_df = pd.DataFrame(x_test_ohe, columns = col_ohe, index = x_test.index)

    # combine the numerical and encoded features
    x_train_encode = pd.concat([x_train.drop(columns=cat_cols), x_train_ohe_df], axis=1)
    x_test_encode = pd.concat([x_test.drop(columns=cat_cols), x_test_ohe_df], axis=1)

    return x_train_encode, x_test_encode, y_train_encode, y_test_encode

In [None]:
x_train_encode, x_test_encode, y_train_encode, y_test_encode = preprocess_inputs(df)

## Model Building

In [7]:
# decision tree classifier
model_dt = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train_encode, y_train_encode)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [9]:
y_pred = model_dt.predict(x_test_encode)

In [10]:
print(confusion_matrix(y_test_encode, y_pred))

[[901 132]
 [194 180]]


In [11]:
print(classification_report(y_test_encode, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1033
           1       0.58      0.48      0.52       374

    accuracy                           0.77      1407
   macro avg       0.70      0.68      0.69      1407
weighted avg       0.76      0.77      0.76      1407



In [12]:
y_train_encode.shape

(5625,)

In [13]:
print(np.bincount(y_train_encode))

[4130 1495]


- The accuracy is quite low also keeping in mind that the dataset is quite imbalanced. The accuracy should not be considered as the metrics as the accuracy is cursed in an imbalanced data.
- Hence, it is essential to check recall, precision & f1 score for the minority class, and it's quite evident these scores are low as well for the minority class i.e. Churned Customers.
- Hence, we shall to scale the classes of the dataset using SMOTE (Upsampling).
- But first let's try other as well classifiers before over-sampling.

In [14]:
def get_score(y_pred_list, y_test, plot=True, axis=0, cmap='Blues'):
    model_name = []
    accuracy = []
    precision = []
    recall = []
    f1 = []
    roc_auc = []

    for name, y_pred in y_pred_list.items():
        model_name.append(name)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        roc_auc.append(roc_auc_score(y_test, y_pred))

        score_list = {
          'model':model_name,
          'accuracy':accuracy,
          'precision':precision,
          'recall':recall,
          'f1_score':f1,
          'roc_auc':roc_auc
        }
    
    score_df = pd.DataFrame(score_list).set_index('model')

    if plot:
        display(score_df.style.background_gradient(axis=axis, cmap=cmap))

    return score_df

In [15]:
model_list ={
    'Logistic Regression' :LogisticRegression(max_iter=1000, random_state=100),
    'Ridge Classifier' :RidgeClassifier(random_state=100),
    'Decision Tree' :DecisionTreeClassifier(random_state=100),
    'Random Forest':RandomForestClassifier(random_state=100),
    'KNN' :KNeighborsClassifier(),
    'SVC' :SVC(random_state=100),
    'Neural Network' : MLPClassifier(max_iter=1000, random_state=100),
    'Gradient Boosting Classifier':GradientBoostingClassifier(random_state=100),
    'AdaBoost Classifier':AdaBoostClassifier(random_state=100),
    'CatBoost Classifier':CatBoostClassifier(random_state=100, verbose=False),
    'XGBoost':XGBClassifier(random_state=100, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM':LGBMClassifier(random_state=100)
}

In [16]:
y_pred_list = dict()

for name, model in model_list.items():
  model.fit(x_train_encode, y_train_encode)
  y_pred_list[name] = model.predict(x_test_encode)

score_smote = get_score(y_pred_list, y_test_encode)

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.79602,0.654804,0.491979,0.561832,0.699039
Ridge Classifier,0.797441,0.673152,0.462567,0.548336,0.690625
Decision Tree,0.742004,0.515152,0.5,0.507463,0.664811
Random Forest,0.783227,0.628253,0.451872,0.525661,0.677533
KNN,0.768301,0.575472,0.489305,0.528902,0.679309
SVC,0.800995,0.680769,0.473262,0.55836,0.696457
Neural Network,0.743426,0.52,0.451872,0.483548,0.650428
Gradient Boosting Classifier,0.797441,0.644951,0.529412,0.581498,0.711947
AdaBoost Classifier,0.799574,0.639394,0.564171,0.599432,0.724486
CatBoost Classifier,0.789623,0.630872,0.502674,0.559524,0.698094


There is no drastic difference in the scores, now let's try over-sampling techniques to upscale the dataset.

## Over-Sampling

In [17]:
print("Before OverSampling - counts of label '0': {}".format(sum(y_train_encode == 0)))
print("Before OverSampling - counts of label '1': {} \n".format(sum(y_train_encode == 1)))

sm = SMOTE(random_state=100)
x_train_smote, y_train_smote = sm.fit_resample(x_train_encode, y_train_encode.ravel())

print('After OverSampling with SMOTE - x_train: {}'.format(x_train_smote.shape))
print('After OverSampling with SMOTE - y_train: {} \n'.format(y_train_smote.shape))

print("After OverSampling with SMOTE - counts of label '0': {}".format(sum(y_train_smote == 0)))
print("After OverSampling with SMOTE - counts of label '1': {}".format(sum(y_train_smote == 1)))

Before OverSampling - counts of label '0': 4130
Before OverSampling - counts of label '1': 1495 

After OverSampling with SMOTE - x_train: (8260, 31)
After OverSampling with SMOTE - y_train: (8260,) 

After OverSampling with SMOTE - counts of label '0': 4130
After OverSampling with SMOTE - counts of label '1': 4130


Using over-sampling only on training set as there might be leakage of information from the test set to training set while training the model. So it is a good measure to not use any sampling techniques on the test set and it also helps in preventing overfitting issue.

After over-sampling, now let's train the model using Support Vector Machine classifier.

In [18]:
model_lr_smote = SVC()
model_lr_smote.fit(x_train_smote, y_train_smote)

SVC()

In [19]:
y_pred = model_lr_smote.predict(x_test_encode)

In [20]:
print(accuracy_score(y_pred, y_test_encode))

0.746268656716418


In [21]:
print(classification_report(y_test_encode, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81      1033
           1       0.52      0.73      0.60       374

    accuracy                           0.75      1407
   macro avg       0.70      0.74      0.71      1407
weighted avg       0.79      0.75      0.76      1407



- After upscaling, there is not a huge difference overall.
- The minority class has a slightly better score than the previous models but not much though, even after upsampling it seems like we need more samples of minority class for the model to perform better.<br>
Now let's try other classifiers as well and pick the best one.

In [22]:
y_pred_list = dict()

for name, model in model_list.items():
  model.fit(x_train_smote, y_train_smote)
  y_pred_list[name] = model.predict(x_test_encode)

score_smote = get_score(y_pred_list, y_test_encode)

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.753376,0.523643,0.799465,0.632804,0.768077
Ridge Classifier,0.752665,0.522109,0.820856,0.638254,0.774416
Decision Tree,0.71855,0.47343,0.524064,0.497462,0.656514
Random Forest,0.778252,0.591716,0.534759,0.561798,0.700584
KNN,0.700071,0.460396,0.745989,0.569388,0.714718
SVC,0.746269,0.516129,0.727273,0.603774,0.740209
Neural Network,0.734897,0.501188,0.564171,0.530818,0.68044
Gradient Boosting Classifier,0.781805,0.574279,0.692513,0.627879,0.753323
AdaBoost Classifier,0.764748,0.540952,0.759358,0.631813,0.763029
CatBoost Classifier,0.793177,0.61186,0.606952,0.609396,0.733776


In [23]:
for i in y_pred_list:
    print(i, ':', accuracy_score(y_pred_list[i], y_test_encode))

Logistic Regression : 0.7533759772565742
Ridge Classifier : 0.7526652452025586
Decision Tree : 0.7185501066098081
Random Forest : 0.7782515991471215
KNN : 0.7000710732054015
SVC : 0.746268656716418
Neural Network : 0.7348969438521677
Gradient Boosting Classifier : 0.7818052594171997
AdaBoost Classifier : 0.7647476901208244
CatBoost Classifier : 0.7931769722814499
XGBoost : 0.775408670931059
LightGBM : 0.7768301350390903


The above metrics and scores are not quite as convincing but for now, CatBoost Classifier performs the best comparing to other classifiers, so let's pick it as our model.

In [24]:
model_cb = CatBoostClassifier(random_state=100, verbose=False)
model_cb.fit(x_train_smote, y_train_smote)

<catboost.core.CatBoostClassifier at 0x16d4d93c580>

In [25]:
y_pred = model_cb.predict(x_test_encode)

In [26]:
print(accuracy_score(y_test_encode, y_pred))

0.7931769722814499


## Performing PCA

In [27]:
from sklearn.decomposition import PCA
pca = PCA()

In [28]:
x_train_pca = pca.fit_transform(x_train_smote)
x_test_pca = pca.transform(x_test_encode)

In [29]:
model_cb_pca = CatBoostClassifier(random_state=100, verbose=False)
model_cb_pca.fit(x_train_pca, y_train_smote)

<catboost.core.CatBoostClassifier at 0x16d4d9537c0>

In [30]:
y_pred = model_cb_pca.predict(x_train_pca)

In [31]:
model_score_pca = model_cb_pca.score(x_test_pca, y_test_encode)
print(model_score_pca)

0.767590618336887


Even after performing PCA, we couldn't see any better results.<br>
Hence, let's finalize the model created by CatBoost Classifier.

## Pickle the model

In [32]:
import pickle

with open('model.sav', 'wb') as f:
    pickle.dump(model_cb, f)

In [33]:
with open('model.sav', 'rb') as f:
    m = pickle.load(f)