In [1]:
import pandas as pd
import joblib
import numpy as np
import logging
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv("Data/Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
def remove_cols(df):
    '''
    Check and remove the unwanted columns
    input: df
    output: cleaned dataframe    
    '''
    Constant_Values = df.columns[df.eq(df.iloc[0]).all()].tolist()
    Duplicate_Columns = df.columns[df.T.duplicated(keep='first').T]  # Only report second column as duplicate
    
    df = df.drop(Constant_Values, axis=1)
    df = df.drop(Duplicate_Columns, axis=1)
    df = df.drop(['customerID','Dependents','PhoneService','MultipleLines', 'PaperlessBilling','PaymentMethod'], axis = 1)
    return df

df = remove_cols(df)

In [6]:
def missing_values_table(df):
    '''
    Check the missing values in the data columns
    input: df
    output: Dataframe of columns and their missing value percent    
    '''
    df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
    missing_val = df.isna().sum()
    missing_val_percent = 100 * missing_val / len(df)
    mis_val_table = pd.concat([missing_val, missing_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)

    return mis_val_table_ren_columns

missing_values = missing_values_table(df)
missing_values

Unnamed: 0,Missing Values,% of Total Values
TotalCharges,11,0.2


In [9]:
train_features = df.drop(columns=['Churn'])
label = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(train_features, label, test_size=0.20, random_state=42)

cat_cols = list(X_train.select_dtypes('object').columns)
num_cols = list(X_train.select_dtypes('number').columns)

In [10]:
print(cat_cols)
print(num_cols)

['gender', 'Partner', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'TotalCharges']
['SeniorCitizen', 'tenure', 'MonthlyCharges']


In [11]:
num_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                  ('scaler', StandardScaler())])

cat_transform = Pipeline(steps=[('onehotenc', OneHotEncoder(handle_unknown='ignore', sparse=False))
                                         ])
col_transformer = ColumnTransformer(transformers=[  ('num_transform',num_transform, num_cols),
                                                    ('cat_transform', cat_transform, num_cols)
                                                  ], remainder='drop')

In [12]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 700, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
min_samples_split = [2, 5, 10]
bootstrap = [True, False]
class_weight = ['balanced']
criterion =  ['gini', 'entropy'] 


In [13]:
def build_model():
    '''
    Machine Learning classification model function that executes following steps:
      1. Building Machine Learning pipeline
      2. Running GridSearchCV for Hyper-parameter tunning
      
      input: None
    output: RandomSearch best model.
    '''
    pipeline_clf = Pipeline([
                     ('transform_column', col_transformer),
                     ('clf', RandomForestClassifier(random_state=42))
                     ]) 

# find best model from random search

    param_grid = {'clf__n_estimators': n_estimators,               
                'clf__max_depth': max_depth,
                'clf__min_samples_split': min_samples_split,
                'clf__bootstrap': bootstrap,
                'clf__criterion': criterion,
                'clf__class_weight': class_weight }

    best_clf = RandomizedSearchCV(pipeline_clf, param_distributions= param_grid, verbose=5, n_jobs=-1, scoring= 'accuracy', cv = 3, refit=True, return_train_score=True)
    return best_clf

In [14]:
model = build_model()

In [15]:
model.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('transform_column',
                                              ColumnTransformer(transformers=[('num_transform',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer()),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['SeniorCitizen',
                                                                                'tenure',
                                                                                'MonthlyCharges']),
                                                                              ('ca

In [16]:
model.best_params_

{'clf__n_estimators': 500,
 'clf__min_samples_split': 2,
 'clf__max_depth': 110,
 'clf__criterion': 'entropy',
 'clf__class_weight': 'balanced',
 'clf__bootstrap': True}

In [19]:
model_1 = build_model()

In [21]:
model_1.fit(X_train, y_train)

Pipeline(steps=[('transform_column',
                 ColumnTransformer(transformers=[('num_transform',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['SeniorCitizen', 'tenure',
                                                   'MonthlyCharges']),
                                                 ('cat_transform',
                                                  Pipeline(steps=[('onehotenc',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['SeniorCitizen',

In [22]:
y_pred = model.predict(X_test)

In [23]:
y_pred_1 = model_1.predict(X_test)

In [24]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred, labels=['No','Yes']))

              precision    recall  f1-score   support

          No       0.85      0.86      0.86      1036
         Yes       0.60      0.59      0.59       373

    accuracy                           0.79      1409
   macro avg       0.73      0.72      0.73      1409
weighted avg       0.79      0.79      0.79      1409



In [25]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_1, labels=['No','Yes']))

              precision    recall  f1-score   support

          No       0.85      0.86      0.86      1036
         Yes       0.60      0.59      0.59       373

    accuracy                           0.79      1409
   macro avg       0.73      0.72      0.73      1409
weighted avg       0.79      0.79      0.79      1409



In [26]:
import shap

In [None]:
shap.i