- Import the required libraries and modules that you would need.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

- Read that data into Python and call the dataframe churnData.

In [None]:
def load_data():
    churnData = pd.read_csv("Data/DATA_Customer-Churn.csv")
    return churnData
churnData = load_data()

In [None]:
churnData

- Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type. Convert this column into numeric type using pd.to_numeric function.

In [None]:
churnData.info()

In [None]:
def convert_to_numeric(churnData):
    churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')
    return churnData
churnData = convert_to_numeric(churnData)

In [None]:
churnData.info()

- Check for null values in the dataframe. Replace the null values.

In [None]:
round(churnData.isna().sum()/len(churnData),4)*100

In [None]:
def replace_null_values(churnData):
    mean_TotalCharges = np.mean(churnData['TotalCharges'])
    churnData['TotalCharges'] = churnData['TotalCharges'].fillna(mean_TotalCharges)
    return churnData
churnData = replace_null_values(churnData)

In [None]:
round(churnData.isna().sum()/len(churnData),4)*100

- Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:

In [None]:
def drop_columns(churnData):
    churnData_features = churnData.drop(columns=['gender','Partner','Dependents',
                                                  'PhoneService','OnlineSecurity',
                                                  'OnlineBackup','DeviceProtection',
                                                  'TechSupport','StreamingTV',
                                                  'StreamingMovies','Contract'])
    return churnData_features
churnData_features = drop_columns(churnData)

In [None]:
churnData_features

In [None]:
def hot_coding_categorical_variables(churnData_features):
    categoricals_features= churnData_features.select_dtypes("object")
    print(categoricals_features)
    
    categoricals_features=pd.get_dummies(categoricals_features[['Churn']], drop_first=True)
    print(categoricals_features)
    
    #adding categorical features
    churnData_features = pd.concat([churnData_features,categoricals_features],axis=1)
    return churnData_features
churnData_features = hot_coding_categorical_variables(churnData_features)

In [None]:
churnData_features

In [None]:
def define_x_y(churnData_features):
    X = churnData_features.drop(columns=['Churn','Churn_Yes'], axis = 1)
    y = churnData_features['Churn_Yes']
    return X, y
X,y = define_x_y(churnData_features)

- Split the data into a training set and a test set.

In [None]:
def data_splitting(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    X_train = pd.DataFrame(X_train, columns=X.columns)
    X_test  = pd.DataFrame(X_test, columns=X.columns)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = data_splitting(X,y)

- Scale the features either by using normalizer or a standard scaler.

- Fit a logistic Regression model on the training data.

In [None]:
def logistic_regression_model(X_train, X_test):

    trans = PowerTransformer() # The same as standard scaler

    trans.fit(X_train)

    X_train_mod = trans.transform(X_train)
    X_test_mod  = trans.transform(X_test)

    log_model = LogisticRegression() 
    
    log_model.fit(X_train_mod, y_train)

    y_pred_train_log = log_model.predict(X_train_mod)
    y_pred_test_log = log_model.predict(X_test_mod)

    performance_log = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_log),
                                         precision_score(y_train, y_pred_train_log),
                                         recall_score(y_train, y_pred_train_log)],
                               'Test': [accuracy_score(y_test, y_pred_test_log),
                                        precision_score(y_test, y_pred_test_log),
                                        recall_score(y_test, y_pred_test_log)]})
    display(performance_log)

    print()

    print("Confusion matrix for the train set")
    print(confusion_matrix(y_train,y_pred_train_log))
    plot_confusion_matrix(log_model,X_train_mod,y_train, values_format = 'd')
    plt.show()

    print()

    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test, y_pred_test_log))
    plot_confusion_matrix(log_model,X_test_mod,y_test, values_format = 'd')
    plt.show()
    
    return performance_log, y_pred_train_log, y_pred_test_log
performance_log, y_pred_train_log, y_pred_test_log = logistic_regression_model(X_train, X_test)

- Fit a Knn Classifier model on the training data

In [None]:
def knn_classifier_model (X_train, X_test):

    trans = PowerTransformer()

    trans.fit(X_train)

    X_train_mod = trans.transform(X_train)
    X_test_mod  = trans.transform(X_test)

    neigh = KNeighborsClassifier() #Import KNeighborsClassifier to use the K-NN for classification
    
    neigh.fit(X_train_mod, y_train)

    y_pred_train_knn = neigh.predict(X_train_mod)
    y_pred_test_knn = neigh.predict(X_test_mod)

    performance_knn = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_knn),
                                         precision_score(y_train, y_pred_train_knn),
                                         recall_score(y_train, y_pred_train_knn)],
                               'Test': [accuracy_score(y_test, y_pred_test_knn),
                                        precision_score(y_test, y_pred_test_knn),
                                        recall_score(y_test, y_pred_test_knn)]})
    display(performance_knn)

    print()

    print("Confusion matrix for the train set")
    print(confusion_matrix(y_train,y_pred_train_knn))
    plot_confusion_matrix(neigh,X_train_mod,y_train, values_format = 'd')
    plt.show()

    print()

    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test, y_pred_test_knn))
    plot_confusion_matrix(neigh, X_test_mod, y_test, values_format = 'd')
    plt.show()
    
    return performance_knn, neigh
performance_knn, neigh= knn_classifier_model(X_train, X_test)

- Fit a Decision Tree Classifier on the training data.

- Check the accuracy on the test data.

In [None]:
def decision_tree_classifier_model (X_train, X_test):

    # Bear in mind that sklearn uses a different function for decission trees used for 
    # classification (to predict a categorical feature): DecisionTreeClassifier() 
    dt = DecisionTreeClassifier(max_depth=5)

    dt.fit(X_train, y_train)

    y_pred_train_dt = dt.predict(X_train)
    y_pred_test_dt = dt.predict(X_test)

    performance_df = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_dt),
                                         precision_score(y_train, y_pred_train_dt),
                                         recall_score(y_train, y_pred_train_dt)],
                               'Test': [accuracy_score(y_test, y_pred_test_dt),
                                        precision_score(y_test, y_pred_test_dt),
                                        recall_score(y_test, y_pred_test_dt)]})
    display(performance_df)

    print("Confusion matrix for the train set")
    print(confusion_matrix(y_train,y_pred_train_dt).T)
    plot_confusion_matrix(dt,X_train,y_train, values_format = 'd')
    plt.show()

    print()

    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test,y_pred_test_dt).T)
    plot_confusion_matrix(dt,X_test,y_test, values_format = 'd')
    plt.show()
    
    return dt, performance_df, y_pred_train_dt, y_pred_test_dt
dt, performance_df, y_pred_train_dt, y_pred_test_dt = decision_tree_classifier_model (X_train, X_test)   

In [None]:
fig, axes= plt.subplots(nrows= 1, ncols= 1, figsize= (34, 20))

plot_tree(dt, filled=True, rounded= True, feature_names= X.columns)
plt.show() 

- apply K-fold cross validation on your models before and check the model score. Note: So far we have not balanced the data.

In [None]:
#Models Comparison
model1 = DecisionTreeClassifier(max_depth=5)

model2 = LogisticRegression() 

model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Regression Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

Managing imbalance in the dataset
Check for the imbalance. Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes. Each time fit the model and see how the accuracy of the model is.

# Downsampling

In [None]:
category_0 = churnData_features[churnData_features['Churn_Yes'] == 0]
category_1 = churnData_features[churnData_features['Churn_Yes'] == 1]

In [None]:
print(category_0.shape)
print(category_1.shape)

In [None]:
category_0_down = category_0.sample(len(category_1,))
print(category_0_down.shape)
print(category_1.shape)

In [None]:
churnData_features = pd.concat([category_0_down, category_1], axis=0)
#shuffling the data
churnData_features = churnData_features.sample(frac=1)
churnData_features['Churn_Yes'].value_counts()

# Upsampling

In [None]:
category_0 = churnData_features[churnData_features['Churn_Yes'] == 0]
category_1 = churnData_features[churnData_features['Churn_Yes'] == 1]

In [None]:
category_1_up = category_1.sample(len(category_0), replace=True)
print(category_1_up.shape)

In [None]:
category_1_up

In [None]:
category_1_up.shape

In [None]:
churnData_features = pd.concat([category_0, category_1_up], axis=0)
#shuffling the data
churnData_features = churnData_features.sample(frac=1)
churnData_features['Churn_Yes'].value_counts()

# Upsampling using SMOTE

In [None]:
smote = SMOTE()

In [None]:
y.value_counts()

In [None]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

In [None]:
def logistic_regression_model(X_train, X_test):

    trans = PowerTransformer() # The same as standard scaler

    trans.fit(X_train)

    X_train_mod = trans.transform(X_train)
    X_test_mod  = trans.transform(X_test)

    log_model = LogisticRegression() 
    
    log_model.fit(X_train_mod, y_train)

    y_pred_train_log = log_model.predict(X_train_mod)
    y_pred_test_log = log_model.predict(X_test_mod)

    performance_log = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_log),
                                         precision_score(y_train, y_pred_train_log),
                                         recall_score(y_train, y_pred_train_log)],
                               'Test': [accuracy_score(y_test, y_pred_test_log),
                                        precision_score(y_test, y_pred_test_log),
                                        recall_score(y_test, y_pred_test_log)]})
    display(performance_log)

    print()

    print("Confusion matrix for the train set")
    print(confusion_matrix(y_train,y_pred_train_log))
    plot_confusion_matrix(log_model,X_train_mod,y_train, values_format = 'd')
    plt.show()

    print()

    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test, y_pred_test_log))
    plot_confusion_matrix(log_model,X_test_mod,y_test, values_format = 'd')
    plt.show()
    
    return performance_log, y_pred_train_log, y_pred_test_log
performance_log, y_pred_train_log, y_pred_test_log = logistic_regression_model(X_train, X_test)

In [None]:
def knn_classifier_model (X_train, X_test):

    trans = PowerTransformer()

    trans.fit(X_train)

    X_train_mod = trans.transform(X_train)
    X_test_mod  = trans.transform(X_test)

    neigh = KNeighborsClassifier() #Import KNeighborsClassifier to use the K-NN for classification
    
    neigh.fit(X_train_mod, y_train)

    y_pred_train_knn = neigh.predict(X_train_mod)
    y_pred_test_knn = neigh.predict(X_test_mod)

    performance_knn = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_knn),
                                         precision_score(y_train, y_pred_train_knn),
                                         recall_score(y_train, y_pred_train_knn)],
                               'Test': [accuracy_score(y_test, y_pred_test_knn),
                                        precision_score(y_test, y_pred_test_knn),
                                        recall_score(y_test, y_pred_test_knn)]})
    display(performance_knn)

    print()

    print("Confusion matrix for the train set")
    print(confusion_matrix(y_train,y_pred_train_knn))
    plot_confusion_matrix(neigh,X_train_mod,y_train, values_format = 'd')
    plt.show()

    print()

    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test, y_pred_test_knn))
    plot_confusion_matrix(neigh, X_test_mod, y_test, values_format = 'd')
    plt.show()
    
    return performance_knn, neigh
performance_knn, neigh= knn_classifier_model(X_train, X_test)

In [None]:
def decision_tree_classifier_model (X_train, X_test):

    # Bear in mind that sklearn uses a different function for decission trees used for 
    # classification (to predict a categorical feature): DecisionTreeClassifier() 
    dt = DecisionTreeClassifier(max_depth=5)

    dt.fit(X_train, y_train)

    y_pred_train_dt = dt.predict(X_train)
    y_pred_test_dt = dt.predict(X_test)

    performance_df = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_dt),
                                         precision_score(y_train, y_pred_train_dt),
                                         recall_score(y_train, y_pred_train_dt)],
                               'Test': [accuracy_score(y_test, y_pred_test_dt),
                                        precision_score(y_test, y_pred_test_dt),
                                        recall_score(y_test, y_pred_test_dt)]})
    display(performance_df)

    print("Confusion matrix for the train set")
    print(confusion_matrix(y_train,y_pred_train_dt).T)
    plot_confusion_matrix(dt,X_train,y_train, values_format = 'd')
    plt.show()

    print()

    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test,y_pred_test_dt).T)
    plot_confusion_matrix(dt,X_test,y_test, values_format = 'd')
    plt.show()
    
    return dt, performance_df, y_pred_train_dt, y_pred_test_dt
dt, performance_df, y_pred_train_dt, y_pred_test_dt = decision_tree_classifier_model (X_train, X_test)   

In [None]:
fig, axes= plt.subplots(nrows= 1, ncols= 1, figsize= (34, 20))

plot_tree(dt, filled=True, rounded= True, feature_names= X.columns)
plt.show() 

In [None]:
dt.score(X_test, y_test)

- Check the accuracy on the test data.

In [None]:
y

In [None]:
set(y)

In [None]:
scores=cross_val_score(dt, X_train, y_train, cv=5)
scores

In [None]:
print("%0.9f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
y_test

In [None]:
y_pred = cross_val_predict(dt, X_test, y_test, cv=5)
y_pred

In [None]:
#Models Comparison
model1 = DecisionTreeClassifier(max_depth=5)

model2 = LogisticRegression() 

model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Regression Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)