In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# CSV
import csv

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split 

import sklearn
import math
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer 

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

In [2]:
customer_df = pd.read_csv('telco_churn_cleaned.csv')

In [3]:
customer_df

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,internet_service_dsl,internet_service_fiber_optic,internet_service_no,contract_month_to_month,contract_one_year,contract_two_year,payment_method_bank_transfer,payment_method_credit_card,payment_method_electronic_check,payment_method_mailed_check
0,0,1,0,0.013889,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0.472222,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,0.027778,0,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0.625000,0,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0.027778,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,0.333333,1,1,0,1,1,1,...,1,0,0,0,1,0,0,0,0,1
7039,0,1,1,1.000000,1,0,1,1,0,1,...,0,1,0,0,1,0,0,1,0,0
7040,0,1,1,0.152778,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
7041,1,1,0,0.055556,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1


## Normalise (Scale) Data, 1.5% Threshold and Split into Train/Test

In [4]:
scaler = MinMaxScaler()

customer_df.iloc[:,3:4] = scaler.fit_transform(customer_df.iloc[:,3:4])
customer_df.iloc[:,13:15] = scaler.fit_transform(customer_df.iloc[:,13:15])
customer_df[["tenure","TotalCharges","MonthlyCharges"]]

Unnamed: 0,tenure,TotalCharges,MonthlyCharges
0,0.013889,0.001275,0.115423
1,0.472222,0.215867,0.385075
2,0.027778,0.010310,0.354229
3,0.625000,0.210241,0.239303
4,0.027778,0.015330,0.521891
...,...,...,...
7038,0.333333,0.227521,0.662189
7039,1.000000,0.847461,0.845274
7040,0.152778,0.037809,0.112935
7041,0.055556,0.033210,0.558706


## 2.5% Threshold

In [5]:
X = customer_df.drop(columns={'Dependents','contract_two_year','SeniorCitizen','OnlineBackup','OnlineSecurity','Partner','Churn','TechSupport','MultipleLines','DeviceProtection','StreamingTV','StreamingMovies','payment_method_credit_card','payment_method_bank_transfer','internet_service_no','payment_method_mailed_check','payment_method_bank_transfer','internet_service_no','payment_method_mailed_check','internet_service_dsl','contract_one_year'})
y = customer_df['Churn']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

## Decision Tree Base Classifier

In [7]:
dt_clf = DecisionTreeClassifier(criterion='gini')  
dt_clf.fit(X_train, y_train)

dt_predictions = dt_clf.predict(X_test)
dt_results_df = pd.DataFrame({'Actual': y_test, 'Predicted': dt_predictions})
dt_results_df.sample(10)

Unnamed: 0,Actual,Predicted
2898,0.0,0.0
1875,0.0,0.0
5835,0.0,0.0
5240,0.0,0.0
238,1.0,1.0
97,1.0,0.0
1574,0.0,0.0
1348,1.0,1.0
1954,1.0,1.0
2186,0.0,0.0


In [8]:
dt_conf_matrix = confusion_matrix(y_test, dt_predictions)
dt_tn = dt_conf_matrix[0][0]
dt_fn = dt_conf_matrix[1][0]
dt_tp = dt_conf_matrix[1][1]
dt_fp = dt_conf_matrix[0][1]
print(dt_tn)
print(dt_fn)
print(dt_tp)
print(dt_fp)

850
176
170
213


In [9]:
dt_accuracy = (dt_tp + dt_tn) / (dt_tp + dt_tn + dt_fp + dt_fn)
dt_precision = dt_tp / (dt_tp + dt_fp)
dt_recall = dt_tp / (dt_tp + dt_fn)
dt_f1_score = 2 * ((dt_precision * dt_recall) / (dt_precision + dt_recall))

print('Decision Tree Accuracy Score: ' + str(dt_accuracy))
print('Decision Tree Precision Score: ' + str(dt_precision))
print('Decision Tree Recall Score: ' + str(dt_recall))
print('Decision Tree F1-Score: ' + str(dt_f1_score))

Decision Tree Accuracy Score: 0.7239176721078779
Decision Tree Precision Score: 0.44386422976501305
Decision Tree Recall Score: 0.4913294797687861
Decision Tree F1-Score: 0.46639231824417005


## Random Forest Base Classifier

In [10]:
clf = RandomForestClassifier(n_estimators=100, random_state=424)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=424)

In [11]:
rfc_predictions = clf.predict(X_test)

In [12]:
rfc_conf_matrix = confusion_matrix(y_test, rfc_predictions)

In [13]:
rfc_conf_matrix

array([[916, 147],
       [181, 165]])

In [14]:
rfc_tn = rfc_conf_matrix[0][0]
rfc_fn = rfc_conf_matrix[1][0]
rfc_tp = rfc_conf_matrix[1][1]
rfc_fp = rfc_conf_matrix[0][1]
print(rfc_tp)
print(rfc_tn)
print(rfc_fp)
print(rfc_fn)

165
916
147
181


In [15]:
rfc_accuracy = (rfc_tp + rfc_tn) / (rfc_tp + rfc_tn + rfc_fp + rfc_fn)
rfc_precision = rfc_tp / (rfc_tp + rfc_fp)
rfc_recall = rfc_tp / (rfc_tp + rfc_fn)
rfc_f1_score = 2 * ((rfc_precision * rfc_recall) / (rfc_precision + rfc_recall))

print('Random Forest Accuracy Score: ' + str(rfc_accuracy))
print('Random Forest Precision Score: ' + str(rfc_precision))
print('Random Forest Recall Score: ' + str(rfc_recall))
print('Random Forest F1-Score: ' + str(rfc_f1_score))

Random Forest Accuracy Score: 0.7672107877927609
Random Forest Precision Score: 0.5288461538461539
Random Forest Recall Score: 0.476878612716763
Random Forest F1-Score: 0.5015197568389057


# Naive Bayes Base Classifier 

In [16]:
from sklearn.naive_bayes import GaussianNB

#Initialize or define the model
nb = GaussianNB()

#Fit the training feature Xs and training label Ys
nb.fit(X_train, y_train)

#Use the trained model to predict the test data
predictions_nb= nb.predict(X_test)

#Find the confusion matrix of the result
cm = confusion_matrix(y_test, predictions_nb)
print(cm)

#Print the accuracy, precision, recall, and F1 score of the result
print(f"Accuracy for Naive Bayes: {accuracy_score(y_test, predictions_nb)}")
print(f"Precision for Naive Bayes: {precision_score(y_test, predictions_nb)}")
print(f"Recall for Naive Bayes: {recall_score(y_test, predictions_nb)}")
print(f"F1-Score for Naive Bayes: {f1_score(y_test, predictions_nb)}")

[[834 229]
 [ 93 253]]
Accuracy for Naive Bayes: 0.7714691270404542
Precision for Naive Bayes: 0.524896265560166
Recall for Naive Bayes: 0.7312138728323699
F1-Score for Naive Bayes: 0.6111111111111112


# Logistic Regression Base Classifier

In [17]:
logreg = LogisticRegression(random_state=424)
logreg.fit(X_train, y_train)
predictions_logreg = logreg.predict(X_test)
cm_logreg = confusion_matrix(y_test, predictions_logreg)

#Print the accuracy, precision, recall, and F1 score of the result
print(f"Accuracy for Logistic Regression: {accuracy_score(y_test, predictions_logreg)}")
print(f"Precision for Logistic Regression: {precision_score(y_test, predictions_logreg)}")
print(f"Recall for Logistic Regression: {recall_score(y_test, predictions_logreg)}")
print(f"F1-Score for Logistic Regression: {f1_score(y_test, predictions_logreg)}")

Accuracy for Logistic Regression: 0.8097941802696949
Precision for Logistic Regression: 0.63
Recall for Logistic Regression: 0.546242774566474
F1-Score for Logistic Regression: 0.585139318885449


# SVM Base Classifier

In [18]:
from datetime import datetime

from sklearn.svm import LinearSVC
#from sklearn.svm import SVC 

#Your code here. The following comments are just for your reference

#ADDED MYSELF
start=datetime.now()
#END

#Initialize or define the model
linearSVC_svm = LinearSVC()

#Fit the training feature Xs and training label Ys
linearSVC_svm.fit(X_train, y_train)

#Use the trained model to predict the test data
linearSVC_svm_pred = linearSVC_svm.predict(X_test)

#Find the confusion matrix of the result
print('Confusion Matrix:')
print(confusion_matrix(y_test, linearSVC_svm_pred))

#Print the accuracy, precision, recall, and F1 score of the result
# print('Accuracy:')
# print(linearSVC_svm.score(X_test, y_test))
linearSVC_svm_accuracy = accuracy_score(y_test, linearSVC_svm_pred)
print('linearSVC Accuracy: ' + str(linearSVC_svm_accuracy))

linearSVC_svm_precision = precision_score(y_test, linearSVC_svm_pred)
print('linearSVC Precision: ' + str(linearSVC_svm_precision))

linearSVC_svm_recall = recall_score(y_test, linearSVC_svm_pred)
print('linearSVC Recall: ' + str(linearSVC_svm_recall))

linearSVC_svm_f1 = f1_score(y_test, linearSVC_svm_pred)
print('linearSVC F1: ' + str(linearSVC_svm_f1))

print('Time taken:')
print(datetime.now()-start)

Confusion Matrix:
[[953 110]
 [168 178]]
linearSVC Accuracy: 0.8026969481902059
linearSVC Precision: 0.6180555555555556
linearSVC Recall: 0.5144508670520231
linearSVC F1: 0.5615141955835962
Time taken:
0:00:00.037015
