In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as pgo

import warnings
warnings.filterwarnings('ignore')

# features
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# training and optimization
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV # Allows us to test parameters of classification algorithms and find the best one

# classification model metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score, log_loss
from sklearn.metrics import roc_curve, roc_auc_score

# 1. Load Data and initial check

In [None]:
df_customers = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
df_customers.head()

In [None]:
df_customers.dtypes

In [None]:
# check missing data
df_customers.isnull().sum()

In [None]:
print(df_customers.shape)
len(df_customers['customerID'].unique())

In [None]:
dfc = df_customers.drop(columns=['customerID'])

# deal with TotalCharges, and use it 
dfc['TotalCharges'].replace(to_replace = ' ', value= np.nan, inplace = True)
dfc['TotalCharges'] = dfc['TotalCharges'].astype(float)

In [None]:
dfc.dtypes

In [None]:
# 1 dimension distrbutions
dimensions = {
    'row1':['gender','SeniorCitizen','Partner','Dependents', 'PhoneService','MultipleLines','InternetService'],
    'row2': ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies'],
    'row3':['Contract','PaperlessBilling','PaymentMethod','Churn','tenure'],
}
fig =plt.figure(figsize=[30,12], tight_layout=True)
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.5)
axes = fig.subplots(3, 7, sharex=False)

row = 0
for cat in list(dimensions.keys()):
    num = len(cat)
    for col in range(len(dimensions[cat])):
        #print(dfc[dimensions[cat][col]].value_counts())
        dfc[dimensions[cat][col]].value_counts().plot(kind='bar',ax=axes[row,col], rot=0)
        axes[row,col].set_title(dimensions[cat][col])
    row =row +1

In [None]:
# 2 dimensions distribution

#,facet_row ='InternetService',barmode='group', barnorm='percent', marginal='histogram'
px.histogram(dfc,x='gender',color='Churn',facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show() 
px.histogram(dfc,x='SeniorCitizen',color='Churn', facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()
px.histogram(dfc,x='Partner',color='Churn',facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()
px.histogram(dfc,x='Dependents',color='Churn',facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()

px.histogram(dfc,x='PhoneService',color='Churn',facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()
px.histogram(dfc,x='PaperlessBilling',color='Churn',facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()
px.histogram(dfc,x='PaymentMethod',color='Churn',facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()

In [None]:
# numerical variables 
px.histogram(dfc,x='SeniorCitizen',color='Churn', facet_col='Contract',width=900, height=250, template="simple_white",barmode='group').show()
px.histogram(dfc,x='SeniorCitizen',color='Churn', facet_col='PaperlessBilling',width=900, height=250, template="simple_white",barmode='group').show()

dfc['SeniorCitizenCat'] = np.where(dfc['SeniorCitizen']>0,'Yes','No')
px.box(dfc,x='SeniorCitizenCat',y='TotalCharges',color='Churn',facet_col='Contract', width=900, height=300, template="simple_white").show()
px.box(dfc,x='SeniorCitizenCat',y='MonthlyCharges',color='Churn',facet_col='Contract', width=900, height=300, template="simple_white").show()
px.box(dfc,x='SeniorCitizenCat',y='tenure',color='Churn',facet_col='Contract', width=900, height=300, template="simple_white").show()

In [None]:
# alternative
fig = plt.figure(figsize = (8,4))
ax = fig.gca()
dfc.hist(ax=ax)
fig.tight_layout()

# 2. Features

In [None]:
# drop
dfc=dfc.drop(columns=['SeniorCitizenCat'])

In [None]:
dfc.dtypes

In [None]:
# convert two-values categorical to binary variable
#dfc['gender']=np.where(dfc['gender']=='Male',1,0) # 1: male, 0: female
dfc['gender']=dfc['gender'].map(dict(Male=1,Female=0))
dfc['SeniorCitizen']=pd.Categorical(dfc['SeniorCitizen'])

columns = ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']
for col in columns: 
    #dfc[col]=np.where(dfc[col]== 'Yes',1,0)
    dfc[col]=dfc[col].map(dict(Yes=1,No=0))

In [None]:
dfc.head()

In [None]:
dfc=dfc.dropna()

In [None]:
dfc.isnull().sum()

In [None]:
dfc.shape

In [None]:
# convert other categorical variables to dummy variables
categorical_variables = [
     'MultipleLines',
     'InternetService',
     'OnlineSecurity',
     'OnlineBackup',
     'DeviceProtection',
     'TechSupport',
     'StreamingTV',
     'StreamingMovies',
     'Contract',
     'PaymentMethod',
]
dfc_dummies = pd.get_dummies(dfc, columns=categorical_variables)

In [None]:
dfc_dummies.dtypes

In [None]:
#Get Correlation of "Churn" with other variables:
plt.figure(figsize=(8,4))
dfc_dummies.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')

In [None]:
# remove TotalCharges?
X = dfc_dummies.drop(columns=['Churn'])
Y = dfc_dummies['Churn']

In [None]:
# MinMaxScaler class of sklearn.preprocessing is used for normalization of features.
# Normalization is about transforming the feature values to fall within the bounded intervals (min and max)
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [None]:
# StandardScaler class of sklearn.preprocessing is used for standardization of features.
# Standardization is about transforming the feature values to fall around mean as 0 with standard deviation as 1
features = X.columns.values
scaler = preprocessing.StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

# 3. Modeling

In [None]:
def plot_confusion_matrix(y,y_predict):
    "this function plots the confusion matrix"
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt="d"); #annot=True to annotate cells
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not churn', 'churn']); ax.yaxis.set_ticklabels(['did not churn', 'churn'])

In [None]:
def plot_roc_curve (y, y_predict):
    roc = roc_curve(y, y_predict)
    auc_score = roc_auc_score(y, y_predict)
    plt.plot(roc[0], roc[1], marker='o')
    plt.title('AUC Score: {}'.format(auc_score))
    plt.xlabel('FPR: False positive rate')
    plt.ylabel('TPR: True positive rate')
    plt.grid()
    plt.show()
    

In [None]:
def print_model_measures(Y_test, yhat):
    print('Accuracy Score:{}'.format(accuracy_score(Y_test, yhat)))
    print('Precision Score:{}'.format(precision_score(Y_test, yhat)))
    print('Recall Score:{}'.format(recall_score(Y_test, yhat)))
    print('F1 Score:{}'.format(f1_score(Y_test, yhat)))
    print('Jaccard Score:{}'.format(jaccard_score(Y_test, yhat)))

split dataset into training and testing

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

## 3.1 Logistic Regression

In [None]:
parameters ={"C":[0.01,0.1,1],'penalty':['l2'], 'solver':['lbfgs']}# l1 lasso l2 ridge
lr=LogisticRegression()
lr_cv = GridSearchCV(estimator=lr,param_grid=parameters, cv=10)
lr_cv.fit(X_train,Y_train)

In [None]:
print("Best hpyerparameters:{} with accuray of {}".format(lr_cv.best_params_, lr_cv.best_score_))
print("")
print("using best esitimator.........")
best_estimator = lr_cv.best_estimator_
lr_score = best_estimator.score(X_test, Y_test)
yhat=lr_cv.predict(X_test)
print("Accuracy :",lr_score)
# other measures
print_model_measures(Y_test, yhat)
print('Log Loss:{}'.format(log_loss(Y_test, yhat)))
plot_roc_curve(Y_test, yhat)
plot_confusion_matrix(Y_test,yhat)

In [None]:
# To get the weights of all the variables
weights = pd.Series(best_estimator.coef_[0],index=X.columns.values)
weights.sort_values(ascending = False)[:10].plot(kind='bar')

In [None]:
weights.sort_values(ascending = False)[-10:].plot(kind='bar')

## 3.2 SVM

In [None]:
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'), 'C': np.logspace(-3, 3, 5),'gamma':np.logspace(-3, 3, 5)}
svm = SVC()
svm_cv = GridSearchCV(estimator=svm, param_grid=parameters, cv=10)
svm_cv.fit(X_train,Y_train)

print("tuned hpyerparameters :(best parameters) ",svm_cv.best_params_)
print("accuracy :",svm_cv.best_score_)
best_estimator = svm_cv.best_estimator_
svm_score = best_estimator.score(X_test, Y_test)
print("accuracy :", svm_score)
yhat=svm_cv.predict(X_test)

# other metrics

print_model_measures(Y_test, yhat)
plot_roc_curve(Y_test, yhat)
plot_confusion_matrix(Y_test,yhat)

## 3.3 Decision Tree

In [None]:
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     #'max_features': ['auto', 'sqrt'],
     'max_features': ['sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(estimator=tree, param_grid=parameters, cv=10)
tree_cv.fit(X_train, Y_train)

In [None]:
print("Best hpyerparameters:{} with accuray of {}".format(tree_cv.best_params_, tree_cv.best_score_))
print("")
print("Using best esitimator.........")
best_estimator = tree_cv.best_estimator_
tree_score = best_estimator.score(X_test, Y_test)
yhat=tree_cv.predict(X_test)
print("Accuracy :",tree_score)

# other measures
print_model_measures(Y_test, yhat)
plot_roc_curve(Y_test, yhat)
plot_confusion_matrix(Y_test,yhat)

## 3.4 K-Nearest Neighbor

In [None]:
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()

In [None]:
knn_cv = GridSearchCV(estimator=KNN, param_grid=parameters)
knn_cv.fit(X_train, Y_train)

In [None]:
print("Best hpyerparameters:{} with accuray of {}".format(knn_cv.best_params_, knn_cv.best_score_))
print("")
print("Using best esitimator.........")
best_estimator = knn_cv.best_estimator_
knn_score = best_estimator.score(X_test, Y_test)
yhat=knn_cv.predict(X_test)
print("Accuracy :",knn_score)

# other measures
print_model_measures(Y_test, yhat)
plot_roc_curve(Y_test, yhat)
plot_confusion_matrix(Y_test,yhat)