# Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import seaborn as sns
import plotly.express as px
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
import joblib

# Import Dataset

In [2]:
df = pd.read_csv('../data/customer_churn_train_preprocessed.csv')
df_predict = pd.read_csv('../data/customer_churn_test.csv')
old_df = pd.read_csv('../data/customer_churn_train.csv')

# Data Split

In [3]:
Y = df['churn']
X = df.drop(['churn'],axis=1)

test_size = 0.2
seed = 123
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

# ML Models

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

NaiveBayes = GaussianNB()
DecisionTree = DecisionTreeClassifier()
KNN = KNeighborsClassifier()
RForest = RandomForestClassifier()

# Training & Evaluation Score

In [5]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# declare variables
lst_model_name = ['Naive Bayes', 'Decision Tree', 'Random Forest', 'KNN']
lst_classifier = [NaiveBayes, DecisionTree, RForest, KNN]
lst_pred = []
lst_TP = []
lst_TN = []
lst_FP = []
lst_FN = []
lst_f1 = []
lst_acc = []

# generate Y_pred list
for clf in lst_classifier:
  clf.fit(X_train, Y_train)
  lst_pred.append(clf.predict(X_test))

# generate confusion matrix
for Y_pred in lst_pred:
  tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel() # source: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
  lst_TP.append(tp)
  lst_TN.append(tn)
  lst_FP.append(fp)
  lst_FN.append(fn)
  lst_f1.append(f1_score(Y_test, Y_pred))
  lst_acc.append(accuracy_score(Y_test, Y_pred))

df_cmatrix = pd.DataFrame(list(zip(lst_model_name, lst_TP, lst_TN, lst_FP, lst_FN, lst_acc, lst_f1)),
                                columns=['Classifier','TP','TN','FP','FN', 'Accuracy','F-1 Score'])
df_cmatrix.sort_values(by=['F-1 Score'],ascending=False,inplace=True)
df_cmatrix.reset_index(inplace=True, drop=True)
df_cmatrix

Unnamed: 0,Classifier,TP,TN,FP,FN,Accuracy,F-1 Score
0,Decision Tree,44,610,26,23,0.930299,0.642336
1,Random Forest,32,635,1,35,0.948791,0.64
2,Naive Bayes,42,418,218,25,0.654339,0.256881
3,KNN,0,631,5,67,0.897582,0.0


# Model Validation

In [6]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# declare list for k-fold score
lst_kfold_accuracy = []
lst_kfold_f1 = []

# generate cross validation score
k_fold = KFold(n_splits=5 , shuffle=True, random_state=0)
for clf in lst_classifier:
  lst_kfold_accuracy.append(cross_val_score(clf, X_train, Y_train, cv=k_fold, n_jobs=1, scoring='accuracy').mean())
  lst_kfold_f1.append(cross_val_score(clf, X_train, Y_train, cv=k_fold, n_jobs=1, scoring='f1').mean())

df_cv = pd.DataFrame(list(zip(lst_model_name, lst_kfold_accuracy, lst_kfold_f1)),
                     columns=['Classifier','Accuracy','F-1 Score'])
df_cv.sort_values(by=['F-1 Score'],ascending=False,inplace=True)
df_cv.reset_index(inplace=True, drop=True)
df_cv

Unnamed: 0,Classifier,Accuracy,F-1 Score
0,Random Forest,0.948793,0.7244
1,Decision Tree,0.92674,0.658081
2,Naive Bayes,0.522003,0.230039
3,KNN,0.891185,0.063854



Based on the results of cross-validation with k=5, the Random Forest model has the highest accuracy and F-1 Score.

# Predict

In [7]:
# data predict pre-processing

# remove prefix in column code_area
df_predict['area_code'] = df_predict['area_code'].str.split('_').str[2]

# feature encoding
for cat in ['international_plan','voice_mail_plan']:
  df_predict[cat] = df_predict[cat].astype(str).map({"no":0,"yes":1})

for cat in ['state','area_code']:
  one_hot_enc = pd.get_dummies(df_predict[cat], prefix=cat)
  df_predict = df_predict.join(one_hot_enc)

df_predict = df_predict.drop(['state','area_code'], axis=1)

# normalization
df_numerical = df_predict.select_dtypes(['int64','float64'])
for col in df_numerical.columns.tolist():
  df_predict[col] = MinMaxScaler().fit_transform(df_predict[col].values.reshape(len(df_predict),1))

In [8]:
prediction = RForest.predict(df_predict.loc[:,'account_length':])
df_submission = pd.DataFrame({"Customer ID": df_predict['id'],
                             "Churn":prediction})
print("Based on the predictions of the Machine Learning model, the Customer IDs that have churned are as follows:")
display(df_submission[df_submission['Churn']==1])

Based on the predictions of the Machine Learning model, the Customer IDs that have churned are as follows:


Unnamed: 0,Customer ID,Churn
37,0.049399,1
62,0.082777,1
72,0.096128,1
99,0.132176,1
104,0.138852,1
108,0.144192,1
149,0.198932,1
227,0.303071,1
250,0.333778,1
253,0.337784,1


# Export the model

In [9]:
lst_classifier

[GaussianNB(),
 DecisionTreeClassifier(),
 RandomForestClassifier(),
 KNeighborsClassifier()]

In [11]:
# Mengekspor RandomForestClassifier() ke dalam file
random_forest_model = lst_classifier[2]  # RandomForestClassifier() ada di indeks 2
joblib.dump(random_forest_model, '../model/random_forest_model.pkl')

['../model/random_forest_model.pkl']