In [74]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import math
import warnings
from imblearn.combine import SMOTEENN
from scipy.stats import chi2_contingency
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder, PowerTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
warnings.filterwarnings("ignore")

%matplotlib inline

In [75]:
data_df = pd.read_csv(f'{Path.cwd().parent}/data/processed/processed_data.csv')

In [76]:
data_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,0,1,2,3,4,5,6,7,8,9
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU,9305-CDSKC,1452-KIOVK,6713-OKOMC,7892-POOKP,6388-TABGU
gender,1,0,0,0,1,1,0,1,1,0
SeniorCitizen,1,1,1,1,1,1,1,1,1,1
Partner,1,0,0,0,0,0,0,0,1,0
Dependents,1,1,1,1,1,1,0,1,1,0
tenure,1,34,2,45,2,8,22,10,28,62
PhoneService,1,0,0,1,0,0,0,1,0,0
MultipleLines,No phone service,No,No,No phone service,No,Yes,Yes,No phone service,Yes,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic,Fiber optic,Fiber optic,DSL,Fiber optic,DSL


In [77]:
X = data_df.drop(['customerID', 'Churn','Unnamed: 0'], axis = 1)
y = data_df['Churn']

In [78]:
X_train,  X_test, y_train,y_test = train_test_split(X,y, random_state = 42, test_size =0.2)

In [79]:
multi_class_var = ['MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaymentMethod']

In [101]:
def validation_monitor(pred,test):
    model_test_accuracy = accuracy_score(pred,test)
    model_test_precision = precision_score(pred,test)
    model_test_f1 = f1_score(pred,test)
    model_test_recall = recall_score(pred,test)
    model_test_rocauc_score = roc_auc_score(pred,test)

    return model_test_accuracy,model_test_precision,model_test_f1,model_test_recall,model_test_rocauc_score


In [102]:
preprocessor = ColumnTransformer([
    ('onehotencoding',OneHotEncoder(),multi_class_var)
])

In [103]:
model_output = {}

In [104]:
models = [LogisticRegression(),RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(),KNeighborsClassifier(),
          DecisionTreeClassifier()]

In [105]:
scalar = StandardScaler()

In [111]:
for model in models:
    
    pipeline = ImbPipeline([
        ('pre', preprocessor),
        ('smote', SMOTEENN(random_state=42)),
        ('scalar',scalar),
        ('model',model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    model_test_accuracy, model_test_precision, model_test_f1, model_test_recall,model_test_rocauc_score = validation_monitor(y_pred,y_test)

    model_output[model] = {
        'accuracy':model_test_accuracy,
        'precision':model_test_precision,
        'f1':model_test_f1,
        'recall':model_test_recall,
        'roc_auc_score':model_test_rocauc_score
    }

    print(f'----------------{model} performance----------------')
    print(f'Accuracy : {round(model_output[model].get('accuracy'),2)}')
    print(f'Precision : {round(model_output[model].get('precision'),2)}')
    print(f'f1 : {round(model_output[model].get('f1'),2)}')
    print(f'recall : {round(model_output[model].get('recall'),2)}')
    print(f'roc_auc_score : {round(model_output[model].get('roc_auc_score'),2)}')



----------------LogisticRegression() performance----------------
Accuracy : 0.72
Precision : 0.69
f1 : 0.78
recall : 0.9
roc_auc_score : 0.69
----------------RandomForestClassifier() performance----------------
Accuracy : 0.73
Precision : 0.74
f1 : 0.8
recall : 0.88
roc_auc_score : 0.69
----------------AdaBoostClassifier() performance----------------
Accuracy : 0.71
Precision : 0.69
f1 : 0.78
recall : 0.9
roc_auc_score : 0.69
----------------GradientBoostingClassifier() performance----------------
Accuracy : 0.74
Precision : 0.75
f1 : 0.81
recall : 0.88
roc_auc_score : 0.69
----------------KNeighborsClassifier() performance----------------
Accuracy : 0.71
Precision : 0.72
f1 : 0.78
recall : 0.86
roc_auc_score : 0.67
----------------DecisionTreeClassifier() performance----------------
Accuracy : 0.71
Precision : 0.73
f1 : 0.79
recall : 0.86
roc_auc_score : 0.67
