In [77]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [78]:
# Load file
data = pd.read_csv('Telco-Customer-Churn.csv')

In [79]:
# Pre-processing
if 'customerID' in data.columns:
    data.drop(columns = ['customerID'], inplace = True)
    
label_encoders = {}
for column in data.select_dtypes(include = ['object']).columns:
    le=LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = data[column]

X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=52)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [80]:
# Modelling and Cross-validation
models = {
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC()
}

cv_strat = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)

param_grids = {
    'KNN': {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']},
    'Naive Bayes': {},
    'Decision Tree': {'max_depth': [3, 5, 10], 'min_samples_split': [2, 10, 20]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
}

best_models = {}

for name,model in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=cv_strat, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_models[name] = grid_search.best_estimator_

print(best_models)

{'KNN': KNeighborsClassifier(n_neighbors=9), 'Logistic Regression': LogisticRegression(C=0.1, solver='saga'), 'Naive Bayes': GaussianNB(), 'Decision Tree': DecisionTreeClassifier(max_depth=5, min_samples_split=10), 'SVM': SVC(C=1, kernel='linear')}


In [81]:
# Predictions
accuracies = {}

for name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    print(f'{name} Test Accuracy: {acc:.4f}')

best_model = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_model]
print(f'\nBest Model: {best_model} with Accuracy Score {best_accuracy:.4f}')

KNN Test Accuracy: 0.7728
Logistic Regression Test Accuracy: 0.8140
Naive Bayes Test Accuracy: 0.7596
Decision Tree Test Accuracy: 0.7894
SVM Test Accuracy: 0.8135

Best Model: Logistic Regression with Accuracy Score 0.8140
