### Models for Classification Problems - Summary
In this exercise, I would compare and summarize the following models for a Classification Problem:
1. Logistic Regression
2. Decision Tree Classifier
3. K Nearest NEighbor CLassifier
3. SVM (Support Vector Machine) Classifier

We will aply this to Predicting Customer Churn classification problem

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
churn = pd.read_csv('data/telecom_churn.csv')
churn.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:

# # Nominal Values that need to be encoded
# State                   3333 non-null   object 
# International plan      3333 non-null   object 
# Voice mail plan         3333 non-null   object 

from sklearn.preprocessing import LabelEncoder

churn_copy = churn

columns = ["State","International plan", "Voice mail plan"]
for column in columns:
    le = LabelEncoder()
    churn_copy[column] = le.fit_transform(churn_copy[column])    
churn_copy
print (type(churn_copy))

<class 'pandas.core.frame.DataFrame'>


In [4]:
# Scale all numerical values
from sklearn.preprocessing import StandardScaler
print (type(churn_copy))

numerical_columns = (churn_copy.select_dtypes(include=['float64', 'int64'])).columns.to_list()
standard_scaler = StandardScaler()

churn_copy[numerical_columns] = standard_scaler.fit_transform(churn_copy[numerical_columns])
churn_copy.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,-0.678649,0.676489,-0.523603,-0.32758,1.617086,1.234883,1.566767,0.476643,1.567036,-0.07061,-0.05594,-0.070427,0.866743,-0.465494,0.866029,-0.085008,-0.601195,-0.08569,-0.427932,False
1,0.60317,0.149065,-0.523603,-0.32758,1.617086,1.307948,-0.333738,1.124503,-0.334013,-0.10808,0.144867,-0.107549,1.058571,0.147825,1.05939,1.240482,-0.601195,1.241169,-0.427932,False
2,0.333313,0.902529,-0.523603,-0.32758,-0.618396,-0.59176,1.168304,0.675985,1.168464,-1.573383,0.496279,-1.5739,-0.756869,0.198935,-0.755571,0.703121,0.211534,0.697156,-1.188218,False
3,0.60317,-0.42859,-0.688834,3.052685,-0.618396,-0.59176,2.196596,-1.466936,2.196759,-2.742865,-0.608159,-2.743268,-0.078551,-0.567714,-0.078806,-1.303026,1.024263,-1.306401,0.332354,False
4,0.670634,-0.654629,-0.523603,3.052685,-0.618396,-0.59176,-0.24009,0.626149,-0.240041,-1.038932,1.098699,-1.037939,-0.276311,1.067803,-0.276562,-0.049184,-0.601195,-0.045885,1.092641,False


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = churn_copy.drop(columns=['Churn'], axis = 1)
y = churn_copy['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
import time

from sklearn.metrics import accuracy_score

results = []

param_dict = {'C': [0.1, 1, 10]}

logistic_regression = LogisticRegression(max_iter=1000)
grid = GridSearchCV(estimator=logistic_regression, param_grid=param_dict)

start_time = time.time()
grid.fit(X_train, y_train)
fit_time = (time.time() - start_time) / len(grid.cv_results_['mean_fit_time'])
    

best_model = grid.best_estimator_
print (best_model)

# Evaluate on training and test sets
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

results.append(["Logistic Regression", train_score, test_score, fit_time])

results

LogisticRegression(C=10, max_iter=1000)


[['Logistic Regression', 0.8598371195885126, 0.862, 0.10421005884806316]]

In [7]:
#Decision Trees
from sklearn.tree import DecisionTreeClassifier

y_pred = ''
param_dict = {'max_depth': [5, 10, 15]}

decision_tree = DecisionTreeClassifier()

grid = GridSearchCV(estimator=decision_tree, param_grid=param_dict)
start_time = time.time()
    
grid.fit(X_train, y_train)
fit_time = (time.time() - start_time) / len(grid.cv_results_['mean_fit_time'])
    

best_model = grid.best_estimator_

# Evaluate on training and test sets
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

results.append(["Decision Tree Classifier", train_score, test_score, fit_time])

results

[['Logistic Regression', 0.8598371195885126, 0.862, 0.10421005884806316],
 ['Decision Tree Classifier', 0.951993141877411, 0.937, 0.20195619265238443]]

In [8]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

param_dict = {'n_neighbors': [3, 5, 7]}
knn_classifier = KNeighborsClassifier()
grid = GridSearchCV(estimator=knn_classifier, param_grid=param_dict)

start_time = time.time()
grid.fit(X_train, y_train)
fit_time = (time.time() - start_time) / len(grid.cv_results_['mean_fit_time'])
    
best_model = grid.best_estimator_

# Evaluate on training and test sets
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

results.append(["KNN Classifier", train_score, test_score, fit_time])

results

[['Logistic Regression', 0.8598371195885126, 0.862, 0.10421005884806316],
 ['Decision Tree Classifier', 0.951993141877411, 0.937, 0.20195619265238443],
 ['KNN Classifier', 0.9198456922417488, 0.881, 0.46997563044230145]]

In [9]:
#SVM
from sklearn.svm import SVC


support_vector_classifier = SVC(kernel='linear', degree=2)
support_vector_classifier.fit(X_train, y_train)

param_dict = {'C': [0.1, 1, 10]}

grid = GridSearchCV(estimator=support_vector_classifier, param_grid=param_dict)

start_time = time.time()
grid.fit(X_train, y_train)
fit_time = (time.time() - start_time) / len(grid.cv_results_['mean_fit_time'])
    

best_model = grid.best_estimator_

# Evaluate on training and test sets
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

results.append(["Support Vector Classifier", train_score, test_score, fit_time])

In [10]:
results
results_df = pd.DataFrame(results, columns=['Name','Training Score','Test Score','Fit Time (Seconds)'])

results_df

Unnamed: 0,Name,Training Score,Test Score,Fit Time (Seconds)
0,Logistic Regression,0.859837,0.862,0.10421
1,Decision Tree Classifier,0.951993,0.937,0.201956
2,KNN Classifier,0.919846,0.881,0.469976
3,Support Vector Classifier,0.854265,0.857,8.377808


In [None]:
results
plt.scatter(results_df, x='Training_score', y = 'Test_score')
plt.show()

KeyError: "None of [Index(['Training_score'], dtype='object')] are in the [columns]"