<a href="https://colab.research.google.com/github/aslyldrm/telco-churn-classification/blob/main/Telco_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Telco-Customer-Churn.csv')
dataset.drop(columns=['customerID'], inplace=True)





In [None]:
dataset['TotalCharges'] = dataset['TotalCharges'].str.strip()
non_numeric_rows = dataset[~dataset['TotalCharges'].str.replace('.', '', regex=False).str.isnumeric()]
dataset['TotalCharges'] = dataset['TotalCharges'].replace({'\s+': '', ',': ''}, regex=True)
dataset['TotalCharges'] = pd.to_numeric(dataset['TotalCharges'], errors='coerce')


In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

##Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
values = [0,2,3,5,6,8,9,10,11,12,13,15]

for i in values:
    X[:,i] = le.fit_transform(X[:,i])

y = le.fit_transform(y)

## Encoding the Independent Variables

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [7,14,16])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer.fit(X[:,:-1])
X = imputer.transform(X[:,:-1])

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 20, criterion = 'entropy')
classifier_rf.fit(X_train, y_train)


In [None]:
y_pred_rf = classifier_rf.predict(X_test)
print(np.concatenate((y_pred_rf.reshape(len(y_pred_rf),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)
accuracy_score_rf = accuracy_score(y_test, y_pred_rf)
accuracy_score_rf

[[1152  146]
 [ 246  217]]


0.7773992049971608

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_log = LogisticRegression(solver='lbfgs', max_iter=1000)
classifier_log.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_log = classifier_log.predict(X_test)
cm = confusion_matrix(y_test, y_pred_log)
print(cm)
accuracy_score_log = accuracy_score(y_test, y_pred_log)

[[1164  134]
 [ 218  245]]


**Applying Grid Search to find the best model and the best parameters**

In [50]:
parameters_log = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [100,1000,2500,5000]
}
]

In [51]:
from sklearn.model_selection import GridSearchCV
grid_search_log = GridSearchCV(estimator = classifier_log,
                           param_grid = parameters_log,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

In [52]:
grid_search_log.fit(X_train, y_train)

10400 fits failed out of a total of 16000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
800 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 67, in _check_solv

In [53]:
best_accuracy_log = grid_search_log.best_score_
best_parameters_log = grid_search_log.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy_log*100))
print("Best Parameters:", best_parameters_log)

Best Accuracy: 80.31 %
Best Parameters: {'C': 0.03359818286283781, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}


# XGBoost

In [None]:
from xgboost import XGBClassifier
xgb_c = XGBClassifier()
xgb_c.fit(X_train, y_train)

In [None]:
pred_xgb_c = xgb_c.predict(X_test)
cm = confusion_matrix(y_test, pred_xgb_c)
print(cm)
accuracy_score_xgb_c = accuracy_score(y_test, pred_xgb_c)
accuracy_score_xgb_c

[[1132  166]
 [ 231  232]]


0.7745599091425327

# Kernel SVM

In [None]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'rbf')
classifier_svc.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_svc = classifier_svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_svc)
print(cm)
accuracy_score_svm = accuracy_score(y_test, y_pred_svc)
accuracy_score_svm

[[1172  126]
 [ 244  219]]


0.7898921067575241

# K-Nearest Neighbors (K-NN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_knn = classifier_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)
accuracy_score_knn = accuracy_score(y_test, y_pred_knn)
accuracy_score_knn

[[1149  149]
 [ 253  210]]


0.7717206132879046

# ANN

In [None]:
import tensorflow as tf

In [None]:
ann = tf.keras.models.Sequential()

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [None]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [None]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 100)

Epoch 1/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6031 - loss: 0.6341
Epoch 2/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7588 - loss: 0.4793
Epoch 3/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7724 - loss: 0.4516
Epoch 4/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7725 - loss: 0.4535
Epoch 5/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7869 - loss: 0.4377
Epoch 6/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7873 - loss: 0.4315
Epoch 7/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7929 - loss: 0.4307
Epoch 8/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7930 - loss: 0.4295
Epoch 9/100
[1m166/166[0m [32

<keras.src.callbacks.history.History at 0x79017208e2c0>

In [None]:
y_pred_ann = ann.predict(X_test)
y_pred_ann = (y_pred_ann > 0.5)
print(np.concatenate((y_pred_ann.reshape(len(y_pred_ann),1), y_test.reshape(len(y_test),1)),1))

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [None]:

cm = confusion_matrix(y_test, y_pred_ann)
print(cm)
accuracy_score_ann = accuracy_score(y_test, y_pred_ann)

[[1143  155]
 [ 233  230]]


# Results

In [56]:
print("Random Forest Classification: ", str(accuracy_score_rf))
print("Logistic Regression: ", str(accuracy_score_log))
print("Kernel SVc", str(accuracy_score_svm))
print("K-Nearest Neighbors (K-NN): ", str(accuracy_score_knn))
print("XGBoost: ", str(accuracy_score_xgb_c))
print("ANN: ", str(accuracy_score_ann))

print("*******************************************")

print("Adding Grid Search For Logistic Regression")
print("accuracy : ", str(best_accuracy_log))

Random Forest Classification:  0.7773992049971608
Logistic Regression:  0.8001135718341851
Kernel SVc 0.7898921067575241
K-Nearest Neighbors (K-NN):  0.7717206132879046
XGBoost:  0.7745599091425327
ANN:  0.7796706416808632
*******************************************
Adding Grid Search For Logistic Regression
accuracy :  0.8031033396345306
