<a href="https://colab.research.google.com/github/aslyldrm/telco-churn-classification/blob/main/Telco_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Telco-Customer-Churn.csv')
dataset.drop(columns=['customerID'], inplace=True)





In [3]:
dataset['TotalCharges'] = dataset['TotalCharges'].str.strip()
non_numeric_rows = dataset[~dataset['TotalCharges'].str.replace('.', '', regex=False).str.isnumeric()]
dataset['TotalCharges'] = dataset['TotalCharges'].replace({'\s+': '', ',': ''}, regex=True)
dataset['TotalCharges'] = pd.to_numeric(dataset['TotalCharges'], errors='coerce')

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

##Label Encoder

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
values = [0,2,3,5,6,8,9,10,11,12,13,15]

for i in values:
    X[:,i] = le.fit_transform(X[:,i])

y = le.fit_transform(y)

## Encoding the Independent Variables

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [7,14,16])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Taking care of missing data

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer.fit(X[:,:-1])
X = imputer.transform(X[:,:-1])

## Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Random Forest Classification

In [10]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 20, criterion = 'entropy')
classifier_rf.fit(X_train, y_train)


In [11]:
y_pred_rf = classifier_rf.predict(X_test)
print(np.concatenate((y_pred_rf.reshape(len(y_pred_rf),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)
accuracy_score_rf = accuracy_score(y_test, y_pred_rf)
accuracy_score_rf

[[1160  138]
 [ 254  209]]


0.7773992049971608

# Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
classifier_log = LogisticRegression(solver='lbfgs', max_iter=1000)
classifier_log.fit(X_train, y_train)

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_log = classifier_log.predict(X_test)
cm = confusion_matrix(y_test, y_pred_log)
print(cm)
accuracy_score_log = accuracy_score(y_test, y_pred_log)

[[1164  134]
 [ 218  245]]


# XGBoost

In [15]:
from xgboost import XGBClassifier
xgb_c = XGBClassifier()
xgb_c.fit(X_train, y_train)

In [16]:
pred_xgb_c = xgb_c.predict(X_test)
cm = confusion_matrix(y_test, pred_xgb_c)
print(cm)
accuracy_score_xgb_c = accuracy_score(y_test, pred_xgb_c)
accuracy_score_xgb_c

[[1132  166]
 [ 231  232]]


0.7745599091425327

# Kernel SVM

In [17]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'rbf')
classifier_svc.fit(X_train, y_train)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_svc = classifier_svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_svc)
print(cm)
accuracy_score_svm = accuracy_score(y_test, y_pred_svc)
accuracy_score_svm

[[1172  126]
 [ 244  219]]


0.7898921067575241

# K-Nearest Neighbors (K-NN)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_knn = classifier_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)
accuracy_score_knn = accuracy_score(y_test, y_pred_knn)
accuracy_score_knn

[[1149  149]
 [ 253  210]]


0.7717206132879046

# ANN

In [21]:
import tensorflow as tf

In [22]:
ann = tf.keras.models.Sequential()

In [23]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [24]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [25]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [26]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [27]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 100)

Epoch 1/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7376 - loss: 0.6890
Epoch 2/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7551 - loss: 0.4988
Epoch 3/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7533 - loss: 0.4575
Epoch 4/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7788 - loss: 0.4392
Epoch 5/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7876 - loss: 0.4289
Epoch 6/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7888 - loss: 0.4330
Epoch 7/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8030 - loss: 0.4094
Epoch 8/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8037 - loss: 0.4126
Epoch 9/100
[1m166/166[0m [32

<keras.src.callbacks.history.History at 0x7b8bbc036380>

In [28]:
y_pred_ann = ann.predict(X_test)
y_pred_ann = (y_pred_ann > 0.5)
print(np.concatenate((y_pred_ann.reshape(len(y_pred_ann),1), y_test.reshape(len(y_test),1)),1))

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [29]:

cm = confusion_matrix(y_test, y_pred_ann)
print(cm)
accuracy_score_ann = accuracy_score(y_test, y_pred_ann)

[[1157  141]
 [ 225  238]]


# Results

In [30]:
print("Random Forest Classification: ", str(accuracy_score_rf))
print("Logistic Regression: ", str(accuracy_score_log))
print("Kernel SVc", str(accuracy_score_svm))
print("K-Nearest Neighbors (K-NN): ", str(accuracy_score_knn))
print("XGBoost: ", str(accuracy_score_xgb_c))
print("ANN: ", str(accuracy_score_ann))

Random Forest Classification:  0.7773992049971608
Logistic Regression:  0.8001135718341851
Kernel SVc 0.7898921067575241
K-Nearest Neighbors (K-NN):  0.7717206132879046
XGBoost:  0.7745599091425327
ANN:  0.7921635434412265
