## Loading the dataset 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.shape

(7043, 21)

In [4]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Preprocessing

In [7]:
df.replace(r'^\s*$', pd.NA, regex=True, inplace=True)

In [8]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
df["TotalCharges"] = df["TotalCharges"].fillna(0)

In [10]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [12]:
df["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [13]:
df["Churn"] = df["Churn"].replace({"No":0, "Yes": 1})

In [14]:
df["Churn"].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [15]:
X = df.drop(columns = ["Churn", "customerID"])
y = df[["Churn"]]

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [17]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
               'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
               'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

## Feature Engineering

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train_numerical = scaler.fit_transform(X_train[numerical])
scaled_test_numerical = scaler.transform(X_test[numerical])

scaled_train_numerical_df = pd.DataFrame(scaled_train_numerical, columns=numerical, index=X_train.index)
scaled_test_numerical_df = pd.DataFrame(scaled_test_numerical, columns=numerical, index=X_test.index)

In [19]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output = False).set_output(transform = "pandas")
encoded_train_categorical = ohe.fit_transform(X_train[categorical])
encoded_test_categorical = ohe.fit_transform(X_test[categorical])

In [20]:
X_train_non_numerical_categorical = X_train.drop(columns=numerical + categorical)
X_test_non_numerical_categorical = X_test.drop(columns=numerical + categorical)

# Combine the scaled numerical and one-hot encoded categorical features with the rest of the DataFrames
X_train_processed = pd.concat([X_train_non_numerical_categorical, scaled_train_numerical_df, encoded_train_categorical], axis=1)
X_test_processed = pd.concat([X_test_non_numerical_categorical, scaled_test_numerical_df, encoded_test_categorical], axis=1)

## Model Training

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

rfc_model = RandomForestClassifier(random_state = 1)
rfc_model.fit(X_train_processed, y_train.values.ravel())
y_pred = rfc_model.predict(X_test_processed)

cnf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0,1])
accuracy = accuracy_score(y_test, y_pred) 
f1 = f1_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred) 
recall  = recall_score(y_test, y_pred) 

print('Model performance for Test set (Random Forest Classifier)')
print("- Accuracy: {:.4f}".format(accuracy))
print('- F1 score: {:4f}'.format(f1))
print('- Precision: {:4f}'.format(precision))
print('- Recall: {:4f}'.format(recall))

Model performance for Test set (Random Forest Classifier)
- Accuracy: 0.7913
- F1 score: 0.558559
- Precision: 0.584906
- Recall: 0.534483


In [22]:
from sklearn.ensemble import ExtraTreesClassifier
etc_model = ExtraTreesClassifier(random_state = 1)
etc_model.fit(X_train_processed, y_train.values.ravel())
y_pred = etc_model.predict(X_test_processed)

cnf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0,1])
accuracy = accuracy_score(y_test, y_pred) 
f1 = f1_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred) 
recall  = recall_score(y_test, y_pred) 

print('Model performance for Test set (Extra Trees Classifier)')
print("- Accuracy: {:.4f}".format(accuracy))
print('- F1 score: {:4f}'.format(f1))
print('- Precision: {:4f}'.format(precision))
print('- Recall: {:4f}'.format(recall))

Model performance for Test set (Extra Trees Classifier)
- Accuracy: 0.7672
- F1 score: 0.501520
- Precision: 0.532258
- Recall: 0.474138


In [23]:
import lightgbm as lgb
clf_model = lgb.LGBMClassifier(random_state = 1)
clf_model.fit(X_train_processed, y_train.values.ravel())
y_pred = etc_model.predict(X_test_processed)

cnf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0,1])
accuracy = accuracy_score(y_test, y_pred) 
f1 = f1_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred) 
recall  = recall_score(y_test, y_pred) 

print('Model performance for Test set (Light Gradiebt Boosting Classifier)')
print("- Accuracy: {:.4f}".format(accuracy))
print('- F1 score: {:4f}'.format(f1))
print('- Precision: {:4f}'.format(precision))
print('- Recall: {:4f}'.format(recall))

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Model performance for Test set (Light Gradiebt Boosting Classifier)
- Accuracy: 0.7672
- F1 score: 0.501520
- Precision: 0.532258
- Recall: 0.474138


In [24]:
pip install --user xgboost

Note: you may need to restart the kernel to use updated packages.


In [25]:
import xgboost as xgb


xgb_classifier = xgb.XGBClassifier(random_state = 1)
xgb_classifier.fit(X_train_processed, y_train.values.ravel())
y_pred = etc_model.predict(X_test_processed)

cnf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[0,1])
accuracy = accuracy_score(y_test, y_pred) 
f1 = f1_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred) 
recall  = recall_score(y_test, y_pred) 

print('Model performance for Test set (Light Gradiebt Boosting Classifier)')
print("- Accuracy: {:.4f}".format(accuracy))
print('- F1 score: {:4f}'.format(f1))
print('- Precision: {:4f}'.format(precision))
print('- Recall: {:4f}'.format(recall))

Model performance for Test set (Light Gradiebt Boosting Classifier)
- Accuracy: 0.7672
- F1 score: 0.501520
- Precision: 0.532258
- Recall: 0.474138
