## Predict whether a customer is likely to stop using a company's services (i.e., churn), based on their usage patterns and demographics.

In [137]:
#!pip install kagglehub

### Exploring Data

In [139]:
#The data set we will be using is telco customer Churn from kaggle

import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
data = pd.read_csv(url)

#Check how many rows and columns
print("Shape of the data: ", data.shape)

#Check the top few rows
data.head(15)

Shape of the data:  (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [140]:
#Check columns, data types and more
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [141]:
#Check how many null values are present
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [142]:
#Checks how many uniquetexts are present eg How many customers churned (Yes) vs. stayed (No)
data['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

### PreProcessing Data

In [144]:
# We’ll go through these key steps:

#Understand data types

#Handle missing values

#Convert TotalCharges to numeric

#Drop unnecessary columns

#Encode categorical variables

#Split features and target

In [145]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [146]:
print("Misiing Vlaues : ", data.isnull().sum())

Misiing Vlaues :  customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [147]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Now check missing values again
print("Missing values after conversion:\n", data.isnull().sum())

# Drop rows with missing TotalCharges
data = data.dropna(subset = ['TotalCharges'])


Missing values after conversion:
 customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [148]:
# Step 5: Drop 'customerID' column (not useful for prediction)
data = data.drop('customerID', axis = 1)

In [149]:
# Step 6: Convert target column 'Churn' to binary (Yes=1, No=0)
data['Churn'] = data['Churn'].map({"Yes" : 1,"No":0})

In [150]:
# Step 7: One-hot encode categorical columns
#drop_first = True-->This reduces redundancy

data_encoded = pd.get_dummies(data, drop_first = True)

In [151]:
#Split X and Y

#axis=0 — Rows
#axis=1 — Columns

X = data_encoded.drop('Churn', axis = 1)
y = data_encoded['Churn']

#Print Final Shapes
print("X : ", X.shape)
print("y : ", y.shape)

X :  (7032, 30)
y :  (7032,)


### Model Training

In [153]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [154]:
from sklearn.linear_model import LogisticRegression

#Initialize model
model = LogisticRegression(max_iter = 1000)

model.fit(X,y)

In [155]:
y_pred = model.predict(X_test)

In [156]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Detailed classification report (precision, recall, f1-score)
print(classification_report(y_test, y_pred))

# Confusion matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7917555081734187
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.72      1407
weighted avg       0.78      0.79      0.78      1407

[[920 113]
 [180 194]]


In [None]:
# Accuracy: Overall correctness of the model (how many predictions were right out of all)

# Precision: When model predicts churn, how often is it correct? (Focus on correctness of positive predictions)
# Recall: Of all actual churners, how many did the model find? (Focus on catching positives)
# F1-Score: Balance between precision and recall (harmonic mean of precision and recall)

# Metrics for class 0 (No churn):
# Precision: Of all predicted no churn, 84% were correct
# Recall: Of all actual no churn customers, 89% were correctly identified
# F1-Score: Balance of precision and recall for no churn (0.86)

# Metrics for class 1 (Churn):
# Precision: Of all predicted churn, 63% were correct
# Recall: Of all actual churn customers, only 52% were identified (missed 48%)
# F1-Score: Balance of precision and recall for churn (0.57)

# Confusion Matrix:
# TN (True Negative): Correctly predicted no churn
# FP (False Positive): Predicted churn but customer stayed (false alarm)
# FN (False Negative): Missed churners (predicted stayed but actually churned)
# TP (True Positive): Correctly predicted churn

# Summary:
# The model is good at identifying customers who will NOT churn but misses about half of the actual churners.
# Need to improve recall for churn class to better catch those likely to leave.

## Lets try to improve the accuracy

### SMOTE, RANDOM CLASSIFIER AND CROSS VALIDATION

In [None]:
# Install imblearn if needed:
!pip install imbalanced-learn

In [179]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# 1. Handle class imbalance with SMOTE
#SMOTE (Synthetic Minority Over-sampling Technique) creates new, synthetic examples of the minority class (e.g. churners) 
#so that both classes have equal size.
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Original training set shape: {X_train.shape}, {y_train.shape}")
print(f"Resampled training set shape: {X_resampled.shape}, {y_resampled.shape}")

# 2. Try Different Model - Random Forest
#Random Forest is usually better than logistic regression in handling complex relationships and noisy data.

rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)

# 3. Cross-validation on resampled data
#Cross-validation splits the resampled training data into 5 parts, trains on 4 parts, tests on the 5th, and repeats this 5 times.

cv_scores = cross_val_score(rf, X_resampled, y_resampled, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean()}")

# Evaluate on test set
y_pred = rf.predict(X_test)

print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Original training set shape: (5625, 30), (5625,)
Resampled training set shape: (8260, 30), (8260,)
Test Set Accuracy: 0.775408670931059

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85      1033
           1       0.58      0.59      0.58       374

    accuracy                           0.78      1407
   macro avg       0.71      0.71      0.71      1407
weighted avg       0.78      0.78      0.78      1407

Confusion Matrix:
 [[872 161]
 [155 219]]


### XGBOOST, SMOTE AND CROSS VALIDATION

In [None]:
# Define XGBoost model (handles imbalanced data well, usually high performance)
# Cross-validation checks model reliability across splits
# Fit on resampled (SMOTE) data to handle class imbalance
# Predict and evaluate on the test set

In [188]:
!pip install xgboost



In [193]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Define the XGBoost model
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

# Step 2: Cross-validation on resampled training data
xgb_cv_scores = cross_val_score(xgb_model, X_resampled, y_resampled, cv=5)
print("Cross-validation accuracy scores:", xgb_cv_scores)
print("Mean cross-validation accuracy:", xgb_cv_scores.mean())

# Step 3: Fit the model
xgb_model.fit(X_resampled, y_resampled)

# Step 4: Predict on test data
y_pred_xgb = xgb_model.predict(X_test)

# Step 5: Evaluation
print("\nTest Set Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


Cross-validation accuracy scores: [0.71791768 0.78026634 0.89588378 0.89588378 0.9031477 ]
Mean cross-validation accuracy: 0.8386198547215497

Test Set Accuracy: 0.7697228144989339

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1033
           1       0.56      0.59      0.58       374

    accuracy                           0.77      1407
   macro avg       0.71      0.71      0.71      1407
weighted avg       0.77      0.77      0.77      1407

Confusion Matrix:
[[861 172]
 [152 222]]
