## Import the Required Libraries

In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

## Load the data file 

In [3]:
# Load the Telco Churn dataset from a CSV file and display the first 5 rows to get an overview of the data
df = pd.read_csv("Telco_Churn_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [4]:
# Drop the 'Unnamed: 0' column as it's likely an unnecessary index column from CSV export
df = df.drop('Unnamed: 0', axis=1)

# Create the feature matrix 'X' by dropping the target variable 'Churn' from the dataset
x = df.drop('Churn', axis=1)

# Display the features to verify
x.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.5,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.3,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.7,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False


In [5]:
# Extract the target variable 'Churn' into a separate Series 'y' for model training
y = df['Churn']
print(y)

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64


## Train Test Split

In [6]:
# Split the dataset into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Decision Tree Classifier

In [7]:
# Initialize Decision Tree Classifier with Gini index, limited depth, and minimum leaf samples
model_dt = DecisionTreeClassifier(
    criterion="gini",          # Use Gini impurity for splitting
    random_state=100,          # Ensures reproducibility of results
    max_depth=6,               # Maximum depth of the tree to avoid overfitting
    min_samples_leaf=8         # Minimum number of samples required at a leaf node
)

In [8]:
# Train (fit) the Decision Tree model on the training dataset
model_dt.fit(x_train, y_train)

In [9]:
# Predict churn labels on the test dataset using the trained Decision Tree model
y_pred = model_dt.predict(x_test)

# Display the predicted values
y_pred

array([0, 0, 1, ..., 0, 0, 0], shape=(1407,))

In [10]:
# Evaluate the accuracy of the Decision Tree model on the test dataset
model_dt.score(x_test, y_test)

0.7668798862828714

In [11]:
# Display precision, recall, F1-score, and support for each class (0: No Churn, 1: Churn)
print(classification_report(y_test, y_pred, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1033
           1       0.56      0.59      0.57       374

    accuracy                           0.77      1407
   macro avg       0.70      0.71      0.71      1407
weighted avg       0.77      0.77      0.77      1407



🚨 The model's accuracy is not sufficient due to class imbalance in the dataset.
In imbalanced datasets, Accuracy is not a reliable metric, as it can be misleading.
For example, if 90% of customers do NOT churn, predicting all as 'No Churn' gives 90% accuracy—but the model fails completely for churn prediction.

🔎 Hence, we focus on metrics like:
- Precision: How many predicted churns were actually churns.
- Recall: How many actual churns were captured.
- F1-Score: Harmonic mean of precision and recall.
🧪 Observation: Class 1 (churned customers) shows very low precision, recall, and F1-score — which means the model is underperforming for the minority class.

 🔄 To resolve this, we apply **SMOTEENN**:
- SMOTE (Synthetic Minority Oversampling Technique): Upsamples the minority class by generating synthetic samples.
- ENN (Edited Nearest Neighbours): Cleans overlapping or noisy data by removing ambiguous points.


In [12]:
# Applying SMOTEENN to handle class imbalance:
# SMOTE (Synthetic Minority Oversampling Technique) upsamples the minority class,
# and ENN (Edited Nearest Neighbours) removes ambiguous points from the majority class.
# This combination improves the quality of the training dataset.
sm = SMOTEENN()

# Fitting and transforming the original dataset to create a balanced version
X_resampled, y_resampled = sm.fit_resample(x, y)

In [13]:
# Splitting the resampled (balanced) dataset into training and testing sets
# This ensures the model trains on balanced data and is evaluated on unseen balanced data
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [14]:
# Initializing the Decision Tree Classifier for the resampled (SMOTEENN-balanced) dataset
# Using 'gini' as the splitting criterion, max depth of 6, and minimum 8 samples per leaf to avoid overfitting
model_dt_smote = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=6, min_samples_leaf=8)

In [15]:
# Training the Decision Tree model on the resampled (balanced) training data
model_dt_smote.fit(xr_train, yr_train)

# Predicting churn labels on the resampled test data
yr_predict = model_dt_smote.predict(xr_test)

# Calculating the accuracy score of the model on the resampled test set
model_score_r = model_dt_smote.score(xr_test, yr_test)
print("Accuracy : ",model_score_r)  # Print the accuracy

# Displaying the detailed classification report (precision, recall, f1-score) on the resampled test data
print(metrics.classification_report(yr_test, yr_predict))

Accuracy :  0.9302127659574468
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       542
           1       0.93      0.94      0.94       633

    accuracy                           0.93      1175
   macro avg       0.93      0.93      0.93      1175
weighted avg       0.93      0.93      0.93      1175



In [16]:
# Printing the confusion matrix to understand the model's true/false positives and negatives
print(metrics.confusion_matrix(yr_test, yr_predict))

[[497  45]
 [ 37 596]]


## Random Forest Classifier

In [17]:
# Initializing the Random Forest Classifier with:
# - 100 decision trees (n_estimators)
# - Gini impurity as the splitting criterion
# - Fixed random state for reproducibility
# - Maximum depth of each tree limited to 6
# - Minimum samples required at a leaf node is 8
model_rf = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    random_state=100,
    max_depth=6,
    min_samples_leaf=8
)

In [18]:
# Fitting (training) the Random Forest model on the training dataset
model_rf.fit(x_train, y_train)


In [19]:
# Using the trained Random Forest model to predict the labels on the test dataset
y_pred = model_rf.predict(x_test)

In [20]:
# Step 4: Evaluate model performance using accuracy score
model_rf.score(x_test, y_test)  # Returns the accuracy (i.e., correct predictions / total samples)

0.7789623312011372

In [21]:
# Print precision, recall, F1-score, and support for each class label (0 and 1)
print(classification_report(y_test, y_pred, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      1033
           1       0.63      0.42      0.50       374

    accuracy                           0.78      1407
   macro avg       0.72      0.66      0.68      1407
weighted avg       0.76      0.78      0.76      1407



In [22]:
print(confusion_matrix(y_test, y_pred))

[[939  94]
 [217 157]]


In [23]:
# Import SMOTEENN for combined over-sampling (SMOTE) and under-sampling (ENN)
sm = SMOTEENN()

# Apply SMOTEENN on the original dataset to balance the class distribution
# SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic examples for the minority class
# ENN (Edited Nearest Neighbors) removes ambiguous examples from the majority class

X_resampled, y_resampled = sm.fit_resample(x, y)

In [24]:
# Splitting the resampled dataset into training and testing sets
# X_resampled: the features after SMOTEENN resampling
# y_resampled: the labels after resampling
# test_size=0.2: 20% of the data will be used for testing, and 80% for training
xr_train_1, xr_test_1, yr_train_1, yr_test_1 = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [25]:
# Creating a Random Forest classifier with SMOTE-resampled data
# - n_estimators=100: The forest consists of 100 decision trees
# - criterion='gini': Gini impurity is used to measure the quality of splits
# - random_state=100: Ensures reproducibility of results
# - max_depth=6: Limits the depth of each tree to prevent overfitting
# - min_samples_leaf=8: A leaf node must have at least 8 samples to reduce model variance
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini',
                                        random_state=100, max_depth=6, min_samples_leaf=8)

In [26]:
# Training the Random Forest model on the SMOTE+ENN resampled training data
model_rf_smote.fit(xr_train_1, yr_train_1)

In [27]:
# Predicting the labels for the resampled test set using the trained Random Forest model
yr_predict_1 = model_rf_smote.predict(xr_test_1)

In [28]:
# Calculating the accuracy score of the Random Forest model on the SMOTE+ENN resampled test set
model_score_r1 = model_rf_smote.score(xr_test_1, yr_test_1)

In [29]:
# Print the accuracy score of the Random Forest model on the resampled test data
print(model_score_r1)

# Print detailed classification metrics such as precision, recall, f1-score, and support
print(metrics.classification_report(yr_test_1, yr_predict_1))

0.939625850340136
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       511
           1       0.93      0.96      0.95       665

    accuracy                           0.94      1176
   macro avg       0.94      0.94      0.94      1176
weighted avg       0.94      0.94      0.94      1176



In [30]:
# Print the confusion matrix to visualize the number of correct and incorrect predictions for each class
print(metrics.confusion_matrix(yr_test_1, yr_predict_1))

[[466  45]
 [ 26 639]]


The Random Forest Classifier yielded better performance compared to the Decision Tree model,
indicating improved generalization and robustness due to ensemble learning.

While we explored only a few models here, you can further enhance this analysis
by experimenting with additional classifiers like Gradient Boosting, XGBoost, SVM, or Logistic Regression
to compare model performances — a great opportunity for further learning!

## Performing Principal Component Analysis (PCA)

In [31]:
# Apply PCA to reduce dimensionality while retaining 90% of the variance
# Fit PCA on the training set and transform both training and test sets accordingly
# `explained_variance` stores the proportion of variance explained by each principal component
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train_1)
xr_test_pca = pca.transform(xr_test_1)
explained_variance = pca.explained_variance_ratio_

In [32]:
# Initialize a Random Forest Classifier with the following parameters:
# - n_estimators=100: Use 100 decision trees in the forest
# - criterion='gini': Use Gini impurity to measure the quality of splits
# - random_state=100: Set seed for reproducibility
# - max_depth=6: Limit the depth of each tree to prevent overfitting
# - min_samples_leaf=8: A leaf must have at least 8 samples to reduce model complexity
model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [33]:
# Train the Random Forest model on the PCA-transformed training data
# xr_train_pca: reduced-dimension training features after applying PCA
# yr_train1: target labels for the training data
model.fit(xr_train_pca, yr_train_1)

In [34]:
# Predict the target labels for the PCA-transformed test data using the trained Random Forest model
yr_predict_pca = model.predict(xr_test_pca)

In [35]:
# Calculate the accuracy score of the Random Forest model on the PCA-transformed test set
model_score_r_pca = model.score(xr_test_pca, yr_test_1)

In [36]:
# Print the accuracy score of the model on the PCA-transformed test data
print(model_score_r_pca)

# Print the detailed classification report including precision, recall, f1-score, and support
# This helps in understanding model performance across different classes
print(metrics.classification_report(yr_test_1, yr_predict_pca))

0.7278911564625851
              precision    recall  f1-score   support

           0       0.69      0.67      0.68       511
           1       0.75      0.77      0.76       665

    accuracy                           0.73      1176
   macro avg       0.72      0.72      0.72      1176
weighted avg       0.73      0.73      0.73      1176



✅ Conclusion after PCA:

Even after applying Principal Component Analysis (PCA) for dimensionality reduction (retaining 90% of the variance), we didn’t observe any significant improvement in the model’s performance. In fact, the accuracy and classification metrics were slightly better without PCA.

🔍 Final Decision:

Since the Random Forest Classifier without PCA gave us the best performance, we’ll finalize that model for deployment.

## Pickling the model

In [37]:
import pickle  
# 📦 Importing the pickle module to enable saving and loading Python objects to/from disk.

filename = 'model.sav'  
# 🗂️ Defining the filename where the trained model will be saved.

pickle.dump(model_rf_smote, open(filename, 'wb'))  
# 💾 Saving (pickling) the trained Random Forest model (with SMOTE-applied data) into the file 'model.sav' in write-binary mode.

load_model = pickle.load(open(filename, 'rb'))  
# 📤 Loading (unpickling) the model from the 'model.sav' file in read-binary mode.

model_score_r1 = load_model.score(xr_test_1, yr_test_1)  
# 🧪 Evaluating the loaded model's performance on the test data using `.score()` which returns the accuracy.

model_score_r1  
# 📈 Displays the accuracy score of the loaded model. any other way to save model

0.939625850340136

In [42]:
# !python -m streamlit run webapp.py

✅ Our final model — a Random Forest Classifier trained on balanced data using SMOTEENN — has been successfully saved as 'model.sav'.

🚀 This model is now ready for deployment. We will build an interactive UI around it using Streamlit.

🖥️ This allows seamless integration between our machine learning model and a web-based user interface, enabling easy input and real-time predictions.
