In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [3]:
df = pd.read_csv('./customer_churn_data.csv')
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes


In [8]:
df.drop(columns=['CustomerID','Gender'],index=1, inplace=True)

In [29]:
df.columns

Index(['Age', 'Tenure', 'MonthlyCharges', 'ContractType', 'InternetService',
       'TotalCharges', 'TechSupport', 'Churn'],
      dtype='object')

In [None]:
df.ContractType.value_counts()
df.InternetService.value_counts()
df.TechSupport.value_counts()
df.Churn.value_counts()

ContractType
Month-to-Month    510
One-Year          289
Two-Year          200
Name: count, dtype: int64

In [10]:
df.InternetService.value_counts()

InternetService
Fiber Optic    394
DSL            308
Name: count, dtype: int64

In [11]:
df.TechSupport.value_counts()

TechSupport
Yes    505
No     494
Name: count, dtype: int64

In [12]:
df.Churn.value_counts()

Churn
Yes    882
No     117
Name: count, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['ContractType', 'InternetService', 'TechSupport', 'Churn']

label_encoders = {}

# Apply Label Encoding
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store the encoder for future use

# Display the updated DataFrame
(df.head())


Unnamed: 0,Age,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,49,4,88.35,0,1,353.4,1,1
2,51,2,63.79,0,1,127.58,0,1
3,60,8,102.34,1,0,818.72,1,1
4,42,32,69.01,0,2,2208.32,0,1
5,42,16,119.75,2,0,1916.0,1,1


In [15]:
df.Churn.value_counts()

Churn
1    882
0    117
Name: count, dtype: int64

In [16]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['Churn'] == 1]  # Majority class
df_minority = df[df['Churn'] == 0]  # Minority class

# Upsample minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,          # Sample with replacement
    n_samples=len(df_majority),  # Match the number of majority class samples
    random_state=42        # For reproducibility
)

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Shuffle the dataset
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify class balance
print(df_upsampled['Churn'].value_counts())


Churn
0    882
1    882
Name: count, dtype: int64


In [17]:
df_upsampled.head()

Unnamed: 0,Age,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,40,53,49.81,2,1,2639.93,1,0
1,50,28,90.66,0,1,2538.48,1,1
2,53,16,81.1,1,0,1297.6,1,0
3,33,13,49.01,1,1,637.13,1,0
4,49,13,43.94,2,0,571.22,1,0


In [18]:
df = df_upsampled

In [19]:
df.shape

(1764, 8)

In [20]:
from sklearn.model_selection import train_test_split

# Features and target
X = df_upsampled.drop(columns=['Churn'])  # Drop unnecessary columns
y = df_upsampled['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1411, 7), (353, 7), (1411,), (353,))

In [22]:
X_train.columns

Index(['Age', 'Tenure', 'MonthlyCharges', 'ContractType', 'InternetService',
       'TotalCharges', 'TechSupport'],
      dtype='object')

In [23]:
scaler = StandardScaler()
numerical_cols = ['Age', 'Tenure', 'MonthlyCharges', 'ContractType', 'InternetService',
       'TotalCharges', 'TechSupport']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [32]:
for i in numerical_cols:
    print(i , df[i].min(),df[i].max())

Age 12 83
Tenure 0 122
MonthlyCharges 30.0 119.96
ContractType 0 2
InternetService 0 2
TotalCharges 0.0 12416.25
TechSupport 0 1


In [26]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=500, random_state=50),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True, random_state=1),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
}

In [27]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# Voting classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', models['Random Forest']),
    ('lr', models['Logistic Regression']),
    ('ab', models['AdaBoost'])
], voting='soft')

voting_clf.fit(X_train, y_train)
voting_y_pred = voting_clf.predict(X_test)
voting_acc = accuracy_score(y_test, voting_y_pred)
print("Voting Classifier Accuracy:", voting_acc)
print(classification_report(y_test, voting_y_pred))


Random Forest Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       184
           1       1.00      1.00      1.00       169

    accuracy                           1.00       353
   macro avg       1.00      1.00      1.00       353
weighted avg       1.00      1.00      1.00       353

AdaBoost Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       184
           1       1.00      1.00      1.00       169

    accuracy                           1.00       353
   macro avg       1.00      1.00      1.00       353
weighted avg       1.00      1.00      1.00       353





Gradient Boosting Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       184
           1       1.00      1.00      1.00       169

    accuracy                           1.00       353
   macro avg       1.00      1.00      1.00       353
weighted avg       1.00      1.00      1.00       353

SVM Accuracy: 0.9773
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       184
           1       1.00      0.95      0.98       169

    accuracy                           0.98       353
   macro avg       0.98      0.98      0.98       353
weighted avg       0.98      0.98      0.98       353

Logistic Regression Accuracy: 0.9008
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       184
           1       0.95      0.83      0.89       169

    accuracy                           0.90       353
   macro avg       0.91      0.90  



In [28]:
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)