TASK 2: BINARY CLASSIFICATION MODELS

In [5]:
# Import necessary libraries
import kagglehub
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           roc_auc_score, confusion_matrix, classification_report,
                           roc_curve, precision_recall_curve, auc)
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')
import joblib
import time

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
print("="*70)
print("STEP 1: DATA LOADING AND PREPROCESSING")
print("="*70)

STEP 1: DATA LOADING AND PREPROCESSING


In [8]:
# Download dataset
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

# Exact filename
print(os.listdir(path))

# Load the CSV
df = pd.read_csv(os.path.join(path, "WA_Fn-UseC_-Telco-Customer-Churn.csv"))

Using Colab cache for faster access to the 'telco-customer-churn' dataset.
['WA_Fn-UseC_-Telco-Customer-Churn.csv']


In [9]:
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}\n")

# Data cleaning based on Task 1 findings
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Handle missing values (using non-churned median as identified in Task 1)
non_churned_median = df[df['Churn'] == 'No']['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(non_churned_median)

print(f"Missing values after imputation: {df['TotalCharges'].isnull().sum()}")

# Drop customerID as it's not useful for prediction
df = df.drop('customerID', axis=1)

Dataset shape: (7043, 21)
Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values after imputation: 0


In [10]:
print("\n\n" + "="*70)
print("STEP 2: FEATURE ENGINEERING")
print("="*70)



STEP 2: FEATURE ENGINEERING


In [11]:
# Create new features based on EDA insights
df['TenureGroup'] = pd.cut(df['tenure'],
                           bins=[0, 12, 24, 36, 48, 60, 72],
                           labels=['0-12m', '13-24m', '25-36m', '37-48m', '49-60m', '61-72m'])

# Calculate charge ratio
df['ChargeRatio'] = df['MonthlyCharges'] / (df['TotalCharges'] + 1)  # +1 to avoid division by zero

# Create total services feature
service_columns = ['PhoneService', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                   'TechSupport', 'StreamingTV', 'StreamingMovies']

df['TotalServices'] = 0
for col in service_columns:
    if col in df.columns:
        # Convert categorical to binary
        df['TotalServices'] += (df[col] != 'No').astype(int)

print("New features created:")
print(f"- TenureGroup: {df['TenureGroup'].unique()}")
print(f"- ChargeRatio: Range [{df['ChargeRatio'].min():.3f}, {df['ChargeRatio'].max():.3f}]")
print(f"- TotalServices: Range [{df['TotalServices'].min()}, {df['TotalServices'].max()}]")

New features created:
- TenureGroup: ['0-12m', '25-36m', '37-48m', '13-24m', '61-72m', '49-60m', NaN]
Categories (6, object): ['0-12m' < '13-24m' < '25-36m' < '37-48m' < '49-60m' < '61-72m']
- ChargeRatio: Range [0.012, 0.990]
- TotalServices: Range [2, 9]


In [12]:
print("\n\n" + "="*70)
print("STEP 3: DATA PREPARATION FOR MODELING")
print("="*70)



STEP 3: DATA PREPARATION FOR MODELING


In [13]:
# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Encode target variable
y = y.map({'No': 0, 'Yes': 1})

# Identify feature types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical features: {categorical_cols}")
print(f"Numerical features: {numerical_cols}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"Churn rate: {(y.sum() / len(y)) * 100:.2f}%")

# Split data with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Train churn rate: {(y_train.sum() / len(y_train)) * 100:.2f}%")
print(f"Test churn rate: {(y_test.sum() / len(y_test)) * 100:.2f}%")

Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TenureGroup']
Numerical features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'ChargeRatio', 'TotalServices']
Target distribution:
Churn
0    5174
1    1869
Name: count, dtype: int64
Churn rate: 26.54%

Train set size: (5634, 22)
Test set size: (1409, 22)
Train churn rate: 26.54%
Test churn rate: 26.54%


In [14]:

print("\n\n" + "="*70)
print("STEP 4: PREPROCESSING PIPELINE")
print("="*70)

# Define preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle any remaining missing values
    ('scaler', StandardScaler())  # Standardize features
])

# Define preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encoding
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

print("Preprocessing pipeline created successfully")



STEP 4: PREPROCESSING PIPELINE
Preprocessing pipeline created successfully


In [15]:
print("\n\n" + "="*70)
print("STEP 5: DECISION TREE CLASSIFIER")
print("="*70)

# Create Decision Tree pipeline
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define hyperparameter grid for Decision Tree
dt_param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [3, 5, 7, 10, 15, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__class_weight': ['balanced', None]
}

print("Performing Grid Search for Decision Tree...")

# Perform GridSearchCV with cross-validation
dt_grid_search = GridSearchCV(
    dt_pipeline,
    dt_param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

start_time = time.time()
dt_grid_search.fit(X_train, y_train)
dt_train_time = time.time() - start_time

print(f"Grid Search completed in {dt_train_time:.2f} seconds")

# Get best model and parameters
dt_best_model = dt_grid_search.best_estimator_
dt_best_params = dt_grid_search.best_params_

print("\nBest Decision Tree Parameters:")
for param, value in dt_best_params.items():
    print(f"  {param}: {value}")

# Make predictions
y_pred_dt = dt_best_model.predict(X_test)
y_pred_proba_dt = dt_best_model.predict_proba(X_test)[:, 1]




STEP 5: DECISION TREE CLASSIFIER
Performing Grid Search for Decision Tree...
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Grid Search completed in 288.12 seconds

Best Decision Tree Parameters:
  classifier__class_weight: balanced
  classifier__criterion: gini
  classifier__max_depth: 3
  classifier__max_features: None
  classifier__min_samples_leaf: 1
  classifier__min_samples_split: 2
