<a href="https://colab.research.google.com/github/avindumihisara0229-code/telco-churn-project/blob/main/telco_churn_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install keras-tuner imbalanced-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Change this path if you saved it in a subfolder
file_path = '/content/drive/MyDrive/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(file_path)

df.head()

In [None]:
# Import libraries for Task 1 (EDA)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_theme(style="whitegrid")

In [None]:
# Load the dataset
# --- IMPORTANT: Update this path if your file is in a folder ---
file_path = '/content/drive/MyDrive/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(file_path)

# 1. Look at the first few rows
print("--- First 5 Rows ---")
display(df.head())

# 2. Get information on columns and data types
print("\n--- Column Info (Data Types) ---")
df.info()

In [None]:
# 1. Drop the useless 'customerID' column
df.drop('customerID', axis=1, inplace=True)

# 2. Fix the 'TotalCharges' column
# 'coerce' will turn any bad values (like empty strings) into 'NaT' (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# 3. Check for any null values we just created
print("--- Null Values After Cleaning ---")
print(df.isnull().sum())

In [None]:
# 4. Drop the few rows with null values
df.dropna(inplace=True)

# 5. Get statistics for our numerical columns
print("\n--- Numerical Statistics (After Clean) ---")
display(df.describe())

In [None]:
# Plot the distribution of our target variable 'Churn'
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution (Target Variable)')
plt.show()

# exact percentages
print(df['Churn'].value_counts(normalize=True))

In [None]:
# Plot numerical features
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

# Tenure vs. Churn
sns.histplot(data=df, x='tenure', hue='Churn', multiple='stack', ax=axes[0])
axes[0].set_title('Tenure vs. Churn')

# MonthlyCharges vs. Churn
sns.histplot(data=df, x='MonthlyCharges', hue='Churn', multiple='stack', ax=axes[1])
axes[1].set_title('Monthly Charges vs. Churn')

# TotalCharges vs. Churn
sns.histplot(data=df, x='TotalCharges', hue='Churn', multiple='stack', ax=axes[2])
axes[2].set_title('Total Charges vs. Churn')

plt.tight_layout()
plt.show()

In [None]:
# Plot key categorical features
fig, axes = plt.subplots(1, 3, figsize=(22, 7))

# Contract vs. Churn
sns.countplot(data=df, x='Contract', hue='Churn', ax=axes[0])
axes[0].set_title('Churn Rate by Contract Type')

# Internet Service vs. Churn
sns.countplot(data=df, x='InternetService', hue='Churn', ax=axes[1])
axes[1].set_title('Churn Rate by Internet Service')

# Payment Method vs. Churn
sns.countplot(data=df, x='PaymentMethod', hue='Churn', ax=axes[2])
axes[2].set_title('Churn Rate by Payment Method')
plt.xticks(rotation=15) # Rotate labels for readability

plt.tight_layout()
plt.show()

Task 2

In [None]:
# --- Imports for Task 2 ---

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Imbalance Handling
from imblearn.over_sampling import SMOTE

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

In [None]:
# 1. Encode the target variable 'Churn'
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
# 2. Define our features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
# 3. Identify numerical and categorical features
# Numerical features
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Categorical features
categorical_features = [col for col in X.columns if col not in numeric_features]

print(f"Numerical Features: {numeric_features}")
print(f"Categorical Features: {categorical_features}")

In [None]:
# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,    # 20% for testing
                                                    random_state=42,  # For reproducible results
                                                    stratify=y)       # Keep class balance

In [None]:
# 5. Create the preprocessing transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Ignores new categories in test set

# 6. Create the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# 7. Apply the preprocessor to our training data
X_train_processed = preprocessor.fit_transform(X_train)

# 8. Apply the preprocessor to our testing data
X_test_processed = preprocessor.transform(X_test)

# 9. Handle Class Imbalance with SMOTE (on training data only)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# Check the new shape and class distribution
print(f"Shape before SMOTE: {X_train_processed.shape}")
print(f"Shape after SMOTE: {X_train_resampled.shape}")
print(f"Original y_train distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Resampled y_train distribution:\n{pd.Series(y_train_resampled).value_counts(normalize=True)}")

In [None]:
# --- Model 1: Decision Tree ---

# 1. Create a pipeline that includes the preprocessor and the classifier
# This ensures that preprocessing is correctly applied to each fold in GridSearchCV
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# 2. Define the parameter grid to search
# These are the hyperparameters we want to tune
param_grid_dt = {
    'classifier__max_depth': [3, 5, 7, 10],
    'classifier__min_samples_leaf': [5, 10, 20],
    'classifier__criterion': ['gini', 'entropy']
}

# 3. Set up the GridSearch
# We use 'roc_auc' as the scoring metric because the data is imbalanced
# cv=5 means 5-fold cross-validation
# n_jobs=-1 uses all available CPU cores to speed up training
grid_search_dt = GridSearchCV(dt_pipeline, param_grid_dt, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# 4. Fit the GridSearch to the *original* training data
# The pipeline will handle preprocessing and
# GridSearchCV will handle the model tuning.
print("Starting Decision Tree Hyperparameter Tuning...")
grid_search_dt.fit(X_train, y_train)

# 5. Get the best model
best_dt = grid_search_dt.best_estimator_

print("\n--- Decision Tree Tuning Complete ---")
print(f"Best DT Parameters: {grid_search_dt.best_params_}")
print(f"Best DT Cross-Validation ROC-AUC Score: {grid_search_dt.best_score_:.4f}")

In [None]:
# 1. Make predictions on the test set
y_pred_dt = best_dt.predict(X_test)
y_pred_proba_dt = best_dt.predict_proba(X_test)[:, 1] # Get probabilities for 'Yes' (class 1)

# 2. Print Evaluation Metrics
print("--- Decision Tree Test Set Evaluation ---")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=['No Churn', 'Churn']))

print(f"\nTest Set ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_dt):.4f}")
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")

In [None]:
# 1. Get the input shape for the network's first layer
# This is the number of features created by our preprocessor
input_shape = X_train_resampled.shape[1]
print(f"Number of input features for NN: {input_shape}")

# 2. Define the model-building function for KerasTuner
def build_model(hp):
    model = Sequential()

    hp_units_1 = hp.Int('units_1', min_value=16, max_value=64, step=16)
    hp_dropout = hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)
    hp_learning_rate = hp.Choice('learning_rate', [1e-3, 1e-4])

    model.add(Dense(units=hp_units_1, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(rate=hp_dropout))

    # --- The first fix was here ---
    model.add(Dense(units=hp_units_1 // 2, activation='relu'))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  # --- The second fix was here ---
                  metrics=['auc']) # Make sure this is lowercase 'auc'

    return model

In [None]:
# --- ADD THIS NEW CELL ---
# Delete the old tuner directory to clear the error
!rm -rf ./my_dir

print("Cleared the tuner's cache. Ready to try again.")

In [None]:
# 1. Set up the KerasTuner (we'll use RandomSearch)
tuner = kt.RandomSearch(
    build_model,
    objective=kt.Objective("val_auc", direction="max"), # We want to maximize Validation AUC
    max_trials=10,        # How many different models to try
    executions_per_trial=2, # How many times to train each model
    directory='my_dir',   # Folder to store results
    project_name='churn_hpo'
)

# 2. Set up an EarlyStopping callback
# This stops training if the model doesn't improve, saving time
early_stopping = EarlyStopping(
    monitor='val_auc',
    mode='max',  # <--- This 'mode=max' is the key
    patience=10,
    restore_best_weights=True
)

# 3. Run the search
# We use the *resampled* training data
# We set validation_split=0.2 to hold out 20% of our *training* data
# to validate the model during tuning.
print("\nStarting Neural Network Hyperparameter Tuning...")
tuner.search(X_train_resampled, y_train_resampled,
             epochs=50,
             validation_split=0.2,
             callbacks=[early_stopping],
             verbose=1)

# 4. Get the best model
best_nn_model = tuner.get_best_models(num_models=1)[0]

print("\n--- Neural Network Tuning Complete ---")
tuner.results_summary()

In [None]:
# 1. Make predictions on the test set
# We get probabilities (a number between 0 and 1)
y_pred_proba_nn = best_nn_model.predict(X_test_processed).ravel()

# 2. Convert probabilities to class labels (0 or 1) using a 0.5 threshold
y_pred_nn = (y_pred_proba_nn > 0.5).astype(int)

# 3. Print Evaluation Metrics
print("\n--- Neural Network Test Set Evaluation ---")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nn))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_nn, target_names=['No Churn', 'Churn']))

print(f"\nTest Set ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_nn):.4f}")
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred_nn):.4f}")