<a href="https://colab.research.google.com/github/afeefzeed/telco-churn-cw/blob/main/telco_churn_cw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries (run once)
!pip install -q pandas numpy matplotlib seaborn scikit-learn tensorflow keras


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from tensorflow import keras

sns.set(style='whitegrid')
print('Libraries imported')

In [None]:
# ---------- CELL A: Choose & load dataset (Colab friendly) ----------
# Use browser upload OR mount Google Drive. This cell saves chosen file to working dir.
from google.colab import files
import os, io, pandas as pd, numpy as np

print("Choose upload method:\n1) Upload file from computer\n2) Use file already in Colab working dir\n3) Mount Google Drive (enter path manually)")
choice = input("Enter 1/2/3 (press Enter for 1): ").strip() or "1"

if choice == "1":
    uploaded = files.upload()
    # take the first uploaded file
    fname = list(uploaded.keys())[0]
    path = fname
    print("Uploaded:", path)
elif choice == "2":
    path = input("Enter filename in working dir (e.g., Telco-Customer-Churn.csv): ").strip()
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} not found in working dir")
else:
    from google.colab import drive
    drive.mount('/content/drive')
    path = input("Enter full path to file in Drive (e.g., /content/drive/MyDrive/Colab Notebooks/Telco-Customer-Churn.csv): ").strip()
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} not found")

# Load
df = pd.read_csv(path)
print("Loaded:", path, " — shape:", df.shape)

# Quick fix: show top columns and first rows
display(df.head())
print("\nColumns:", df.columns.tolist())


In [None]:
# ---------- CELL B:  EDA (feature signal & actionable insights) ----------
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance

sns.set(style='whitegrid')

# 1) basic dtype & missing diagnostics
print("Shape:", df.shape)
print("\nMissing values (top):")
print(df.isna().sum().sort_values(ascending=False).head(20))

# 2) TotalCharges quick check (Telco-specific)
if 'TotalCharges' in df.columns:
    print("\nTotalCharges dtype before fix:", df['TotalCharges'].dtype)
    # Count spaces
    spaces = (df['TotalCharges'].astype(str).str.strip() == "").sum()
    print("TotalCharges blank-strings:", spaces)
    # show few bad rows
    display(df[df['TotalCharges'].astype(str).str.strip()==""].head())

# 3) Target balance & class-wise numeric summary
target_col = 'Churn' if 'Churn' in df.columns else ('y' if 'y' in df.columns else None)
if target_col is None:
    raise ValueError("Cannot find target column named 'Churn' or 'y'")

print(f"\nTarget column: {target_col}")
display(df[target_col].value_counts())
print("\nPercentage:")
display((df[target_col].value_counts(normalize=True)*100).round(2))

# Numeric columns overview & class-wise means for top numeric cols
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nNumeric columns:", num_cols)
if len(num_cols) > 0:
    # show class-wise means for numeric columns
    display(df.groupby(target_col)[num_cols].mean().T.head(10))

# 4) Mutual information (fast signal ranking)
# prepare a short cleaned copy: convert simple binary yes/no to 0/1 and drop id-like cols
prep = df.copy()
# quick conversion for obvious binary columns
for c in prep.select_dtypes(include=['object']).columns:
    if prep[c].nunique() == 2:
        prep[c] = prep[c].map({prep[c].unique()[0]:0, prep[c].unique()[1]:1})
# for mutual info we need numeric features; convert remaining object columns to codes (fast, not final!)
for c in prep.select_dtypes(include=['object']).columns:
    prep[c] = prep[c].astype('category').cat.codes
X_mi = prep.drop(columns=[target_col], errors='ignore').select_dtypes(include=[np.number])
y_mi = prep[target_col].astype(int)
mi = mutual_info_classif(X_mi.fillna(0), y_mi, discrete_features='auto', random_state=42)
mi_series = pd.Series(mi, index=X_mi.columns).sort_values(ascending=False)
print("\nTop mutual information features (fast):")
display(mi_series.head(15))

# 5) Quick permutation importance using a shallow Decision Tree (fast)
print("\nPermutation importance (shallow DecisionTree, fast):")
small_dt = DecisionTreeClassifier(max_depth=6, random_state=42)
# sample small balanced subset if data is large
sample_idx = df.index
if len(df) > 5000:
    sample_idx = df.sample(5000, random_state=42).index
X_perm = X_mi.loc[sample_idx].fillna(0)
y_perm = y_mi.loc[sample_idx]
small_dt.fit(X_perm, y_perm)
perm = permutation_importance(small_dt, X_perm, y_perm, n_repeats=8, random_state=42, n_jobs=1)
perm_series = pd.Series(perm.importances_mean, index=X_perm.columns).sort_values(ascending=False)
display(perm_series.head(15))

# 6) PCA projection (2D) of top-K numeric features
top_feats = mi_series.head(8).index.tolist()
if len(top_feats) >= 2:
    pca = PCA(n_components=2, random_state=42)
    proj = pca.fit_transform(prep[top_feats].fillna(0))
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=proj[:,0], y=proj[:,1], hue=prep[target_col].astype(str), alpha=0.6, s=40)
    plt.title('PCA (2D) projection on top features — colored by target')
    plt.show()

# 7) Compact actionable plots (contract vs churn, payment vs churn, tenure bins)
if 'Contract' in df.columns:
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x='Contract', hue=target_col)
    plt.title('Contract type vs Churn'); plt.xticks(rotation=45); plt.show()

if 'PaymentMethod' in df.columns:
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x='PaymentMethod', hue=target_col)
    plt.title('PaymentMethod vs Churn'); plt.xticks(rotation=45); plt.show()

# tenure bins
if 'tenure' in df.columns:
    df['tenure_bin'] = pd.cut(df['tenure'], bins=[-1,0,12,24,48,72], labels=['None','0-12','13-24','25-48','49-72'])
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x='tenure_bin', hue=target_col)
    plt.title('Tenure bins vs Churn'); plt.show()
    # drop created column to avoid contaminating pipeline
    df.drop(columns=['tenure_bin'], inplace=True, errors='ignore')

print("\nEDA done — use these signals to pick features & engineering steps.")


In [None]:
# --- Advanced Visualization Cell ---
if 'df' in globals():
    # 1) Target distribution
    plt.figure(figsize=(6,4))
    sns.countplot(x='Churn', data=df)
    plt.title('Churn Distribution')
    plt.show()

    # 2) Tenure bins vs churn
    df['tenure_bin'] = pd.cut(df['tenure'], bins=[-1,12,24,48,72],
                              labels=['0-12','13-24','25-48','49-72'])
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x='tenure_bin', hue='Churn')
    plt.title('Tenure Groups vs Churn')
    plt.show()

    # 3) MonthlyCharges distribution split by churn
    plt.figure(figsize=(8,4))
    sns.kdeplot(data=df, x='MonthlyCharges', hue='Churn', fill=True, common_norm=False)
    plt.title('Monthly Charges Distribution by Churn')
    plt.show()

    # 4) Internet Service vs Churn
    if 'InternetService' in df.columns:
        plt.figure(figsize=(8,4))
        sns.countplot(data=df, x='InternetService', hue='Churn')
        plt.title('Internet Service vs Churn')
        plt.xticks(rotation=45)
        plt.show()

    # Clean temporary column
    df.drop(columns=['tenure_bin'], inplace=True, errors='ignore')
else:
    print("Load dataset first.")


In [None]:
# --- Improved Correlation Analysis Cell ---
if 'df' in globals():

    # 1) Numeric Correlation Heatmap
    num = df.select_dtypes(include=[np.number])
    if num.shape[1] > 1:
        plt.figure(figsize=(12,8))
        sns.heatmap(num.corr(), annot=True, fmt='.2f', cmap='coolwarm')
        plt.title('Numeric Feature Correlation Heatmap')
        plt.show()
    else:
        print("Not enough numeric columns for numeric correlation.")

    # 2) Correlation of numeric features WITH Churn
    if 'churn' in df.columns:
        churn_corr = num.corr()['Churn'].sort_values(ascending=False)

        plt.figure(figsize=(6,5))
        sns.barplot(x=churn_corr.values, y=churn_corr.index, palette="viridis")
        plt.title("Correlation of Numeric Features with Churn")
        plt.xlabel("Correlation Value")
        plt.show()
    else:
        print("Churn column missing; cannot compute correlation with target.")

else:
    print("Load the dataset first.")


In [None]:
# ================================
# FINAL CLEANING + PREPROCESSING PIPELINE
# ================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

print("\n" + "="*70)
print("DATA PREPROCESSING PIPELINE")
print("="*70)

# ---- 1. WORKING COPY ----
df_clean = df.copy()
print(f"\n✓ Step 1: Created working copy")
print(f"  Original shape: {df_clean.shape}")

# ---- 2. FIX TotalCharges ----
if 'TotalCharges' in df_clean.columns:
    blank_count = (df_clean['TotalCharges'].astype(str).str.strip() == "").sum()
    df_clean['TotalCharges'] = df_clean['TotalCharges'].replace(" ", np.nan)
    df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors="coerce")
    df_clean['TotalCharges'] = df_clean['TotalCharges'].fillna(
        df_clean['MonthlyCharges'] * df_clean['tenure']
    )
    print(f"\n✓ Step 2: Fixed TotalCharges column")
    print(f"  - Found {blank_count} blank entries")
    print(f"  - Imputed using: MonthlyCharges × tenure")
    print(f"  - Remaining nulls: {df_clean['TotalCharges'].isnull().sum()}")

# ---- 3. DROP NON-USEFUL ID COLUMNS ----
id_cols = ['customerID', 'CustomerID', 'customerId']
dropped_ids = [c for c in id_cols if c in df_clean.columns]
df_clean = df_clean.drop(columns=dropped_ids, errors='ignore')
print(f"\n✓ Step 3: Dropped ID columns")
if dropped_ids:
    print(f"  - Removed: {dropped_ids}")
else:
    print(f"  - No ID columns found")

# ---- 4. ENCODE TARGET ----
original_target_dist = df_clean['Churn'].value_counts()
df_clean['Churn'] = df_clean['Churn'].map({'Yes':1, 'No':0}).astype(int)
print(f"\n✓ Step 4: Encoded target variable")
print(f"  - 'Yes' → 1 (Churn)")
print(f"  - 'No' → 0 (No Churn)")
print(f"  - Class distribution: {dict(df_clean['Churn'].value_counts())}")
print(f"  - Imbalance ratio: {(df_clean['Churn']==1).sum() / len(df_clean) * 100:.1f}% churn")

# ---- 5. FEATURE GROUPS ----
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
binary_cols  = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
cat_cols     = [c for c in df_clean.select_dtypes(include='object').columns if c != 'Churn']
print(f"\n✓ Step 5: Identified feature groups")
print(f"  - Numeric features ({len(numeric_cols)}): {numeric_cols}")
print(f"  - Binary features ({len(binary_cols)}): {binary_cols}")
print(f"  - Categorical features ({len(cat_cols)}): {cat_cols}")

# ---- 6. FIX BINARY YES/NO ----
binary_converted = []
for col in binary_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].map({'Yes':1, 'No':0}).astype('float')
        binary_converted.append(col)
print(f"\n✓ Step 6: Converted binary features to 0/1")
print(f"  - Converted: {binary_converted}")

# ---- 7. FILL MISSING CATEGORICAL VALUES ----
filled_cats = []
for col in cat_cols:
    missing_before = df_clean[col].isnull().sum()
    if missing_before > 0:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
        filled_cats.append((col, missing_before))
print(f"\n✓ Step 7: Filled missing categorical values")
if filled_cats:
    for col, count in filled_cats:
        print(f"  - {col}: filled {count} missing values with mode")
else:
    print(f"  - No missing categorical values found")

# ---- 8. BUILD PREPROCESSOR ----
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_cols),
    ],
    remainder='passthrough'
)
print(f"\n✓ Step 8: Built ColumnTransformer")
print(f"  - StandardScaler for numeric features")
print(f"  - OneHotEncoder for categorical features (drop_first=True)")
print(f"  - Binary features passed through as-is")

# ---- 9. TRAIN-TEST SPLIT ----
X = df_clean.drop(columns=['Churn'])
y = df_clean['Churn']
X_train_raw, X_test_raw, y_train_raw, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
print(f"\n✓ Step 9: Train-test split (80/20, stratified)")
print(f"  - Training set: {X_train_raw.shape[0]} samples")
print(f"  - Test set: {X_test_raw.shape[0]} samples")
print(f"  - Train class distribution: {dict(Counter(y_train_raw))}")
print(f"  - Test class distribution: {dict(Counter(y_test))}")

# ---- 10. FIT TRANSFORMER ----
preprocessor.fit(X_train_raw)
X_train = preprocessor.transform(X_train_raw)
X_test  = preprocessor.transform(X_test_raw)
try:
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_count = len(ohe.get_feature_names_out())
    total_features = len(numeric_cols) + ohe_feature_count + len(binary_cols)
except:
    total_features = X_train.shape[1]
print(f"\n✓ Step 10: Applied transformations")
print(f"  - Training features shape: {X_train.shape}")
print(f"  - Test features shape: {X_test.shape}")
print(f"  - Total features after encoding: {total_features}")

# ---- 11. SMOTE BALANCING ----
print(f"\n✓ Step 11: Applying SMOTE for class balancing")
print(f"  - Before SMOTE: {dict(Counter(y_train_raw))}")
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train_raw)
print(f"  - After SMOTE: {dict(Counter(y_train_bal))}")
print(f"  - Training samples increased: {X_train.shape[0]} → {X_train_bal.shape[0]}")
print(f"  - Class balance achieved: 50/50 split")

# ---- 12. SAVE PREPROCESSOR ----
joblib.dump(preprocessor, "preprocessor.joblib")
print(f"\n✓ Step 12: Saved preprocessor")
print(f"  - File: preprocessor.joblib")

# ---- FINAL SUMMARY ----
print("\n" + "="*70)
print("PREPROCESSING COMPLETE ✓")
print("="*70)
print(f"Final Training Set: {X_train_bal.shape[0]} samples × {X_train_bal.shape[1]} features")
print(f"Final Test Set: {X_test.shape[0]} samples × {X_test.shape[1]} features")
print(f"Ready for model training!")
print("="*70 + "\n")

# ---- 13. ASSIGN FINAL VARIABLES ----
X_train = X_train_bal
y_train = y_train_bal

print("Variable assignments:")
print("  X_train = SMOTE-balanced training features")
print("  y_train = SMOTE-balanced training labels")
print("  X_test = original test features (no SMOTE)")
print("  y_test = original test labels (no SMOTE)")

In [None]:
# ---------- CELL D: DecisionTree + GridSearch (tuned) ----------
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

if 'X_train' in globals():
    param_grid = {'max_depth':[4,6,8,12,16], 'criterion':['gini','entropy'], 'min_samples_leaf':[1,5,10]}
    dt = DecisionTreeClassifier(random_state=42)
    grid = GridSearchCV(dt, param_grid, cv=4, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_dt = grid.best_estimator_
    print("Best DT params:", grid.best_params_)

    # Evaluate
    y_pred_dt = best_dt.predict(X_test)
    print("Decision Tree Test Accuracy:", accuracy_score(y_test, y_pred_dt))
    print(classification_report(y_test, y_pred_dt))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred_dt)
    sns.heatmap(cm, annot=True, fmt='d'); plt.title("Decision Tree Confusion Matrix"); plt.show()

    # Map feature importances back to names (try to reconstruct)
    try:
        # get feature names from preprocessor
        num_cols = preprocessor.transformers_[0][2]
        ohe = preprocessor.transformers_[1][1]
        cat_cols = preprocessor.transformers_[1][2]
        ohe_names = []
        if hasattr(ohe, 'get_feature_names_out'):
            ohe_names = list(ohe.get_feature_names_out(cat_cols))
        feature_names = list(num_cols) + ohe_names + [c for c in X_train_raw.columns if c not in num_cols+cat_cols]
        fi = pd.Series(best_dt.feature_importances_, index=feature_names).sort_values(ascending=False)
        display(fi.head(15))
    except Exception as e:
        print("Could not map feature names:", e)
else:
    print("Run preprocessing first.")


In [None]:
# ---------- CELL E: Neural Network (improved training + callbacks) ----------
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt

if 'X_train' in globals():
    input_dim = X_train.shape[1]
    nn = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    rl = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

    history = nn.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.15, callbacks=[es, rl], verbose=2)

    # plot history
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1); plt.plot(history.history['loss'], label='train'); plt.plot(history.history['val_loss'], label='val'); plt.legend(); plt.title('Loss')
    plt.subplot(1,2,2); plt.plot(history.history['accuracy'], label='train'); plt.plot(history.history['val_accuracy'], label='val'); plt.legend(); plt.title('Accuracy')
    plt.show()

    # Evaluate
    loss, acc = nn.evaluate(X_test, y_test, verbose=0)
    print("NN Test Acc:", acc)

    y_prob_nn = nn.predict(X_test).ravel()
    y_pred_nn = (y_prob_nn >= 0.5).astype(int)
    print(classification_report(y_test, y_pred_nn))
    sns.heatmap(confusion_matrix(y_test, y_pred_nn), annot=True, fmt='d'); plt.title("NN Confusion Matrix"); plt.show()

    # save model
    nn.save('nn_model.h5')
    print("Neural net saved to nn_model.h5")
else:
    print("Run preprocessing first.")


In [None]:
# ---------- CELL F: ROC & AUC comparison ----------
from sklearn.metrics import roc_curve, auc
if 'best_dt' in globals() and 'nn' in globals():
    y_prob_dt = best_dt.predict_proba(X_test)[:,1]
    fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
    auc_dt = auc(fpr_dt, tpr_dt)

    fpr_nn, tpr_nn, _ = roc_curve(y_test, y_prob_nn)
    auc_nn = auc(fpr_nn, tpr_nn)

    plt.figure(figsize=(8,6))
    plt.plot(fpr_dt, tpr_dt, label=f'DT (AUC={auc_dt:.3f})')
    plt.plot(fpr_nn, tpr_nn, label=f'NN (AUC={auc_nn:.3f})')
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend(); plt.title('ROC Curves');
    plt.show()
else:
    print("Train both models first.")


In [None]:
# ---------- FINAL MODEL COMPARISON ----------
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate metrics for both models
dt_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_dt),
    'Precision': precision_score(y_test, y_pred_dt),
    'Recall': recall_score(y_test, y_pred_dt),
    'F1-Score': f1_score(y_test, y_pred_dt),
    'AUC': auc_dt
}

nn_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_nn),
    'Precision': precision_score(y_test, y_pred_nn),
    'Recall': recall_score(y_test, y_pred_nn),
    'F1-Score': f1_score(y_test, y_pred_nn),
    'AUC': auc_nn
}

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Decision Tree': dt_metrics,
    'Neural Network': nn_metrics
})

print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)
display(comparison_df.round(4))

# Determine best model
best_model = 'Neural Network' if nn_metrics['F1-Score'] > dt_metrics['F1-Score'] else 'Decision Tree'
print(f"\n✓ Best performing model: {best_model}")
print(f"  (Based on F1-Score: balances precision and recall)")
print("="*60 + "\n")

# Save comparison
comparison_df.to_csv('model_comparison.csv')
print("✓ Comparison saved to model_comparison.csv")

In [None]:
sns.countplot(data=df, x='Contract', hue='Churn')
plt.title("Contract vs Churn")
plt.show()

sns.countplot(data=df, x='PaymentMethod', hue='Churn')
plt.title("PaymentMethod vs Churn")
plt.xticks(rotation=45)
plt.show()

sns.countplot(data=df, x='InternetService', hue='Churn')
plt.title("InternetService vs Churn")
plt.xticks(rotation=45)
plt.show()
