# Telco Customer Churn â€” Complete Notebook (Hints applied)

This notebook follows the Task 04 hints and includes a full EDA + preprocessing + modeling workflow. It is runnable. Make sure the raw CSV `WA_Fn-UseC_-Telco-Customer-Churn.csv` is in the same folder, or use the provided cleaned/model-ready CSVs.

In [None]:
# Imports
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve, auc
import joblib
%matplotlib inline
pd.set_option('display.max_columns', 200)


In [None]:
# Load cleaned data (if you have the cleaned CSV)
if os.path.exists('telco_churn_cleaned.csv'):
    df = pd.read_csv('telco_churn_cleaned.csv')
else:
    df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Standardize columns
df.columns = [c.strip().replace(' ', '_').replace('-', '_') for c in df.columns]
# Quick fixes
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = df['TotalCharges'].replace(' ', pd.NA)
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Create churn flag
if 'Churn' in df.columns:
    df['Churn_flag'] = df['Churn'].map({'Yes':1,'No':0})

print('Shape:', df.shape)
df.head()


In [None]:
# Basic EDA
print(df.info())
print('\nChurn distribution:')
print(df['Churn_flag'].value_counts(normalize=True))

# Numeric summary
display(df.describe().T)

# Check missing
print('\nMissing values per column:')
print(df.isnull().sum().sort_values(ascending=False).head(20))


In [None]:
# Preprocessing
replace_no_service = ['MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
for c in replace_no_service:
    if c in df.columns:
        df[c] = df[c].replace({'No internet service':'No','No phone service':'No'})

# Trim strings
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip()

# Save cleaned
df.to_csv('telco_churn_cleaned.csv', index=False)
print('Saved telco_churn_cleaned.csv')

# Label encode binary-like columns
binary_candidates = []
for col in df.select_dtypes(include=['object']).columns:
    vals = df[col].dropna().unique()
    if len(vals)==2:
        binary_candidates.append(col)

binary_candidates


In [None]:
# Apply mapping for common binaries and gender
binary_map = {'Yes':1,'No':0,'Male':1,'Female':0}
for col in binary_candidates:
    df[col] = df[col].map(lambda x: binary_map.get(x, x))

# Prepare model dataframe
drop_cols = [c for c in ['customerID','CustomerID','CustomerId','Churn'] if c in df.columns]
df_model = df.drop(columns=drop_cols, errors='ignore').copy()

# One-hot encode remaining object columns
obj_cols = df_model.select_dtypes(include=['object']).columns.tolist()
print('One-hot encoding columns:', obj_cols)
df_model = pd.get_dummies(df_model, columns=obj_cols, drop_first=True)

# Ensure target at end
if 'Churn_flag' in df_model.columns:
    churn = df_model.pop('Churn_flag')
    df_model['Churn_flag'] = churn

# Save model-ready
df_model.to_csv('telco_churn_model_ready.csv', index=False)
print('Saved telco_churn_model_ready.csv; shape=', df_model.shape)


In [None]:
# Train-test split and scaling
m = pd.read_csv('telco_churn_model_ready.csv')
X = m.drop(columns=['Churn_flag'])
y = m['Churn_flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


In [None]:
# Baseline models
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print('Logistic Regression AUC:', roc_auc_score(y_test, lr.predict_proba(X_test)[:,1]))
print(classification_report(y_test, lr.predict(X_test)))

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
print('Random Forest AUC:', roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
print(classification_report(y_test, rf.predict(X_test)))

# ROC curve
import matplotlib.pyplot as plt
lr_probs = lr.predict_proba(X_test)[:,1]
rf_probs = rf.predict_proba(X_test)[:,1]

fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_probs)
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)

plt.figure(figsize=(6,4))
plt.plot(fpr_lr, tpr_lr, label=f'LR (AUC={roc_auc_score(y_test, lr_probs):.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'RF (AUC={roc_auc_score(y_test, rf_probs):.3f})')
plt.plot([0,1],[0,1],'--', color='grey')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.title('ROC Curves')
plt.show()

# Feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(30)
importances.plot(kind='bar', figsize=(10,4))
plt.title('Top feature importances (RF)')
plt.show()


In [None]:
# Save model and scaler
joblib.dump({'model': rf, 'scaler': scaler, 'numeric_cols': numeric_cols}, 'telco_churn_rf_model.joblib')
print('Saved telco_churn_rf_model.joblib')
