
# Task 4 — Telco Customer Churn: ML Pipeline (Notebook)
**Objective:** Predict customer churn using the provided Telco customer churn dataset.

**This notebook will:**  
- Load the CSV from the path you gave.  
- Perform EDA (missing values, target balance, basic visuals).  
- Clean and preprocess (types, `TotalCharges` conversion, missing handling).  
- Encode categorical features (LabelEncoder for binary, one-hot for multi-class).  
- Scale numeric features where appropriate.  
- Train baseline models (Logistic Regression, RandomForest).  
- Evaluate using accuracy, ROC-AUC, confusion matrix, classification report.  
- Show feature importances and save a best model with `joblib`.  
- Save predictions (CSV) ready for inspection.

> **File path used to load data:**  
`C:\Users\admin\OneDrive\Desktop\skillyt\telco\WA_Fn-UseC_-Telco-Customer-Churn.csv`


In [None]:

# Step 0 — Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve, auc)
import joblib

pd.set_option('display.max_columns', 200)
sns.set_style('whitegrid')


In [None]:

# Step 1 — Load data
df = pd.read_csv("C:\\Users\\admin\\OneDrive\\Desktop\\skillyt\\telco\\WA_Fn-UseC_-Telco-Customer-Churn.csv")

print('Shape:', df.shape)
df.head()


In [None]:

# Step 2 — Quick EDA
print('Columns:', df.columns.tolist())
print('\nTarget distribution (Churn):')
print(df['Churn'].value_counts())
sns.countplot(x='Churn', data=df)
plt.title('Churn distribution')
plt.show()

print('\nMissing values:')
display(df.isnull().sum()[df.isnull().sum()>0])


In [None]:

# Observations: TotalCharges is sometimes blank/space; convert to numeric
print('TotalCharges sample values that are non-numeric:')
mask = df['TotalCharges'].apply(lambda x: isinstance(x, str) and x.strip()=='')
display(df[mask].head())

# Convert TotalCharges to numeric, coerce errors (will produce NaN for blanks)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print('Number of NaNs in TotalCharges:', df['TotalCharges'].isna().sum())

# For rows with missing TotalCharges, we can fill with MonthlyCharges * tenure (approx)
df.loc[df['TotalCharges'].isna(), 'TotalCharges'] = df.loc[df['TotalCharges'].isna(), 'MonthlyCharges'] * df.loc[df['TotalCharges'].isna(), 'tenure']
print('After imputation, NaNs in TotalCharges:', df['TotalCharges'].isna().sum())


In [None]:

# Step 3 — Feature types and simple feature creation
# Convert 'SeniorCitizen' to categorical (it is 0/1 integer)
df['SeniorCitizen'] = df['SeniorCitizen'].astype('int')

# Create a few useful features
df['AvgMonthlyCharge'] = df['TotalCharges'] / (df['tenure'].replace(0,1))  # avoid divide by zero
df['HasPhoneService'] = (df['PhoneService'] == 'Yes').astype(int)
df['MultipleLines_flag'] = df['MultipleLines'].replace({'No phone service':'No', 'No':'No', 'Yes':'Yes'})
df['MultipleLines_flag'] = (df['MultipleLines_flag'] == 'Yes').astype(int)

df[['tenure','MonthlyCharges','TotalCharges','AvgMonthlyCharge']].describe().T


In [None]:

# Step 4 — Encoding
# Target encode
le_target = LabelEncoder()
df['Churn_flag'] = le_target.fit_transform(df['Churn'])  # No=0, Yes=1

# Identify categorical features
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
# Remove target and customerID-like fields from cat_cols
cat_cols = [c for c in cat_cols if c not in ['customerID','Churn']]
print('Categorical columns:', cat_cols)

# For binary object columns like 'Yes'/'No' we can use LabelEncoder; for multi-class use get_dummies
bin_cols = [c for c in cat_cols if df[c].nunique() == 2]
multi_cols = [c for c in cat_cols if df[c].nunique() > 2]
print('Binary categorical cols:', bin_cols)
print('Multi-class categorical cols:', multi_cols)

# Label encode binary columns
le = LabelEncoder()
for c in bin_cols:
    df[c] = le.fit_transform(df[c])

# One-hot encode multi-class (drop first to avoid multicollinearity)
df = pd.get_dummies(df, columns=multi_cols, drop_first=True)
print('Shape after encoding:', df.shape)


In [None]:

# Step 5 — Prepare X, y and train-test split
drop_cols = ['customerID','Churn','Churn_flag']  # keep churn_flag separately
X = df.drop(columns=drop_cols, errors='ignore')
y = df['Churn_flag']

# Train-test split (stratify because classes may be imbalanced)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


In [None]:

# Step 6 — Feature scaling for models that need it (Logistic Regression)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
# We'll scale only a subset (numeric continuous columns)
cont_cols = ['tenure','MonthlyCharges','TotalCharges','AvgMonthlyCharge']
cont_cols = [c for c in cont_cols if c in X.columns]
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[cont_cols] = scaler.fit_transform(X_train[cont_cols])
X_test_scaled[cont_cols]  = scaler.transform(X_test[cont_cols])

print('Scaled continuous columns:', cont_cols)


In [None]:

# Step 7 — Baseline model: Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:,1]

print('Accuracy:', accuracy_score(y_test, y_pred))
print('ROC-AUC:', roc_auc_score(y_test, y_proba))
print('\nClassification report:\n', classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Logistic Regression Confusion Matrix')
plt.show()


In [None]:

# Step 8 — Random Forest (no need to scale)
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print('RF Accuracy:', accuracy_score(y_test, y_pred_rf))
print('RF ROC-AUC:', roc_auc_score(y_test, y_proba_rf))
print('\nRF Classification report:\n', classification_report(y_test, y_pred_rf))

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Random Forest Confusion Matrix')
plt.show()


In [None]:

# Step 9 — Feature importance (from Random Forest)
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
display(importances.head(30))

plt.figure(figsize=(8,10))
importances.head(20).plot(kind='barh')
plt.gca().invert_yaxis()
plt.title('Top 20 Feature Importances (RF)')
plt.show()


In [None]:

# Step 10 — ROC curve comparison
fpr1, tpr1, _ = roc_curve(y_test, y_proba)
roc_auc1 = auc(fpr1, tpr1)

fpr2, tpr2, _ = roc_curve(y_test, y_proba_rf)
roc_auc2 = auc(fpr2, tpr2)

plt.figure(figsize=(8,6))
plt.plot(fpr1, tpr1, label=f'LogReg (AUC = {roc_auc1:.3f})')
plt.plot(fpr2, tpr2, label=f'RandomForest (AUC = {roc_auc2:.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()


In [None]:

# Step 11 — Save best model (choose RF here) and create predictions file
joblib.dump(rf, 'telco_rf_model.joblib')
print('Saved Random Forest model to telco_rf_model.joblib')

# Predictions CSV with probabilities for inspection
preds_df = X_test.copy()
preds_df['actual_churn'] = y_test.values
preds_df['pred_churn'] = y_pred_rf
preds_df['pred_proba'] = y_proba_rf
preds_df.to_csv('telco_churn_test_predictions.csv', index=False)
print('Saved telco_churn_test_predictions.csv (rows: {})'.format(preds_df.shape[0]))



---
## Next steps & interview prep
- Try hyperparameter tuning (GridSearchCV) for the Random Forest and Logistic Regression.  
- Handle class imbalance (if present) with class weights, resampling (SMOTE), or threshold tuning.  
- Explain why we scaled some features and not others.  
- Be ready to talk about feature engineering choices (e.g., TotalCharges imputation, AvgMonthlyCharge).

**If you'd like**, I can also:
- Add GridSearchCV for RF, or
- Create a simpler export with selected features for model deployment.
