# Task 2: End-to-End ML Pipeline for Customer Churn
Build a reusable ML pipeline using scikit-learn Pipeline API.


In [1]:
# Cell 1: install required packages (run only if needed)
# Run this cell once in the notebook. Remove `-q` if you want verbose install output.
!pip install -q scikit-learn pandas joblib


In [2]:
# Cell 2: imports and load dataset (with safe fallback)
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Try to download Telco dataset from a reliable source (IBM repo)
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

try:
    df = pd.read_csv(url)
    print("Loaded Telco dataset from URL. Shape:", df.shape)
except Exception as e:
    print("Could not download dataset (reason: {}). Creating a small synthetic fallback dataset.".format(e))
    # Synthetic fallback dataset (very small) so cells won't crash if no internet:
    df = pd.DataFrame({
        "customerID": ["0001","0002","0003","0004","0005","0006","0007","0008"],
        "gender": ["Male","Female","Female","Male","Female","Male","Male","Female"],
        "SeniorCitizen": [0,0,0,1,0,1,0,0],
        "Partner": ["Yes","No","No","Yes","No","Yes","No","Yes"],
        "Dependents": ["No","No","Yes","No","No","No","No","Yes"],
        "tenure": [1,34,5,2,45,23,8,58],
        "PhoneService": ["Yes","Yes","No","Yes","Yes","Yes","No","Yes"],
        "InternetService": ["DSL","Fiber optic","DSL","DSL","Fiber optic","DSL","None","DSL"],
        "MonthlyCharges": [29.85,56.95,53.85,42.30,70.70,99.65,89.10,29.75],
        "TotalCharges": [29.85,1889.5,108.15,84.8,3194.2,2300.0,500.0,173.7],
        "Churn": ["No","No","Yes","No","No","Yes","Yes","No"]
    })
    print("Synthetic dataset shape:", df.shape)

# Quick peek
df.head()


Loaded Telco dataset from URL. Shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Cell 3: Cleaning and preparing feature matrix X and target y

# Drop customerID if present
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])

# Convert TotalCharges to numeric (it may be object due to blanks)
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill numeric missing values (if any) - we'll do more within pipeline, but fix TotalCharges now
if df['TotalCharges'].isnull().any():
    median_tc = df['TotalCharges'].median()
    df['TotalCharges'] = df['TotalCharges'].fillna(median_tc)

# Map target
if 'Churn' not in df.columns:
    raise ValueError("Dataset does not contain 'Churn' column.")
y = df['Churn'].map({'Yes': 1, 'No': 0})
X = df.drop(columns=['Churn'])

print("Features shape:", X.shape, "Target distribution:\n", y.value_counts(normalize=True))
X.dtypes


Features shape: (7043, 19) Target distribution:
 Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64


gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
dtype: object

In [4]:
# Cell 4: recompute numeric and categorical columns safely
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [5]:
# Cell 5: construct pipeline safely
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
], remainder='drop')

pipe = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))  # placeholder
])


In [6]:
# Cell 6: train/test split (we keep a holdout test set for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print("Train set:", X_train.shape, "Test set:", X_test.shape)


Train set: (5634, 19) Test set: (1409, 19)


In [7]:
# Cell 7: Grid search over two estimator families (small grid to keep run-time modest)
param_grid = [
    {
        'clf': [LogisticRegression(max_iter=1000, solver='liblinear')],
        'clf__C': [0.01, 0.1, 1.0],
        'clf__class_weight': [None, 'balanced']
    },
    {
        'clf': [RandomForestClassifier(random_state=42)],
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 10],
        'clf__class_weight': [None, 'balanced']
    }
]

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=3,
    scoring='f1',      # use F1 since churn can be imbalanced
    n_jobs=-1,
    verbose=2
)

print("Starting GridSearchCV... (this can take a few minutes)")
grid.fit(X_train, y_train)
print("Grid search done.")
print("Best params:", grid.best_params_)
print("Best CV F1 score:", grid.best_score_)


Starting GridSearchCV... (this can take a few minutes)
Fitting 3 folds for each of 14 candidates, totalling 42 fits
Grid search done.
Best params: {'clf': LogisticRegression(max_iter=1000, solver='liblinear'), 'clf__C': 0.1, 'clf__class_weight': 'balanced'}
Best CV F1 score: 0.6333164633324696


In [8]:
# Cell 8: Evaluate best estimator on test set
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.7416607523066004
Test F1: 0.6176470588235294

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.73      0.80      1035
           1       0.51      0.79      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.76      1409


Confusion Matrix:
 [[751 284]
 [ 80 294]]


In [9]:
# Cell 9: save the full pipeline for production
outfile = "churn_pipeline.joblib"
joblib.dump(best_model, outfile)
print(f"Saved pipeline to {outfile}")

# Example: load and predict on a single sample (use first row from X_test)
loaded = joblib.load(outfile)
example = X_test.iloc[[0]]
pred = loaded.predict(example)
print("Example input (first test row):")
display(example)
print("Predicted churn (0=no,1=yes):", pred[0])


Saved pipeline to churn_pipeline.joblib
Example input (first test row):


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
437,Male,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),114.05,8468.2


Predicted churn (0=no,1=yes): 0
