..**Step	Description**
1	Load Telco CSV
2	Clean columns and target
3	Split into features and target
4	Build ColumnTransformer
5	Add model into Pipeline
6	Tune with GridSearchCV
7	Evaluate and save with joblib

In [21]:
#Task 2: End-to-End ML Pipeline with Scikit-learn Pipeline API
# Install required libraries (if using Colab)
!pip install pandas scikit-learn joblib

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib




In [28]:
# Load data
df = pd.read_excel("/content/Telco_customer_churn.xlsx")

# Preview data
df.head()


Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [36]:
#Step 3: Basic Cleaning and Target Prep
# Drop customerID (non-informative)
#df.drop('CustomerID', axis=1, inplace=True)

# Convert TotalCharges to numeric
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

# Fill missing values in TotalCharges
df['Total Charges'].fillna(df['Total Charges'].median(), inplace=True)

# Encode target variable
df['Churn Label'] = df['Churn Label'].map({'Yes': 1, 'No': 0})

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Total Charges'].fillna(df['Total Charges'].median(), inplace=True)


In [38]:
#Step 4: Split Features and Target
X = df.drop("Churn Label", axis=1)
y = df["Churn Label"]

In [39]:
#Step 5: Identify Numeric and Categorical Columns
# Separate column types
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()


In [40]:
#Step 6: Build Preprocessing Pipeline
# Preprocessing for numerical data
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Preprocessing for categorical data
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])


In [41]:
#Step 7: Create Full Pipeline with Classifier
#Let’s start with Logistic Regression:
# Create full pipeline
logreg_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])


In [42]:
#Step 8: Split into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [43]:
#Step 9: Hyperparameter Tuning with GridSearchCV
#python
#Copy
#Edit
# Grid for Logistic Regression
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10]
}

# Wrap GridSearchCV
grid = GridSearchCV(logreg_pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Best parameters and score
print("Best Params:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)



Best Params: {'classifier__C': 0.01}
Best CV Accuracy: 1.0


In [44]:
#Step 10: Evaluate on Test Set
y_pred = grid.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409

Confusion Matrix:
[[1035    0]
 [   0  374]]


In [45]:
#Step 11: Export Model using joblib
# Save entire pipeline
joblib.dump(grid.best_estimator_, "churn_pipeline.pkl")

# Load later like this:
# model = joblib.load("churn_pipeline.pkl")



['churn_pipeline.pkl']

In [46]:
#step 12..[BONUS] Try Random Forest Model Instead
rf_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Grid search for RF
rf_grid = GridSearchCV(rf_pipeline, {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [5, 10, None]
}, cv=5, scoring="accuracy")

rf_grid.fit(X_train, y_train)

# Evaluate
y_rf_pred = rf_grid.predict(X_test)
print("RF Accuracy:", rf_grid.best_score_)
print(classification_report(y_test, y_rf_pred))


RF Accuracy: 0.9971596577467963
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1035
           1       1.00      0.98      0.99       374

    accuracy                           1.00      1409
   macro avg       1.00      0.99      0.99      1409
weighted avg       1.00      1.00      1.00      1409

