<a href="https://colab.research.google.com/github/alixa2003/AI-ML-Internship-Tasks-Month2/blob/main/DHC_Internship_Part2_Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**End-to-End ML Pipeline with Scikit-learn Pipeline API**
**Objective**:
Build a reusable and production-ready machine learning pipeline for predicting customer churn.

**Dataset**:
Telco Churn Dataset

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import joblib

In [None]:
data_path = "/content/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(data_path)

print(df.head())
print(df.shape)
print(df.info())
print(df['Churn'].value_counts())

   customerID  gender  SeniorCitizen  ... MonthlyCharges TotalCharges  Churn
0  7590-VHVEG  Female              0  ...          29.85        29.85     No
1  5575-GNVDE    Male              0  ...          56.95       1889.5     No
2  3668-QPYBK    Male              0  ...          53.85       108.15    Yes
3  7795-CFOCW    Male              0  ...          42.30      1840.75     No
4  9237-HQITU  Female              0  ...          70.70       151.65    Yes

[5 rows x 21 columns]
(7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService    

In [None]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
if 'customerID' in df.columns:
  df = df.drop(columns=['customerID'])

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
print("Numerical Features: ", list(numerical_features))
print("Categorical Features: ", list(categorical_features))

Numerical Features:  ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical Features:  ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
from sklearn.impute import SimpleImputer

# Redefine the numeric_transformer to include a SimpleImputer for handling NaNs
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Add imputer here
    ('scaler', StandardScaler())
])

# Redefine the preprocessor with the updated numeric_transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the logistic regression pipeline with the fixed preprocessor
log_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000, solver='liblinear')) # Added solver for better convergence
])

# Fit the pipeline
log_reg_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_lr = log_reg_pipeline.predict(X_test)

# Print evaluation metrics
print("Logistic_regression Accuracy: ", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic_regression Accuracy:  0.8055358410220014
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [None]:
rf_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestClassifier(random_state=42))
])


rf_pipeline.fit(X_train, y_train)


y_pred_rf = rf_pipeline.predict(X_test)


print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.7792760823278921
              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1035
           1       0.61      0.48      0.53       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409



In [None]:
rf_balanced_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestClassifier(
random_state=42,
class_weight='balanced'
))
])


rf_balanced_pipeline.fit(X_train, y_train)


y_pred_rf_bal = rf_balanced_pipeline.predict(X_test)


print("Random Forest Accuracy (Class Balanced):", accuracy_score(y_test, y_pred_rf_bal))
print(classification_report(y_test, y_pred_rf_bal))

Random Forest Accuracy (Class Balanced): 0.7821149751596878
              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1035
           1       0.62      0.48      0.54       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.70      1409
weighted avg       0.77      0.78      0.77      1409



In [None]:
param_grid = {
'model__n_estimators': [100, 200],
'model__max_depth': [None, 10, 20]
}


grid_search = GridSearchCV(
rf_pipeline,
param_grid=param_grid,
cv=5,
scoring='f1',
n_jobs=-1
)


grid_search.fit(X_train, y_train)


print("Best parameters:", grid_search.best_params_)
print("Best CV F1 score:", grid_search.best_score_)

Best parameters: {'model__max_depth': 10, 'model__n_estimators': 100}
Best CV F1 score: 0.5747513512648854


In [None]:
best_model = grid_search.best_estimator_


y_pred_final = best_model.predict(X_test)


print("Final Model Accuracy:", accuracy_score(y_test, y_pred_final))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_final))
print(classification_report(y_test, y_pred_final))

Final Model Accuracy: 0.8019872249822569
Confusion Matrix:
 [[933 102]
 [177 197]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.53      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.73      1409
weighted avg       0.79      0.80      0.79      1409

