1: Install necessary libraries

In [4]:
!pip install kagglehub pandas scikit-learn joblib



2: Import libraries

In [3]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import os

3: Load Dataset

In [5]:
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/telco-customer-churn


4: Load the dataset from the downloaded directory

In [6]:
csv_file = os.path.join(path, "WA_Fn-UseC_-Telco-Customer-Churn.csv")
df = pd.read_csv(csv_file)

5: Clean and preprocess raw data

In [7]:
df = df.replace(" ", np.nan)
df = df.dropna()
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"])
df = df.drop(columns=["customerID"])

6: Define features and target

In [8]:
X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})  # Encode target variable



7: Train-test split

In [11]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


8: Set up preprocessing pipeline

In [12]:

numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = list(X.select_dtypes(include=["object"]).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


9: Build pipelines for both classifiers

In [13]:

logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


10: Define hyperparameter grids

In [14]:

param_grid_logreg = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__penalty": ["l2"],
    "classifier__solver": ["lbfgs"]
}

param_grid_rf = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5]
}


11: Train models with GridSearchCV

In [15]:

grid_logreg = GridSearchCV(logreg_pipeline, param_grid_logreg, cv=5, scoring='accuracy', n_jobs=-1)
grid_logreg.fit(X_train, y_train)

grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)


In [16]:
print("Random Forest Best Parameters:")
print(grid_rf.best_estimator_)


Random Forest Best Parameters:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['tenure', 'MonthlyCharges',
                                                   'TotalCharges']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['gender', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultipleLines',
                                                   'InternetService',
                                                   'OnlineSecurity',
                                                   'OnlineBackup',
                                                   'DeviceProtection',
                                                   'TechSupport', 'S

12: Evaluate both models

In [17]:

print("Best Logistic Regression Model:")
print(grid_logreg.best_estimator_)
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, grid_logreg.predict(X_test)))

print("\nBest Random Forest Model:")
print(grid_rf.best_estimator_)
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, grid_rf.predict(X_test)))


Best Logistic Regression Model:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['tenure', 'MonthlyCharges',
                                                   'TotalCharges']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['gender', 'Partner',
                                                   'Dependents', 'PhoneService',
                                                   'MultipleLines',
                                                   'InternetService',
                                                   'OnlineSecurity',
                                                   'OnlineBackup',
                                                   'DeviceProtection',
                                                   'TechSupport', '

13: Save the best model pipeline

In [18]:

best_pipeline = grid_rf if grid_rf.best_score_ > grid_logreg.best_score_ else grid_logreg
joblib.dump(best_pipeline.best_estimator_, "telco_churn_pipeline.pkl")

print("✅ Best model pipeline saved as 'telco_churn_pipeline.pkl'")

✅ Best model pipeline saved as 'telco_churn_pipeline.pkl'


14: Downloading the pipeline

In [19]:
from google.colab import files

files.download('telco_churn_pipeline.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>