**Install Required Libraries:**

In [8]:
!pip install pandas scikit-learn joblib




**Load the Dataset**

In [28]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Drop customerID (not useful)
df.drop("customerID", axis=1, inplace=True)

# Replace spaces with NaNs and drop missing
df.replace(" ", pd.NA, inplace=True)
df.dropna(inplace=True)

# Convert target column to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# No of rows and columns after performing EDA
df.shape


(7032, 20)

**Split Features and Target:**

In [11]:
x = df.drop("Churn", axis=1)
y = df["Churn"]

**Create Preprocessing Pipelines:**

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Separate categorical and numerical columns
categorical_cols = x.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = x.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Preprocessing for numerical data
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Preprocessing for categorical data
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, numerical_cols),
    ("cat", cat_pipeline, categorical_cols)
])


**Define and Train Models Using Pipeline:**

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Logistic Regression pipeline
logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


**Hyperparameter Tuning with GridSearchCV:**

In [15]:
param_grid_logreg = {
    "classifier__C": [0.01, 0.1, 1, 10]
}

param_grid_rf = {
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [5, 10, None]
}

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Grid search for Logistic Regression
grid_logreg = GridSearchCV(logreg_pipeline, param_grid_logreg, cv=5, scoring='accuracy')
grid_logreg.fit(x_train, y_train)

# Grid search for Random Forest
grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(x_train, y_train)


**Evaluate Best Models:**

In [17]:
from sklearn.metrics import classification_report

print("Logistic Regression Best Parameters:", grid_logreg.best_params_)
print("Random Forest Best Parameters:", grid_rf.best_params_)

print("\nLogistic Regression Report:")
print(classification_report(y_test, grid_logreg.predict(x_test)))

print("\nRandom Forest Report:")
print(classification_report(y_test, grid_rf.predict(x_test)))


Logistic Regression Best Parameters: {'classifier__C': 1}
Random Forest Best Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1033
           1       0.64      0.53      0.58       374

    accuracy                           0.80      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.79      0.80      0.79      1407


Random Forest Report:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1033
           1       0.62      0.45      0.53       374

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.69      1407
weighted avg       0.77      0.78      0.77      1407



**Save Final Model Using Joblib:**

In [18]:
import joblib

# Save the best Random Forest model
joblib.dump(grid_rf.best_estimator_, "telco_churn_pipeline_rf.joblib")


['telco_churn_pipeline_rf.joblib']

**Load and Use Saved Pipeline for Prediction:**

In [26]:
# Load the saved model
model = joblib.load("telco_churn_pipeline_rf.joblib")

# Predict on new data
sample = x_test.iloc[0:10]
prediction = model.predict(sample)
print("Prediction:", prediction)


Prediction: [0 0 0 0 0 0 0 1 0 0]
