In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import optuna

In [3]:
df = pd.read_csv('../data/telco_cleaned1.csv')
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
X = df.drop(columns=['Churn', 'customerID'])
y = df['Churn']

In [4]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [5]:
preprocessor = ColumnTransformer([
('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }

    clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(**params))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        clf.fit(X_train, y_train)
        preds = clf.predict(X_val)
        scores.append(f1_score(y_val, preds))
    return np.mean(scores)


In [7]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

[I 2025-06-03 13:02:45,932] A new study created in memory with name: no-name-f3e8e3ba-05b6-4abb-930e-f98e2a1e775a
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-06-03 13:02:48,728] Trial 0 finished with value: 0.5881117496786947 and parameters: {'n_estimators': 321, 'max_depth': 3, 'learning_rate': 0.023467013776153914, 'subsample': 0.9487614293096867, 'colsample_bytree': 0.6028846725340212, 'gamma': 1.7905632786050325, 'reg_alpha': 2.417198553373737, 'reg_lambda': 2.777631924746751}. Best is trial 0 with value: 0.5881117496786947.
Parameters: { "use_label_e

In [8]:
best_params = study.best_trial.params
best_params['use_label_encoder'] = False
best_params['eval_metric'] = 'logloss'
best_params['random_state'] = 42

final_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier',XGBClassifier(**best_params))
])
final_clf.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
y_pred = final_clf.predict(X)
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Rpeort\n", classification_report(y, y_pred))

Confusion Matrix:
 [[4707  467]
 [ 808 1061]]

Classification Rpeort
               precision    recall  f1-score   support

           0       0.85      0.91      0.88      5174
           1       0.69      0.57      0.62      1869

    accuracy                           0.82      7043
   macro avg       0.77      0.74      0.75      7043
weighted avg       0.81      0.82      0.81      7043



In [10]:
from sklearn.pipeline import Pipeline

best_params.update({
    "n_estimators": 300,
    "use_label_encoder": False,
    "eval_metric": "logloss",
    "random_state": 42
})

final_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(**best_params))
])

final_model.fit(X, y)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = final_model.predict(X)
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))

Confusion Matrix:
 [[4702  472]
 [ 808 1061]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88      5174
           1       0.69      0.57      0.62      1869

    accuracy                           0.82      7043
   macro avg       0.77      0.74      0.75      7043
weighted avg       0.81      0.82      0.81      7043



In [12]:
import joblib

# Save the pipeline
joblib.dump(final_model, "churn_xgb_pipeline.pkl")
print("✅ Model saved as churn_xgb_pipeline.pkl")

✅ Model saved as churn_xgb_pipeline.pkl
