Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow
import mlflow.sklearn
import warnings
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from mlflow.models.signature import infer_signature

warnings.filterwarnings("ignore")


In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("TelcoChurn_Experiment")


<Experiment: artifact_location='mlflow-artifacts:/162929546382421294', creation_time=1746097237042, experiment_id='162929546382421294', last_update_time=1746097237042, lifecycle_stage='active', name='TelcoChurn_Experiment', tags={}>

In [3]:
df = pd.read_csv("../data/clean_telco_2.csv")

In [4]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,NEW_TENURE_YEAR_1-2 Year,NEW_TENURE_YEAR_2-3 Year,NEW_TENURE_YEAR_3-4 Year,NEW_TENURE_YEAR_4-5 Year,NEW_TENURE_YEAR_5-6 Year
0,7590-VHVEG,0,0,1,0,-1.277445,0,1,-1.160323,-0.992611,...,0,0,0,1,0,0,0,0,0,0
1,5575-GNVDE,1,0,0,0,0.066327,1,0,-0.259629,-0.172165,...,1,0,0,0,1,0,1,0,0,0
2,3668-QPYBK,1,0,0,0,-1.236724,1,1,-0.36266,-0.958066,...,0,0,0,0,1,0,0,0,0,0
3,7795-CFOCW,1,0,0,0,0.514251,0,0,-0.746535,-0.193672,...,1,0,0,0,0,0,0,1,0,0
4,9237-HQITU,0,0,0,0,-1.236724,1,1,0.197365,-0.938874,...,0,0,0,1,0,0,0,0,0,0


In [5]:
X = df.drop(["Churn", "customerID"], axis=1)
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

Hyperopt Search Space

In [6]:
space = hp.choice("classifier_type", [
    {
        "model_type": "logistic",
        "C": hp.loguniform("log_C", -4, 2),
        "max_iter": 1000
    },
    {
        "model_type": "random_forest",
        "n_estimators": hp.choice("rf_n_estimators", [100, 200]),
        "max_depth": hp.choice("rf_max_depth", [4, 6, 8]),
        "min_samples_split": hp.choice("min_samples_split", [2, 5])
    },
    {
        "model_type": "xgboost",
        "n_estimators": hp.choice("xgb_n_estimators", [100, 200]),
        "max_depth": hp.choice("xgb_max_depth", [3, 6]),
        "learning_rate": hp.choice("learning_rate", [0.05, 0.1]),
        "subsample": hp.choice("subsample", [0.8, 1.0])
    }
])


Objective Function

In [7]:
def objective(params):
    model_type = params["model_type"]

    if model_type == "logistic":
        model = LogisticRegression(C=params["C"], max_iter=params["max_iter"])
    elif model_type == "random_forest":
        model = RandomForestClassifier(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            min_samples_split=params["min_samples_split"],
            random_state=42
        )
    elif model_type == "xgboost":
        model = XGBClassifier(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            learning_rate=params["learning_rate"],
            subsample=params["subsample"],
            use_label_encoder=False,
            eval_metric='logloss'
        )

    with mlflow.start_run(nested=True, run_name=f"{model_type}_trial"):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        proba = model.predict_proba(X_test)[:, 1]

        auc = roc_auc_score(y_test, proba)
        precision = precision_score(y_test, preds)
        recall = recall_score(y_test, preds)

        mlflow.log_param("model_type", model_type)
        mlflow.log_params(params)
        mlflow.log_metric("roc_auc", auc)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)

        return {"loss": -auc, "status": STATUS_OK}

Hyperparameter Tuning Run

In [8]:
# Run hyperparameter tuning
trials = Trials()
with mlflow.start_run(run_name="Hyperparameter Tuning") as outer_run:
    best_result = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=20,
        trials=trials
    )

    best_trial = trials.best_trial
    best_auc = -best_trial["result"]["loss"]
    model_type_index = best_trial["misc"]["vals"]["classifier_type"][0]
    model_type_name = ["logistic", "random_forest", "xgboost"][model_type_index]

    mlflow.log_param("best_model_type", model_type_name)
    mlflow.log_metric("best_auc", best_auc)

    # Extract best model params
    params = best_trial["misc"]["vals"]

    if model_type_name == "logistic":
        best_model = LogisticRegression(C=best_trial["misc"]["vals"]["log_C"][0], max_iter=1000)
    elif model_type_name == "random_forest":
        best_model = RandomForestClassifier(
            n_estimators=[100, 200][params["rf_n_estimators"][0]],
            max_depth=[4, 6, 8][params["rf_max_depth"][0]],
            min_samples_split=[2, 5][params["min_samples_split"][0]],
            random_state=42
        )
    elif model_type_name == "xgboost":
        best_model = XGBClassifier(
            n_estimators=[100, 200][params["xgb_n_estimators"][0]],
            max_depth=[3, 6][params["xgb_max_depth"][0]],
            learning_rate=[0.05, 0.1][params["learning_rate"][0]],
            subsample=[0.8, 1.0][params["subsample"][0]],
            use_label_encoder=False,
            eval_metric='logloss'
        )

    best_model.fit(X_resampled, y_resampled)
    preds = best_model.predict(X_test)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)

    input_example = X_test.iloc[:1]
    pred_example = best_model.predict(input_example)
    signature = infer_signature(input_example, pred_example)

    # MLflow Model Registry’ye best modeli signature + input ile kaydet
    mlflow.sklearn.log_model(
        best_model,
        artifact_path="best_model",
        registered_model_name="best_telco_churn_model",
        input_example=input_example,
        signature=signature
    )

🏃 View run xgboost_trial at: http://127.0.0.1:5000/#/experiments/162929546382421294/runs/abf99d769cdb457c8ce40218261673fc

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/162929546382421294

🏃 View run logistic_trial at: http://127.0.0.1:5000/#/experiments/162929546382421294/runs/f3300b78b7a94f89a29cf77fb62605e7

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/162929546382421294    

🏃 View run random_forest_trial at: http://127.0.0.1:5000/#/experiments/162929546382421294/runs/4350da8aecd0463dafc8ec07eeb36a2d

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/162929546382421294    

🏃 View run logistic_trial at: http://127.0.0.1:5000/#/experiments/162929546382421294/runs/8dfd1b53093346c5aaae92b941ca029e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/162929546382421294    

🏃 View run logistic_trial at: http://127.0.0.1:5000/#/experiments/162929546382421294/runs/5cdb393ba3264eb5b3ae0ebe67cc679f

🧪 View experiment at: http://127.0.0.1:5000/#/experi

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 437.77it/s] 
Registered model 'best_telco_churn_model' already exists. Creating a new version of this model...
2025/05/05 22:52:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best_telco_churn_model, version 15


🏃 View run Hyperparameter Tuning at: http://127.0.0.1:5000/#/experiments/162929546382421294/runs/0d3b762f3abc4799820390f4f139e891
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/162929546382421294


Created version '15' of model 'best_telco_churn_model'.
