In [1]:
import pandas as pd
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost

from sklearn.metrics import roc_auc_score

from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope

import warnings
warnings.filterwarnings("ignore")

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("churn-experiment")

%load_ext lab_black

In [3]:
path = "../data/preprocessed.parquet"

In [4]:
df = pd.read_parquet(path)

In [5]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Experiment: Logistic Regression

In [6]:
mlflow.sklearn.autolog()

search_space = {
    "penalty": hp.choice("penalty", [None, "l2"]),
    "C": scope.int(hp.quniform("C", 1, 10, 1)),
    "max_iter": scope.int(hp.quniform("max_iter", 100, 1000, 1)),
}


def objective(params):
    with mlflow.start_run():

        mlflow.log_param("data-path", path)

        lr = LogisticRegression(**params)

        lr.fit(X_train, y_train)

        y_pred = lr.predict(X_test)
        auc_score = roc_auc_score(y_test, y_pred)

        mlflow.log_metric("roc-auc-score", auc_score)

    return {"loss": -1 * auc_score, "status": STATUS_OK}


best_result = fmin(
    fn=objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=Trials()
)

100%|██████████| 10/10 [00:16<00:00,  1.67s/trial, best loss: -0.7482519382653431]


### Experiment: Random Forest

In [7]:
mlflow.sklearn.autolog()

search_space = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 100, 1000, 1)),
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
}


def objective(params):
    with mlflow.start_run():

        mlflow.log_param("data-path", path)

        rf = RandomForestClassifier(**params)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)
        auc_score = roc_auc_score(y_test, y_pred)

        mlflow.log_metric("roc-auc-score", auc_score)

    return {"loss": -1 * auc_score, "status": STATUS_OK}


best_result = fmin(
    fn=objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=Trials()
)

100%|██████████| 10/10 [00:46<00:00,  4.63s/trial, best loss: -0.7169925574751312]


### Experiment: SVM

In [8]:
mlflow.sklearn.autolog()

search_space = {
    "C": scope.int(hp.quniform("C", 1, 10, 1)),
}


def objective(params):
    with mlflow.start_run():

        mlflow.log_param("data-path", path)

        svc = SVC(**params)
        svc.fit(X_train, y_train)

        y_pred = svc.predict(X_test)
        auc_score = roc_auc_score(y_test, y_pred)

        mlflow.log_metric("roc-auc-score", auc_score)

    return {"loss": -1 * auc_score, "status": STATUS_OK}


best_result = fmin(
    fn=objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=Trials()
)

100%|██████████| 10/10 [00:56<00:00,  5.66s/trial, best loss: -0.6168885277464365]


### Experiment: XGBoost

In [9]:
mlflow.xgboost.autolog()

search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3),
    "objective": "binary:logistic",
    "seed": 44,
}


def objective(params):
    with mlflow.start_run():
        mlflow.log_param("data-path", path)
        mlflow.log_params(params)

        train_data = xgboost.DMatrix(data=X_train, label=y_train)
        val_data = xgboost.DMatrix(data=X_test, label=y_test)

        bst = xgboost.train(
            params=params,
            dtrain=train_data,
            num_boost_round=500,
            evals=[(val_data, "validation")],
            early_stopping_rounds=50,
        )

        y_pred = bst.predict(val_data)
        auc_score = roc_auc_score(y_test, y_pred)
        mlflow.log_metric("roc-auc-score", auc_score)

    return {"loss": -1 * auc_score, "status": STATUS_OK}


best_result = fmin(
    fn=objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=Trials()
)



[0]	validation-logloss:0.55050                        
[1]	validation-logloss:0.48557                        
[2]	validation-logloss:0.45489                        
[3]	validation-logloss:0.44307                        
[4]	validation-logloss:0.44118                        
[5]	validation-logloss:0.44088                        
[6]	validation-logloss:0.44197                        
[7]	validation-logloss:0.44571                        
[8]	validation-logloss:0.44908                        
[9]	validation-logloss:0.45017                        
[10]	validation-logloss:0.45223                       
[11]	validation-logloss:0.45639                       
[12]	validation-logloss:0.46227                       
[13]	validation-logloss:0.46811                       
[14]	validation-logloss:0.47445                       
[15]	validation-logloss:0.47835                       
[16]	validation-logloss:0.48171                       
[17]	validation-logloss:0.48527                       
[18]	valid

### Prediction with Registered Model

In [10]:
client = mlflow.MlflowClient()
client.search_registered_models()

[<RegisteredModel: creation_timestamp=1675008596460, description='', last_updated_timestamp=1675008620745, latest_versions=[<ModelVersion: creation_timestamp=1675008596494, current_stage='Production', description='', last_updated_timestamp=1675008620745, name='xgboost-best-model', run_id='39d2c9fc6f184aa187a2bd09ade6a9a3', run_link='', source='./mlruns/1/39d2c9fc6f184aa187a2bd09ade6a9a3/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='xgboost-best-model', tags={}>]

In [11]:
model_name = "xgboost-best-model"
model = mlflow.pyfunc.load_model(f"models:/{model_name}/production")

y_pred = model.predict(X_test)
roc_auc_score(y_test, y_pred)

0.843672042398584