In [1]:
!python --version

Python 3.8.10


IMPORT LIBRARIES

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import xgboost as xgb
import pickle

STANDARDIZATION

In [3]:
X_train = pd.read_parquet('../processed_data/X_train.parquet')
X_val = pd.read_parquet('../processed_data/X_val.parquet')

y_train = np.loadtxt('../processed_data/y_train.txt').astype(int)
y_val = np.loadtxt('../processed_data/y_val.txt').astype(int)

In [4]:
# Seperation of columns into numeric and categorical columns
num_cols = np.array(X_train.select_dtypes(include= ['int64','float64']).columns).tolist()
cat_cols = np.array(X_train.select_dtypes(include= ['category','object']).columns).tolist()

In [5]:
print(cat_cols)
print(num_cols)
print(set(cat_cols + num_cols) - set(X_train.columns))  # Should be empty

['AGE_GROUP', 'YEARS_EMPLOYED_GROUP', 'PHONE_CHANGE_GROUP']
['REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'FLOORSMAX_AVG']
set()


In [6]:
train_dicts = X_train[cat_cols + num_cols].to_dict(orient='records')
val_dicts = X_val[cat_cols + num_cols].to_dict(orient='records')

In [7]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

EXPERIMENT TRACKING

In [8]:
import mlflow

mlflow.set_tracking_uri('sqlite:///../cred_risk_sqlite_mlflow.db')
mlflow.set_experiment('credit_default_risk_experiment_tracking')

2025/08/10 10:25:22 INFO mlflow.tracking.fluent: Experiment with name 'credit_default_risk_experiment_tracking' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/mac/Projects/MLops_credit_default_risk_prediction/02-experiment-tracking/mlruns/1', creation_time=1754817922741, experiment_id='1', last_update_time=1754817922741, lifecycle_stage='active', name='credit_default_risk_experiment_tracking', tags={}>

In [9]:

with mlflow.start_run():
    mlflow.set_tag("engineer", "adeakinwe")
    mlflow.set_tag("model", "Logistic Regression")

    mlflow.log_param("train_data_path", "../processed_data/X_train.parquet")
    mlflow.log_param("val_data_path", "../processed_data/X_val.parquet")

    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    y_proba = lr.predict_proba(X_val)[:, 1]

    accuracy = round(accuracy_score(y_val, y_pred), 3)
    auc = round(roc_auc_score(y_val, y_proba), 3)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc", auc)

In [10]:
with mlflow.start_run():
    mlflow.set_tag("engineer", "adeakinwe")
    mlflow.set_tag("model", "XGBoost")

    mlflow.log_param("train_data_path", "../processed_data/X_train.parquet")
    mlflow.log_param("val_data_path", "../processed_data/X_val.parquet")

    # Prepare DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    # Compute scale_pos_weight if needed
    class_counts = pd.Series(y_train).value_counts()
    scale_pos_weight = class_counts[0] / class_counts[1]

    # XGBoost params
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "scale_pos_weight": scale_pos_weight,
    }

    mlflow.log_param("scale_pos_weight", scale_pos_weight)

    # Train with early stopping
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=100,
        evals=[(dtrain, "train"), (dval, "eval")],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predict and evaluate
    y_pred_proba = model.predict(dval)
    y_pred = (y_pred_proba > 0.5).astype(int)

    accuracy = round(accuracy_score(y_val, y_pred), 3)
    auc = round(roc_auc_score(y_val, y_pred_proba), 3)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc", auc)

HYPER-PARAMETER TUNING

In [11]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope
import mlflow.xgboost

In [None]:
# Search space for hyperopt
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
}

# Objective function for hyperopt
def objective(params):
    # Calculate imbalance ratio
    neg, pos = np.bincount(y_train)
    scale_pos_weight = neg / pos

    # Add required static params
    params['objective'] = 'binary:logistic'
    params['seed'] = 42
    params['eval_metric'] = 'auc'
    params['scale_pos_weight'] = scale_pos_weight

    run_name = f"xgb-md{params['max_depth']}-lr{params['learning_rate']:.3f}"

    with mlflow.start_run(nested=True, run_name=run_name):
        mlflow.set_tag("model", "XGBoost")
        mlflow.set_tag("engineer", "adeakinwe")

        mlflow.log_param("train_data", "../processed_data/X_train.parquet")
        mlflow.log_param("val_data", "../processed_data/X_val.parquet")

        mlflow.log_params({k: round(v, 5) if isinstance(v, float) else v for k, v in params.items()})

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=100,
            evals=[(dval, 'eval')],
            early_stopping_rounds=50,
            verbose_eval=10
        )

        y_pred_proba = model.predict(dval)
        y_pred = (y_pred_proba > 0.5).astype(int)

        accuracy = round(accuracy_score(y_val, y_pred), 3)
        auc = round(roc_auc_score(y_val, y_pred_proba), 3)

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("auc", auc)

        mlflow.xgboost.log_model(model, artifact_path="models")

        return {'loss': -auc, 'status': STATUS_OK}

# Run outer MLflow parent run
with mlflow.start_run(run_name="xgboost-hyperopt"):
    trials = Trials()
    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )

    # Log best hyperparameters found
    mlflow.log_params({f"best_{k}": v for k, v in best_result.items()})

train and save best model

In [17]:
#best params from hyper parameter tuning
best_params = {
    'max_depth': 4,
    'learning_rate': 0.13232,
    'reg_alpha': 0.02965,
    'reg_lambda': 0.1111,
    'min_child_weight': 3.19211,
    'subsample': 0.83768,
    'colsample_bytree': 0.81102,
    'objective': 'binary:logistic',
    'seed': 42,
    'eval_metric': 'auc',
    'scale_pos_weight': 11.38747
}

with mlflow.start_run(run_name="xgboost-final-auc"):

    mlflow.set_tag("model", "XGBoost")
    mlflow.set_tag("engineer", "adeakinwe")
    mlflow.log_param("train_data_path", "../processed_data/X_train.parquet")
    mlflow.log_param("val_data_path", "../processed_data/X_val.parquet")

    mlflow.log_params(best_params)

    # Prepare DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    # Train model
    model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=200,
        evals=[(dval, 'eval')],
        early_stopping_rounds=50,
        verbose_eval=10
    )

    # Evaluate
    y_proba = model.predict(dval)
    auc = round(roc_auc_score(y_val, y_proba), 3)
    mlflow.log_metric("auc", auc)

    # Log model (native)
    mlflow.xgboost.log_model(model, artifact_path="models/xgboost_model")

    # Dump model + vectorizer together
    model_bundle = {
        "model": model,
        "vectorizer": dv  # dict_vectorizer
    }

    bundle_path = "../models/xgb_credit_pred.bin"
    with open(bundle_path, "wb") as f_out:
        pickle.dump(model_bundle, f_out)

    mlflow.log_artifact(bundle_path)

[0]	eval-auc:0.66864
[10]	eval-auc:0.72431
[20]	eval-auc:0.72766
[30]	eval-auc:0.72943
[40]	eval-auc:0.73074
[50]	eval-auc:0.73158
[60]	eval-auc:0.73207
[70]	eval-auc:0.73270
[80]	eval-auc:0.73318
[90]	eval-auc:0.73345
[100]	eval-auc:0.73360
[110]	eval-auc:0.73363
[120]	eval-auc:0.73343
[130]	eval-auc:0.73339
[140]	eval-auc:0.73308
[150]	eval-auc:0.73304
[160]	eval-auc:0.73277
[164]	eval-auc:0.73273




load model with mlflow run id and predict

In [18]:
logged_model = 'runs:/b88184738ffd42ccbe32c94be8854a16/models/xgboost_model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models/xgboost_model
  flavor: mlflow.xgboost
  run_id: b88184738ffd42ccbe32c94be8854a16

In [19]:
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model

<xgboost.core.Booster at 0x7fc4f5442bb0>

In [20]:
y_pred = xgboost_model.predict(dval)

In [25]:
y_pred[:10]

array([0.41663513, 0.33560494, 0.3576474 , 0.30611232, 0.7213146 ,
       0.53515816, 0.21677195, 0.55349094, 0.44458437, 0.2975627 ],
      dtype=float32)