In [13]:
import mlflow
import mlflow.lightgbm
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score



In [14]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    RocCurveDisplay
)


In [15]:
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test  = pd.read_csv("../data/processed/y_test.csv").values.ravel()


In [18]:
import mlflow

mlflow.set_experiment("credit_risk_experiment")


2025/11/22 08:36:01 INFO mlflow.tracking.fluent: Experiment with name 'credit_risk_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/projects/Credit-Risk-Assessment-System/notebooks/mlruns/845896167360739608', creation_time=1763796961241, experiment_id='845896167360739608', last_update_time=1763796961241, lifecycle_stage='active', name='credit_risk_experiment', tags={}>

In [19]:
with mlflow.start_run(run_name="lightgbm_baseline"):

    lgbm = LGBMClassifier(
        n_estimators=300,
        max_depth=-1,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary",
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        random_state=42,
        n_jobs=-1,
    )

    lgbm.fit(X_train, y_train)

    
    y_pred = lgbm.predict(X_test)
    y_proba = lgbm.predict_proba(X_test)[:, 1]

    
    roc_auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)

    
    mlflow.log_param("model", "LightGBM")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("learning_rate", 0.05)

    
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("f1_score", f1)

    
    mlflow.lightgbm.log_model(lgbm, "model")


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8455
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 98
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482




In [20]:
import joblib
import os

# Make sure the models folder exists (safe check)
os.makedirs("../models", exist_ok=True)

# Save the trained LightGBM model
joblib.dump(lgbm, "../models/lightgbm_best.pkl")


['../models/lightgbm_best.pkl']