In [9]:
import mlflow
import mlflow.lightgbm
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score



In [10]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    RocCurveDisplay
)


In [11]:
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test  = pd.read_csv("../data/processed/y_test.csv").values.ravel()


In [None]:

with mlflow.start_run(run_name="lightgbm_baseline"):
    lgbm = LGBMClassifier(
        n_estimators=300,
        max_depth=-1,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary",
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        random_state=42,
        n_jobs=-1,
    )

    #  train
    lgbm.fit(X_train, y_train)

    #  predictions & metrics
    y_pred = lgbm.predict(X_test)
    y_proba = lgbm.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    #  log params & metrics
    mlflow.log_param("model", "LightGBM")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("num_leaves", 31)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)

    #  log model artifact
    mlflow.lightgbm.log_model(lgbm, artifact_path="model")

print("ROC AUC:", roc_auc)


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8455
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 98
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482




ROC AUC: 0.7596846644898383
