## here we'll see 4 experiments with different models on the same dataset

In [202]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings
import mlflow
warnings.filterwarnings('ignore')

In [203]:
# import mlflow
# 
# # End any currently active MLflow run
# if mlflow.active_run():
#     mlflow.end_run()
#     mlflow.start_run()

In [204]:
#imbalanced binary classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, 
                           weights=[0.9, 0.1], flip_y=0, random_state=42)

X

array([[-2.33224581, -1.73015228, -4.05388142, ...,  0.85612326,
        -3.13555024,  1.22003574],
       [ 0.78663065,  0.72655494,  1.07774555, ..., -0.32878979,
         0.66469403, -0.32691647],
       [ 1.4709666 ,  1.32562619,  2.08216446, ..., -0.60558465,
         1.33361742, -0.63083993],
       ...,
       [-2.78244158, -1.9136032 , -5.14120195, ...,  0.97924357,
        -4.15435048,  1.54457183],
       [-0.43118285, -0.76179492,  0.14539007, ...,  0.2819945 ,
         0.6344439 , -0.0358334 ],
       [-1.41528041, -1.09134101, -2.37613076, ...,  0.53112078,
        -1.78892693,  0.71585107]])

In [205]:
y

array([0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [206]:
np.unique(y, return_counts=True) # as u can see, the dataset is Unbalanced


(array([0, 1]), array([900, 100], dtype=int64))

In [207]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

reports = []


## Experiment1: Logistic Regression Classifier

In [208]:
log_reg = LogisticRegression(C=1, solver = 'liblinear')
log_reg.fit(x_train, y_train)
y_pred_log_reg = log_reg.predict(x_test)
print(classification_report(y_test, y_pred_log_reg))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       270
           1       0.60      0.50      0.55        30

    accuracy                           0.92       300
   macro avg       0.77      0.73      0.75       300
weighted avg       0.91      0.92      0.91       300



Acceptable on the first class, Horriblly bad on the second class, because the dataset was unbalanced

In [209]:
report = classification_report(y_test, y_pred_log_reg, output_dict=True)
reports.append(report)

now a logistic regressor with different params

In [210]:
log_reg_sec = LogisticRegression(C=0.5, solver = 'liblinear')
log_reg_sec.fit(x_train, y_train)
y_pred_log_reg = log_reg_sec.predict(x_test)
print(classification_report(y_test, y_pred_log_reg))

report = classification_report(y_test, y_pred_log_reg, output_dict=True)
reports.append(report)

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       270
           1       0.58      0.50      0.54        30

    accuracy                           0.91       300
   macro avg       0.76      0.73      0.74       300
weighted avg       0.91      0.91      0.91       300



##  Experiment 2: Random Forest Classifier


In [211]:
rf = RandomForestClassifier(n_estimators=30, max_depth=3)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       270
           1       0.95      0.67      0.78        30

    accuracy                           0.96       300
   macro avg       0.96      0.83      0.88       300
weighted avg       0.96      0.96      0.96       300



performance on the second class has improved, recall still low

In [212]:
report = classification_report(y_test, y_pred_rf, output_dict=True)
reports.append(report)

## Experiment 3: XGBoost

In [213]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       270
           1       0.96      0.80      0.87        30

    accuracy                           0.98       300
   macro avg       0.97      0.90      0.93       300
weighted avg       0.98      0.98      0.98       300



Performance improved, Not the best result we can get though

In [214]:
report = classification_report(y_test, y_pred_xgb, output_dict=True)
reports.append(report)

## Experiment 4: XGBoost after handling the class imbalance using SMOTETomek Technique

In [215]:
from imblearn.combine import SMOTETomek


smt = SMOTETomek(random_state=42)
x_train_res, y_train_res = smt.fit_resample(x_train, y_train)
np.unique(y_train_res, return_counts=True)


(array([0, 1]), array([619, 619], dtype=int64))

class balanced now

In [216]:
xgb_bal = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_bal.fit(x_train_res, y_train_res)
y_pred_xgb = xgb_bal.predict(x_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       270
           1       0.81      0.83      0.82        30

    accuracy                           0.96       300
   macro avg       0.89      0.91      0.90       300
weighted avg       0.96      0.96      0.96       300



recall improved slightly, the precision dropped a little though.

In [217]:
report = classification_report(y_test, y_pred_xgb, output_dict=True)
reports.append(report)

## Experiment 5: Logistic regressor (C=1) on balanced Data

In [218]:
log_reg_bal = LogisticRegression(C=1, solver = 'liblinear')
log_reg_bal.fit(x_train_res, y_train_res)
y_pred_log_reg_balanced = log_reg_bal.predict(x_test)
print(classification_report(y_test, y_pred_log_reg_balanced))

              precision    recall  f1-score   support

           0       0.98      0.88      0.93       270
           1       0.45      0.87      0.59        30

    accuracy                           0.88       300
   macro avg       0.72      0.87      0.76       300
weighted avg       0.93      0.88      0.90       300



In [219]:
report = classification_report(y_test, y_pred_log_reg_balanced, output_dict=True)
reports.append(report)
report

{'0': {'precision': 0.9834710743801653,
  'recall': 0.8814814814814815,
  'f1-score': 0.9296875,
  'support': 270.0},
 '1': {'precision': 0.4482758620689655,
  'recall': 0.8666666666666667,
  'f1-score': 0.5909090909090909,
  'support': 30.0},
 'accuracy': 0.88,
 'macro avg': {'precision': 0.7158734682245654,
  'recall': 0.8740740740740741,
  'f1-score': 0.7602982954545454,
  'support': 300.0},
 'weighted avg': {'precision': 0.9299515531490453,
  'recall': 0.88,
  'f1-score': 0.8958096590909091,
  'support': 300.0}}

### Random forest on Balanced Data

In [220]:
rf_bal = RandomForestClassifier(n_estimators=30, max_depth=3)
rf_bal.fit(x_train_res, y_train_res)
y_pred_rf_balanced = rf_bal.predict(x_test)
print(classification_report(y_test, y_pred_rf_balanced))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95       270
           1       0.54      0.90      0.68        30

    accuracy                           0.91       300
   macro avg       0.76      0.91      0.81       300
weighted avg       0.94      0.91      0.92       300



In [221]:
report = classification_report(y_test, y_pred_rf_balanced, output_dict=True)
reports.append(report)
report

{'0': {'precision': 0.988,
  'recall': 0.9148148148148149,
  'f1-score': 0.95,
  'support': 270.0},
 '1': {'precision': 0.54, 'recall': 0.9, 'f1-score': 0.675, 'support': 30.0},
 'accuracy': 0.9133333333333333,
 'macro avg': {'precision': 0.764,
  'recall': 0.9074074074074074,
  'f1-score': 0.8125,
  'support': 300.0},
 'weighted avg': {'precision': 0.9431999999999999,
  'recall': 0.9133333333333333,
  'f1-score': 0.9225,
  'support': 300.0}}

## Tracking the experiments

In [222]:
models = [
    {
        "name": "Logistic Regression (Imbalanced c = 1)", 
        "model": log_reg,
        "params": {
            "name": "LR c=1 IM", 
            "C": 1,
            "solver": "liblinear"
        }
    },
    {
        "name": "Logistic Regression (Imbalanced c = 0.5)", 
        "model": log_reg_sec,
        "params": {
            "name": "LR c=0.5 IM", 
            "C": 0.5,
            "solver": "liblinear"
        }
    },
    {
        "name": "Random Forest (Imbalanced)",
        "model": rf, 
        "params": {
            "name": "RF IM",
            "n_estimators": 30,
            "max_depth": 3
        }
    },
    {
        "name": "XGBClassifier (Imbalanced)",
        "model": xgb, 

        "params": {
            "name": "XGB",
            "use_label_encoder": False,
            "eval_metric": 'logloss'
        }
    },
    {
        "name": "XGBSMOTE",
        "model": xgb_bal, 

        "params": {
            "name": "XGBSMOTE",
            "use_label_encoder": False,
            "eval_metric": 'logloss'
        }
    },
    {
        "name": "Logistic Regression (balanced c = 1)", 
        "model": log_reg_bal,
        "params": {
            "name": "LR c=1 BAL", 
            "C": 1,
            "solver": "liblinear"
        }
    },
    {
        "name": "Random Forest (balanced)",
        "model": rf_bal, 
        "params": {
            "name": "RF BAL",
            "n_estimators": 30,
            "max_depth": 3
        }
    },
]

In [223]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [224]:
mlflow.set_experiment("Logistic Regression vs Random Forest vs XGBoost vs Balanced XGBoost v4")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

for i, element in enumerate(models):
    model_name = element['name']
    model = element['model']
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(element['params'])
        
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) 
        
        if "XGB" in model_name:
            # if model_name == "XGBClassifier":
            #     model.fit(x_train, y_train)
            # else:
            #     model.fit(x_train_res, y_train_res)
            # mlflow.xgboost.log_model(model, "model")
            
            
            
            mlflow.xgboost.log_model(model, artifact_path=model_name)
                # print(f"Run ID: {run.info.run_id}")
                
        else:
            
            mlflow.sklearn.log_model(model, artifact_path = model_name)
        



🏃 View run Logistic Regression (Imbalanced c = 1) at: http://127.0.0.1:5000/#/experiments/6/runs/c5bf0b643ba549b28b79e05f44e7f849
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6




🏃 View run Logistic Regression (Imbalanced c = 0.5) at: http://127.0.0.1:5000/#/experiments/6/runs/48f3db604bc74da784e7775cf1484c22
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6




🏃 View run Random Forest (Imbalanced) at: http://127.0.0.1:5000/#/experiments/6/runs/b560e391d7a34ef09e53ab825c77e47e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6




🏃 View run XGBClassifier (Imbalanced) at: http://127.0.0.1:5000/#/experiments/6/runs/824273afc53d4a068803ff85e7b1d70b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6




🏃 View run XGBSMOTE at: http://127.0.0.1:5000/#/experiments/6/runs/2388619a4af949f898c9f34a311929a1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6




🏃 View run Logistic Regression (balanced c = 1) at: http://127.0.0.1:5000/#/experiments/6/runs/b3d64b1c2b424a348621ed973c5afc23
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6




🏃 View run Random Forest (balanced) at: http://127.0.0.1:5000/#/experiments/6/runs/94cc708a241a4ed089bdeb3091428921
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6


### Best model Registering

In [234]:
result = mlflow.register_model(
    # "runs:/824273afc53d4a068803ff85e7b1d70b/XGB_imbalanced", "XGB"
    "runs:/2388619a4af949f898c9f34a311929a1/XGBSMOTE", "XGB"
)

Registered model 'XGB' already exists. Creating a new version of this model...
2025/01/18 22:25:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGB, version 2
Created version '2' of model 'XGB'.


In [263]:
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("http://127.0.0.1:5000")
client = MlflowClient()


registered_models = client.search_registered_models()
for rm in registered_models:
    print(rm.name)

Logistic Regressors
Production XGB Classifier
XGB
XGBSmote


In [252]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "XGBSmotet"

for mv in client.search_model_versions(f"name='{model_name}'"):
    print(f"Version: {mv.version}, Stage: {mv.current_stage}, URI: {mv.source}")


Version: 1, Stage: None, URI: file:///C:/Users/amine/PycharmProjects/ML_Flow_Project/mlruns/6/2388619a4af949f898c9f34a311929a1/artifacts/XGBSMOTE


In [239]:
from mlflow.xgboost import load_model

model_uri = "file:///C:/Users/amine/PycharmProjects/ML_Flow_Project/mlruns/6/13da461719a24cb2b69bdb402309a691/artifacts/XGBSMOTE"

loaded_model = load_model(model_uri)

y_predict = loaded_model.predict(x_test)
y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0])

### Transitioning from Dev Envoroment To Production

In [251]:
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("http://127.0.0.1:5000")
client = MlflowClient()


registered_models = client.search_registered_models()
for rm in registered_models:
    print(rm.name)

Logistic Regressors
XGB
XGBSmotet


In [None]:
run_id = "2388619a4af949f898c9f34a311929a1" 

dev_model_uri = f"runs:/{run_id}/XGBSMOTE"
registered_model = mlflow.register_model(
    model_uri=dev_model_uri,
    name="Production XGB Classifier"
)

### Champion model could now be packaged and deployed (on AWS For Example)

In [None]:
from mlflow.xgboost import load_model

model_uri = "file:///C:/Users/amine/PycharmProjects/ML_Flow_Project/mlruns/6/13da461719a24cb2b69bdb402309a691/artifacts/XGBSMOTE"

loaded_model = load_model(model_uri)

y_predict = loaded_model.predict(x_test)
y_predict