In [None]:
# Imports
import pandas as pd
import numpy as np
import pickle
import os
from datetime import datetime
from pathlib import Path

# MLflow
import mlflow
import mlflow.sklearn
import dagshub
from dotenv import load_dotenv

# ML Models
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# Tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Imports termin√©s")

‚úÖ Imports termin√©s


In [2]:
import getpass
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tqdm.auto import tqdm

import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
import mlflow.catboost
import dagshub

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, precision_score, recall_score,
    confusion_matrix
)

In [None]:
# Load environment variables
import getpass
load_dotenv()

# --- Configuration ---
DAGSHUB_USERNAME = os.getenv('DAGSHUB_USERNAME')
DAGSHUB_TOKEN = os.getenv('DAGSHUB_TOKEN')
DAGSHUB_REPO = os.getenv('DAGSHUB_REPO_NAME')
EXPERIMENT_NAME = "Crime_MLOPS1"
DATA_PATH = '../processors/preprocessed_data.pkl'

# --- INTERACTIVE USER INPUT ---
system_user = getpass.getuser()
print(f"\nSystem detected user: {system_user}")
custom_user = input(f"Enter username for MLflow tagging (Press Enter to use '{system_user}'): ")
user_name = custom_user.strip() if custom_user.strip() else system_user
print(f"üë§ MLflow User set to: {user_name}")
USER_NAME=user_name

# --- Initialize DagsHub & MLflow ---
if all([DAGSHUB_USERNAME, DAGSHUB_TOKEN, DAGSHUB_REPO]):
    # Set environment variables BEFORE any MLflow calls
    os.environ['MLFLOW_TRACKING_USERNAME'] = DAGSHUB_USERNAME
    os.environ['MLFLOW_TRACKING_PASSWORD'] = DAGSHUB_TOKEN
    os.environ['MLFLOW_ENABLE_LOGGED_MODEL_CREATION'] = 'false'
    
    MLFLOW_TRACKING_URI = f"https://dagshub.com/{DAGSHUB_USERNAME}/{DAGSHUB_REPO}.mlflow"
    
    # 1. Initialize DagsHub (This handles auth setup internally)
    dagshub.init(repo_owner=DAGSHUB_USERNAME, repo_name=DAGSHUB_REPO, mlflow=True)
    
    # 2. Explicitly set tracking URI
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    
    # 3. Create or Set Experiment
    try:
        # Check if experiment exists
        experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
        if experiment is None:
            mlflow.create_experiment(EXPERIMENT_NAME)
        mlflow.set_experiment(EXPERIMENT_NAME)
    except Exception as e:
        print(f"‚ö†Ô∏è Error setting experiment: {e}")

    print(f"‚úÖ MLflow Configured via DagsHub")
    print(f"üìä Tracking URI: {MLFLOW_TRACKING_URI}")
    print(f"üß™ Experiment: {EXPERIMENT_NAME}")

else:
    print("‚ö†Ô∏è Missing .env variables. Running locally only.")


System detected user: pc
üë§ MLflow User set to: imen benamar


‚úÖ MLflow Configured via DagsHub
üìä Tracking URI: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow
üß™ Experiment: Crime_MLOPS1


In [None]:
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Data file '{DATA_PATH}' not found. Run preprocessing first.")

with open(DATA_PATH, 'rb') as f:
    data = pickle.load(f)

X_train = data['X_train_scaled']
X_test = data['X_test_scaled']
y_train = data['y_train']
y_test = data['y_test']

print(f"‚úÖ Data Loaded Successfully")
print(f"   Train Shape: {X_train.shape}")
print(f"   Test Shape:  {X_test.shape}")

‚úÖ Data Loaded Successfully
   Train Shape: (319110, 17)
   Test Shape:  (79778, 17)


In [5]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import time

# Global Leaderboard
leaderboard = pd.DataFrame(columns=["Stage", "Model", "Accuracy", "F1_Weighted", "ROC_AUC", "Time (s)"])

def get_run_input(model_name, stage):
    """Pauses execution to ask for metadata for the specific model."""
    print(f"\nüìù Configuring: {model_name} ({stage})")
    print("-" * 40)
    def_ver = "V1"
    def_desc = f"{stage} training for {model_name}"
    
    ver = input(f"   Dataset Version [Enter='{def_ver}']: ")
    desc = input(f"   Description     [Enter='{def_desc}']: ")
    
    return ver.strip() or def_ver, desc.strip() or def_desc

def log_confusion_matrix(y_true, y_pred, model_name, stage):
    """Generates, saves and logs Confusion Matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name} ({stage})')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    filename = f"cm_{model_name}_{stage}.png"
    plt.savefig(filename)
    plt.close()
    
    mlflow.log_artifact(filename)
    if os.path.exists(filename): os.remove(filename)

def log_roc_curve(y_true, y_prob, model_name, stage):
    """Generates, saves and logs ROC-AUC Curve (One-vs-Rest)."""
    if y_prob is None: return

    n_classes = y_prob.shape[1]
    # Binarize labels for ROC calculation
    classes = np.unique(y_true)
    y_bin = label_binarize(y_true, classes=classes)
    
    plt.figure(figsize=(10, 8))
    
    # Calculate ROC for each class
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_prob[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'Class {i} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name} ({stage})')
    plt.legend(loc="lower right")
    
    filename = f"roc_{model_name}_{stage}.png"
    plt.savefig(filename)
    plt.close()
    
    mlflow.log_artifact(filename)
    if os.path.exists(filename): os.remove(filename)

def train_evaluate_log(model_name, model, X_train, y_train, X_test, y_test, stage, 
                       dataset_version, description, params=None, tune=False, param_dist=None, n_iter=6):
    
    global leaderboard
    
    with mlflow.start_run(run_name=f"{model_name}_{stage}", description=description):
        # Tags
        mlflow.set_tag("user", USER_NAME)
        mlflow.set_tag("dataset_version", dataset_version)
        mlflow.set_tag("model_name", model_name)
        mlflow.set_tag("stage", stage)
        
        # Tuning
        tune_time = 0
        final_model = model
        if tune and param_dist:
            start_tune = time.time()
            search = RandomizedSearchCV(model, param_dist, n_iter=n_iter, scoring='f1_weighted', cv=3, verbose=0, n_jobs=-1, random_state=42)
            search.fit(X_train, y_train)
            tune_time = time.time() - start_tune
            final_model = search.best_estimator_
            params = search.best_params_
            mlflow.log_metric("tuning_time", tune_time)
        
        if params: mlflow.log_params(params)

        # Training
        start_train = time.time()
        if not tune: final_model.fit(X_train, y_train)
        train_time = time.time() - start_train
        
        # Prediction
        start_pred = time.time()
        y_pred = final_model.predict(X_test)
        pred_time = time.time() - start_pred
        
        try: y_prob = final_model.predict_proba(X_test)
        except: y_prob = None

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        roc_auc = 0
        if y_prob is not None:
             try: roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')
             except: pass

        # Log Metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_weighted", f1)
        mlflow.log_metric("roc_auc", roc_auc)
        if not tune: mlflow.log_metric("training_time", train_time)

        # Save Model
        try:
            fname = f"{model_name}_{stage}.pkl"
            with open(fname, "wb") as f: pickle.dump(final_model, f)
            mlflow.log_artifact(fname)
            if os.path.exists(fname): os.remove(fname)
        except: pass

        # --- LOG IMAGES ---
        log_confusion_matrix(y_test, y_pred, model_name, stage)
        log_roc_curve(y_test, y_prob, model_name, stage)
        
        # Update Leaderboard
        new_row = pd.DataFrame([{
            "Stage": stage, "Model": model_name, 
            "Accuracy": acc, "F1_Weighted": f1, 
            "ROC_AUC": roc_auc, "Time (s)": train_time + tune_time
        }])
        leaderboard = pd.concat([leaderboard, new_row], ignore_index=True)
        
        return final_model

In [6]:
# --- Baseline Configurations ---
base_models_config = {
    "RandomForest": RandomForestClassifier(
        n_estimators=150, max_depth=15, class_weight='balanced', n_jobs=-1, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=150, learning_rate=0.1, max_depth=6, eval_metric='mlogloss', n_jobs=-1, random_state=42
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=150, learning_rate=0.1, class_weight='balanced', n_jobs=-1, random_state=42, verbose=-1
    ),
    "CatBoost": CatBoostClassifier(
        iterations=150, learning_rate=0.1, depth=6, verbose=0, random_state=42, allow_writing_files=False
    )
}

# --- Hyperparameter Grids ---
param_grids = {
    'RandomForest': { 
        'n_estimators': [100, 300, 600],
            'max_depth': [10,30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 3, 6],
            'max_features': ['sqrt', 'log2'],
            'class_weight': ['balanced', 'balanced_subsample']
    },
    'XGBoost': { 
        'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7, 10],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'gamma': [0, 0.1, 0.2],
            'min_child_weight': [1, 3, 5]
    },
    'LightGBM': { 
        'n_estimators': [100, 200, 300],
            'max_depth': [-1, 10, 20],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [20, 31, 50, 100],
            'min_child_samples': [10, 20, 30],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'class_weight': ['balanced', None]
    },
    'CatBoost': { 
        'iterations': [100, 200, 300],
            'depth': [4, 6, 8, 10],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'l2_leaf_reg': [1, 3, 5, 7, 9],
            'random_strength': [1, 2, 5],
            'auto_class_weights': ['Balanced', 'None']
    }
}

# Dictionaries to store trained instances
trained_baseline = {}
trained_tuned = {}

In [7]:
from tqdm.auto import tqdm

# 1. Baseline Models
print("\n" + "="*50 + "\n‚ñ∂Ô∏è  Phase 1: Baseline Models\n" + "-"*50)

# We iterate using keys to allow input before training
for name in tqdm(base_models_config.keys(), desc="Baseline Models"):
    model_inst = base_models_config[name]
    
    # Ask for input
    ver, desc = get_run_input(name, "Baseline")
    
    # Train & Log
    m = train_evaluate_log(
        model_name=name,
        model=model_inst,
        X_train=X_train, y_train=y_train,
        X_test=X_test, y_test=y_test,
        stage="Baseline",
        dataset_version=ver,
        description=desc
    )
    trained_baseline[name] = m

# 2. Tuning Models
print("\n" + "-"*50 + "\n‚ñ∂Ô∏è  Phase 2: Hyperparameter Tuning\n" + "-"*50)
for name in tqdm(base_models_config.keys(), desc="Tuning Models"):
    if name in param_grids:
        model_inst = base_models_config[name]
        
        ver, desc = get_run_input(name, "Tuned")
        
        m = train_evaluate_log(
            model_name=name,
            model=model_inst,
            X_train=X_train, y_train=y_train,
            X_test=X_test, y_test=y_test,
            stage="Tuned",
            dataset_version=ver,
            description=desc,
            tune=True, param_dist=param_grids[name], n_iter=6
        )
        trained_tuned[name] = m

# 3. Stacking
print("\n" + "-"*50 + "\n‚ñ∂Ô∏è  Phase 3: Stacking Ensembles\n" + "-"*50)

# Stacking Baseline
ver, desc = get_run_input("StackingClassifier", "Ensemble_Baseline")
stack_base = StackingClassifier(estimators=[(n, m) for n, m in trained_baseline.items()], final_estimator=LogisticRegression(), n_jobs=-1)
train_evaluate_log("StackingClassifier", stack_base, X_train, y_train, X_test, y_test, stage="Ensemble_Baseline", dataset_version=ver, description=desc)

# Stacking Tuned
ver, desc = get_run_input("StackingClassifier", "Ensemble_Tuned")
stack_tuned = StackingClassifier(estimators=[(n, m) for n, m in trained_tuned.items()], final_estimator=LogisticRegression(), n_jobs=-1)
train_evaluate_log("StackingClassifier", stack_tuned, X_train, y_train, X_test, y_test, stage="Ensemble_Tuned", dataset_version=ver, description=desc)

# Final Display
display(leaderboard.sort_values(by="F1_Weighted", ascending=False).style.background_gradient(subset=["F1_Weighted"], cmap="Greens"))


‚ñ∂Ô∏è  Phase 1: Baseline Models
--------------------------------------------------


Baseline Models:   0%|          | 0/4 [00:00<?, ?it/s]


üìù Configuring: RandomForest (Baseline)
----------------------------------------


üèÉ View run RandomForest_Baseline at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/962ebae059894035976279f8e324c362
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Baseline Models:  25%|‚ñà‚ñà‚ñå       | 1/4 [06:52<20:37, 412.63s/it]


üìù Configuring: XGBoost (Baseline)
----------------------------------------
üèÉ View run XGBoost_Baseline at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/9b4a85cfd0c040ff8fe1259737b1772d
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Baseline Models:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 2/4 [10:49<10:17, 308.96s/it]


üìù Configuring: LightGBM (Baseline)
----------------------------------------
üèÉ View run LightGBM_Baseline at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/a75eecf0ff354ca3954d2e7d4086fc0d
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Baseline Models:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [11:57<03:19, 199.06s/it]


üìù Configuring: CatBoost (Baseline)
----------------------------------------
üèÉ View run CatBoost_Baseline at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/ed57771e298242b6babd54377765fa11
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Baseline Models: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [12:54<00:00, 193.70s/it]



--------------------------------------------------
‚ñ∂Ô∏è  Phase 2: Hyperparameter Tuning
--------------------------------------------------


Tuning Models:   0%|          | 0/4 [00:00<?, ?it/s]


üìù Configuring: RandomForest (Tuned)
----------------------------------------
üèÉ View run RandomForest_Tuned at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/0a280f5dfd5141ec92ac02b6272c5cc7
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Tuning Models:  25%|‚ñà‚ñà‚ñå       | 1/4 [22:05<1:06:15, 1325.19s/it]


üìù Configuring: XGBoost (Tuned)
----------------------------------------
üèÉ View run XGBoost_Tuned at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/6f4bb9bb6a154195bdf1df06359cc1a1
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Tuning Models:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 2/4 [33:39<31:47, 953.91s/it]   


üìù Configuring: LightGBM (Tuned)
----------------------------------------
üèÉ View run LightGBM_Tuned at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/27827760550d41cc933a64dc4938226e
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Tuning Models:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [55:21<18:32, 1112.97s/it]


üìù Configuring: CatBoost (Tuned)
----------------------------------------
üèÉ View run CatBoost_Tuned at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/66a520c8050f4d85b93ea292275ad0d2
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


Tuning Models: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [1:10:47<00:00, 1062.00s/it]


--------------------------------------------------
‚ñ∂Ô∏è  Phase 3: Stacking Ensembles
--------------------------------------------------

üìù Configuring: StackingClassifier (Ensemble_Baseline)
----------------------------------------





üèÉ View run StackingClassifier_Ensemble_Baseline at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/d9f182f672f4447f955a88a8479e769a
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2

üìù Configuring: StackingClassifier (Ensemble_Tuned)
----------------------------------------
üèÉ View run StackingClassifier_Ensemble_Tuned at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2/runs/5a9f4ea8f69c4b2689b7d413c2b17b55
üß™ View experiment at: https://dagshub.com/YomnaJL/MLOPS_Project.mlflow/#/experiments/2


OSError: [WinError 87] Param√®tre incorrect