# 02 - Model Training & MLflow (reproducible)
Purpose: train a simple model, track with MLflow, save model artifact and metadata.
Requirements: pip install mlflow scikit-learn joblib


In [None]:
"""
Imports and MLflow basic configuration.
"""

from pathlib import Path
import logging
import joblib
import json
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

ROOT = Path.cwd()
PROCESSED_DIR = ROOT / "data" / "processed"
MODEL_DIR = ROOT / "model_registry"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Local MLflow tracking URI (file-backed)
MLFLOW_TRACKING_URI = f"file://{ROOT / 'mlruns'}"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
logging.basicConfig(level=logging.INFO)


## 2.1 — Load processed data (from notebook 01)
This cell expects `data/processed/train.csv` and `val.csv` present.


In [None]:
train = pd.read_csv(PROCESSED_DIR / "train.csv")
val = pd.read_csv(PROCESSED_DIR / "val.csv")

feature_cols = [c for c in train.columns if c != "target"]
X_train = train[feature_cols]
y_train = train["target"]
X_val = val[feature_cols]
y_val = val["target"]


## 2.2 — Training function (SRP: single responsibility)
The function trains, returns fitted model and metrics dictionary.


In [None]:
def train_and_log(run_name: str="sklearn-logreg", C:float=1.0, max_iter:int=100):
    """
    Trains logistic regression and logs parameters, metrics, and artifact to MLflow.
    Returns: model_path (str), run_id (str), metrics (dict)
    """
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.log_param("C", C)
        mlflow.log_param("max_iter", max_iter)

        model = LogisticRegression(C=C, max_iter=max_iter, solver="lbfgs", multi_class="auto")
        model.fit(X_train, y_train)

        preds = model.predict(X_val)
        acc = float(accuracy_score(y_val, preds))
        mlflow.log_metric("val_accuracy", acc)

        # Save model artifact to model_registry with versioned filename
        run_id = run.info.run_id
        model_name = f"model_{run_id[:8]}.pkl"
        model_path = MODEL_DIR / model_name
        joblib.dump(model, model_path)

        # Log artifact into mlflow run as well
        mlflow.log_artifact(str(model_path), artifact_path="model")

        # Save metadata
        meta = {"run_id": run_id, "val_accuracy": acc, "artifact": str(model_path)}
        (MODEL_DIR / f"{model_name}.meta.json").write_text(json.dumps(meta))

        logging.info("Trained model logged. run_id=%s acc=%s", run_id, acc)
        return str(model_path), run_id, meta

model_path, run_id, meta = train_and_log(C=1.0, max_iter=200)
model_path, meta


## 2.3 — Minimal model loading util used by deployment
Persist a tiny loader that the deployment service will call.


In [None]:
def load_model(path: str):
    """Load a joblib model from disk."""
    return joblib.load(path)

# smoke load
m = load_model(model_path)
print(type(m))


End of notebook 02.
