In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

### Initializing MLflow

Let's print the tracking server URI, where the experiments and runs are going to be logged. We observe it refers to a local path.

In [None]:
import mlflow


mlflow.set_tracking_uri("../mlruns")


In [None]:
from typing import List
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error


# First pipeline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from skrub import TableVectorizer
import seaborn as sns

# === Pipeline ===
pipe = make_pipeline(
    TableVectorizer(numeric=StandardScaler()),
    LinearRegression()
)

# === Fonctions ===
def evaluate_fold(X_train, y_train, X_test, y_test, pipeline):
    """Evaluate a single fold of cross-validation for the given pipeline."""
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    rmse_train = root_mean_squared_error(y_train, y_train_pred)
    rmse_test = root_mean_squared_error(y_test, y_test_pred)
    return rmse_train, rmse_test


def plot_coefficients_variability(coef_list: List[np.ndarray], feature_names: List[str]):
    """
    Plot coefficient variability across folds as:
    - a boxplot (dispersion des coefficients)
    - une heatmap de corrélation entre coefficients
    """
    coef_array = np.vstack(coef_list)  # (n_folds, n_features)

    # ---- Boxplot ----
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))

    ax[0].boxplot(coef_array, vert=True, patch_artist=True,
                  boxprops=dict(facecolor="#add8e6", color="#007acc"),
                  medianprops=dict(color="red"))
    ax[0].set_title("Variabilité des coefficients (Boxplot)")
    ax[0].set_xlabel("Features")
    ax[0].set_ylabel("Coefficient Value")
    ax[0].set_xticks(range(1, len(feature_names) + 1))
    ax[0].set_xticklabels(feature_names, rotation=45, ha="right", fontsize=9)

    # ---- Heatmap de corrélation ----
    corr = np.corrcoef(coef_array)  # corrélation entre folds
    sns.heatmap(corr, cmap="coolwarm", center=0, annot=False, ax=ax[1])
    ax[1].set_title("Corrélation entre les coefficients des folds")
    ax[1].set_xlabel("Fold index")
    ax[1].set_ylabel("Fold index")

    plt.tight_layout()
    return fig

def evaluate_cv(pipeline, X, y, n_splits=5, n_repeats=2, random_state=42, verbose=False):
    """Evaluate the pipeline using Repeated K-Fold cross-validation."""
    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
    rmse_train_list, rmse_test_list, coef_list = [], [], []

    for i, (train_index, test_index) in enumerate(rkf.split(X), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        rmse_train, rmse_test = evaluate_fold(X_train, y_train, X_test, y_test, pipeline)
        rmse_train_list.append(rmse_train)
        rmse_test_list.append(rmse_test)
        # Extraire les coefficients du modèle LinearRegression
        coef_list.append(pipeline.named_steps["linearregression"].coef_.ravel())

    # Extraire les noms de features du TableVectorizer
    feature_names = pipeline.named_steps["tablevectorizer"].get_feature_names_out()

    results = {
        "rmse_train_mean": np.mean(rmse_train_list),
        "rmse_train_std": np.std(rmse_train_list),
        "rmse_test_mean": np.mean(rmse_test_list),
        "rmse_test_std": np.std(rmse_test_list),
    }

    fig = plot_coefficients_variability(coef_list, feature_names)

    if verbose:
        print(f"Train RMSE: {results['rmse_train_mean']:.4f} ± {results['rmse_train_std']:.4f}")
        print(f"Test RMSE:  {results['rmse_test_mean']:.4f} ± {results['rmse_test_std']:.4f}")

    return results, fig

def load_data():
    """Load the abalone dataset from a CSV file."""
    df = pd.read_csv("../data/abalone.csv")
    X, y = df.drop(columns=["Rings"]), df[["Rings"]]
    return X, y

# === Exécution ===
X, y = load_data()
results, fig = evaluate_cv(pipe, X, y, verbose=True)
plt.show()


In [None]:
import mlflow
from mlflow import sklearn

import warnings
warnings.filterwarnings("ignore")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

# Set the experiment name
mlflow_experiment_path = f"/mlflow/linear_reg_abalone"
mlflow.set_experiment(mlflow_experiment_path)

# ne sérialise pas automatiquement le modèle
mlflow.sklearn.autolog(log_models=False, log_datasets=False)
# Start a run
with mlflow.start_run() as run:

    mlflow.set_tag("user", "Big_Bouzzz")

    X, y = load_data()
    mlflow.log_param("n_samples", X.shape[0])
    mlflow.log_param("n_features", X.shape[1])

    result, fig = evaluate_cv(pipe, X, y, verbose=True)
    for key, value in results.items():
        mlflow.log_metric(key, value)

    # Log the coefficient variability plot
    mlflow.log_figure(fig, "coefficient_variability.png")


Le modèle est clairement pas fou mais c'est pas le but ici ...

On changera si on a le temps

If the model is satisfactory, we stage it as production using the appropriate version. This will help us retreiving it for predictions.

In [None]:

with mlflow.start_run() as run:
    X, y = load_data()
    pipe.fit(X, y)

    # Log les performances
    results, fig = evaluate_cv(pipe, X, y)
    for key, value in results.items():
        mlflow.log_metric(key, value)

    # Log du modèle sous un nom stable
    mlflow.sklearn.log_model(
        sk_model=pipe,
        artifact_path="model",
        registered_model_name="LinearRegression_Abalone"
    )


In [None]:
from mlflow.client import MlflowClient

client = MlflowClient()
production_version = 1

client.transition_model_version_stage(
    name="LinearRegression_Abalone", version=production_version, stage="Production"
)