In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
# Adding the parent directory to sys.path for module imports
import os
import sys

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
import pickle as pkl
from pathlib import Path
from typing import Any, Optional, Tuple

import mlflow
import pandas as pd
from mlflow import MlflowClient
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.modelling.config import NUMERICAL_COLS


In [None]:
client = MlflowClient()
# Display available experiments
experiments = client.search_experiments()
for exp in experiments:
    print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}")

In [None]:
df = pd.read_csv("../data/abalone.csv")
# df_y = df['Rings']
# df = df.drop('Rings', axis=1)

In [None]:
# Preprocessing functions for the abalone dataset.


def create_preprocessor() -> ColumnTransformer:
    """Create a preprocessor that handles both one-hot encoding and scaling.

    Returns:
        ColumnTransformer: A preprocessor that one-hot encodes 'Sex' and scales numerical features.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ("onehot", OneHotEncoder(drop="first", sparse_output=False), ["Sex"]),
            ("scaler", StandardScaler(), NUMERICAL_COLS),
        ],
        remainder="passthrough",
    )
    return preprocessor


def preprocess_data(
    df: pd.DataFrame,
    preprocessor: Optional[ColumnTransformer] = None,
    with_target: bool = True,
) -> Tuple[pd.DataFrame, Optional[pd.Series], ColumnTransformer]:
    """Preprocess the abalone dataset using a unified preprocessor.

    Args:
        df (pd.DataFrame): The input dataframe with all columns.
        preprocessor (Optional[ColumnTransformer], optional): An existing preprocessor to use.
            If None, a new preprocessor will be created and fitted. Defaults to None.
        with_target (bool, optional): Whether to separate the target variable 'Rings' from the features. Defaults to True.

    Returns:
        Tuple[pd.DataFrame, Optional[pd.Series], ColumnTransformer]: A tuple containing the preprocessed dataframe,
            the target variable (if with_target is True), and the preprocessor used.
    """
    df = df.copy()
    df = df.rename(columns=lambda x: x.replace(" ", "_"))

    if with_target:
        y = df["Rings"]
        X = df.drop("Rings", axis=1)
    else:
        y = None
        X = df

    if preprocessor is None:
        preprocessor = create_preprocessor()
        X_processed = preprocessor.fit_transform(X)
    else:
        X_processed = preprocessor.transform(X)

    # Get feature names after transformation
    feature_names = []
    for name, transformer, columns in preprocessor.transformers_:
        if name == "onehot":
            feature_names.extend(transformer.get_feature_names_out(columns))
        elif name == "scaler":
            feature_names.extend(columns)
        elif name != "remainder":
            feature_names.extend(columns)

    X_processed_df = pd.DataFrame(X_processed, columns=feature_names, index=X.index)

    return X_processed_df, y, preprocessor


In [None]:
def train_model(x: pd.DataFrame, y: pd.Series) -> LinearRegression:
    """Train a Linear Regression model on the provided features and target.

    Args:
        x (pd.DataFrame): The input features.
        y (pd.Series): The target variable.

    Returns:
        LinearRegression: The trained Linear Regression model.
    """
    model = LinearRegression()
    model.fit(x, y)
    return model


def evaluate_model(model: LinearRegression, x: pd.DataFrame, y: pd.Series) -> float:
    """Evaluate the model using Mean Squared Error (MSE).

    Args:
        model (LinearRegression): The trained Linear Regression model.
        x (pd.DataFrame): The input features.
        y (pd.Series): The true target variable.

    Returns:
        float: The Mean Squared Error of the model's predictions.
    """
    y_pred = model.predict(x)
    mse = mean_squared_error(y, y_pred)
    return mse

In [None]:
def save_to_pickle(obj: Any, file_path: str) -> None:
    """Save an object to a specified file path in pickle format.

    Args:
        obj (object): The object to pickle.
        file_path (str): The file path where the object will be saved.

    Returns:
        None
    """
    file_path = Path(file_path)
    # Create parent directories if they don't exist
    file_path.parent.mkdir(parents=True, exist_ok=True)

    with file_path.open("wb") as f:
        pkl.dump(obj, f)

In [None]:
mlflow.set_experiment("abalone-age-experiment")

with mlflow.start_run() as run:
    run_id = run.info.run_id

    x, y, preprocessor = preprocess_data(df, preprocessor=None, with_target=True)
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=42
    )
    model = train_model(x_train, y_train)

    mse = evaluate_model(model, x_test, y_test)
    print(f"MSE: {mse}")
    artifacts_path = Path("../models/")
    save_to_pickle(model, artifacts_path / "model.pkl")
    save_to_pickle(preprocessor, artifacts_path / "preprocessor.pkl")

    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("mse", mse)
    mlflow.sklearn.log_model(model, "model")
    model_uri = f"runs:/{run_id}/model"

    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
    registered_model = mlflow.register_model(model_uri, "abalone_regression_model")

In [None]:
# !mlflow ui --host 0.0.0.0 --port 5002