In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle as pkl
from sklearn.preprocessing import StandardScaler
import os
from typing import Tuple, Optional
import mlflow
from mlflow import MlflowClient
import mlflow.sklearn


In [None]:
client = MlflowClient()
# Display available experiments
experiments = client.search_experiments()
for exp in experiments:
    print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}")

In [None]:
df = pd.read_csv("../data/abalone.csv")
# df_y = df['Rings']
# df = df.drop('Rings', axis=1)

In [None]:
def onehot(df: pd.DataFrame) -> pd.DataFrame:
    """One-hot encode the the categorical variable 'Sex' in the abalone dataset.

    Args:
        df (pd.DataFrame): The input dataframe with all columns.

    Returns:
        pd.DataFrame: The dataframe with the 'Sex' column one-hot encoded.
    """
    df = df.copy()
    sex_encoder = OneHotEncoder(drop="first", sparse_output=False)
    sex_encoded = sex_encoder.fit_transform(df[["Sex"]])
    sex_feature_names = sex_encoder.get_feature_names_out(["Sex"])
    sex_df = pd.DataFrame(sex_encoded, columns=sex_feature_names, index=df.index)
    df = df.drop("Sex", axis=1)
    df_encoded = pd.concat([df, sex_df], axis=1)
    return df_encoded


def scale(
    df: pd.DataFrame, scaler: Optional[StandardScaler] = None
) -> Tuple[pd.DataFrame, StandardScaler]:
    """Scale the numerical features in the abalone dataset using StandardScaler.

    Args:
        df (pd.DataFrame): The input dataframe with all columns.
        scaler (Optional[StandardScaler], optional): An existing StandardScaler to use for transformation.
            If None, a new scaler will be created and fitted. Defaults to None.

    Returns:
        Tuple[pd.DataFrame, StandardScaler]: A tuple containing the scaled dataframe and the scaler used.
    """
    df = df.copy()
    numerical_cols = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
    ]
    if scaler is None:
        scaler = StandardScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler


def preprocess_data(
    df: pd.DataFrame, scaler: Optional[StandardScaler] = None, with_target: bool = True
) -> Tuple[pd.DataFrame, Optional[pd.Series], StandardScaler]:
    """Preprocess the abalone dataset by one-hot encoding categorical variables and scaling numerical features.

    Args:
        df (pd.DataFrame): The input dataframe with all columns.
        scaler (Optional[StandardScaler], optional): An existing StandardScaler to use for scaling.
            If None, a new scaler will be created and fitted. Defaults to None.
        with_target (bool, optional): Whether to separate the target variable 'Rings' from the features. Defaults to True.

    Returns:
        Tuple[pd.DataFrame, Optional[pd.Series], StandardScaler]: A tuple containing the preprocessed dataframe,
            the target variable (if with_target is True), and the scaler used.
    """
    df = df.copy()
    df = onehot(df)

    if with_target:
        y = df["Rings"]
        df = df.drop("Rings", axis=1)
    else:
        y = None
    df, scaler = scale(df, scaler)

    return df, y, scaler


def train_model(x: pd.DataFrame, y: pd.Series) -> LinearRegression:
    """Train a Linear Regression model on the provided features and target.

    Args:
        x (pd.DataFrame): The input features.
        y (pd.Series): The target variable.

    Returns:
        LinearRegression: The trained Linear Regression model.
    """
    model = LinearRegression()
    model.fit(x, y)
    return model


def evaluate_model(model: LinearRegression, x: pd.DataFrame, y: pd.Series) -> float:
    """Evaluate the model using Mean Squared Error (MSE).

    Args:
        model (LinearRegression): The trained Linear Regression model.
        x (pd.DataFrame): The input features.
        y (pd.Series): The true target variable.

    Returns:
        float: The Mean Squared Error of the model's predictions.
    """
    y_pred = model.predict(x)
    mse = mean_squared_error(y, y_pred)
    return mse


def save_model(model: BaseEstimator, artifacts_path: str) -> None:
    """Save the trained model to a file.

    Args:
        model (BaseEstimator): The trained model to save.
        artifacts_path (str): The directory path where the model will be saved.

    Returns:
        None
    """
    with open(os.path.join(artifacts_path, "model.pkl"), "wb") as f:
        pkl.dump(model, f)


def save_scaler(scaler: StandardScaler, artifacts_path: str) -> None:
    """Save the scaler to a file.

    Args:
        scaler (StandardScaler): The scaler to save.
        artifacts_path (str): The directory path where the scaler will be saved.

    Returns:
        None
    """
    with open(os.path.join(artifacts_path, "scaler.pkl"), "wb") as f:
        pkl.dump(scaler, f)


In [None]:
mlflow.set_experiment("abalone-age-experiment")

with mlflow.start_run() as run:
    run_id = run.info.run_id

    x, y, scaler = preprocess_data(df, scaler=None, with_target=True)
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=42
    )
    model = train_model(x_train, y_train)

    mse = evaluate_model(model, x_test, y_test)
    print(f"MSE: {mse}")
    artifacts_path = "../models/"
    save_model(model, artifacts_path)
    save_scaler(scaler, artifacts_path)

    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("mse", mse)
    mlflow.sklearn.log_model(model, "model")
    model_uri = f"runs:/{run_id}/model"

    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
    registered_model = mlflow.register_model(model_uri, "abalone_regression_model")

In [None]:
# !mlflow ui --host 0.0.0.0 --port 5002