In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# Import

In [28]:
import pandas as pd
from pathlib import Path
import mlflow
from mlflow.tracking import MlflowClient

import numpy as np
from typing import List

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


from typing import List
from scipy.sparse import csr_matrix

# Prepare Data

In [2]:
# Define the relative path to the data folder
DATA_FOLDER = Path("../data")

# Construct the path to your CSV file
train_path = DATA_FOLDER / "abalone.csv"

In [22]:
def load_data(path: str):
    return pd.read_csv(path)
def compute_target(
    df: pd.DataFrame,
    rings_column: str = "Rings",
) -> pd.DataFrame:
    df["Age"] = df[rings_column] + 1.5
    df.drop("Rings", axis=1, inplace=True) 
    return df
# Function to extract X and y using DictVectorizer
def extract_x_y(
    df: pd.DataFrame,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:

    dicts = df.drop(columns=["Age"] if "Age" in df.columns else []).to_dict(orient="records")

    if dv is None:
        dv = DictVectorizer()
        dv.fit(dicts)

    x = dv.transform(dicts)

    y = None
    if with_target and "Age" in df.columns:
        y = df["Age"].values

    return x, y, dv

In [23]:
train_df = load_data(train_path)
train_df = compute_target(train_df)

In [29]:
X, y, dv = extract_x_y(train_df)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # using 20% of data for testing

# Model

In [32]:
def train_model(x_train: csr_matrix, y_train: np.ndarray):
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    return lr
    
def predict_age(input_data: csr_matrix, model: LinearRegression):
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return mean_squared_error(y_true, y_pred, squared=False)

In [34]:
model = train_model(X_train, y_train)
prediction = predict_age(X_train, model)
train_me = evaluate_model(y_train, prediction)
print(f'{train_me} is the mse metrics on train dataset')

2.187202344131259 is the mse metrics on train dataset


In [36]:
y_pred_test = predict_age(X_test, model)
test_me = evaluate_model(y_test, y_pred_test)
print(f'{test_me} is the mse metrics on train dataset')

2.211613087121831 is the mse metrics on train dataset


# Log mlflow

In [37]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")
client = MlflowClient()
experiments = client.search_experiments()

tracking URI: 'file:///Users/danliliu/Documents/DSB_M2/MLOP/xhec-mlops-project-student/notebooks/mlruns'


In [38]:
experiments

[<Experiment: artifact_location='file:///Users/danliliu/Documents/DSB_M2/MLOP/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1698057909457, experiment_id='0', last_update_time=1698057909457, lifecycle_stage='active', name='Default', tags={}>]

In [40]:
# Set the experiment name
mlflow_experiment_path = f"/mlflow/linear_reg_test"
mlflow.set_experiment(mlflow_experiment_path)

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("Level", "Development")
    mlflow.set_tag("Team", "x-hec-mlop-project-Danli-1")

    # Prepare data
    train_df = load_data(train_path)
    train_df = compute_target(train_df)
    X, y, dv = extract_x_y(train_df)
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # using 20% of data for testing

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_age(X_train, model)
    train_me = evaluate_model(y_train, prediction)
    mlflow.log_metric("train_me", train_me)

    # Evaluate model on test set
    y_pred_test = predict_age(X_test, model)
    test_me = evaluate_model(y_test, y_pred_test)
    mlflow.log_metric("test_me", test_me)

    # Log your model
    mlflow.sklearn.log_model(model, "models")

    # Register your model as the production model
    mlflow.register_model(f"runs:/{run_id}/models", "linear_reg_test")

Successfully registered model 'linear_reg_test'.
2023/10/23 12:47:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: linear_reg_test, version 1
Created version '1' of model 'linear_reg_test'.


In [None]:
!mlflow ui --host 0.0.0.0 --port 5002

[2023-10-23 12:59:21 +0200] [14489] [INFO] Starting gunicorn 21.2.0
[2023-10-23 12:59:21 +0200] [14489] [INFO] Listening at: http://0.0.0.0:5002 (14489)
[2023-10-23 12:59:21 +0200] [14489] [INFO] Using worker: sync
[2023-10-23 12:59:21 +0200] [14490] [INFO] Booting worker with pid: 14490
[2023-10-23 12:59:21 +0200] [14491] [INFO] Booting worker with pid: 14491
[2023-10-23 12:59:21 +0200] [14492] [INFO] Booting worker with pid: 14492
[2023-10-23 12:59:21 +0200] [14493] [INFO] Booting worker with pid: 14493


In [41]:
production_version = 1

client.transition_model_version_stage(
    name="linear_reg_test", version=production_version, stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1698058043462, current_stage='Production', description=None, last_updated_timestamp=1698058051412, name='linear_reg_test', run_id='c9e118dc2bdb46a3b68ef77a2e689a59', run_link=None, source='file:///Users/danliliu/Documents/DSB_M2/MLOP/xhec-mlops-project-student/notebooks/mlruns/299678146651034789/c9e118dc2bdb46a3b68ef77a2e689a59/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>

# Predict

In [43]:
predict_df = load_data(train_path)
predict_df.head(3)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9


In [46]:
# Load prediction data
predict_df = load_data(train_path)

# Apply feature engineering
predict_df = compute_target(predict_df)
X_pred, _, _ = extract_x_y(predict_df)

# Load production model
model_uri = f"models:/{mlflow_experiment_path}/production"
model = mlflow.sklearn.load_model(model_uri)

# Make predictions
y_pred = predict_age(X_pred, model)
y_pred

array([10.70433314,  9.31451885, 12.56368176, ..., 12.47000475,
       11.20685023, 12.41903985])