In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# Imports

In [11]:
import mlflow
import mlflow.sklearn
import pandas as pd
import kagglehub
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from mlflow.models.signature import infer_signature
import os

# Data

In [3]:
# Download the dataset
path = kagglehub.dataset_download("rodolfomendes/abalone-dataset")
csv_path = os.path.join(path, "abalone.csv") 

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# Modelling

In [4]:
# One-hot encoding the categorical variables
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

In [5]:
# Defining the target variable
X = df.drop(columns=['Rings'])  
y = df['Rings']                 

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Normalizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Defining the linear regression model
model = LinearRegression()

# Track the experiment using MLflow
with mlflow.start_run(run_name="Linear Regression Experiment"):

    # Fit the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log parameters and metrics to MLflow
    mlflow.log_param("model", "Linear Regression")
    mlflow.log_param("scaling", "StandardScaler")
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)

     # Create an input example
    input_example = pd.DataFrame(X_test_scaled[:5], columns=X.columns)  # Using the first 5 rows as an example

    # Infer model signature (schema)
    signature = infer_signature(X_test_scaled, y_pred)

    # Log the model with input example and signature
    mlflow.sklearn.log_model(model, "linear_regression_model", signature=signature, input_example=input_example)

    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Mean Squared Error: 4.891232447128579
R^2 Score: 0.5481628137889263


In [13]:
model = XGBRegressor(random_state=42)

with mlflow.start_run(run_name="XGBoost Regressor Experiment"):

    # Fit the model
    model.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log parameters and metrics to MLflow
    mlflow.log_param("model", "XGBoost Regressor")
    mlflow.log_param("scaling", "StandardScaler")
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)

    # Create an input example
    input_example = pd.DataFrame(X_test_scaled[:5], columns=X.columns)

    # Infer model signature (schema)
    signature = infer_signature(X_test_scaled, y_pred)

    # Log the model with input example and signature
    mlflow.sklearn.log_model(model, "xgboost_regressor_model", signature=signature, input_example=input_example)

    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Mean Squared Error: 5.437235685045327
R^2 Score: 0.49772469428649535
