In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pkl
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

In [None]:
df = pd.read_csv('../data/abalone.csv')
# df_y = df['Rings']
# df = df.drop('Rings', axis=1)






In [None]:
def onehot(df):
    df = df.copy()
    sex_encoder = OneHotEncoder(drop="first", sparse_output=False)
    sex_encoded = sex_encoder.fit_transform(df[['Sex']])
    sex_feature_names = sex_encoder.get_feature_names_out(['Sex'])
    sex_df = pd.DataFrame(sex_encoded, columns=sex_feature_names, index=df.index)
    df = df.drop('Sex', axis=1)
    df_encoded = pd.concat([df, sex_df], axis=1)
    return df_encoded

def scale(df, scaler):
    
    df = df.copy()
    numerical_cols = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight"
    ]
    if scaler is None:
        scaler = StandardScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler

def preprocess_data(df, scaler=None, with_target=True):
    df = df.copy()
    df = onehot(df)

    if with_target:
        y = df["Rings"]
        df = df.drop("Rings", axis=1)
    else:
        y = None
    df, scaler = scale(df, scaler)

    return df, y, scaler

def train_model(x, y):
    model = LinearRegression()
    model.fit(x, y)
    return model

def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    mse = mean_squared_error(y, y_pred)
    return mse


def save_model(model, artifacts_path):
    with open(os.path.join(artifacts_path, 'model.pkl'), 'wb') as f:
        pkl.dump(model, f)

def save_scaler(scaler, artifacts_path):
    with open(os.path.join(artifacts_path, 'scaler.pkl'), 'wb') as f:
        pkl.dump(scaler, f)
    


In [None]:
x, y, scaler = preprocess_data(df, scaler=None, with_target=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model = train_model(x_train, y_train)
mse = evaluate_model(model, x_test, y_test)
artifacts_path = "../models/"
save_model(model, artifacts_path)
save_scaler(scaler, artifacts_path)