In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pkl
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
df = pd.read_csv('../data/abalone.csv')
df_y = df['Rings']
df = df.drop('Rings', axis=1)

In [None]:
def onehot(df):
    df = df.copy()
    sex_encoder = OneHotEncoder(drop="first", sparse_output=False)
    sex_encoded = sex_encoder.fit_transform(df[['Sex']])
    sex_feature_names = sex_encoder.get_feature_names_out(['Sex'])
    sex_df = pd.DataFrame(sex_encoded, columns=sex_feature_names, index=df.index)
    df = df.drop('Sex', axis=1)
    df_encoded = pd.concat([df, sex_df], axis=1)
    return df_encoded

def scale(df, scaler):
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df)
    df = df.copy()
    numerical_cols = [
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight"
    ]
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df, scaler
def preprocess_data(df, scaler=None, with_target=True):
    df = df.copy()
    df = onehot(df)

    if with_target:
        y = df['Rings']
        df = df.drop('Rings', axis=1)
    else:
        y = None
    df, scaler= scale(df, scaler)

    return df, y, scaler
        

In [None]:
# Create scaler
scaler = StandardScaler()
df_scaled = df.copy()
numerical_cols = [
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight"
]
df_scaled[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
# Create OneHotEncoder WITHOUT dropping any categories
sex_encoder = OneHotEncoder(drop="first", sparse_output=False)

# Fit and transform the Sex column
sex_encoded = sex_encoder.fit_transform(df[['Sex']])

# Get the actual feature names from the encoder
sex_feature_names = sex_encoder.get_feature_names_out(['Sex'])

# Create DataFrame with encoded features
sex_df = pd.DataFrame(sex_encoded, columns=sex_feature_names, index=df.index)

# Drop original Sex column and add encoded columns
df_encoded = df.drop('Sex', axis=1)
df_encoded = pd.concat([df_encoded, sex_df], axis=1)

print("Original Sex column values:", df['Sex'].unique())
print("Encoded columns:", sex_feature_names)
print("Shape before encoding:", df.shape)
print("Shape after encoding:", df_encoded.shape)
print("\nFirst few rows of encoded data:")
print(df_encoded.head())

In [None]:

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_encoded, 
    df_y, 
    test_size=0.2, 
    random_state=42
)

In [None]:
#create a linear regression model and fit it to the training data
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#make predictions on the test set
y = model.predict(X_test)

#evaluate the model
mse = mean_squared_error(y_test, y)
print(f"Mean Squared Error: {mse}")

In [None]:
#save the model
def save_model(model, filename):
    with open('../models/linear_regression_model.pkl', 'wb') as f:
    pkl.dump(model, f)

path = '../models/linear_regression_model.pkl'
save_model(model, path)

In [None]:
#plot the predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y, alpha=0.5)
plt.xlabel('Actual Rings')
plt.ylabel('Predicted Rings')
plt.title('Actual vs Predicted Rings')
plt.show()