# Baseline Model — Linear Regression with MLflow

> **Goal:** Train a simple baseline model (Linear Regression) to predict Abalone age, and **track the experiment with MLflow**.  
> Do not commit MLflow artifacts; only the code belongs in the repo.

In [3]:
# --- Setup ---
import os, pathlib, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn

warnings.filterwarnings("ignore")

# Paths & data
df = pd.read_csv("/Users/kaixinxie/Downloads/abalone.csv")

# Target: Age ≈ Rings + 1.5
if "Rings" not in df.columns:
    raise ValueError("Expected target column 'Rings' not found in dataset.")
df["Age"] = df["Rings"] + 1.5

# Basic feature selection
y = df["Age"].values
X = df.drop(columns=["Age", "Rings"]).copy()

# One-hot encode categoricals (e.g., Sex) via pandas.get_dummies for a simple baseline
X = pd.get_dummies(X, drop_first=True)

print("X shape:", X.shape, "y shape:", y.shape)
X.head()

X shape: (4177, 9) y shape: (4177,)


Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,False,True
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,False,True
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,False,False
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,False,True
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,True,False


In [4]:
# --- Train/valid split ---
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Simple baseline pipeline: Standardize -> LinearRegression
pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),  # with_mean=False to support sparse one-hot
    ("linreg", LinearRegression())
])

In [5]:
# --- MLflow tracking ---
# Use a local folder for tracking; ensure it's ignored by git (.gitignore)
os.makedirs("mlruns_local", exist_ok=True)
mlflow.set_tracking_uri("file:./mlruns_local")
mlflow.set_experiment("abalone-baseline")

with mlflow.start_run(run_name="linear_regression_baseline"):
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_param("scaler_with_mean", False)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("test_size", 0.2)

    # Train
    pipe.fit(X_train, y_train)

    # Evaluate
    y_pred = pipe.predict(X_valid)
    rmse = float(np.sqrt(mean_squared_error(y_valid, y_pred)))
    mae  = float(mean_absolute_error(y_valid, y_pred))
    r2   = float(r2_score(y_valid, y_pred))

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Save model artifact
    mlflow.sklearn.log_model(pipe, artifact_path="model")

    print(f"RMSE: {rmse:.4f} | MAE: {mae:.4f} | R2: {r2:.4f}")

2025/10/23 11:16:58 INFO mlflow.tracking.fluent: Experiment with name 'abalone-baseline' does not exist. Creating a new experiment.


RMSE: 2.2116 | MAE: 1.5931 | R2: 0.5482



## How to compare experiments
- Tweak features (e.g., include/exclude variables), preprocessing, or try Ridge/Lasso.
- Re-run this notebook; each run appears in MLflow UI.

```bash
mlflow ui --backend-store-uri file:./mlruns_local
# Open the shown URL in your browser to compare runs.
```
