In [5]:
# ============================================================
# Linear Regression on Abalone Dataset with Scaler Comparison
# Logs multiple runs to MLflow (stored in notebooks/mlruns)
# ============================================================


from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

In [6]:
# ---------- Load and Preprocess Data ----------
RANDOM_STATE = 42
DATA_PATH = Path("../data") / "abalone_clean.csv"

In [7]:
# Load dataset
df = pd.read_csv(DATA_PATH)

# Normalize column names (avoid case mismatches like "Sex" vs "sex")
df.columns = df.columns.str.strip().str.lower()

# One-hot encode the 'sex' column
df = pd.get_dummies(df, columns=["sex"], drop_first=True)

In [14]:
X = df.drop(columns=["rings", "age"])
y = df["rings"]

In [15]:
dummy_cols = [c for c in X.columns if c.startswith("sex_")]
num_cols = [c for c in X.columns if c not in dummy_cols]

In [16]:
# ---------- Define Experiments ----------
# Different scalers to compare; "None" will skip scaling

scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler(),
    "None": "passthrough",
}

# Different test/train split ratios to try
test_sizes = [0.2, 0.3]

In [17]:
# ---------- MLflow Setup ----------
mlflow.set_experiment("Abalone_Age_Prediction_Multiple_Runs")

<Experiment: artifact_location='file:///Users/nandanasreeraj/mlopsproject/xhec-mlops-2025-project/notebooks/mlruns/530722836504791793', creation_time=1761225621947, experiment_id='530722836504791793', last_update_time=1761225621947, lifecycle_stage='active', name='Abalone_Age_Prediction_Multiple_Runs', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [18]:
# ---------- Training & Logging Loop ----------
for scaler_name, scaler in scalers.items():
    for test_size in test_sizes:
        run_name = f"LR | scaler={scaler_name} | test_size={test_size}"
        with mlflow.start_run(run_name=run_name):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=RANDOM_STATE
            )

            preproc = ColumnTransformer(
                transformers=[
                    ("scale_num", scaler, num_cols),
                    ("pass_dummies", "passthrough", dummy_cols),
                ]
            )

            pipe = Pipeline(
                steps=[("preprocess", preproc), ("model", LinearRegression())]
            )

            pipe.fit(X_train, y_train)

            y_pred = pipe.predict(X_test)

            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            print(f"Scaler: {scaler_name}, Test size: {test_size}")
            print(f"MAE: {mae:.3f}, MSE: {mse:.3f}, R²: {r2:.3f}\n")

            mlflow.log_params(
                {
                    "scaler": scaler_name,
                    "test_size": test_size,
                    "model_type": "LinearRegression",
                    "random_state": RANDOM_STATE,
                }
            )
            mlflow.log_metrics({"MAE": mae, "MSE": mse, "R2": r2})

            mlflow.sklearn.log_model(
                sk_model=pipe,
                name="linear_regression_model",
                input_example=X_test.head(5),
            )

            try:
                feature_names = pipe.named_steps["preprocess"].get_feature_names_out()
                coef = pipe.named_steps["model"].coef_
                coef_df = pd.DataFrame({"feature": feature_names, "coefficient": coef})
                mlflow.log_text(coef_df.to_csv(index=False), "feature_coefficients.csv")
            except Exception as e:
                mlflow.log_text(
                    f"Could not derive feature names: {e}\nCoefficients: {pipe.named_steps['model'].coef_.tolist()}",
                    "feature_coefficients.txt",
                )

Scaler: StandardScaler, Test size: 0.2
MAE: 1.550, MSE: 4.461, R²: 0.549

Scaler: StandardScaler, Test size: 0.3
MAE: 1.541, MSE: 4.533, R²: 0.539

Scaler: MinMaxScaler, Test size: 0.2
MAE: 1.550, MSE: 4.461, R²: 0.549

Scaler: MinMaxScaler, Test size: 0.3
MAE: 1.541, MSE: 4.533, R²: 0.539

Scaler: RobustScaler, Test size: 0.2
MAE: 1.550, MSE: 4.461, R²: 0.549

Scaler: RobustScaler, Test size: 0.3
MAE: 1.541, MSE: 4.533, R²: 0.539

Scaler: None, Test size: 0.2
MAE: 1.550, MSE: 4.461, R²: 0.549

Scaler: None, Test size: 0.3
MAE: 1.541, MSE: 4.533, R²: 0.539



In [19]:
import mlflow
import os

print("CWD:", os.getcwd())
print("Tracking URI:", mlflow.get_tracking_uri())
with mlflow.start_run(nested=True):
    print("Artifact URI (run context):", mlflow.get_artifact_uri())

CWD: /Users/nandanasreeraj/mlopsproject/xhec-mlops-2025-project/notebooks
Tracking URI: file:///Users/nandanasreeraj/mlopsproject/xhec-mlops-2025-project/notebooks/mlruns
Artifact URI (run context): file:///Users/nandanasreeraj/mlopsproject/xhec-mlops-2025-project/notebooks/mlruns/530722836504791793/e2a7f7a019c9480db477f205e9c63d28/artifacts
