<a href="https://colab.research.google.com/github/ansharyis/ml-colab-project/blob/main/notebooks/03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
DATA_DIR = "/content/drive/MyDrive/ML_Project_Data"

## Load tha dataset

In [3]:
# Project: ML Weight Prediction
# Notebook: Ridg, Lasso, and Elastic Model
# Owner: ....
# Description: Ridge, Lasso, and Elastic Model Rev A

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

train_df = pd.read_csv(f"{DATA_DIR}/PROCESSED/train_df_final_after_null_removal.csv")
print(train_df.shape)



(20340, 59)


In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


## Prepare explanatory variable, dependent variable, and preprocessing pipeline

This handles:

numeric: median impute + scaling (good for linear/ridge/lasso)

categorical: most_frequent + one-hot

In [7]:
TARGET = "WEIGHTLBTC_A"

# -------------------------------------------------
# 1) Prepare X / y
# -------------------------------------------------
df_model = train_df.dropna(subset=[TARGET]).copy()
X = df_model.drop(columns=[TARGET])

USE_LOG_TARGET = True  # keep True for your "inverse transform to pounds" evaluation

if USE_LOG_TARGET:
    y = np.log1p(df_model[TARGET].astype(float))
else:
    y = df_model[TARGET].astype(float)

# -------------------------------------------------
# 2) Detect numeric vs categorical columns
# -------------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# -------------------------------------------------
# 3) Define Transformers (memory-aware)
# -------------------------------------------------
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=True,   # keep sparse for memory
        min_frequency=0.01    # group rare categories <1% (if supported by your sklearn)
    ))
])

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, cat_cols),
        ("num", numeric_transformer, num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------------------------------
# 4) Train/test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=44
)


## RIDGE, LASSO, AND ELASTIC NET

In [9]:

# -------------------------------------------------
# 5) Models & Hyperparameters
# -------------------------------------------------
models = {
    "Ridge": {
        "model": Ridge(),
        "params": {"model__alpha": [0.1, 1.0, 10.0, 100.0]}
    },
    "Lasso": {
        "model": Lasso(max_iter=20000),
        "params": {"model__alpha": [0.001, 0.01, 0.1, 1.0]}
    },
    "ElasticNet": {
        "model": ElasticNet(max_iter=20000),
        "params": {
            "model__alpha": [0.01, 0.1, 1.0],
            "model__l1_ratio": [0.2, 0.5, 0.8]
        }
    }
}

# -------------------------------------------------
# 6) Execution Loop (GridSearchCV)
# -------------------------------------------------
results = {}

for name, config in models.items():
    print(f"\nTraining {name}...")

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", config["model"])
    ])

    search = GridSearchCV(
        estimator=pipeline,
        param_grid=config["params"],
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=1  # keep 1 to avoid memory crash in Colab
    )

    search.fit(X_train, y_train)

    # Predict
    y_pred = search.predict(X_test)

    # Evaluate
    if USE_LOG_TARGET:
        y_pred_pounds = np.expm1(y_pred)
        y_test_pounds = np.expm1(y_test)
    else:
        y_pred_pounds = y_pred
        y_test_pounds = y_test

    mse = mean_squared_error(y_test_pounds, y_pred_pounds)
    rmse = np.sqrt(mse)

    results[name] = {
        "MSE": mse,
        "RMSE": rmse,
        "Best Params": search.best_params_
    }

    print("Best Params:", search.best_params_)
    print(f"{name} RMSE (lbs): {rmse:.2f}")
    print(f"{name} MSE  (lbs^2): {mse:.2f}")

# -------------------------------------------------
# 7) Winner
# -------------------------------------------------
best_model = min(results, key=lambda k: results[k]["MSE"])
print(f"\nWINNER: {best_model}")
print("Best Parameters:", results[best_model]["Best Params"])
print(f"Final Test RMSE: {results[best_model]['RMSE']:.2f}")



Training Ridge...
Best Params: {'model__alpha': 10.0}
Ridge RMSE (lbs): 16.08
Ridge MSE  (lbs^2): 258.51

Training Lasso...
Best Params: {'model__alpha': 0.001}
Lasso RMSE (lbs): 16.06
Lasso MSE  (lbs^2): 257.81

Training ElasticNet...
Best Params: {'model__alpha': 0.01, 'model__l1_ratio': 0.2}
ElasticNet RMSE (lbs): 16.08
ElasticNet MSE  (lbs^2): 258.47

WINNER: Lasso
Best Parameters: {'model__alpha': 0.001}
Final Test RMSE: 16.06
