<a href="https://colab.research.google.com/github/ansharyis/ml-colab-project/blob/main/notebooks/05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
DATA_DIR = "/content/drive/MyDrive/ML_Project_Data"

## Load tha dataset

In [11]:
# Project: ML Weight Prediction
# Notebook: Ensemble Model (Bagging and Boosting)
# Owner: ....
# Description: Ensemble Model (Bagging and Boosting) Rev A

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

train_df = pd.read_csv(f"{DATA_DIR}/PROCESSED/train_df_final_after_null_removal.csv")
print(train_df.shape)




(20340, 59)


In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

## Prepare explanatory variable, dependent variable, and preprocessing pipeline

This handles:

numeric: median impute + scaling (good for linear/ridge/lasso)

categorical: most_frequent + one-hot

In [13]:
TARGET = "WEIGHTLBTC_A"

# -------------------------------------------------
# 1) Prepare X / y
# -------------------------------------------------
df_model = train_df.dropna(subset=[TARGET]).copy()
X = df_model.drop(columns=[TARGET])

USE_LOG_TARGET = True  # keep True for inverse transform to pounds

if USE_LOG_TARGET:
    y = np.log1p(df_model[TARGET].astype(float))
else:
    y = df_model[TARGET].astype(float)

# -------------------------------------------------
# 2) Detect numeric vs categorical columns
# -------------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# -------------------------------------------------
# 3) Tree-friendly preprocessing (no scaling)
# -------------------------------------------------
categorical_tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=True,
        min_frequency=0.01   # remove if sklearn doesn't support it
    ))
])

numeric_tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

tree_preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_tree, cat_cols),
        ("num", numeric_tree, num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------------------------------
# 4) Train/test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=44
)


## ENSEMBLE MODEL

In [14]:
# -------------------------------------------------
# 5) Define Ensemble Models (NO tuning)
# -------------------------------------------------
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

gb_model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

models = {
    "RandomForest": rf_model,
    "GradientBoosting": gb_model
}

# -------------------------------------------------
# 6) Train + Evaluate
# -------------------------------------------------
results = []

for name, model in models.items():
    print(f"\nTraining {name} (no GridSearch)...")

    pipe = Pipeline(steps=[
        ("preprocessor", tree_preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    if USE_LOG_TARGET:
        y_pred_pounds = np.expm1(y_pred)
        y_test_pounds = np.expm1(y_test)
    else:
        y_pred_pounds = y_pred
        y_test_pounds = y_test

    mse = mean_squared_error(y_test_pounds, y_pred_pounds)
    rmse = np.sqrt(mse)

    results.append({"model": name, "RMSE": rmse, "MSE": mse})
    print(f"{name} RMSE: {rmse:.2f} lbs")
    print(f"{name} MSE : {mse:.2f} lbs^2")

results_df = pd.DataFrame(results).sort_values("RMSE")
print("\n=== Summary ===")
print(results_df)


Training RandomForest (no GridSearch)...
RandomForest RMSE: 15.67 lbs
RandomForest MSE : 245.60 lbs^2

Training GradientBoosting (no GridSearch)...
GradientBoosting RMSE: 15.59 lbs
GradientBoosting MSE : 243.16 lbs^2

=== Summary ===
              model       RMSE         MSE
1  GradientBoosting  15.593496  243.157121
0      RandomForest  15.671546  245.597365
