<a href="https://colab.research.google.com/github/ansharyis/ml-colab-project/blob/main/notebooks/05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
DATA_DIR = "/content/drive/MyDrive/Machine Learning Group Work"

## Load tha dataset

In [15]:
# Project: ML Weight Prediction
# Notebook: Ensemble Model (Bagging and Boosting)
# Owner: ....
# Description: Ensemble Model (Bagging and Boosting) Rev A

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

train_df = pd.read_csv(f"{DATA_DIR}/PROCESSED/train_df_final_after_null_removal.csv")
print(train_df.shape)




FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Machine Learning Group Work/PROCESSED/train_df_final_after_null_removal.csv'

In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [13]:
TARGET = "WEIGHTLBTC_A"

# -------------------------------------------------
# 1) Prepare X / y
# -------------------------------------------------
df_model = train_df.dropna(subset=[TARGET]).copy()
X = df_model.drop(columns=[TARGET])

USE_LOG_TARGET = True  # keep True for inverse transform to pounds

if USE_LOG_TARGET:
    y = np.log1p(df_model[TARGET].astype(float))
else:
    y = df_model[TARGET].astype(float)

# -------------------------------------------------
# 2) Detect numeric vs categorical columns
# -------------------------------------------------
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

# -------------------------------------------------
# 3) Tree-friendly preprocessing (no scaling)
#    - Categorical: mode impute + one-hot
#    - Numeric: median impute
# -------------------------------------------------
categorical_tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=True,
        min_frequency=0.01   # remove if your sklearn doesn't support it
    ))
])

numeric_tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

tree_preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_tree, cat_cols),
        ("num", numeric_tree, num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -------------------------------------------------
# 4) Train/test split
# -------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=44
)

NameError: name 'train_df' is not defined

In [None]:
# -------------------------------------------------
# 5) Define Ensemble Models + hyperparameter grids
# -------------------------------------------------
ensemble_models = {
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42, n_jobs=-1),
        "params": {
            "model__n_estimators": [100, 200],
            "model__max_depth": [10, 20, None],
            "model__min_samples_leaf": [1, 4]
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [3, 5]
        }
    }
}

# -------------------------------------------------
# 6) Training loop + evaluation
# -------------------------------------------------
ensemble_results = {}

print("Training Ensemble Models (this may take time)...")

for name, config in ensemble_models.items():
    print(f"\nRunning {name}...")

    pipe = Pipeline(steps=[
        ("preprocessor", tree_preprocessor),
        ("model", config["model"])
    ])

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=config["params"],
        cv=3,  # reduce folds for speed/memory
        scoring="neg_mean_squared_error",
        n_jobs=1  # keep sequential to avoid Colab worker crashes
    )

    grid.fit(X_train, y_train)

    # Predict
    y_pred = grid.best_estimator_.predict(X_test)

    # Inverse transform for pounds
    if USE_LOG_TARGET:
        y_pred_pounds = np.expm1(y_pred)
        y_test_pounds = np.expm1(y_test)
    else:
        y_pred_pounds = y_pred
        y_test_pounds = y_test

    mse = mean_squared_error(y_test_pounds, y_pred_pounds)
    rmse = np.sqrt(mse)

    ensemble_results[name] = {
        "RMSE": rmse,
        "MSE": mse,
        "Best Params": grid.best_params_
    }

    print("Best Params:", grid.best_params_)
    print(f"{name} RMSE: {rmse:.2f} lbs")
    print(f"{name} MSE : {mse:.2f}")

# -------------------------------------------------
# 7) Final standings
# -------------------------------------------------
print("\n--- FINAL STANDINGS ---")
for name, res in sorted(ensemble_results.items(), key=lambda kv: kv[1]["RMSE"]):
    print(f"{name}: RMSE={res['RMSE']:.2f} lbs | Best Params={res['Best Params']}")