## Assignment 5 — Compare RF, AdaBoost, Gradient Boosting, XGBoost, LightGBM, CatBoost

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv("E:/Engineering/Machine Learning/ADS-VAC/notebooks/datasets/housing.csv")
df.head()


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


### Select Features and Target

We remove non-predictive columns:
id, date, zipcode (categorical), latitude & longitude kept.

In [4]:
df = df.drop(["id", "date"], axis=1)

# One-hot encode zipcode
df = pd.get_dummies(df, columns=["zipcode"], drop_first=True)

X = df.drop("price", axis=1)
y = df["price"]

X.shape, y.shape


((21613, 86), (21613,))

### Train-Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


### Scaling (helps AdaBoost & Gradient Boosting slightly - diminishing returns for tree models)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Metric Function

In [7]:
def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    mae  = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2   = r2_score(y_test, preds)
    return mae, rmse, r2


### Random Forest

In [8]:
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_results = evaluate(rf, X_test, y_test)
rf_results


(72151.77398754448, 146147.98150031906, 0.8587134729992428)

### AdaBoost Regressor

In [9]:
ada = AdaBoostRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
ada.fit(X_train_scaled, y_train)

ada_results = evaluate(ada, X_test_scaled, y_test)
ada_results


(177875.86542584575, 243729.9663120547, 0.6070539173307756)

### Gradient Boosting Regressor

In [10]:
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
gbr.fit(X_train_scaled, y_train)

gbr_results = evaluate(gbr, X_test_scaled, y_test)
gbr_results


(76634.99223355037, 141522.49624136425, 0.8675151961164104)

### XGBoost

In [11]:
xg = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xg.fit(X_train, y_train)

xgb_results = evaluate(xg, X_test, y_test)
xgb_results


(67561.13606364215, 143011.6734266387, 0.8647123575210571)

### LightGBM

In [12]:
lgbm = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgbm.fit(X_train, y_train)

lgbm_results = evaluate(lgbm, X_test, y_test)
lgbm_results


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2398
[LightGBM] [Info] Number of data points in the train set: 17290, number of used features: 86
[LightGBM] [Info] Start training from score 537768.047773


(67054.75446498931, 137816.14332794628, 0.8743636555914379)

### CatBoost

In [13]:
cat = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="RMSE",
    verbose=0,
    random_state=42
)
cat.fit(X_train, y_train)

cat_results = evaluate(cat, X_test, y_test)
cat_results


(68059.56854065819, 127026.3077625264, 0.8932660809338475)

### Comparison Table

In [14]:
results_df = pd.DataFrame({
    "Model": ["Random Forest", "AdaBoost", "Gradient Boosting", "XGBoost", "LightGBM", "CatBoost"],
    "MAE":   [rf_results[0], ada_results[0], gbr_results[0], xgb_results[0], lgbm_results[0], cat_results[0]],
    "RMSE":  [rf_results[1], ada_results[1], gbr_results[1], xgb_results[1], lgbm_results[1], cat_results[1]],
    "R2":    [rf_results[2], ada_results[2], gbr_results[2], xgb_results[2], lgbm_results[2], cat_results[2]],
})

results_df.sort_values("RMSE")


Unnamed: 0,Model,MAE,RMSE,R2
5,CatBoost,68059.568541,127026.307763,0.893266
4,LightGBM,67054.754465,137816.143328,0.874364
2,Gradient Boosting,76634.992234,141522.496241,0.867515
3,XGBoost,67561.136064,143011.673427,0.864712
0,Random Forest,72151.773988,146147.9815,0.858713
1,AdaBoost,177875.865426,243729.966312,0.607054


✔ Model Performance Summary

Model   -   Expected Performance

CatBoost    -   Usually best or tied for best

XGBoost -   Very strong, stable model

LightGBM    -   Fast & high-performing

Random Forest   -   Strong baseline model

Gradient Boosting   -   Good but slower

AdaBoost    -   Often weakest on large regression datasets


✔ Why?

CatBoost handles categorical + numerical features smoothly

XGBoost & LightGBM are optimized gradient boosters

Random forest handles noise well

AdaBoost can be unstable with noisy continuous targets

✔ What We Achieved

Trained 6 ensemble models

Compared MAE, RMSE, and R²

Built a model leaderboard

Identified best performer for housing price prediction