In [101]:
# =========================================
# AirBnB NYC: LightGBM (traditional) + sklearn MLP (deep)
# CV in log-space + original price metrics
# =========================================

import numpy as np, pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor as SKMLP

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [103]:
# =============== Load ===============
df = pd.read_csv(r"C:\Users\ASUS\Downloads\data\airbnb_listings_nyc.csv")  
target = "price"
y_raw = df[target].clip(lower=10)      # guard against zeros/negatives if any
y = np.log1p(y_raw)                    # log-transform for stability
X = df.drop(columns=[target])

# Identify columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()

# =============== Robust stratification labels for regression ===============
def make_strata(y_cont, n_splits=5, max_bins=10):
    """Return labels suitable for StratifiedKFold by binning a continuous y.
       Decrease the number of bins until every bin has >= n_splits samples."""
    bins = max_bins
    y = pd.Series(y_cont).reset_index(drop=True)
    while bins >= n_splits:
        cats = pd.qcut(y, q=bins, duplicates="drop")
        # Convert to strings so scikit treats it as multiclass, not 'unknown'
        labels = cats.astype(str)
        counts = labels.value_counts()
        if (counts >= n_splits).all():
            return labels.values
        bins -= 1
    # Fallback: all one label (equivalent to plain KFold)
    return np.zeros(len(y), dtype=int)

n_splits = 5
y_strata = make_strata(y, n_splits=n_splits, max_bins=10)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
cv_folds = list(skf.split(X, y_strata))

In [104]:
# =============== Preprocessing ===============
# For MLP (deep): impute + scale numeric; impute + OHE categoricals
prep_mlp = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc", RobustScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=10))]), cat_cols)
])

# For LightGBM: impute + OHE (no scaling)
prep_lgbm = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", min_frequency=10), cat_cols)
])


In [107]:
# =============== Models ===============
# (A) Traditional: LightGBM
lgbm = Pipeline([
    ("prep", prep_lgbm),
    ("model", LGBMRegressor(
        objective="huber",
        n_estimators=1000, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        max_depth=-1, random_state=RANDOM_STATE
    ))
])

# (B) “Deep”: sklearn MLP (neural net) — no extra installs
mlp = Pipeline([
    ("prep", prep_mlp),
    ("model", SKMLP(
        hidden_layer_sizes=(256, 128, 64),
        activation="relu",
        solver="adam",
        alpha=1e-4,                # L2
        batch_size=512,
        learning_rate_init=1e-3,
        max_iter=250,
        early_stopping=True,
        n_iter_no_change=12,
        random_state=RANDOM_STATE,
        verbose=False
    ))
])

In [108]:
# =============== Metrics ===============
def rmse(yt, yp): 
    return np.sqrt(mean_squared_error(yt, yp))

def mae_real(yt_log, yp_log):
    yt, yp = np.expm1(yt_log), np.expm1(yp_log)
    return mean_absolute_error(yt, yp)

def rmse_real(yt_log, yp_log):
    yt, yp = np.expm1(yt_log), np.expm1(yp_log)
    return np.sqrt(mean_squared_error(yt, yp))

from sklearn.metrics import mean_squared_error  

scoring = {
    # log-price space (training target)
    "MAE_log": make_scorer(mean_absolute_error, greater_is_better=False),
    "RMSE_log": make_scorer(lambda yt, yp: rmse(yt, yp), greater_is_better=False),
    "R2_log": make_scorer(r2_score),
    # original price space (business units)
    "MAE_real": make_scorer(mae_real, greater_is_better=False),
    "RMSE_real": make_scorer(rmse_real, greater_is_better=False),
}

In [109]:
# =============== Evaluate helper ===============
def summarize_cv(res, name):
    print(f"\n{name}:")
    for key in sorted(res.keys()):
        if key.startswith("test_"):
            arr = res[key]
            # Flip sign for error metrics built with greater_is_better=False
            if any(m in key for m in ["MAE", "RMSE"]):
                arr = -arr
            print(f"  {key[5:]:<10} {arr.mean():.4f} ± {arr.std():.4f}")

def evaluate(pipe, name):
    res = cross_validate(pipe, X, y, cv=cv_folds, scoring=scoring, n_jobs=-1, return_estimator=False)
    summarize_cv(res, name)

In [110]:
# =============== Run both models ===============
evaluate(lgbm, "LightGBM (traditional)")
evaluate(mlp,  "Sklearn MLP (deep learning)")


LightGBM (traditional):
  MAE_log    0.3023 ± 0.0017
  MAE_real   55.2774 ± 1.1087
  R2_log     0.6228 ± 0.0045
  RMSE_log   0.4253 ± 0.0034
  RMSE_real  217.2194 ± 19.3847

Sklearn MLP (deep learning):
  MAE_log    0.7522 ± 0.0499
  MAE_real   91.2320 ± 4.4157
  R2_log     -0.6012 ± 0.1492
  RMSE_log   0.8757 ± 0.0457
  RMSE_real  245.2074 ± 18.7291
