<a href="https://colab.research.google.com/github/YashubG/first-contributions/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import the datasets

In [None]:
test = pd.read_csv('/content/test.csv')
train = pd.read_csv('/content/train.csv')

Preprocessing

In [None]:
train.drop((train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index),
           inplace=True)  # Remove obvious outliers
train.reset_index(drop=True, inplace=True)

y = np.log1p(train['HotelValue'])    # Log-transform target for symmetry
train_ids = train['Id']
test_ids  = test['Id']

# Drop ID and target from features
train.drop(['Id','HotelValue'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# ---------------------------
# Combine for Preprocessing
# ---------------------------
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ---------------------------
# Drop Sparse/Irrelevant Features
# ---------------------------
# Remove columns with mostly missing or poor information
drop_cols = [
    'ServiceLaneType','FacadeType','PoolQuality','BoundaryFence','ExtraFacility',
    'PlotConfiguration','NearbyTransport1','NearbyTransport2','UtilityAccess'
]
all_data.drop(columns=[c for c in drop_cols if c in all_data.columns], inplace=True)

# ---------------------------
# Feature Engineering
# ---------------------------
# Convert some numeric categories to strings (for one-hot later)
all_data['PropertyClass'] = all_data['PropertyClass'].astype(str)

# Area and Room features
all_data['TotalSF'] = (
    all_data['BasementTotalSF'] +
    all_data['GroundFloorArea'] +
    all_data['UpperFloorArea']
)
all_data['TotalBath'] = (
    all_data['FullBaths'] +
    0.5 * all_data['HalfBaths'] +
    all_data['BasementFullBaths'] +
    0.5 * all_data['BasementHalfBaths']
)
all_data['TotalPorchSF'] = (
    all_data['OpenVerandaArea'] +
    all_data['EnclosedVerandaArea'] +
    all_data['SeasonalPorchArea'] +
    all_data['ScreenPorchArea']
)
# Age and renovation features
all_data['HotelAge'] = all_data['YearSold'] - all_data['ConstructionYear']
all_data['RemodAge'] = all_data['YearSold'] - all_data['RenovationYear']
all_data['WasRemodeled'] = (all_data['RemodAge'] > 0).astype(int)
all_data['IsNew'] = (all_data['YearSold'] == all_data['ConstructionYear']).astype(int)
# Flags for amenities
all_data['HasPool'] = (all_data['SwimmingPoolArea'] > 0).astype(int)
all_data['HasGarage'] = (all_data['ParkingArea'] > 0).astype(int)
all_data['HasBasement'] = (all_data['BasementTotalSF'] > 0).astype(int)
all_data['HasLounge'] = (all_data['Lounges'] > 0).astype(int)
# Polynomial / interaction features
all_data['OverallQuality_sq'] = all_data['OverallQuality']**2
all_data['OverallQuality_cub'] = all_data['OverallQuality']**3
all_data['OverallQuality_x_TotalSF']  = all_data['OverallQuality'] * all_data['TotalSF']
all_data['OverallQuality_x_HotelAge'] = all_data['OverallQuality'] * all_data['HotelAge']
# New ratio features
all_data['BuiltPct']      = all_data['TotalSF'] / (all_data['LandArea'] + 1)
all_data['Area_per_Room'] = all_data['UsableArea'] / (all_data['TotalRooms'] + 1)
all_data['Baths_to_Rooms'] = all_data['TotalBath'] / (all_data['TotalRooms'] + 1)
all_data['BasementRatio']  = all_data['BasementTotalSF'] / (all_data['TotalSF'] + 1)

# ---------------------------
# Drop Redundant Originals (after creating features)
# ---------------------------
drop_orig = [
    'OpenVerandaArea','EnclosedVerandaArea','SeasonalPorchArea','ScreenPorchArea',
    'LowQualityArea','FacadeArea','BasementFacilitySF2'
]
for col in drop_orig:
    if col in all_data.columns:
        all_data.drop(col, axis=1, inplace=True)

# ---------------------------
# Imputation for Remaining Missing Data
# ---------------------------
# Many missing in RoadAccessLength; fill by District median
if 'RoadAccessLength' in all_data.columns:
    all_data['RoadAccessLength'] = all_data.groupby('District')['RoadAccessLength']\
                                          .transform(lambda x: x.fillna(x.median()))
    # --- FIX 1 (No inplace=True) ---
    all_data['RoadAccessLength'] = all_data['RoadAccessLength'].fillna(all_data['RoadAccessLength'].median())

# Fill small gaps with zeros or modes
if 'FacadeArea' in all_data.columns:
    all_data['FacadeArea'].fillna(0, inplace=True) # This one is fine, not a chained assignment

if 'ElectricalSystem' in all_data.columns:
    # --- FIX 2 (No inplace=True) ---
    all_data['ElectricalSystem'] = all_data['ElectricalSystem'].fillna(all_data['ElectricalSystem'].mode()[0])

# Any remaining numeric NaNs
num_cols = all_data.select_dtypes(include=[np.number]).columns
cat_cols = all_data.select_dtypes(exclude=[np.number]).columns
all_data[num_cols] = all_data[num_cols].fillna(0)
all_data[cat_cols] = all_data[cat_cols].fillna('None')

# ---------------------------
# Ordinal Encoding for Quality Features
# ---------------------------
qual_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'None':0}
for col in ['ExteriorQuality','ExteriorCondition','HeatingQuality','KitchenQuality']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(qual_map).astype(int)
# Categorical mappings
if 'LandSlope' in all_data.columns:
    all_data['LandSlope'] = all_data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3).astype(int)
if 'PlotShape' in all_data.columns:
    all_data['PlotShape'] = all_data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4).astype(int)
# Binary encoding
all_data['CentralAC'] = all_data['CentralAC'].map({'Y':1,'N':0}).astype(int)

# ---------------------------
# Log-transform Highly Skewed Numerics
# ---------------------------
for col in ['ExtraFacilityValue','LandArea','BasementHalfBaths']:
    if col in all_data.columns:
        all_data[col] = np.log1p(all_data[col])

# Drop SwimmingPoolArea due to extreme skew & rarity (we have HasPool flag)
if 'SwimmingPoolArea' in all_data.columns:
    all_data.drop('SwimmingPoolArea', axis=1, inplace=True)

# ---------------------------
# One-Hot Encoding
# ---------------------------
all_data = pd.get_dummies(all_data, drop_first=True)
print("Final feature matrix shape:", all_data.shape)

# ---------------------------
# Train/Test Split for Modeling
# ---------------------------
X = all_data.iloc[:train.shape[0], :].values
X_test_final = all_data.iloc[train.shape[0]:, :].values


Final feature matrix shape: (1458, 233)


Agam Roy

Naive Bayes Classifier


In [None]:
# bayes_classifier_hotel_value.py
# Train a Gaussian Naive Bayes classifier (Bayes) with same preprocessing and GridSearchCV.
# Predictions are mapped back to median HotelValue per class so submission retains numeric HotelValue.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, f1_score

# ====================
# CONFIG
# ====================
SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

TRAIN_PATH = '/content/train.csv'
TEST_PATH  = '/content/test.csv'
SUB_PATH   = '/content/sample_submission.csv'
OUTPUT_PATH = 'submission.csv'

# ====================
# LOAD
# ====================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SUB_PATH)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# ====================
# Create classification target (terciles) + mapping to numeric medians
# ====================
# We'll bin original HotelValue into 3 classes (low/mid/high) using quantiles.
# Keep the original HotelValue for mapping medians later.
train_orig = train.copy()
# If there are ties that make qcut fail, use rank-based binning as fallback.
try:
    train['ValueClass'] = pd.qcut(train['HotelValue'], q=3, labels=['low','mid','high'])
except Exception:
    # fallback to rank-based bins
    train['ValueClass'] = pd.qcut(train['HotelValue'].rank(method='first'), q=3, labels=['low','mid','high'])

# Map class string labels to integers for classifier
class_order = ['low', 'mid', 'high']
train['ValueClassLabel'] = train['ValueClass'].map({lab:i for i,lab in enumerate(class_order)})

# Compute representative numeric HotelValue for each class (median)
class_medians = train_orig.groupby(pd.qcut(train_orig['HotelValue'], q=3, labels=class_order))['HotelValue'].median().to_dict()
# If qcut fallback used, ensure medians align
if set(class_medians.keys()) != set(class_order):
    # recompute using labels from train['ValueClass']
    class_medians = train_orig.assign(ValueClass=train['ValueClass']).groupby('ValueClass')['HotelValue'].median().to_dict()

print("Class medians (mapping class -> numeric HotelValue):", class_medians)

y_class = train['ValueClassLabel'].values
test_ids = test['Id'].copy()

# Drop target/id columns from train/test before combining
train_drop = train.drop(['HotelValue','ValueClass','ValueClassLabel'], axis=1, errors='ignore')
test_drop  = test.drop(['Id'], axis=1, errors='ignore')

# ====================
# COMBINE FOR PREPROCESSING
# ====================
data = pd.concat([train_drop, test_drop], axis=0).reset_index(drop=True)

# ====================
# FEATURE ENGINEERING (preserve your transforms)
# ====================
print("Performing feature engineering...")
if 'PropertyClass' in data.columns:
    data['PropertyClass'] = data['PropertyClass'].astype(str)

# Area-based features
for col in ['BasementTotalSF', 'GroundFloorArea', 'UpperFloorArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalSF'] = data['BasementTotalSF'].fillna(0) + data['GroundFloorArea'].fillna(0) + data['UpperFloorArea'].fillna(0)

# Bath features
for col in ['FullBaths', 'HalfBaths', 'BasementFullBaths', 'BasementHalfBaths']:
    if col not in data.columns:
        data[col] = 0
data['TotalBath'] = data['FullBaths'].fillna(0) + 0.5*data['HalfBaths'].fillna(0) + data['BasementFullBaths'].fillna(0) + 0.5*data['BasementHalfBaths'].fillna(0)

# Porch
for col in ['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalPorchSF'] = data['OpenVerandaArea'].fillna(0) + data['EnclosedVerandaArea'].fillna(0) + data['SeasonalPorchArea'].fillna(0) + data['ScreenPorchArea'].fillna(0)

# Age features
for col in ['YearSold','ConstructionYear','RenovationYear']:
    if col not in data.columns:
        data[col] = data.get('YearSold', 0)
data['HotelAge'] = data['YearSold'].fillna(0) - data['ConstructionYear'].fillna(data['YearSold'].fillna(0))
data['RemodAge'] = data['YearSold'].fillna(0) - data['RenovationYear'].fillna(data['YearSold'].fillna(0))

# Binary flags
data['WasRemodeled'] = (data.get('RenovationYear', data['ConstructionYear']) != data.get('ConstructionYear', data['RenovationYear'])).astype(int)
data['IsNew'] = (data['ConstructionYear'] == data['YearSold']).astype(int) if ('ConstructionYear' in data.columns and 'YearSold' in data.columns) else 0
data['HasPool'] = (data['SwimmingPoolArea'] > 0).astype(int) if 'SwimmingPoolArea' in data.columns else 0
data['HasGarage'] = (data['ParkingArea'] > 0).astype(int) if 'ParkingArea' in data.columns else 0
data['HasBasement'] = (data['BasementTotalSF'] > 0).astype(int) if 'BasementTotalSF' in data.columns else 0
data['HasLounge'] = (data['Lounges'] > 0).astype(int) if 'Lounges' in data.columns else 0

# Polynomial & interactions
if 'OverallQuality' not in data.columns:
    data['OverallQuality'] = 0
data['OverallQuality_sq'] = data['OverallQuality']**2
data['OverallQuality_cub'] = data['OverallQuality']**3
data['OverallQuality_x_TotalSF'] = data['OverallQuality'] * data['TotalSF']
data['OverallQuality_x_HotelAge'] = data['OverallQuality'] * data['HotelAge']

# Drop columns you previously considered weak
drop_cols = [
    'ServiceLaneType', 'FacadeType', 'BoundaryFence', 'ExtraFacility',
    'UtilityAccess', 'NearbyTransport1', 'NearbyTransport2'
]
data.drop(columns=[c for c in drop_cols if c in data.columns], inplace=True)

# ====================
# MISSING VALUES (kept your logic)
# ====================
print("Handling missing values...")
none_cols = [
    'PoolQuality', 'BasementHeight', 'BasementCondition', 'BasementExposure',
    'BasementFacilityType1', 'BasementFacilityType2', 'ParkingType',
    'ParkingFinish', 'ParkingQuality', 'ParkingCondition', 'LoungeQuality'
]
for col in none_cols:
    if col in data.columns:
        data[col] = data[col].fillna('None')

mode_cols = ['KitchenQuality', 'PropertyFunctionality', 'ZoningCategory']
for col in mode_cols:
    if col in data.columns:
        if data[col].isnull().any():
            data[col] = data[col].fillna(data[col].mode().iloc[0])

zero_cols = [
    'BasementFacilitySF1', 'BasementFacilitySF2', 'BasementUnfinishedSF',
    'BasementTotalSF', 'BasementFullBaths', 'BasementHalfBaths',
    'FacadeArea', 'ParkingArea', 'ParkingCapacity'
]
for col in zero_cols:
    if col in data.columns:
        data[col] = data[col].fillna(0)

if 'RoadAccessLength' in data.columns and 'District' in data.columns:
    data['RoadAccessLength'] = data.groupby('District')['RoadAccessLength'].transform(lambda x: x.fillna(x.median()))
    data['RoadAccessLength'].fillna(data['RoadAccessLength'].median(), inplace=True)

# catch-all
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns
data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('None')

# ====================
# ENCODING & SKEW TRANSFORMS
# ====================
print("Encoding categorical features and transforming skewness...")
qual_map = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}
for col in ['ExteriorQuality', 'ExteriorCondition', 'HeatingQuality', 'KitchenQuality',
            'LoungeQuality', 'BasementHeight', 'BasementCondition']:
    if col in data.columns:
        data[col] = data[col].map(qual_map).fillna(0)

if 'CentralAC' in data.columns:
    data['CentralAC'] = data['CentralAC'].map({'Y':1, 'N':0}).fillna(0).astype(int)
if 'LandSlope' in data.columns:
    data['LandSlope'] = data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3)
if 'PlotShape' in data.columns:
    data['PlotShape'] = data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4)

numeric_feats = data.select_dtypes(include=[np.number]).columns
skewed = data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed[skewed.abs() > 0.7].index.tolist()
for col in skewed_feats:
    if data[col].nunique() > 2:
        data[col] = np.log1p(data[col].clip(lower=0))

# One-hot encode
data = pd.get_dummies(data, drop_first=True)

# ====================
# SPLIT BACK
# ====================
X = data.iloc[:len(train_drop), :].copy()
X_test = data.iloc[len(train_drop):, :].copy()

# Align to be safe
X, X_test = X.align(X_test, join='inner', axis=1)
print(f"Final X shape: {X.shape}, X_test shape: {X_test.shape}")

# ====================
# PIPELINE + GRIDSEARCH (GaussianNB)
# ====================
print("Setting up GaussianNB pipeline and GridSearchCV...")

pipeline = Pipeline([
    ('var_thresh', VarianceThreshold(threshold=1e-5)),
    ('scale', StandardScaler()),
    ('clf', GaussianNB())
])

# grid for GaussianNB: var_smoothing is the main hyperparameter
param_grid = {
    'clf__var_smoothing': np.logspace(-12, -6, 7)
}

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',       # balanced across classes
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True
)

print("Running GridSearchCV (this tunes var_smoothing for GaussianNB)...")
grid.fit(X, y_class)

best_score = grid.best_score_
best_params = grid.best_params_
print("Best CV f1_macro:", best_score)
print("Best params:", best_params)

# ====================
# Predict on test, map class -> numeric median HotelValue
# ====================
best_model = grid.best_estimator_
pred_class_labels = best_model.predict(X_test)  # integers 0,1,2 corresponding to low/mid/high

# Map back to class names and then to median HotelValue
inv_map = {i:lab for i,lab in enumerate(class_order)}
pred_class_names = [inv_map[int(c)] for c in pred_class_labels]
pred_numeric = [class_medians[name] for name in pred_class_names]

# Save submission (Id + HotelValue numeric mapped from class medians)
submission = pd.DataFrame({'Id': test_ids, 'HotelValue': pred_numeric})
submission.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Submission saved to {OUTPUT_PATH}")
print(submission.head())


Train shape: (1200, 81), Test shape: (260, 80)
Class medians (mapping class -> numeric HotelValue): {'low': 119500.0, 'mid': 165075.0, 'high': 240000.0}
Performing feature engineering...
Handling missing values...
Encoding categorical features and transforming skewness...
Final X shape: (1200, 237), X_test shape: (260, 237)
Setting up GaussianNB pipeline and GridSearchCV...
Running GridSearchCV (this tunes var_smoothing for GaussianNB)...
Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best CV f1_macro: 0.6098809844877229
Best params: {'clf__var_smoothing': np.float64(1e-06)}
✅ Submission saved to submission.csv
     Id  HotelValue
0   893    165075.0
1  1106    240000.0
2   414    119500.0
3   523    119500.0
4  1037    240000.0


Best Version


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import xgboost as xgb

# ---------------------------
# Settings Using Best code to get best accuracy possible
# ---------------------------
SEED = 42
N_FOLDS = 5  # 5-fold to save time while still robust
np.random.seed(SEED)

# ---------------------------
# Load Data
# ---------------------------
# Load the CSV files into DataFrames
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

train.drop((train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index),
           inplace=True)  # Remove obvious outliers
train.reset_index(drop=True, inplace=True)

y = np.log1p(train['HotelValue'])    # Log-transform target for symmetry
train_ids = train['Id']
test_ids  = test['Id']

# Drop ID and target from features
train.drop(['Id','HotelValue'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# ---------------------------
# Combine for Preprocessing
# ---------------------------
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ---------------------------
# Drop Sparse/Irrelevant Features
# ---------------------------
# Remove columns with mostly missing or poor information
drop_cols = [
    'ServiceLaneType','FacadeType','PoolQuality','BoundaryFence','ExtraFacility',
    'PlotConfiguration','NearbyTransport1','NearbyTransport2','UtilityAccess'
]
all_data.drop(columns=[c for c in drop_cols if c in all_data.columns], inplace=True)

# ---------------------------
# Feature Engineering
# ---------------------------
# Convert some numeric categories to strings (for one-hot later)
all_data['PropertyClass'] = all_data['PropertyClass'].astype(str)

# Area and Room features
all_data['TotalSF'] = (
    all_data['BasementTotalSF'] +
    all_data['GroundFloorArea'] +
    all_data['UpperFloorArea']
)
all_data['TotalBath'] = (
    all_data['FullBaths'] +
    0.5 * all_data['HalfBaths'] +
    all_data['BasementFullBaths'] +
    0.5 * all_data['BasementHalfBaths']
)
all_data['TotalPorchSF'] = (
    all_data['OpenVerandaArea'] +
    all_data['EnclosedVerandaArea'] +
    all_data['SeasonalPorchArea'] +
    all_data['ScreenPorchArea']
)
# Age and renovation features
all_data['HotelAge'] = all_data['YearSold'] - all_data['ConstructionYear']
all_data['RemodAge'] = all_data['YearSold'] - all_data['RenovationYear']
all_data['WasRemodeled'] = (all_data['RemodAge'] > 0).astype(int)
all_data['IsNew'] = (all_data['YearSold'] == all_data['ConstructionYear']).astype(int)
# Flags for amenities
all_data['HasPool'] = (all_data['SwimmingPoolArea'] > 0).astype(int)
all_data['HasGarage'] = (all_data['ParkingArea'] > 0).astype(int)
all_data['HasBasement'] = (all_data['BasementTotalSF'] > 0).astype(int)
all_data['HasLounge'] = (all_data['Lounges'] > 0).astype(int)
# Polynomial / interaction features
all_data['OverallQuality_sq'] = all_data['OverallQuality']**2
all_data['OverallQuality_cub'] = all_data['OverallQuality']**3
all_data['OverallQuality_x_TotalSF']  = all_data['OverallQuality'] * all_data['TotalSF']
all_data['OverallQuality_x_HotelAge'] = all_data['OverallQuality'] * all_data['HotelAge']
# New ratio features
all_data['BuiltPct']      = all_data['TotalSF'] / (all_data['LandArea'] + 1)
all_data['Area_per_Room'] = all_data['UsableArea'] / (all_data['TotalRooms'] + 1)
all_data['Baths_to_Rooms'] = all_data['TotalBath'] / (all_data['TotalRooms'] + 1)
all_data['BasementRatio']  = all_data['BasementTotalSF'] / (all_data['TotalSF'] + 1)

# ---------------------------
# Drop Redundant Originals (after creating features)
# ---------------------------
drop_orig = [
    'OpenVerandaArea','EnclosedVerandaArea','SeasonalPorchArea','ScreenPorchArea',
    'LowQualityArea','FacadeArea','BasementFacilitySF2'
]
for col in drop_orig:
    if col in all_data.columns:
        all_data.drop(col, axis=1, inplace=True)

# ---------------------------
# Imputation for Remaining Missing Data
# ---------------------------
# Many missing in RoadAccessLength; fill by District median
if 'RoadAccessLength' in all_data.columns:
    all_data['RoadAccessLength'] = all_data.groupby('District')['RoadAccessLength']\
                                          .transform(lambda x: x.fillna(x.median()))
    # --- FIX 1 (No inplace=True) ---
    all_data['RoadAccessLength'] = all_data['RoadAccessLength'].fillna(all_data['RoadAccessLength'].median())

# Fill small gaps with zeros or modes
if 'FacadeArea' in all_data.columns:
    all_data['FacadeArea'].fillna(0, inplace=True) # This one is fine, not a chained assignment

if 'ElectricalSystem' in all_data.columns:
    # --- FIX 2 (No inplace=True) ---
    all_data['ElectricalSystem'] = all_data['ElectricalSystem'].fillna(all_data['ElectricalSystem'].mode()[0])

# Any remaining numeric NaNs
num_cols = all_data.select_dtypes(include=[np.number]).columns
cat_cols = all_data.select_dtypes(exclude=[np.number]).columns
all_data[num_cols] = all_data[num_cols].fillna(0)
all_data[cat_cols] = all_data[cat_cols].fillna('None')

# ---------------------------
# Ordinal Encoding for Quality Features
# ---------------------------
qual_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'None':0}
for col in ['ExteriorQuality','ExteriorCondition','HeatingQuality','KitchenQuality']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(qual_map).astype(int)
# Categorical mappings
if 'LandSlope' in all_data.columns:
    all_data['LandSlope'] = all_data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3).astype(int)
if 'PlotShape' in all_data.columns:
    all_data['PlotShape'] = all_data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4).astype(int)
# Binary encoding
all_data['CentralAC'] = all_data['CentralAC'].map({'Y':1,'N':0}).astype(int)

# ---------------------------
# Log-transform Highly Skewed Numerics
# ---------------------------
for col in ['ExtraFacilityValue','LandArea','BasementHalfBaths']:
    if col in all_data.columns:
        all_data[col] = np.log1p(all_data[col])

# Drop SwimmingPoolArea due to extreme skew & rarity (we have HasPool flag)
if 'SwimmingPoolArea' in all_data.columns:
    all_data.drop('SwimmingPoolArea', axis=1, inplace=True)

# ---------------------------
# One-Hot Encoding
# ---------------------------
all_data = pd.get_dummies(all_data, drop_first=True)
print("Final feature matrix shape:", all_data.shape)

# ---------------------------
# Train/Test Split for Modeling
# ---------------------------
X = all_data.iloc[:train.shape[0], :].values
X_test_final = all_data.iloc[train.shape[0]:, :].values

# ---------------------------
# Define Models & Ensemble
# ---------------------------
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
# LightGBM
lgbm = lgb.LGBMRegressor(
    objective='regression_l1',
    n_estimators=1500,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=6,
    reg_alpha=0.1,
    reg_lambda=0.1,
    colsample_bytree=0.5,
    subsample=0.7,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1  # --- FIX 3: Suppress LightGBM warnings ---
)
# XGBoost
xgbr = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=5,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.005,
    reg_lambda=0.9,
    random_state=SEED,
    n_jobs=-1
)
# Ridge meta-learner
meta = RidgeCV(alphas=np.logspace(-3, 2, 50), cv=kf)

# Stacking ensemble with out-of-fold blending
stack = StackingRegressor(
    estimators=[('lgbm', lgbm), ('xgbr', xgbr)],
    final_estimator=meta,
    cv=kf,
    n_jobs=-1,
    passthrough=True
)

# ---------------------------
# Model Training
# ---------------------------
print("Training Stacking Regressor...")
stack.fit(X, y)
print("Training complete.")

# ---------------------------
# Predict & Prepare Submission
# ---------------------------
print("Predicting on test set...")
log_preds = stack.predict(X_test_final)
final_preds = np.expm1(log_preds)
final_preds[final_preds < 0] = 0  # ensure no negatives

submission = pd.DataFrame({'Id': test_ids, 'HotelValue': final_preds})
submission.to_csv('submission.csv', index=False)
print("Submission file created.")

Final feature matrix shape: (1458, 233)
Training Stacking Regressor...
Training complete.
Predicting on test set...
Submission file created.


Naive Ridge Regression


In [None]:
# map_ridge_hotel_value.py
# MAP estimator via Ridge (Gaussian prior => MAP = Ridge). Uses same preprocessing + GridSearchCV.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from math import sqrt

# ====================
# CONFIG
# ====================
SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

TRAIN_PATH = '/content/train.csv'
TEST_PATH  = '/content/test.csv'
SUB_PATH   = '/content/sample_submission.csv'
OUTPUT_PATH = 'submission.csv'

# ====================
# LOAD
# ====================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SUB_PATH)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Remove outliers (kept same logic)
outlier_idx = train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index
if len(outlier_idx) > 0:
    train = train.drop(outlier_idx).reset_index(drop=True)
    print(f"Removed {len(outlier_idx)} outliers.")

# Target (log-transform)
y = np.log1p(train['HotelValue'])
test_ids = test['Id']

train.drop(['HotelValue', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# ====================
# COMBINE FOR UNIFIED PREPROCESSING
# ====================
data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ====================
# FEATURE ENGINEERING (same as before)
# ====================
print("Performing feature engineering...")
if 'PropertyClass' in data.columns:
    data['PropertyClass'] = data['PropertyClass'].astype(str)

# Area-based features (guard columns)
for col in ['BasementTotalSF', 'GroundFloorArea', 'UpperFloorArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalSF'] = data['BasementTotalSF'].fillna(0) + data['GroundFloorArea'].fillna(0) + data['UpperFloorArea'].fillna(0)

# Baths
for col in ['FullBaths', 'HalfBaths', 'BasementFullBaths', 'BasementHalfBaths']:
    if col not in data.columns:
        data[col] = 0
data['TotalBath'] = data['FullBaths'].fillna(0) + 0.5 * data['HalfBaths'].fillna(0) + data['BasementFullBaths'].fillna(0) + 0.5 * data['BasementHalfBaths'].fillna(0)

# Porch
for col in ['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalPorchSF'] = data['OpenVerandaArea'].fillna(0) + data['EnclosedVerandaArea'].fillna(0) + data['SeasonalPorchArea'].fillna(0) + data['ScreenPorchArea'].fillna(0)

# Age features
for col in ['YearSold', 'ConstructionYear', 'RenovationYear']:
    if col not in data.columns:
        data[col] = data.get('YearSold', 0)
data['HotelAge'] = data['YearSold'].fillna(0) - data['ConstructionYear'].fillna(data['YearSold'].fillna(0))
data['RemodAge'] = data['YearSold'].fillna(0) - data['RenovationYear'].fillna(data['YearSold'].fillna(0))

# Binary flags
data['WasRemodeled'] = (data.get('RenovationYear', data['ConstructionYear']) != data.get('ConstructionYear', data['RenovationYear'])).astype(int)
data['IsNew'] = ((data['ConstructionYear'] == data['YearSold']) if ('ConstructionYear' in data.columns and 'YearSold' in data.columns) else 0).astype(int)
data['HasPool'] = (data['SwimmingPoolArea'] > 0).astype(int) if 'SwimmingPoolArea' in data.columns else 0
data['HasGarage'] = (data['ParkingArea'] > 0).astype(int) if 'ParkingArea' in data.columns else 0
data['HasBasement'] = (data['BasementTotalSF'] > 0).astype(int) if 'BasementTotalSF' in data.columns else 0
data['HasLounge'] = (data['Lounges'] > 0).astype(int) if 'Lounges' in data.columns else 0

# Polynomial & interactions
if 'OverallQuality' not in data.columns:
    data['OverallQuality'] = 0
data['OverallQuality_sq'] = data['OverallQuality']**2
data['OverallQuality_cub'] = data['OverallQuality']**3
data['OverallQuality_x_TotalSF'] = data['OverallQuality'] * data['TotalSF']
data['OverallQuality_x_HotelAge'] = data['OverallQuality'] * data['HotelAge']

# Drop weak columns if present
drop_cols = [
    'ServiceLaneType', 'FacadeType', 'BoundaryFence', 'ExtraFacility',
    'UtilityAccess', 'NearbyTransport1', 'NearbyTransport2'
]
data.drop(columns=[c for c in drop_cols if c in data.columns], inplace=True)

# ====================
# MISSING VALUE HANDLING (same rules)
# ====================
print("Handling missing values...")
none_cols = [
    'PoolQuality', 'BasementHeight', 'BasementCondition', 'BasementExposure',
    'BasementFacilityType1', 'BasementFacilityType2', 'ParkingType',
    'ParkingFinish', 'ParkingQuality', 'ParkingCondition', 'LoungeQuality'
]
for col in none_cols:
    if col in data.columns:
        data[col] = data[col].fillna('None')

mode_cols = ['KitchenQuality', 'PropertyFunctionality', 'ZoningCategory']
for col in mode_cols:
    if col in data.columns:
        if data[col].isnull().any():
            data[col] = data[col].fillna(data[col].mode().iloc[0])

zero_cols = [
    'BasementFacilitySF1', 'BasementFacilitySF2', 'BasementUnfinishedSF',
    'BasementTotalSF', 'BasementFullBaths', 'BasementHalfBaths',
    'FacadeArea', 'ParkingArea', 'ParkingCapacity'
]
for col in zero_cols:
    if col in data.columns:
        data[col] = data[col].fillna(0)

if 'RoadAccessLength' in data.columns and 'District' in data.columns:
    data['RoadAccessLength'] = data.groupby('District')['RoadAccessLength'].transform(lambda x: x.fillna(x.median()))
    data['RoadAccessLength'].fillna(data['RoadAccessLength'].median(), inplace=True)

# final catch-all
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns
data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('None')

# ====================
# ENCODING & SKEW TRANSFORMS
# ====================
print("Encoding categorical features and transforming skewness...")
qual_map = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}
for col in ['ExteriorQuality', 'ExteriorCondition', 'HeatingQuality', 'KitchenQuality',
            'LoungeQuality', 'BasementHeight', 'BasementCondition']:
    if col in data.columns:
        data[col] = data[col].map(qual_map).fillna(0)

if 'CentralAC' in data.columns:
    data['CentralAC'] = data['CentralAC'].map({'Y':1, 'N':0}).fillna(0).astype(int)
if 'LandSlope' in data.columns:
    data['LandSlope'] = data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3)
if 'PlotShape' in data.columns:
    data['PlotShape'] = data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4)

numeric_feats = data.select_dtypes(include=[np.number]).columns
skewed = data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed[skewed.abs() > 0.7].index.tolist()
for col in skewed_feats:
    if data[col].nunique() > 2:
        data[col] = np.log1p(data[col].clip(lower=0))

# One-hot encode
data = pd.get_dummies(data, drop_first=True)

# ====================
# SPLIT BACK
# ====================
X = data.iloc[:len(train), :].copy()
X_test = data.iloc[len(train):, :].copy()
X, X_test = X.align(X_test, join='inner', axis=1)
print(f"Final X shape: {X.shape}, X_test shape: {X_test.shape}")

# ====================
# PIPELINE: VarianceThreshold -> StandardScaler -> Ridge (MAP)
# ====================
print("Setting up Ridge (MAP) pipeline and GridSearchCV...")

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

pipeline = Pipeline([
    ('var_thresh', VarianceThreshold(threshold=1e-5)),
    ('scale', StandardScaler()),
    ('reg', Ridge())   # MAP estimator for linear regression with Gaussian prior
])

# Grid: alpha corresponds to prior strength (bigger alpha => stronger shrinkage)
param_grid = {
    'reg__alpha': np.logspace(-4, 3, 20),        # from 1e-4 to 1e3
    'reg__fit_intercept': [True, False],
    'reg__solver': ['auto', 'svd', 'cholesky']   # stable solvers
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',   # on log1p(target)
    cv=kf,
    n_jobs=-1,
    verbose=2,
    refit=True
)

print("Running GridSearchCV for MAP (Ridge) — tuning alpha (prior strength) and solver...")
grid.fit(X, y)

best_rmse = -grid.best_score_
print("Best CV RMSE (log-target):", best_rmse)
print("Best params (MAP via Ridge):", grid.best_params_)

# Use best model to predict
best_model = grid.best_estimator_
log_preds = best_model.predict(X_test)
preds = np.expm1(log_preds)
preds[preds < 0] = 0.0

# Save submission
submission = pd.DataFrame({'Id': test_ids, 'HotelValue': preds})
submission.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Submission saved to {OUTPUT_PATH}")
print(submission.head())


Train shape: (1200, 81), Test shape: (260, 80)
Removed 2 outliers.
Performing feature engineering...
Handling missing values...
Encoding categorical features and transforming skewness...
Final X shape: (1198, 235), X_test shape: (260, 235)
Setting up Ridge (MAP) pipeline and GridSearchCV...
Running GridSearchCV for MAP (Ridge) — tuning alpha (prior strength) and solver...
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best CV RMSE (log-target): 0.11967611377762848
Best params (MAP via Ridge): {'reg__alpha': np.float64(428.1332398719387), 'reg__fit_intercept': True, 'reg__solver': 'auto'}
✅ Submission saved to submission.csv
     Id     HotelValue
0   893  145320.034031
1  1106  317968.522646
2   414  103249.406448
3   523  167375.626388
4  1037  311081.887484


MAP Bayesian Regression

In [None]:
# bayesian_conjugate_hotel_value.py
# Bayesian linear regression with conjugate Gaussian prior (closed-form posterior)
# Preserves your preprocessing + GridSearchCV for hyperparameter tuning.

import warnings
warnings.filterwarnings('ignore')

import time
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# ====================
# Custom estimator: Conjugate Bayesian Linear Regressor (scikit-learn style)
# ====================
class ConjugateBayesRegressor(BaseEstimator, RegressorMixin):
    """
    Bayesian linear regression with conjugate Gaussian prior:
      prior: w ~ N(0, alpha^{-1} I)
      noise: y = X w + eps, eps ~ N(0, sigma2 I)
    Fits closed-form posterior mean (and optionally posterior covariance).
    Params (for GridSearchCV): alpha_prior (prior precision), sigma2 (noise variance),
    fit_intercept (bool).
    """
    def __init__(self, alpha_prior=1.0, sigma2=1.0, fit_intercept=True, jitter=1e-8):
        self.alpha_prior = alpha_prior
        self.sigma2 = sigma2
        self.fit_intercept = fit_intercept
        self.jitter = jitter  # numerical stability
        # attributes set in fit:
        self.coef_ = None
        self.intercept_ = 0.0
        self.posterior_cov_ = None

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)

        # Handle intercept by centering data if requested
        if self.fit_intercept:
            self.X_mean_ = X.mean(axis=0)
            self.y_mean_ = y.mean()
            Xc = X - self.X_mean_
            yc = y - self.y_mean_
        else:
            self.X_mean_ = np.zeros(X.shape[1], dtype=float)
            self.y_mean_ = 0.0
            Xc = X
            yc = y

        # Compute sufficient statistics
        XtX = Xc.T.dot(Xc)           # shape (p, p)
        Xty = Xc.T.dot(yc)           # shape (p,)

        # posterior precision: (XtX / sigma2) + alpha_prior * I
        p = XtX.shape[0]
        precision = XtX / max(self.sigma2, 1e-12)
        precision.flat[::p+1] += self.alpha_prior  # add alpha_prior to diagonal

        # add jitter for numerical stability
        precision += np.eye(p) * self.jitter

        # posterior mean = precision^{-1} * (X^T y / sigma2)
        rhs = Xty / max(self.sigma2, 1e-12)
        # solve precision * coef = rhs
        coef = np.linalg.solve(precision, rhs)

        # optionally compute posterior covariance (could be large)
        # posterior_cov = precision^{-1}
        try:
            # compute inverse via solve for identity where cost is acceptable
            posterior_cov = np.linalg.inv(precision)
        except np.linalg.LinAlgError:
            posterior_cov = None

        # set attributes
        self.coef_ = coef
        self.posterior_cov_ = posterior_cov
        if self.fit_intercept:
            self.intercept_ = self.y_mean_ - self.X_mean_.dot(self.coef_)
        else:
            self.intercept_ = 0.0

        return self

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        return X.dot(self.coef_) + self.intercept_

    # scikit-learn compatibility (optional)
    def get_params(self, deep=True):
        return {"alpha_prior": self.alpha_prior, "sigma2": self.sigma2, "fit_intercept": self.fit_intercept, "jitter": self.jitter}

    def set_params(self, **params):
        for k, v in params.items():
            setattr(self, k, v)
        return self

# ====================
# CONFIG
# ====================
SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

TRAIN_PATH = '/content/train.csv'
TEST_PATH  = '/content/test.csv'
SUB_PATH   = '/content/sample_submission.csv'
OUTPUT_PATH = 'submission.csv'

# ====================
# LOAD DATA
# ====================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SUB_PATH)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# remove outliers as you did before
outlier_idx = train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index
if len(outlier_idx) > 0:
    train = train.drop(outlier_idx).reset_index(drop=True)
    print(f"Removed {len(outlier_idx)} outliers.")

# target & ids
y = np.log1p(train['HotelValue'])
test_ids = test['Id']

train.drop(['HotelValue', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# ====================
# COMBINE FOR UNIFIED PREPROCESSING
# ====================
data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ====================
# FEATURE ENGINEERING (keeps your transforms)
# ====================
print("Performing feature engineering...")

if 'PropertyClass' in data.columns:
    data['PropertyClass'] = data['PropertyClass'].astype(str)

# Area-based features
for col in ['BasementTotalSF', 'GroundFloorArea', 'UpperFloorArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalSF'] = data['BasementTotalSF'].fillna(0) + data['GroundFloorArea'].fillna(0) + data['UpperFloorArea'].fillna(0)

# Baths
for col in ['FullBaths', 'HalfBaths', 'BasementFullBaths', 'BasementHalfBaths']:
    if col not in data.columns:
        data[col] = 0
data['TotalBath'] = data['FullBaths'].fillna(0) + 0.5 * data['HalfBaths'].fillna(0) + data['BasementFullBaths'].fillna(0) + 0.5 * data['BasementHalfBaths'].fillna(0)

# Porch
for col in ['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalPorchSF'] = data['OpenVerandaArea'].fillna(0) + data['EnclosedVerandaArea'].fillna(0) + data['SeasonalPorchArea'].fillna(0) + data['ScreenPorchArea'].fillna(0)

# Age features
for col in ['YearSold', 'ConstructionYear', 'RenovationYear']:
    if col not in data.columns:
        data[col] = data.get('YearSold', 0)
data['HotelAge'] = data['YearSold'].fillna(0) - data['ConstructionYear'].fillna(data['YearSold'].fillna(0))
data['RemodAge'] = data['YearSold'].fillna(0) - data['RenovationYear'].fillna(data['YearSold'].fillna(0))

# Binary flags
data['WasRemodeled'] = (data.get('RenovationYear', data['ConstructionYear']) != data.get('ConstructionYear', data['RenovationYear'])).astype(int)
data['IsNew'] = (data['ConstructionYear'] == data['YearSold']).astype(int) if ('ConstructionYear' in data.columns and 'YearSold' in data.columns) else 0
data['HasPool'] = (data['SwimmingPoolArea'] > 0).astype(int) if 'SwimmingPoolArea' in data.columns else 0
data['HasGarage'] = (data['ParkingArea'] > 0).astype(int) if 'ParkingArea' in data.columns else 0
data['HasBasement'] = (data['BasementTotalSF'] > 0).astype(int) if 'BasementTotalSF' in data.columns else 0
data['HasLounge'] = (data['Lounges'] > 0).astype(int) if 'Lounges' in data.columns else 0

# Polynomial / interaction terms
if 'OverallQuality' not in data.columns:
    data['OverallQuality'] = 0
data['OverallQuality_sq'] = data['OverallQuality']**2
data['OverallQuality_cub'] = data['OverallQuality']**3
data['OverallQuality_x_TotalSF'] = data['OverallQuality'] * data['TotalSF']
data['OverallQuality_x_HotelAge'] = data['OverallQuality'] * data['HotelAge']

# Drop weak columns if present
drop_cols = [
    'ServiceLaneType', 'FacadeType', 'BoundaryFence', 'ExtraFacility',
    'UtilityAccess', 'NearbyTransport1', 'NearbyTransport2'
]
data.drop(columns=[c for c in drop_cols if c in data.columns], inplace=True)

# ====================
# MISSING VALUE HANDLING
# ====================
print("Handling missing values...")
none_cols = [
    'PoolQuality', 'BasementHeight', 'BasementCondition', 'BasementExposure',
    'BasementFacilityType1', 'BasementFacilityType2', 'ParkingType',
    'ParkingFinish', 'ParkingQuality', 'ParkingCondition', 'LoungeQuality'
]
for col in none_cols:
    if col in data.columns:
        data[col] = data[col].fillna('None')

mode_cols = ['KitchenQuality', 'PropertyFunctionality', 'ZoningCategory']
for col in mode_cols:
    if col in data.columns:
        if data[col].isnull().any():
            data[col] = data[col].fillna(data[col].mode().iloc[0])

zero_cols = [
    'BasementFacilitySF1', 'BasementFacilitySF2', 'BasementUnfinishedSF',
    'BasementTotalSF', 'BasementFullBaths', 'BasementHalfBaths',
    'FacadeArea', 'ParkingArea', 'ParkingCapacity'
]
for col in zero_cols:
    if col in data.columns:
        data[col] = data[col].fillna(0)

if 'RoadAccessLength' in data.columns and 'District' in data.columns:
    data['RoadAccessLength'] = data.groupby('District')['RoadAccessLength'].transform(lambda x: x.fillna(x.median()))
    data['RoadAccessLength'].fillna(data['RoadAccessLength'].median(), inplace=True)

# Catch-all
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns
data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('None')

# ====================
# ENCODING & SKEW TRANSFORMS
# ====================
print("Encoding categorical features and transforming skewness...")
qual_map = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}
for col in ['ExteriorQuality', 'ExteriorCondition', 'HeatingQuality', 'KitchenQuality',
            'LoungeQuality', 'BasementHeight', 'BasementCondition']:
    if col in data.columns:
        data[col] = data[col].map(qual_map).fillna(0)

if 'CentralAC' in data.columns:
    data['CentralAC'] = data['CentralAC'].map({'Y':1, 'N':0}).fillna(0).astype(int)
if 'LandSlope' in data.columns:
    data['LandSlope'] = data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3)
if 'PlotShape' in data.columns:
    data['PlotShape'] = data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4)

numeric_feats = data.select_dtypes(include=[np.number]).columns
skewed = data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed[skewed.abs() > 0.7].index.tolist()
for col in skewed_feats:
    if data[col].nunique() > 2:
        data[col] = np.log1p(data[col].clip(lower=0))

# One-hot encode remaining categoricals
data = pd.get_dummies(data, drop_first=True)

# ====================
# FINAL SPLIT
# ====================
X = data.iloc[:len(train), :].copy()
X_test = data.iloc[len(train):, :].copy()
# Align columns
X, X_test = X.align(X_test, join='inner', axis=1)
print(f"Final X shape: {X.shape}, X_test shape: {X_test.shape}")

# ====================
# PIPELINE and GRIDSEARCH (tune conjugate prior hyperparameters)
# ====================
print("Setting up pipeline and GridSearchCV for conjugate-prior Bayesian regression...")

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

pipeline = Pipeline([
    ('var_thresh', VarianceThreshold(threshold=1e-5)),
    ('scale', StandardScaler()),
    ('reg', ConjugateBayesRegressor())   # our custom estimator
])

# Grid: tune prior precision (alpha_prior), noise variance (sigma2), and fit_intercept.
param_grid = {
    'reg__alpha_prior': [1e-6, 1e-4, 1e-2, 1e-1, 1.0, 10.0, 100.0],
    'reg__sigma2':      [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0],
    'reg__fit_intercept':[True, False]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2,
    refit=True
)

print("Running GridSearchCV (this will tune conjugate prior hyperparameters)...")
start_time = time.time()
grid.fit(X, y)
elapsed = time.time() - start_time

best_rmse = -grid.best_score_
print(f"GridSearchCV finished in {elapsed:.1f}s")
print("Best CV RMSE (log-target):", best_rmse)
print("Best params (conjugate-prior bayes):", grid.best_params_)

# final model and predictions
best_model = grid.best_estimator_
log_preds = best_model.predict(X_test)
preds = np.expm1(log_preds)
preds[preds < 0] = 0.0

# Save submission
submission = pd.DataFrame({'Id': test_ids, 'HotelValue': preds})
submission.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Submission saved to {OUTPUT_PATH}")
print(submission.head())


Train shape: (1200, 81), Test shape: (260, 80)
Removed 2 outliers.
Performing feature engineering...
Handling missing values...
Encoding categorical features and transforming skewness...
Final X shape: (1198, 235), X_test shape: (260, 235)
Setting up pipeline and GridSearchCV for conjugate-prior Bayesian regression...
Running GridSearchCV (this will tune conjugate prior hyperparameters)...
Fitting 5 folds for each of 84 candidates, totalling 420 fits
GridSearchCV finished in 10.7s
Best CV RMSE (log-target): 0.12144326406040104
Best params (conjugate-prior bayes): {'reg__alpha_prior': 1.0, 'reg__fit_intercept': True, 'reg__sigma2': 100.0}
✅ Submission saved to submission.csv
     Id     HotelValue
0   893  145911.180669
1  1106  326261.781096
2   414  104342.323069
3   523  171838.562488
4  1037  311483.314575


Decision Tree Classifier

In [None]:
# decision_tree_only_hotel_value.py
# Uses scikit-learn DecisionTreeRegressor + GridSearchCV
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import make_scorer
from math import sqrt

# ====================
# CONFIG
# ====================
SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

TRAIN_PATH = '/content/train.csv'
TEST_PATH  = '/content/test.csv'
SUB_PATH   = '/content/sample_submission.csv'
OUTPUT_PATH = 'submission.csv'

# ====================
# LOAD
# ====================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SUB_PATH)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Remove outliers (same logic you used)
outlier_idx = train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index
if len(outlier_idx) > 0:
    train = train.drop(outlier_idx).reset_index(drop=True)
    print(f"Removed {len(outlier_idx)} outliers.")

# Target: log-transform as before (tree can handle either, but we keep consistency)
y = np.log1p(train['HotelValue'])
test_ids = test['Id']

train.drop(['HotelValue', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# ====================
# COMBINE FOR PROCESSING
# ====================
data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ====================
# FEATURE ENGINEERING (preserved)
# ====================
print("Performing feature engineering...")
if 'PropertyClass' in data.columns:
    data['PropertyClass'] = data['PropertyClass'].astype(str)

# Area-based
for col in ['BasementTotalSF', 'GroundFloorArea', 'UpperFloorArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalSF'] = data['BasementTotalSF'].fillna(0) + data['GroundFloorArea'].fillna(0) + data['UpperFloorArea'].fillna(0)

# Baths
for col in ['FullBaths', 'HalfBaths', 'BasementFullBaths', 'BasementHalfBaths']:
    if col not in data.columns:
        data[col] = 0
data['TotalBath'] = data['FullBaths'].fillna(0) + 0.5*data['HalfBaths'].fillna(0) + data['BasementFullBaths'].fillna(0) + 0.5*data['BasementHalfBaths'].fillna(0)

# Porch
for col in ['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalPorchSF'] = data['OpenVerandaArea'].fillna(0) + data['EnclosedVerandaArea'].fillna(0) + data['SeasonalPorchArea'].fillna(0) + data['ScreenPorchArea'].fillna(0)

# Age features
for col in ['YearSold','ConstructionYear','RenovationYear']:
    if col not in data.columns:
        data[col] = data['YearSold'] if 'YearSold' in data.columns else 0
data['HotelAge'] = data['YearSold'].fillna(0) - data['ConstructionYear'].fillna(data['YearSold'].fillna(0))
data['RemodAge'] = data['YearSold'].fillna(0) - data['RenovationYear'].fillna(data['YearSold'].fillna(0))

# Binary flags
data['WasRemodeled'] = (data.get('RenovationYear', data['ConstructionYear']) != data.get('ConstructionYear', data['RenovationYear'])).astype(int)
data['IsNew'] = (data['ConstructionYear'] == data['YearSold']).astype(int) if ('ConstructionYear' in data.columns and 'YearSold' in data.columns) else 0
data['HasPool'] = (data['SwimmingPoolArea'] > 0).astype(int) if 'SwimmingPoolArea' in data.columns else 0
data['HasGarage'] = (data['ParkingArea'] > 0).astype(int) if 'ParkingArea' in data.columns else 0
data['HasBasement'] = (data['BasementTotalSF'] > 0).astype(int) if 'BasementTotalSF' in data.columns else 0
data['HasLounge'] = (data['Lounges'] > 0).astype(int) if 'Lounges' in data.columns else 0

# Polynomial / interactions
if 'OverallQuality' not in data.columns:
    data['OverallQuality'] = 0
data['OverallQuality_sq'] = data['OverallQuality']**2
data['OverallQuality_cub'] = data['OverallQuality']**3
data['OverallQuality_x_TotalSF'] = data['OverallQuality'] * data['TotalSF']
data['OverallQuality_x_HotelAge'] = data['OverallQuality'] * data['HotelAge']

# Drop certain weak columns if present
drop_cols = [
    'ServiceLaneType', 'FacadeType', 'BoundaryFence', 'ExtraFacility',
    'UtilityAccess', 'NearbyTransport1', 'NearbyTransport2'
]
data.drop(columns=[c for c in drop_cols if c in data.columns], inplace=True)

# ====================
# MISSING VALUE HANDLING
# ====================
print("Handling missing values...")
none_cols = [
    'PoolQuality', 'BasementHeight', 'BasementCondition', 'BasementExposure',
    'BasementFacilityType1', 'BasementFacilityType2', 'ParkingType',
    'ParkingFinish', 'ParkingQuality', 'ParkingCondition', 'LoungeQuality'
]
for col in none_cols:
    if col in data.columns:
        data[col] = data[col].fillna('None')

mode_cols = ['KitchenQuality', 'PropertyFunctionality', 'ZoningCategory']
for col in mode_cols:
    if col in data.columns:
        if data[col].isnull().any():
            data[col] = data[col].fillna(data[col].mode().iloc[0])

zero_cols = [
    'BasementFacilitySF1', 'BasementFacilitySF2', 'BasementUnfinishedSF',
    'BasementTotalSF', 'BasementFullBaths', 'BasementHalfBaths',
    'FacadeArea', 'ParkingArea', 'ParkingCapacity'
]
for col in zero_cols:
    if col in data.columns:
        data[col] = data[col].fillna(0)

if 'RoadAccessLength' in data.columns and 'District' in data.columns:
    data['RoadAccessLength'] = data.groupby('District')['RoadAccessLength'].transform(lambda x: x.fillna(x.median()))
    data['RoadAccessLength'].fillna(data['RoadAccessLength'].median(), inplace=True)

# Catch-all fills
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns
data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('None')

# ====================
# ENCODING & SKEW TRANSFORMS
# ====================
print("Encoding categorical features and transforming skewness...")
qual_map = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}
for col in ['ExteriorQuality', 'ExteriorCondition', 'HeatingQuality', 'KitchenQuality',
            'LoungeQuality', 'BasementHeight', 'BasementCondition']:
    if col in data.columns:
        data[col] = data[col].map(qual_map).fillna(0)

if 'CentralAC' in data.columns:
    data['CentralAC'] = data['CentralAC'].map({'Y':1, 'N':0}).fillna(0).astype(int)
if 'LandSlope' in data.columns:
    data['LandSlope'] = data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3)
if 'PlotShape' in data.columns:
    data['PlotShape'] = data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4)

# Log-transform skewed numerics (avoid booleans)
numeric_feats = data.select_dtypes(include=[np.number]).columns
skewed = data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed[skewed.abs() > 0.7].index.tolist()
for col in skewed_feats:
    if data[col].nunique() > 2:
        data[col] = np.log1p(data[col].clip(lower=0))

# One-hot encode remaining categoricals
data = pd.get_dummies(data, drop_first=True)

# ====================
# SPLIT BACK
# ====================
X = data.iloc[:len(train), :].copy()
X_test = data.iloc[len(train):, :].copy()
X, X_test = X.align(X_test, join='inner', axis=1)
print(f"Final X shape: {X.shape}, X_test shape: {X_test.shape}")

# ====================
# DECISION TREE PIPELINE + GRIDSEARCH
# ====================
print("Setting up DecisionTreeRegressor pipeline and GridSearchCV...")

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

pipeline = Pipeline([
    ('var_thresh', VarianceThreshold(threshold=1e-5)),   # remove near-constant features
    ('dt', DecisionTreeRegressor(random_state=SEED))
])

# Grid for DecisionTreeRegressor
param_grid = {
    'dt__criterion': ['squared_error', 'friedman_mse'],  # 'mse' deprecated alias; sklearn uses squared_error
    'dt__max_depth': [None, 6, 10, 15, 25],
    'dt__min_samples_split': [2, 5, 10, 20],
    'dt__min_samples_leaf': [1, 2, 4, 8],
    'dt__max_features': [None, 'sqrt', 'log2', 0.3, 0.5]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # on log target
    cv=kf,
    n_jobs=-1,
    verbose=2,
    refit=True
)

print("Running GridSearchCV — this may take a while depending on grid size and CPU...")
grid.fit(X, y)

best_rmse = -grid.best_score_
print("Best CV RMSE (log-target):", best_rmse)
print("Best params:", grid.best_params_)

# Predict (GridSearchCV already refit on full train)
best_model = grid.best_estimator_
log_preds = best_model.predict(X_test)
preds = np.expm1(log_preds)
preds[preds < 0] = 0.0

# Save submission
submission = pd.DataFrame({'Id': test_ids, 'HotelValue': preds})
submission.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Submission saved to {OUTPUT_PATH}")
print(submission.head())


Train shape: (1200, 81), Test shape: (260, 80)
Removed 2 outliers.
Performing feature engineering...
Handling missing values...
Encoding categorical features and transforming skewness...
Final X shape: (1198, 235), X_test shape: (260, 235)
Setting up DecisionTreeRegressor pipeline and GridSearchCV...
Running GridSearchCV — this may take a while depending on grid size and CPU...
Fitting 5 folds for each of 800 candidates, totalling 4000 fits
Best CV RMSE (log-target): 0.17078003100334918
Best params: {'dt__criterion': 'friedman_mse', 'dt__max_depth': 10, 'dt__max_features': None, 'dt__min_samples_leaf': 8, 'dt__min_samples_split': 20}
✅ Submission saved to submission.csv
     Id     HotelValue
0   893  145925.782517
1  1106  302159.123847
2   414  123534.557386
3   523  181440.246612
4  1037  290843.688764


MLE

In [None]:
# mle_linear_regression_with_gridsearch.py
# Use MLE (LinearRegression = OLS) + GridSearchCV, preserving your preprocessing pipeline

import warnings
warnings.filterwarnings('ignore')

import time
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

# ================
# CONFIG
# ================
SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

TRAIN_PATH = '/content/train.csv'
TEST_PATH  = '/content/test.csv'
SUB_PATH   = '/content/sample_submission.csv'
OUTPUT_PATH = 'submission.csv'

# ================
# LOAD
# ================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SUB_PATH)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Remove outliers (same rule you used)
outlier_idx = train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index
if len(outlier_idx) > 0:
    train = train.drop(outlier_idx).reset_index(drop=True)
    print(f"Removed {len(outlier_idx)} outliers.")

# Target: log transform (keep same target transform)
y = np.log1p(train['HotelValue'])
test_ids = test['Id']

train.drop(['HotelValue', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# ================
# COMBINE FOR UNIFIED PREPROCESSING
# ================
data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ================
# FEATURE ENGINEERING (kept identical to your prior pipeline)
# ================
print("Performing feature engineering...")

if 'PropertyClass' in data.columns:
    data['PropertyClass'] = data['PropertyClass'].astype(str)

# Area-based
for col in ['BasementTotalSF', 'GroundFloorArea', 'UpperFloorArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalSF'] = data['BasementTotalSF'].fillna(0) + data['GroundFloorArea'].fillna(0) + data['UpperFloorArea'].fillna(0)

# Baths
for col in ['FullBaths', 'HalfBaths', 'BasementFullBaths', 'BasementHalfBaths']:
    if col not in data.columns:
        data[col] = 0
data['TotalBath'] = (
    data['FullBaths'].fillna(0) + 0.5 * data['HalfBaths'].fillna(0)
    + data['BasementFullBaths'].fillna(0) + 0.5 * data['BasementHalfBaths'].fillna(0)
)

# Porches
for col in ['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea']:
    if col not in data.columns:
        data[col] = 0
data['TotalPorchSF'] = (
    data['OpenVerandaArea'].fillna(0) + data['EnclosedVerandaArea'].fillna(0)
    + data['SeasonalPorchArea'].fillna(0) + data['ScreenPorchArea'].fillna(0)
)

# Age features
for col in ['YearSold','ConstructionYear','RenovationYear']:
    if col not in data.columns:
        data[col] = data.get('YearSold', 0)
data['HotelAge'] = data['YearSold'].fillna(0) - data['ConstructionYear'].fillna(data['YearSold'].fillna(0))
data['RemodAge'] = data['YearSold'].fillna(0) - data['RenovationYear'].fillna(data['YearSold'].fillna(0))

# Binary flags
data['WasRemodeled'] = (data.get('RenovationYear', data['ConstructionYear']) != data.get('ConstructionYear', data['RenovationYear'])).astype(int)
data['IsNew'] = (data['ConstructionYear'] == data['YearSold']).astype(int) if ('ConstructionYear' in data.columns and 'YearSold' in data.columns) else 0
data['HasPool'] = (data['SwimmingPoolArea'] > 0).astype(int) if 'SwimmingPoolArea' in data.columns else 0
data['HasGarage'] = (data['ParkingArea'] > 0).astype(int) if 'ParkingArea' in data.columns else 0
data['HasBasement'] = (data['BasementTotalSF'] > 0).astype(int) if 'BasementTotalSF' in data.columns else 0
data['HasLounge'] = (data['Lounges'] > 0).astype(int) if 'Lounges' in data.columns else 0

# Polynomial & interactions
if 'OverallQuality' not in data.columns:
    data['OverallQuality'] = 0
data['OverallQuality_sq'] = data['OverallQuality']**2
data['OverallQuality_cub'] = data['OverallQuality']**3
data['OverallQuality_x_TotalSF'] = data['OverallQuality'] * data['TotalSF']
data['OverallQuality_x_HotelAge'] = data['OverallQuality'] * data['HotelAge']

# Drop weak columns if present
drop_cols = [
    'ServiceLaneType', 'FacadeType', 'BoundaryFence', 'ExtraFacility',
    'UtilityAccess', 'NearbyTransport1', 'NearbyTransport2'
]
data.drop(columns=[c for c in drop_cols if c in data.columns], inplace=True)

# ================
# MISSING VALUE HANDLING (same rules)
# ================
print("Handling missing values...")
none_cols = [
    'PoolQuality', 'BasementHeight', 'BasementCondition', 'BasementExposure',
    'BasementFacilityType1', 'BasementFacilityType2', 'ParkingType',
    'ParkingFinish', 'ParkingQuality', 'ParkingCondition', 'LoungeQuality'
]
for col in none_cols:
    if col in data.columns:
        data[col] = data[col].fillna('None')

mode_cols = ['KitchenQuality', 'PropertyFunctionality', 'ZoningCategory']
for col in mode_cols:
    if col in data.columns:
        if data[col].isnull().any():
            data[col] = data[col].fillna(data[col].mode().iloc[0])

zero_cols = [
    'BasementFacilitySF1', 'BasementFacilitySF2', 'BasementUnfinishedSF',
    'BasementTotalSF', 'BasementFullBaths', 'BasementHalfBaths',
    'FacadeArea', 'ParkingArea', 'ParkingCapacity'
]
for col in zero_cols:
    if col in data.columns:
        data[col] = data[col].fillna(0)

if 'RoadAccessLength' in data.columns and 'District' in data.columns:
    data['RoadAccessLength'] = data.groupby('District')['RoadAccessLength'].transform(lambda x: x.fillna(x.median()))
    data['RoadAccessLength'].fillna(data['RoadAccessLength'].median(), inplace=True)

# Final catch-all
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns
data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('None')

# ================
# ENCODING & TRANSFORMS
# ================
print("Encoding categorical features and transforming skewness...")
qual_map = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}
for col in ['ExteriorQuality', 'ExteriorCondition', 'HeatingQuality', 'KitchenQuality',
            'LoungeQuality', 'BasementHeight', 'BasementCondition']:
    if col in data.columns:
        data[col] = data[col].map(qual_map).fillna(0)

if 'CentralAC' in data.columns:
    data['CentralAC'] = data['CentralAC'].map({'Y':1, 'N':0}).fillna(0).astype(int)
if 'LandSlope' in data.columns:
    data['LandSlope'] = data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3)
if 'PlotShape' in data.columns:
    data['PlotShape'] = data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4)

numeric_feats = data.select_dtypes(include=[np.number]).columns
skewed = data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed[skewed.abs() > 0.7].index.tolist()
for col in skewed_feats:
    if data[col].nunique() > 2:
        data[col] = np.log1p(data[col].clip(lower=0))

# One-hot encode
data = pd.get_dummies(data, drop_first=True)

# ================
# FINAL SPLIT
# ================
X = data.iloc[:len(train), :].copy()
X_test = data.iloc[len(train):, :].copy()

# Align columns
X, X_test = X.align(X_test, join='inner', axis=1)
print(f"Final X shape: {X.shape}, X_test shape: {X_test.shape}")

# ================
# PIPELINE & GRIDSEARCH (LinearRegression = MLE)
# ================
print("Setting up LinearRegression (MLE) pipeline and GridSearchCV...")

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

pipeline = Pipeline([
    ('var_thresh', VarianceThreshold()),   # tune threshold in grid
    ('scale', StandardScaler()),
    ('pca', PCA()),                        # optional dimensionality reduction
    ('reg', LinearRegression())
])

# Grid over preprocessing choices and intercept option (OLS has no regularization hyperparameter)
param_grid = {
    'var_thresh__threshold': [0.0, 1e-5, 1e-3],
    'pca__n_components': [None, 50, 100, 200],   # None => PCA leaves all components
    'reg__fit_intercept': [True, False]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2,
    refit=True
)

print("Running GridSearchCV (this may take some time depending on grid size)...")
start = time.time()
grid.fit(X, y)
elapsed = time.time() - start

best_rmse = -grid.best_score_
print(f"GridSearchCV finished in {elapsed:.1f}s")
print("Best CV RMSE (log-target):", best_rmse)
print("Best params (MLE / OLS):", grid.best_params_)

# Final predictions (GridSearchCV already refit on full training data with best params)
best_model = grid.best_estimator_
log_preds = best_model.predict(X_test)
preds = np.expm1(log_preds)
preds[preds < 0] = 0.0

# Save submission
submission = pd.DataFrame({'Id': test_ids, 'HotelValue': preds})
submission.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Submission saved to {OUTPUT_PATH}")
print(submission.head())


Train shape: (1200, 81), Test shape: (260, 80)
Removed 2 outliers.
Performing feature engineering...
Handling missing values...
Encoding categorical features and transforming skewness...
Final X shape: (1198, 235), X_test shape: (260, 235)
Setting up LinearRegression (MLE) pipeline and GridSearchCV...
Running GridSearchCV (this may take some time depending on grid size)...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
GridSearchCV finished in 13.1s
Best CV RMSE (log-target): 0.12645204539751617
Best params (MLE / OLS): {'pca__n_components': 100, 'reg__fit_intercept': True, 'var_thresh__threshold': 0.0}
✅ Submission saved to submission.csv
     Id     HotelValue
0   893  147340.464456
1  1106  327968.767957
2   414   89316.296657
3   523  171097.429181
4  1037  326351.172306


YashubG:

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

# ---------------------------
# Train Random Forest Model
# ---------------------------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
)

rf.fit(X, y)

# ---------------------------
# Predict on Test Set
# ---------------------------
log_preds = rf.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p if target was log-transformed
final_preds[final_preds < 0] = 0   # avoid negative values

# ---------------------------
# Save Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("RandomForest.csv", index=False)


✅ Random Forest model trained and submission.csv created successfully!
     Id     HotelValue
0   893  142248.956122
1  1106  320257.951012
2   414  113263.362772
3   523  150685.392453
4  1037  311892.583676


Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd

# ---------------------------
# Train Gradient Boosting Model
# ---------------------------
gb = GradientBoostingRegressor(
    n_estimators=500,     # number of boosting stages
    learning_rate=0.1,   # smaller = more robust
    max_depth=5,          # depth of each tree
    subsample=0.8,        # for stochastic boosting
    random_state=42
)

gb.fit(X, y)

# ---------------------------
# Predict on Test Set
# ---------------------------
log_preds = gb.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p if y was log-transformed
final_preds[final_preds < 0] = 0   # ensure no negatives

# ---------------------------
# Save Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("GradientBoost.csv", index=False)

print("✅ Gradient Boosting model trained and submission.csv created successfully!")
print(submission.head())


✅ Gradient Boosting model trained and submission.csv created successfully!
     Id     HotelValue
0   893  141322.904380
1  1106  322125.270717
2   414  103823.530234
3   523  147100.911476
4  1037  332787.447911


Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# ---------------------------
# Train Linear Regression
# ---------------------------
print("Training Linear Regression model...")
lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(X, y)
print("Training complete.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = lin_reg.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p transform
final_preds[final_preds < 0] = 0   # ensure no negative values

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("linear_regression.csv", index=False)

print("✅ submission.csv created successfully!")
print(submission.head())


Training Linear Regression model...
Training complete.
✅ submission.csv created successfully!
     Id     HotelValue
0   893  147395.657115
1  1106  328934.353821
2   414  105309.178068
3   523  165803.564443
4  1037  311199.481472


K-fold Cross validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ---------------------------
# K-Fold Cross Validation Setup
# ---------------------------
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# ---------------------------
# Initialize Model
# ---------------------------
lin_reg = LinearRegression(n_jobs=-1)

# ---------------------------
# Cross-validation
# ---------------------------
rmsle_scores = []

print(f"Running {N_FOLDS}-Fold Cross Validation for Linear Regression...\n")

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_valid)

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(y_pred)))
    rmsle_scores.append(rmsle)

    print(f"Fold {fold}: RMSLE = {rmsle:.5f}")

print("\n✅ Cross-validation complete.")
print(f"Average RMSLE across {N_FOLDS} folds: {np.mean(rmsle_scores):.5f}")

# ---------------------------
# Train Final Model on Full Data
# ---------------------------
lin_reg.fit(X, y)
print("\nTraining final model on full dataset... Done.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = lin_reg.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p
final_preds[final_preds < 0] = 0

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("kFold.csv", index=False)

print("\n✅ submission.csv created successfully!")
print(submission.head())


Running 5-Fold Cross Validation for Linear Regression...

Fold 1: RMSLE = 0.12644
Fold 2: RMSLE = 0.12862
Fold 3: RMSLE = 0.14658
Fold 4: RMSLE = 0.12526
Fold 5: RMSLE = 0.12147

✅ Cross-validation complete.
Average RMSLE across 5 folds: 0.12967

Training final model on full dataset... Done.

✅ submission.csv created successfully!
     Id     HotelValue
0   893  147395.657115
1  1106  328934.353821
2   414  105309.178068
3   523  165803.564443
4  1037  311199.481472


LOOCV

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ---------------------------
# Leave-One-Out CV Setup
# ---------------------------
loo = LeaveOneOut()
n_splits = loo.get_n_splits(X)

lin_reg = LinearRegression(n_jobs=-1)
rmsle_scores = []

print(f"Running Leave-One-Out Cross Validation on {n_splits} samples...\n")

# ---------------------------
# LOOCV Loop
# ---------------------------
for i, (train_idx, valid_idx) in enumerate(loo.split(X), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_valid)

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(y_pred)))
    rmsle_scores.append(rmsle)

    if i % 100 == 0 or i == n_splits:  # print progress every 100 samples
        print(f"Processed {i}/{n_splits} samples, Current RMSLE: {rmsle:.5f}")

print("\n✅ LOOCV complete.")
print(f"Average RMSLE across all samples: {np.mean(rmsle_scores):.5f}")

# ---------------------------
# Train Final Model on Full Data
# ---------------------------
lin_reg.fit(X, y)
print("\nTraining final Linear Regression model on full dataset... Done.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = lin_reg.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p
final_preds[final_preds < 0] = 0

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("loocv.csv", index=False)

print("\n✅ submission.csv created successfully!")
print(submission.head())


Running Leave-One-Out Cross Validation on 1198 samples...

Processed 100/1198 samples, Current RMSLE: 0.00944
Processed 200/1198 samples, Current RMSLE: 0.05119
Processed 300/1198 samples, Current RMSLE: 0.01004
Processed 400/1198 samples, Current RMSLE: 0.06401
Processed 500/1198 samples, Current RMSLE: 0.17141
Processed 600/1198 samples, Current RMSLE: 0.05238
Processed 700/1198 samples, Current RMSLE: 0.02046
Processed 800/1198 samples, Current RMSLE: 0.07474
Processed 900/1198 samples, Current RMSLE: 0.03780
Processed 1000/1198 samples, Current RMSLE: 0.01810
Processed 1100/1198 samples, Current RMSLE: 0.17214
Processed 1198/1198 samples, Current RMSLE: 0.13056

✅ LOOCV complete.
Average RMSLE across all samples: 0.08278

Training final Linear Regression model on full dataset... Done.

✅ submission.csv created successfully!
     Id     HotelValue
0   893  147395.657115
1  1106  328934.353821
2   414  105309.178068
3   523  165803.564443
4  1037  311199.481472


Ridge regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ---------------------------
# Leave-One-Out CV Setup
# ---------------------------
loo = LeaveOneOut()
n_splits = loo.get_n_splits(X)

ridge = Ridge(alpha=1.0, random_state=42)  # You can tune alpha later
rmsle_scores = []

print(f"Running Leave-One-Out Cross Validation for Ridge Regression on {n_splits} samples...\n")

# ---------------------------
# LOOCV Loop
# ---------------------------
for i, (train_idx, valid_idx) in enumerate(loo.split(X), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_valid)

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(y_pred)))
    rmsle_scores.append(rmsle)

    if i % 100 == 0 or i == n_splits:  # Print progress every 100 samples
        print(f"Processed {i}/{n_splits} samples, Current RMSLE: {rmsle:.5f}")

print("\n✅ LOOCV complete.")
print(f"Average RMSLE across all samples: {np.mean(rmsle_scores):.5f}")

# ---------------------------
# Train Final Model on Full Data
# ---------------------------
ridge.fit(X, y)
print("\nTraining final Ridge Regression model on full dataset... Done.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = ridge.predict(X_test_final)
final_preds = np.expm1(log_preds)  # Reverse log1p transform
final_preds[final_preds < 0] = 0   # Ensure no negatives

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("ridgeRegression.csv", index=False)

print("\n✅ submission.csv created successfully!")
print(submission.head())


Running Leave-One-Out Cross Validation for Ridge Regression on 1198 samples...

Processed 100/1198 samples, Current RMSLE: 0.00124
Processed 200/1198 samples, Current RMSLE: 0.06196
Processed 300/1198 samples, Current RMSLE: 0.00458
Processed 400/1198 samples, Current RMSLE: 0.07191
Processed 500/1198 samples, Current RMSLE: 0.13652
Processed 600/1198 samples, Current RMSLE: 0.04300
Processed 700/1198 samples, Current RMSLE: 0.02222
Processed 800/1198 samples, Current RMSLE: 0.13733
Processed 900/1198 samples, Current RMSLE: 0.04268
Processed 1000/1198 samples, Current RMSLE: 0.00954
Processed 1100/1198 samples, Current RMSLE: 0.19361
Processed 1198/1198 samples, Current RMSLE: 0.11449

✅ LOOCV complete.
Average RMSLE across all samples: 0.07832

Training final Ridge Regression model on full dataset... Done.

✅ submission.csv created successfully!
     Id     HotelValue
0   893  147219.487747
1  1106  331851.841627
2   414  104549.424486
3   523  166146.445081
4  1037  311794.030132


In [None]:
Lasso Regression

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.exceptions import ConvergenceWarning
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# Lasso model
lasso = Lasso(alpha=0.001, max_iter=50000, random_state=42)

lasso.fit(X_scaled, y)
print("✅ Model trained successfully (no convergence warnings).")

# Predictions
log_preds = lasso.predict(X_test_scaled)
final_preds = np.expm1(log_preds)
final_preds[final_preds < 0] = 0

submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created successfully!")


✅ submission.csv created successfully!


Elastic nets

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ================================
# Scale features (important!)
# ================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# ================================
# Elastic Net with Cross-Validation
# ================================
elastic_cv = ElasticNetCV(
    l1_ratio=[.1, .3, .5, .7, .9, .95, 1],
    alphas=np.logspace(-4, 1, 50),
    cv=5,
    max_iter=100000,
    n_jobs=-1,
    random_state=42
)

print("Training Elastic Net with CV...")
elastic_cv.fit(X_scaled, y)
print("Training complete.")
print(f"Best alpha: {elastic_cv.alpha_:.6f}")
print(f"Best l1_ratio: {elastic_cv.l1_ratio_:.2f}")

# ================================
# Predict on Test Data
# ================================
log_preds = elastic_cv.predict(X_test_scaled)
final_preds = np.expm1(log_preds)
final_preds[final_preds < 0] = 0

# ================================
# Create Submission File
# ================================
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv created successfully!")
print(submission.head())


Training Elastic Net with CV...
Training complete.
Best alpha: 0.028118
Best l1_ratio: 0.10
✅ submission.csv created successfully!
     Id     HotelValue
0   893  147540.826955
1  1106  317283.985683
2   414  105870.135758
3   523  159955.124294
4  1037  304075.633259


Bayesian Approach + Conjugate Priors

In [None]:
import numpy as np
import pandas as pd

# ---------------------------
# Add intercept term
# ---------------------------
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
X_test_aug = np.hstack([np.ones((X_test_final.shape[0], 1)), X_test_final])
n, p = X_aug.shape

# ---------------------------
# Prior hyperparameters
# ---------------------------
mu_0 = np.zeros(p)             # prior mean of coefficients
Lambda_0 = np.eye(p) * 1e-6    # prior precision (tiny, almost uninformative)
a_0 = 1e-6                     # prior shape for sigma^2
b_0 = 1e-6                     # prior scale for sigma^2

# ---------------------------
# Posterior for coefficients (beta | sigma^2, y)
# ---------------------------
# Posterior precision and mean
Lambda_n = Lambda_0 + X_aug.T @ X_aug
mu_n = np.linalg.solve(Lambda_n, Lambda_0 @ mu_0 + X_aug.T @ y)

# Posterior parameters for sigma^2
a_n = a_0 + n / 2
residuals = y - X_aug @ mu_n
b_n = b_0 + 0.5 * (residuals.T @ residuals + (mu_n - mu_0).T @ Lambda_0 @ (mu_n - mu_0))

# Posterior mean of sigma^2
sigma2_post = b_n / (a_n - 1)

# Posterior predictive mean for test set
y_pred_test = X_test_aug @ mu_n

# ---------------------------
# Reverse log-transform
# ---------------------------
final_preds = np.expm1(y_pred_test)
final_preds[final_preds < 0] = 0

# ---------------------------
# Create submission file
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("✅ Bayesian regression submission created successfully!")
print(submission.head())


✅ Bayesian regression submission created successfully!
     Id     HotelValue
0   893  147395.155681
1  1106  328934.870491
2   414  105309.390046
3   523  165803.205292
4  1037  311199.689455


Bayesian + MAP estimate

In [None]:
import numpy as np
import pandas as pd

# ---------------------------
# Add intercept term
# ---------------------------
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
X_test_aug = np.hstack([np.ones((X_test_final.shape[0], 1)), X_test_final])
n, p = X_aug.shape

# ---------------------------
# MAP / Ridge parameters
# ---------------------------
tau2 = 1.0       # prior variance for coefficients
sigma2 = 1.0     # assumed noise variance
lambda_ = sigma2 / tau2  # regularization strength

# ---------------------------
# Compute MAP estimate
# ---------------------------
beta_map = np.linalg.solve(X_aug.T @ X_aug + lambda_ * np.eye(p), X_aug.T @ y)

# ---------------------------
# Predict on test data
# ---------------------------
y_pred_test = X_test_aug @ beta_map

# Reverse log-transform
final_preds = np.expm1(y_pred_test)
final_preds[final_preds < 0] = 0

# ---------------------------
# Create submission file
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("✅ Bayesian MAP regression submission created successfully!")
print(submission.head())


✅ Bayesian MAP regression submission created successfully!
     Id     HotelValue
0   893  146199.896460
1  1106  333192.071502
2   414  105398.256957
3   523  165257.086300
4  1037  312337.850589


K nearest neighbours

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

# ---------------------------
# Scale features (important for KNN)
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# ---------------------------
# KNN Model with Grid Search
# ---------------------------
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

knn = KNeighborsRegressor()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_log_error', n_jobs=-1)
print("🔧 Performing Grid Search for KNN...")
grid.fit(X_scaled, y)

print("✅ Grid Search complete!")
print(f"Best parameters: {grid.best_params_}")

# ---------------------------
# Train final KNN with best params
# ---------------------------
knn_best = grid.best_estimator_
knn_best.fit(X_scaled, y)

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = knn_best.predict(X_test_scaled)
final_preds = np.expm1(log_preds)  # reverse log1p
final_preds[final_preds < 0] = 0   # ensure no negative values

# ---------------------------
# Create submission file
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv created successfully!")
print(submission.head())


🔧 Performing Grid Search for KNN...
✅ Grid Search complete!
Best parameters: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
✅ submission.csv created successfully!
     Id     HotelValue
0   893  129493.630228
1  1106  272446.984455
2   414  101369.955521
3   523  139889.126281
4  1037  334962.130438
