<a href="https://colab.research.google.com/github/YashubG/ML-Project/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import the datasets

In [5]:
test = pd.read_csv('/content/ML_project/test.csv')
train = pd.read_csv('/content/ML_project/train.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/ML_project/test.csv'

Preprocessing

In [None]:
train.drop((train[(train['UsableArea'] > 4000) & (train['HotelValue'] < 300000)].index),
           inplace=True)  # Remove obvious outliers
train.reset_index(drop=True, inplace=True)

y = np.log1p(train['HotelValue'])    # Log-transform target for symmetry
train_ids = train['Id']
test_ids  = test['Id']

# Drop ID and target from features
train.drop(['Id','HotelValue'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# ---------------------------
# Combine for Preprocessing
# ---------------------------
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)

# ---------------------------
# Drop Sparse/Irrelevant Features
# ---------------------------
# Remove columns with mostly missing or poor information
drop_cols = [
    'ServiceLaneType','FacadeType','PoolQuality','BoundaryFence','ExtraFacility',
    'PlotConfiguration','NearbyTransport1','NearbyTransport2','UtilityAccess'
]
all_data.drop(columns=[c for c in drop_cols if c in all_data.columns], inplace=True)

# ---------------------------
# Feature Engineering
# ---------------------------
# Convert some numeric categories to strings (for one-hot later)
all_data['PropertyClass'] = all_data['PropertyClass'].astype(str)

# Area and Room features
all_data['TotalSF'] = (
    all_data['BasementTotalSF'] +
    all_data['GroundFloorArea'] +
    all_data['UpperFloorArea']
)
all_data['TotalBath'] = (
    all_data['FullBaths'] +
    0.5 * all_data['HalfBaths'] +
    all_data['BasementFullBaths'] +
    0.5 * all_data['BasementHalfBaths']
)
all_data['TotalPorchSF'] = (
    all_data['OpenVerandaArea'] +
    all_data['EnclosedVerandaArea'] +
    all_data['SeasonalPorchArea'] +
    all_data['ScreenPorchArea']
)
# Age and renovation features
all_data['HotelAge'] = all_data['YearSold'] - all_data['ConstructionYear']
all_data['RemodAge'] = all_data['YearSold'] - all_data['RenovationYear']
all_data['WasRemodeled'] = (all_data['RemodAge'] > 0).astype(int)
all_data['IsNew'] = (all_data['YearSold'] == all_data['ConstructionYear']).astype(int)
# Flags for amenities
all_data['HasPool'] = (all_data['SwimmingPoolArea'] > 0).astype(int)
all_data['HasGarage'] = (all_data['ParkingArea'] > 0).astype(int)
all_data['HasBasement'] = (all_data['BasementTotalSF'] > 0).astype(int)
all_data['HasLounge'] = (all_data['Lounges'] > 0).astype(int)
# Polynomial / interaction features
all_data['OverallQuality_sq'] = all_data['OverallQuality']**2
all_data['OverallQuality_cub'] = all_data['OverallQuality']**3
all_data['OverallQuality_x_TotalSF']  = all_data['OverallQuality'] * all_data['TotalSF']
all_data['OverallQuality_x_HotelAge'] = all_data['OverallQuality'] * all_data['HotelAge']
# New ratio features
all_data['BuiltPct']      = all_data['TotalSF'] / (all_data['LandArea'] + 1)
all_data['Area_per_Room'] = all_data['UsableArea'] / (all_data['TotalRooms'] + 1)
all_data['Baths_to_Rooms'] = all_data['TotalBath'] / (all_data['TotalRooms'] + 1)
all_data['BasementRatio']  = all_data['BasementTotalSF'] / (all_data['TotalSF'] + 1)

# ---------------------------
# Drop Redundant Originals (after creating features)
# ---------------------------
drop_orig = [
    'OpenVerandaArea','EnclosedVerandaArea','SeasonalPorchArea','ScreenPorchArea',
    'LowQualityArea','FacadeArea','BasementFacilitySF2'
]
for col in drop_orig:
    if col in all_data.columns:
        all_data.drop(col, axis=1, inplace=True)

# ---------------------------
# Imputation for Remaining Missing Data
# ---------------------------
# Many missing in RoadAccessLength; fill by District median
if 'RoadAccessLength' in all_data.columns:
    all_data['RoadAccessLength'] = all_data.groupby('District')['RoadAccessLength']\
                                          .transform(lambda x: x.fillna(x.median()))
    # --- FIX 1 (No inplace=True) ---
    all_data['RoadAccessLength'] = all_data['RoadAccessLength'].fillna(all_data['RoadAccessLength'].median())

# Fill small gaps with zeros or modes
if 'FacadeArea' in all_data.columns:
    all_data['FacadeArea'].fillna(0, inplace=True) # This one is fine, not a chained assignment

if 'ElectricalSystem' in all_data.columns:
    # --- FIX 2 (No inplace=True) ---
    all_data['ElectricalSystem'] = all_data['ElectricalSystem'].fillna(all_data['ElectricalSystem'].mode()[0])

# Any remaining numeric NaNs
num_cols = all_data.select_dtypes(include=[np.number]).columns
cat_cols = all_data.select_dtypes(exclude=[np.number]).columns
all_data[num_cols] = all_data[num_cols].fillna(0)
all_data[cat_cols] = all_data[cat_cols].fillna('None')

# ---------------------------
# Ordinal Encoding for Quality Features
# ---------------------------
qual_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'None':0}
for col in ['ExteriorQuality','ExteriorCondition','HeatingQuality','KitchenQuality']:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(qual_map).astype(int)
# Categorical mappings
if 'LandSlope' in all_data.columns:
    all_data['LandSlope'] = all_data['LandSlope'].map({'Gtl':3,'Mod':2,'Sev':1}).fillna(3).astype(int)
if 'PlotShape' in all_data.columns:
    all_data['PlotShape'] = all_data['PlotShape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1}).fillna(4).astype(int)
# Binary encoding
all_data['CentralAC'] = all_data['CentralAC'].map({'Y':1,'N':0}).astype(int)

# ---------------------------
# Log-transform Highly Skewed Numerics
# ---------------------------
for col in ['ExtraFacilityValue','LandArea','BasementHalfBaths']:
    if col in all_data.columns:
        all_data[col] = np.log1p(all_data[col])

# Drop SwimmingPoolArea due to extreme skew & rarity (we have HasPool flag)
if 'SwimmingPoolArea' in all_data.columns:
    all_data.drop('SwimmingPoolArea', axis=1, inplace=True)

# ---------------------------
# One-Hot Encoding
# ---------------------------
all_data = pd.get_dummies(all_data, drop_first=True)
print("Final feature matrix shape:", all_data.shape)

# ---------------------------
# Train/Test Split for Modeling
# ---------------------------
X = all_data.iloc[:train.shape[0], :].values
X_test_final = all_data.iloc[train.shape[0]:, :].values


Final feature matrix shape: (1458, 233)


Test Models

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

# ---------------------------
# Train Random Forest Model
# ---------------------------
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
)

rf.fit(X, y)

# ---------------------------
# Predict on Test Set
# ---------------------------
log_preds = rf.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p if target was log-transformed
final_preds[final_preds < 0] = 0   # avoid negative values

# ---------------------------
# Save Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("RandomForest.csv", index=False)


âœ… Random Forest model trained and submission.csv created successfully!
     Id     HotelValue
0   893  142248.956122
1  1106  320257.951012
2   414  113263.362772
3   523  150685.392453
4  1037  311892.583676


Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd

# ---------------------------
# Train Gradient Boosting Model
# ---------------------------
gb = GradientBoostingRegressor(
    n_estimators=500,     # number of boosting stages
    learning_rate=0.1,   # smaller = more robust
    max_depth=5,          # depth of each tree
    subsample=0.8,        # for stochastic boosting
    random_state=42
)

gb.fit(X, y)

# ---------------------------
# Predict on Test Set
# ---------------------------
log_preds = gb.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p if y was log-transformed
final_preds[final_preds < 0] = 0   # ensure no negatives

# ---------------------------
# Save Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("GradientBoost.csv", index=False)

print("âœ… Gradient Boosting model trained and submission.csv created successfully!")
print(submission.head())


âœ… Gradient Boosting model trained and submission.csv created successfully!
     Id     HotelValue
0   893  141322.904380
1  1106  322125.270717
2   414  103823.530234
3   523  147100.911476
4  1037  332787.447911


Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# ---------------------------
# Train Linear Regression
# ---------------------------
print("Training Linear Regression model...")
lin_reg = LinearRegression(n_jobs=-1)
lin_reg.fit(X, y)
print("Training complete.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = lin_reg.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p transform
final_preds[final_preds < 0] = 0   # ensure no negative values

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("linear_regression.csv", index=False)

print("âœ… submission.csv created successfully!")
print(submission.head())


Training Linear Regression model...
Training complete.
âœ… submission.csv created successfully!
     Id     HotelValue
0   893  147395.657115
1  1106  328934.353821
2   414  105309.178068
3   523  165803.564443
4  1037  311199.481472


K-fold Cross validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ---------------------------
# K-Fold Cross Validation Setup
# ---------------------------
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# ---------------------------
# Initialize Model
# ---------------------------
lin_reg = LinearRegression(n_jobs=-1)

# ---------------------------
# Cross-validation
# ---------------------------
rmsle_scores = []

print(f"Running {N_FOLDS}-Fold Cross Validation for Linear Regression...\n")

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_valid)

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(y_pred)))
    rmsle_scores.append(rmsle)

    print(f"Fold {fold}: RMSLE = {rmsle:.5f}")

print("\nâœ… Cross-validation complete.")
print(f"Average RMSLE across {N_FOLDS} folds: {np.mean(rmsle_scores):.5f}")

# ---------------------------
# Train Final Model on Full Data
# ---------------------------
lin_reg.fit(X, y)
print("\nTraining final model on full dataset... Done.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = lin_reg.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p
final_preds[final_preds < 0] = 0

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("kFold.csv", index=False)

print("\nâœ… submission.csv created successfully!")
print(submission.head())


Running 5-Fold Cross Validation for Linear Regression...

Fold 1: RMSLE = 0.12644
Fold 2: RMSLE = 0.12862
Fold 3: RMSLE = 0.14658
Fold 4: RMSLE = 0.12526
Fold 5: RMSLE = 0.12147

âœ… Cross-validation complete.
Average RMSLE across 5 folds: 0.12967

Training final model on full dataset... Done.

âœ… submission.csv created successfully!
     Id     HotelValue
0   893  147395.657115
1  1106  328934.353821
2   414  105309.178068
3   523  165803.564443
4  1037  311199.481472


LOOCV

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ---------------------------
# Leave-One-Out CV Setup
# ---------------------------
loo = LeaveOneOut()
n_splits = loo.get_n_splits(X)

lin_reg = LinearRegression(n_jobs=-1)
rmsle_scores = []

print(f"Running Leave-One-Out Cross Validation on {n_splits} samples...\n")

# ---------------------------
# LOOCV Loop
# ---------------------------
for i, (train_idx, valid_idx) in enumerate(loo.split(X), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_valid)

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(y_pred)))
    rmsle_scores.append(rmsle)

    if i % 100 == 0 or i == n_splits:  # print progress every 100 samples
        print(f"Processed {i}/{n_splits} samples, Current RMSLE: {rmsle:.5f}")

print("\nâœ… LOOCV complete.")
print(f"Average RMSLE across all samples: {np.mean(rmsle_scores):.5f}")

# ---------------------------
# Train Final Model on Full Data
# ---------------------------
lin_reg.fit(X, y)
print("\nTraining final Linear Regression model on full dataset... Done.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = lin_reg.predict(X_test_final)
final_preds = np.expm1(log_preds)  # reverse log1p
final_preds[final_preds < 0] = 0

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("loocv.csv", index=False)

print("\nâœ… submission.csv created successfully!")
print(submission.head())


Running Leave-One-Out Cross Validation on 1198 samples...

Processed 100/1198 samples, Current RMSLE: 0.00944
Processed 200/1198 samples, Current RMSLE: 0.05119
Processed 300/1198 samples, Current RMSLE: 0.01004
Processed 400/1198 samples, Current RMSLE: 0.06401
Processed 500/1198 samples, Current RMSLE: 0.17141
Processed 600/1198 samples, Current RMSLE: 0.05238
Processed 700/1198 samples, Current RMSLE: 0.02046
Processed 800/1198 samples, Current RMSLE: 0.07474
Processed 900/1198 samples, Current RMSLE: 0.03780
Processed 1000/1198 samples, Current RMSLE: 0.01810
Processed 1100/1198 samples, Current RMSLE: 0.17214
Processed 1198/1198 samples, Current RMSLE: 0.13056

âœ… LOOCV complete.
Average RMSLE across all samples: 0.08278

Training final Linear Regression model on full dataset... Done.

âœ… submission.csv created successfully!
     Id     HotelValue
0   893  147395.657115
1  1106  328934.353821
2   414  105309.178068
3   523  165803.564443
4  1037  311199.481472


Ridge regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ---------------------------
# Leave-One-Out CV Setup
# ---------------------------
loo = LeaveOneOut()
n_splits = loo.get_n_splits(X)

ridge = Ridge(alpha=1.0, random_state=42)  # You can tune alpha later
rmsle_scores = []

print(f"Running Leave-One-Out Cross Validation for Ridge Regression on {n_splits} samples...\n")

# ---------------------------
# LOOCV Loop
# ---------------------------
for i, (train_idx, valid_idx) in enumerate(loo.split(X), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_valid)

    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(y_pred)))
    rmsle_scores.append(rmsle)

    if i % 100 == 0 or i == n_splits:  # Print progress every 100 samples
        print(f"Processed {i}/{n_splits} samples, Current RMSLE: {rmsle:.5f}")

print("\nâœ… LOOCV complete.")
print(f"Average RMSLE across all samples: {np.mean(rmsle_scores):.5f}")

# ---------------------------
# Train Final Model on Full Data
# ---------------------------
ridge.fit(X, y)
print("\nTraining final Ridge Regression model on full dataset... Done.")

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = ridge.predict(X_test_final)
final_preds = np.expm1(log_preds)  # Reverse log1p transform
final_preds[final_preds < 0] = 0   # Ensure no negatives

# ---------------------------
# Create Submission File
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("ridgeRegression.csv", index=False)

print("\nâœ… submission.csv created successfully!")
print(submission.head())


Running Leave-One-Out Cross Validation for Ridge Regression on 1198 samples...

Processed 100/1198 samples, Current RMSLE: 0.00124
Processed 200/1198 samples, Current RMSLE: 0.06196
Processed 300/1198 samples, Current RMSLE: 0.00458
Processed 400/1198 samples, Current RMSLE: 0.07191
Processed 500/1198 samples, Current RMSLE: 0.13652
Processed 600/1198 samples, Current RMSLE: 0.04300
Processed 700/1198 samples, Current RMSLE: 0.02222
Processed 800/1198 samples, Current RMSLE: 0.13733
Processed 900/1198 samples, Current RMSLE: 0.04268
Processed 1000/1198 samples, Current RMSLE: 0.00954
Processed 1100/1198 samples, Current RMSLE: 0.19361
Processed 1198/1198 samples, Current RMSLE: 0.11449

âœ… LOOCV complete.
Average RMSLE across all samples: 0.07832

Training final Ridge Regression model on full dataset... Done.

âœ… submission.csv created successfully!
     Id     HotelValue
0   893  147219.487747
1  1106  331851.841627
2   414  104549.424486
3   523  166146.445081
4  1037  311794.03013

In [None]:
Lasso Regression

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.exceptions import ConvergenceWarning
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# Lasso model
lasso = Lasso(alpha=0.001, max_iter=50000, random_state=42)

lasso.fit(X_scaled, y)
print("âœ… Model trained successfully (no convergence warnings).")

# Predictions
log_preds = lasso.predict(X_test_scaled)
final_preds = np.expm1(log_preds)
final_preds[final_preds < 0] = 0

submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)
print("âœ… submission.csv created successfully!")


âœ… submission.csv created successfully!


Elastic nets

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_log_error
import numpy as np
import pandas as pd

# ================================
# Scale features (important!)
# ================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# ================================
# Elastic Net with Cross-Validation
# ================================
elastic_cv = ElasticNetCV(
    l1_ratio=[.1, .3, .5, .7, .9, .95, 1],
    alphas=np.logspace(-4, 1, 50),
    cv=5,
    max_iter=100000,
    n_jobs=-1,
    random_state=42
)

print("Training Elastic Net with CV...")
elastic_cv.fit(X_scaled, y)
print("Training complete.")
print(f"Best alpha: {elastic_cv.alpha_:.6f}")
print(f"Best l1_ratio: {elastic_cv.l1_ratio_:.2f}")

# ================================
# Predict on Test Data
# ================================
log_preds = elastic_cv.predict(X_test_scaled)
final_preds = np.expm1(log_preds)
final_preds[final_preds < 0] = 0

# ================================
# Create Submission File
# ================================
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("âœ… submission.csv created successfully!")
print(submission.head())


Training Elastic Net with CV...
Training complete.
Best alpha: 0.028118
Best l1_ratio: 0.10
âœ… submission.csv created successfully!
     Id     HotelValue
0   893  147540.826955
1  1106  317283.985683
2   414  105870.135758
3   523  159955.124294
4  1037  304075.633259


Bayesian Approach + Conjugate Priors

In [None]:
import numpy as np
import pandas as pd

# ---------------------------
# Add intercept term
# ---------------------------
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
X_test_aug = np.hstack([np.ones((X_test_final.shape[0], 1)), X_test_final])
n, p = X_aug.shape

# ---------------------------
# Prior hyperparameters
# ---------------------------
mu_0 = np.zeros(p)             # prior mean of coefficients
Lambda_0 = np.eye(p) * 1e-6    # prior precision (tiny, almost uninformative)
a_0 = 1e-6                     # prior shape for sigma^2
b_0 = 1e-6                     # prior scale for sigma^2

# ---------------------------
# Posterior for coefficients (beta | sigma^2, y)
# ---------------------------
# Posterior precision and mean
Lambda_n = Lambda_0 + X_aug.T @ X_aug
mu_n = np.linalg.solve(Lambda_n, Lambda_0 @ mu_0 + X_aug.T @ y)

# Posterior parameters for sigma^2
a_n = a_0 + n / 2
residuals = y - X_aug @ mu_n
b_n = b_0 + 0.5 * (residuals.T @ residuals + (mu_n - mu_0).T @ Lambda_0 @ (mu_n - mu_0))

# Posterior mean of sigma^2
sigma2_post = b_n / (a_n - 1)

# Posterior predictive mean for test set
y_pred_test = X_test_aug @ mu_n

# ---------------------------
# Reverse log-transform
# ---------------------------
final_preds = np.expm1(y_pred_test)
final_preds[final_preds < 0] = 0

# ---------------------------
# Create submission file
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("âœ… Bayesian regression submission created successfully!")
print(submission.head())


âœ… Bayesian regression submission created successfully!
     Id     HotelValue
0   893  147395.155681
1  1106  328934.870491
2   414  105309.390046
3   523  165803.205292
4  1037  311199.689455


Bayesian + MAP estimate

In [None]:
import numpy as np
import pandas as pd

# ---------------------------
# Add intercept term
# ---------------------------
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
X_test_aug = np.hstack([np.ones((X_test_final.shape[0], 1)), X_test_final])
n, p = X_aug.shape

# ---------------------------
# MAP / Ridge parameters
# ---------------------------
tau2 = 1.0       # prior variance for coefficients
sigma2 = 1.0     # assumed noise variance
lambda_ = sigma2 / tau2  # regularization strength

# ---------------------------
# Compute MAP estimate
# ---------------------------
beta_map = np.linalg.solve(X_aug.T @ X_aug + lambda_ * np.eye(p), X_aug.T @ y)

# ---------------------------
# Predict on test data
# ---------------------------
y_pred_test = X_test_aug @ beta_map

# Reverse log-transform
final_preds = np.expm1(y_pred_test)
final_preds[final_preds < 0] = 0

# ---------------------------
# Create submission file
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("âœ… Bayesian MAP regression submission created successfully!")
print(submission.head())


âœ… Bayesian MAP regression submission created successfully!
     Id     HotelValue
0   893  146199.896460
1  1106  333192.071502
2   414  105398.256957
3   523  165257.086300
4  1037  312337.850589


K nearest neighbours

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

# ---------------------------
# Scale features (important for KNN)
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)

# ---------------------------
# KNN Model with Grid Search
# ---------------------------
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

knn = KNeighborsRegressor()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_log_error', n_jobs=-1)
print("ðŸ”§ Performing Grid Search for KNN...")
grid.fit(X_scaled, y)

print("âœ… Grid Search complete!")
print(f"Best parameters: {grid.best_params_}")

# ---------------------------
# Train final KNN with best params
# ---------------------------
knn_best = grid.best_estimator_
knn_best.fit(X_scaled, y)

# ---------------------------
# Predict on Test Data
# ---------------------------
log_preds = knn_best.predict(X_test_scaled)
final_preds = np.expm1(log_preds)  # reverse log1p
final_preds[final_preds < 0] = 0   # ensure no negative values

# ---------------------------
# Create submission file
# ---------------------------
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": final_preds
})
submission.to_csv("submission.csv", index=False)

print("âœ… submission.csv created successfully!")
print(submission.head())


ðŸ”§ Performing Grid Search for KNN...
âœ… Grid Search complete!
Best parameters: {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
âœ… submission.csv created successfully!
     Id     HotelValue
0   893  129493.630228
1  1106  272446.984455
2   414  101369.955521
3   523  139889.126281
4  1037  334962.130438
