In [1]:
# 1. Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [2]:
# 2. Load datasets (train + eval)

train_df = pd.read_csv(r'C:\Users\H.P\Desktop\Housing Regression MLE\data\processed\feature_engineered_train.csv')
eval_df = pd.read_csv(r'C:\Users\H.P\Desktop\Housing Regression MLE\data\processed\feature_engineered_eval.csv')

In [None]:
'''
# 3. Drop high VIF features (both train + eval)
high_vif_features = [
    "median_sale_price" #highest correlation to 'price' => data leakage
]
train_df.drop(columns=high_vif_features, inplace=True)
eval_df.drop(columns=high_vif_features, inplace=True)
'''

# So I am skipping this step because I know my I won't be getting best result by these models. And my
# rest complex models are naturally immune to multicolinearity issues. Otherwise I would have dropped
# high VIF features here in both train and eval datasets. In case I considered this a serious candidate

'\n# 3. Drop high VIF features (both train + eval)\nhigh_vif_features = [\n    "median_sale_price" #highest correlation to \'price\' => data leakage\n]\ntrain_df.drop(columns=high_vif_features, inplace=True)\neval_df.drop(columns=high_vif_features, inplace=True)\n'

In [4]:
# 4. Define target & features

target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_eval = eval_df.drop(columns=[target])
y_eval = eval_df[target]

In [6]:
# 5. Standardization (fit on train, transform eval)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled = scaler.transform(X_eval)

In [7]:
# 6. Train & Evaluate Models

# --- Linear Regression ---
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_eval_scaled)

print("Linear Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lr))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lr)))
print(" R²:", r2_score(y_eval, y_pred_lr))

Linear Regression:
 MAE: 57163.4322166157
 RMSE: 122396.97072119515
 R²: 0.8842285774323847


In [8]:
# --- Ridge Regression ---
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_eval_scaled)

print("\nRidge Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_ridge))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_ridge)))
print(" R²:", r2_score(y_eval, y_pred_ridge))


Ridge Regression:
 MAE: 57163.535494486576
 RMSE: 122398.36753582611
 R²: 0.8842259350117647


In [9]:
# --- Lasso Regression ---
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_eval_scaled)

print("\nLasso Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lasso))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lasso)))
print(" R²:", r2_score(y_eval, y_pred_lasso))


Lasso Regression:
 MAE: 57602.534010093914
 RMSE: 122701.30818468778
 R²: 0.8836521353087576


  model = cd_fast.enet_coordinate_descent(


In [10]:
# --- ElasticNet ---
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train_scaled, y_train)
y_pred_elastic = elastic.predict(X_eval_scaled)

print("\nElasticNet Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_elastic))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_elastic)))
print(" R²:", r2_score(y_eval, y_pred_elastic))


ElasticNet Regression:
 MAE: 57192.776489120704
 RMSE: 124255.01302067707
 R²: 0.880686971465787
