In [7]:
# ================================================
# 1. Imports
# ================================================
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [8]:
# ================================================
# 2. Load datasets (train + eval)
# ================================================
train_df = pd.read_csv("/Users/anuvaagarwal/Documents/mlproject/Regression_ML_EndtoEnd/data/processed/feature_engineered_train.csv")
eval_df  = pd.read_csv("/Users/anuvaagarwal/Documents/mlproject/Regression_ML_EndtoEnd/data/processed/feature_engineered_eval.csv")

In [9]:
'''
# ================================================
# 3. Drop high VIF features (both train + eval)
# ================================================
high_vif_features = [
    "median_sale_price" #highest correlation to 'price' => data leakage
]
train_df.drop(columns=high_vif_features, inplace=True)
eval_df.drop(columns=high_vif_features, inplace=True)
'''



In [10]:
# ================================================
# 4. Define target & features
# ================================================
target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_eval = eval_df.drop(columns=[target])
y_eval = eval_df[target]

In [11]:
# ================================================
# 5. Standardization (fit on train, transform eval)
# ================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled  = scaler.transform(X_eval)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [13]:
import numpy as np
import pandas as pd

print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_eval:", np.isnan(X_eval).sum())

print("NaNs in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaNs in X_eval_scaled:", np.isnan(X_eval_scaled).sum())


NaNs in X_train: year                                 0
quarter                              0
month                                0
median_list_price                    0
median_ppsf                          0
median_list_ppsf                     0
homes_sold                           0
pending_sales                        0
new_listings                         0
inventory                            0
median_dom                           0
avg_sale_to_list                     0
sold_above_list                      0
off_market_in_two_weeks              0
bank                                 0
bus                                  0
hospital                             0
mall                                 0
park                                 0
restaurant                           0
school                               0
station                              0
supermarket                          0
Total Population                     0
Median Age                           0
Per Capi

In [14]:
print("NaNs in y_train:", pd.isna(y_train).sum())
print("NaNs in y_eval:", pd.isna(y_eval).sum())


NaNs in y_train: 0
NaNs in y_eval: 0


In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Pipeline: Impute -> Scale -> Linear Regression
lr_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # or "mean"
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_eval)

print("Linear Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lr))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lr)))
print(" R²:", r2_score(y_eval, y_pred_lr))




Linear Regression:
 MAE: 53811.93813400827
 RMSE: 121336.13469295991
 R²: 0.8862267031700115




In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

ridge_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0))
])

ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_eval)

print("\nRidge Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_ridge))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_ridge)))
print(" R²:", r2_score(y_eval, y_pred_ridge))





Ridge Regression:
 MAE: 53811.11466043951
 RMSE: 121338.02551498688
 R²: 0.8862231572068496




In [19]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# --- Lasso Regression with imputation + scaling in a Pipeline ---
lasso_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # handle NaNs
    ("scaler", StandardScaler()),                  # scale features
    ("model", Lasso(alpha=0.1, random_state=42))   # your Lasso model
])

lasso_pipeline.fit(X_train, y_train)
y_pred_lasso = lasso_pipeline.predict(X_eval)

print("\nLasso Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lasso))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lasso)))
print(" R²:", r2_score(y_eval, y_pred_lasso))





Lasso Regression:
 MAE: 54117.4260712303
 RMSE: 121604.4782343721
 R²: 0.8857229111303108


  model = cd_fast.enet_coordinate_descent(


In [20]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# --- ElasticNet Regression with Pipeline ---
elastic_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # fix NaNs
    ("scaler", StandardScaler()),                    # scale features
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
])

elastic_pipeline.fit(X_train, y_train)
y_pred_elastic = elastic_pipeline.predict(X_eval)

print("\nElasticNet Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_elastic))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_elastic)))
print(" R²:", r2_score(y_eval, y_pred_elastic))





ElasticNet Regression:
 MAE: 54234.249320614625
 RMSE: 122295.84870428537
 R²: 0.8844197946433394


