In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error as mape
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
# load the CSV
all_data = pd.read_csv('../project/output/complete_cleaned_data.csv')

In [3]:
print(all_data)
all_cols = all_data.columns.tolist()
all_cols.remove('ClosePrice')
X_cols = all_cols
print(len(X_cols))

reduced_df = all_data.sample(frac=0.3, random_state=42)
print(reduced_df)

       ClosePrice  ViewYN  PoolPrivateYN   CloseDate   Latitude   Longitude  \
0        890000.0       1              0  2024-12-19  34.180411 -118.342020   
1       1138000.0       0              0  2024-12-31  32.574359 -117.023836   
2        681877.0       1              0  2024-12-23  33.725080 -117.222302   
3        900000.0       1              0  2024-12-30  34.203479 -118.643567   
4        862000.0       0              0  2024-12-24  34.460368 -118.490755   
...           ...     ...            ...         ...        ...         ...   
47854    265000.0       0              0  2025-05-23  33.701431 -117.199363   
47855   1350000.0       1              1  2025-05-07  34.115293 -116.319679   
47856    757000.0       1              0  2025-05-21  37.319615 -119.551230   
47857    510000.0       0              0  2025-05-01  33.943555 -118.285762   
47858    575000.0       0              0  2025-05-14  34.264846 -118.426665   

              UnparsedAddress PropertyType  LivingA

In [4]:
print("\nTraining Lasso (L1-regularized) regression model with cross-validated alpha...")

# Drop unwanted columns, including target!
X = reduced_df[X_cols]
y = reduced_df['ClosePrice']

X = X.drop(columns=['geometry'], errors='ignore')

# Fill missing values
X = X.fillna(0)

# Encode categorical columns ONCE only
non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
if len(non_numeric_cols) > 0:
    print(f"Encoding categorical columns: {list(non_numeric_cols)}")
    X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)

print("\nFinal feature matrix shape:", X.shape)

# Sanity check for NaNs
print("Any NaNs?", X.isnull().any().any())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define range of alphas to search over
# alphas_to_try = [0.1, 1000, 5000, 10000, 20000, 30000, 40000]
alphas_to_try = np.logspace(-2, 4, 10)
print("\nTrying alphas in range:", alphas_to_try)

# Train Lasso with cross-validation to choose best alpha
model = LassoCV(alphas=alphas_to_try, cv=5)
model.fit(X_train_scaled, y_train)

# Make predictions
y_test_pred = model.predict(X_test_scaled)

# Evaluate
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mape = mape(y_test, y_test_pred) * 100


# Best alpha chosen
print(f"\nBest alpha: {model.alpha_:.6f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test R^2: {test_r2:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")


Training Lasso (L1-regularized) regression model with cross-validated alpha...
Encoding categorical columns: ['CloseDate', 'UnparsedAddress', 'PropertyType', 'CountyOrParish', 'ElementarySchool', 'PropertySubType', 'City', 'StateOrProvince', 'MiddleOrJuniorSchool', 'HighSchool', 'Levels', 'HighSchoolDistrict', 'PostalCode', 'CloseDate_Parsed', 'CloseDate_YearMonth']

Final feature matrix shape: (14358, 17818)
Any NaNs? False

Trying alphas in range: [1.00000000e-02 4.64158883e-02 2.15443469e-01 1.00000000e+00
 4.64158883e+00 2.15443469e+01 1.00000000e+02 4.64158883e+02
 2.15443469e+03 1.00000000e+04]

Best alpha: 2154.434690
Test RMSE: 309646.31
Test R^2: 0.7881
Test MAPE: 16.75%


In [5]:
print("\nTraining Ridge (L2-regularized) regression model with cross-validated alpha...")

# Drop unwanted columns, including target!
X = reduced_df[X_cols]
y = reduced_df['ClosePrice']

X = X.drop(columns=['geometry'], errors='ignore')

# Fill missing values
X = X.fillna(0)

# Encode categorical columns ONCE only
non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
if len(non_numeric_cols) > 0:
    print(f"Encoding categorical columns: {list(non_numeric_cols)}")
    X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)

print("\nFinal feature matrix shape:", X.shape)

# Sanity check for NaNs
print("Any NaNs?", X.isnull().any().any())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define range of alphas to search over
alphas_to_try = np.logspace(-2, 4, 10)
# alphas_to_try = np.logspace(-6, 3, 100)
print("\nTrying alphas in range:", alphas_to_try)

# Train Ridge with cross-validation to choose best alpha
model = RidgeCV(alphas=alphas_to_try, cv=5)
model.fit(X_train_scaled, y_train)

# Make predictions
y_test_pred = model.predict(X_test_scaled)

# Evaluate
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mape = mape(y_test, y_test_pred) * 100

# Best alpha chosen
print(f"\nBest alpha: {model.alpha_:.6f}")
print(f"\nTest RMSE: {test_rmse:.2f}")
print(f"Test R^2: {test_r2:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")


Training Ridge (L2-regularized) regression model with cross-validated alpha...
Encoding categorical columns: ['CloseDate', 'UnparsedAddress', 'PropertyType', 'CountyOrParish', 'ElementarySchool', 'PropertySubType', 'City', 'StateOrProvince', 'MiddleOrJuniorSchool', 'HighSchool', 'Levels', 'HighSchoolDistrict', 'PostalCode', 'CloseDate_Parsed', 'CloseDate_YearMonth']

Final feature matrix shape: (14358, 17818)
Any NaNs? False

Trying alphas in range: [1.00000000e-02 4.64158883e-02 2.15443469e-01 1.00000000e+00
 4.64158883e+00 2.15443469e+01 1.00000000e+02 4.64158883e+02
 2.15443469e+03 1.00000000e+04]

Best alpha: 0.010000

Test RMSE: 334067.72
Test R^2: 0.7534
Test MAPE: 20.79%
