In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

drive.mount('/content/drive')

data_dir = '/content/drive/My Drive/datasets/'

file_list = [
    "CRMLSSold202501_filled.csv",
    "CRMLSSold202502_filled.csv",
    "CRMLSSold202503_filled.csv",
    "CRMLSSold202504_filled.csv",
    "CRMLSSold202505_filled.csv",
    "CRMLSSold202506_filled.csv",
    "CRMLSSold202507_filled.csv",
    "CRMLSSold202508_filled-2.csv",
    "CRMLSSold202509.csv"
]

# Combine all months into one DF
all_data = []
for filename in file_list:
    df = pd.read_csv(os.path.join(data_dir, filename))
    all_data.append(df)
combined_df = pd.concat(all_data, ignore_index=True)

# Preprocess
df = combined_df[(combined_df['PropertyType'] == 'Residential') &
                 (combined_df['PropertySubType'] == 'SingleFamilyResidence')]

# Remove extreme outliers in ClosePrice using IQR
q1 = df['ClosePrice'].quantile(0.25)
q3 = df['ClosePrice'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['ClosePrice'] >= lower_bound) & (df['ClosePrice'] <= upper_bound)]

# Handle missing values
for col in ['ViewYN', 'PoolPrivateYN', 'NewConstructionYN']:
    if col in df.columns:
        df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0})
        df[col] = df[col].fillna(0).astype(int)

for col in ['LivingArea', 'BedroomsTotal', 'BathroomsTotalInteger', 'YearBuilt', 'LotSizeSquareFeet', 'Stories']:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

if 'GarageSpaces' in df.columns:
    df['GarageSpaces'] = df['GarageSpaces'].fillna(0)
if 'PostalCode' in df.columns:
    df = df.dropna(subset=['PostalCode'])

# Feature Engineering
df['PPSF'] = df['ClosePrice'] / df['LivingArea']
df['BedBathRatio'] = df['BedroomsTotal'] / df['BathroomsTotalInteger'].replace(0, 1)
df['BathBedRatio'] = df['BathroomsTotalInteger'] / df['BedroomsTotal'].replace(0, 1)
df['PropertyAge'] = 2025 - df['YearBuilt']

# Features
feature_cols = [
    'ViewYN', 'PoolPrivateYN', 'LivingArea', 'YearBuilt', 'BedroomsTotal', 'BathroomsTotalInteger',
    'NewConstructionYN', 'GarageSpaces', 'LotSizeSquareFeet', 'Stories',
    'PPSF', 'BedBathRatio', 'BathBedRatio', 'PropertyAge'
]
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=feature_cols + ['ClosePrice'])

# Split by month
df['CloseDate'] = pd.to_datetime(df['CloseDate'])
df['YearMonth'] = df['CloseDate'].dt.strftime('%Y-%m')

# Training on months jan - aug, test on sep
train_months = [
    '2025-01', '2025-02', '2025-03', '2025-04',
    '2025-05', '2025-06', '2025-07', '2025-08'
]
test_month = '2025-09'

train_df = df[df['YearMonth'].isin(train_months)].copy()
test_df = df[df['YearMonth'] == test_month].copy()

X_train = train_df[feature_cols]
y_train = train_df['ClosePrice']
X_test = test_df[feature_cols]
y_test = test_df['ClosePrice']

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)

# Decision Tree with limited depth and min samples to hopefully reduce overfitting
dt = DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_split=20, min_samples_leaf=10)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
r2_dt = r2_score(y_test, y_pred_dt)

# Random Forest with tuned hyperparameters
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Linear Regression R² (test):      {r2_lr:.4f}")
print(f"Decision Tree R² (test):          {r2_dt:.4f}")
print(f"Random Forest R² (test):          {r2_rf:.4f}")


Mounted at /content/drive


  df = pd.read_csv(os.path.join(data_dir, filename))
  df.replace([np.inf, -np.inf], np.nan, inplace=True)


Linear Regression R² (test):      0.3495
Decision Tree R² (test):          0.9967
Random Forest R² (test):          0.9712
