In [None]:
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

drive.mount('/content/drive')

data_dir = '/content/drive/My Drive/datasets/'

file_list = [
    "CRMLSSold202501_filled.csv",
    "CRMLSSold202502_filled.csv",
    "CRMLSSold202503_filled.csv",
    "CRMLSSold202504_filled.csv",
    "CRMLSSold202505_filled.csv",
    "CRMLSSold202506_filled.csv",
    "CRMLSSold202507_filled.csv"
]

def process_and_compare_models(filepath):
    print(f"--- Processing {filepath} ---")
    df = pd.read_csv(filepath)
    df = df[(df['PropertyType'] == 'Residential') & (df['PropertySubType'] == 'SingleFamilyResidence')]

    # Remove extreme outliers
    q1 = df['ClosePrice'].quantile(0.25)
    q3 = df['ClosePrice'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df = df[(df['ClosePrice'] >= lower_bound) & (df['ClosePrice'] <= upper_bound)]

    # Handle missing values
    for col in ['ViewYN', 'PoolPrivateYN', 'NewConstructionYN']:
        if col in df.columns:
            df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0})
            df[col] = df[col].fillna(0).astype(int)
    for col in ['LivingArea', 'BedroomsTotal', 'BathroomsTotalInteger', 'YearBuilt', 'LotSizeSquareFeet', 'Stories']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    if 'GarageSpaces' in df.columns:
        df['GarageSpaces'] = df['GarageSpaces'].fillna(0)
    if 'PostalCode' in df.columns:
        df = df.dropna(subset=['PostalCode'])

    df['CloseDate'] = pd.to_datetime(df['CloseDate'])
    df['YearMonth'] = df['CloseDate'].dt.strftime('%Y-%m')
    months = sorted(df['YearMonth'].unique())
    train_months = months[:5]
    test_month = months[-1]
    train_df = df[df['YearMonth'].isin(train_months)].copy()
    test_df = df[df['YearMonth'] == test_month].copy()

    # Select features for regression
    feature_cols = [
        'ViewYN', 'PoolPrivateYN', 'LivingArea', 'YearBuilt', 'BedroomsTotal', 'BathroomsTotalInteger',
        'NewConstructionYN', 'GarageSpaces', 'LotSizeSquareFeet', 'Stories'
    ]
    feature_cols = [col for col in feature_cols if col in train_df.columns]

    X_train = train_df[feature_cols]
    y_train = train_df['ClosePrice']
    X_test = test_df[feature_cols]
    y_test = test_df['ClosePrice']

    # Baseline: Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    r2_lr = r2_score(y_test, y_pred_lr)

    # Decision Tree
    dt = DecisionTreeRegressor(random_state=42)
    dt.fit(X_train, y_train)
    y_pred_dt = dt.predict(X_test)
    r2_dt = r2_score(y_test, y_pred_dt)

    # Random Forest
    rf = RandomForestRegressor(n_jobs=-1, random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    r2_rf = r2_score(y_test, y_pred_rf)

    print(f"Linear Regression R² (test):      {r2_lr:.4f}")
    print(f"Decision Tree R² (test):          {r2_dt:.4f}")
    print(f"Random Forest R² (test):          {r2_rf:.4f}")


for filename in file_list:
    process_and_compare_models(os.path.join(data_dir, filename))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Processing /content/drive/My Drive/datasets/CRMLSSold202501_filled.csv ---
Linear Regression R² (test):      0.3024
Decision Tree R² (test):          1.0000
Random Forest R² (test):          0.9102
--- Processing /content/drive/My Drive/datasets/CRMLSSold202502_filled.csv ---
Linear Regression R² (test):      0.3173
Decision Tree R² (test):          1.0000
Random Forest R² (test):          0.9136
--- Processing /content/drive/My Drive/datasets/CRMLSSold202503_filled.csv ---
Linear Regression R² (test):      0.3443
Decision Tree R² (test):          1.0000
Random Forest R² (test):          0.9166
--- Processing /content/drive/My Drive/datasets/CRMLSSold202504_filled.csv ---
Linear Regression R² (test):      0.3144
Decision Tree R² (test):          1.0000
Random Forest R² (test):          0.9137
--- Processing /content/drive/My Drive/datasets/CRMLSSold202505

  df = pd.read_csv(filepath)


Linear Regression R² (test):      0.3671
Decision Tree R² (test):          1.0000
Random Forest R² (test):          0.9197
--- Processing /content/drive/My Drive/datasets/CRMLSSold202507_filled.csv ---
Linear Regression R² (test):      0.3460
Decision Tree R² (test):          1.0000
Random Forest R² (test):          0.9173
