In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Load and clean data
df = pd.read_csv('data/Melbourne_housing_FULL.csv')
df = df[['Price', 'Rooms', 'Bathroom', 'Landsize', 'Suburb', 'Date',
         'BuildingArea', 'Bedroom2', 'Car', 'YearBuilt']].dropna()
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['Month'] = df['Date'].dt.to_period('M').astype(int)

# Log-transform the price
df['log_price'] = np.log(df['Price'])


# Helper function to filter valid test rows
def filter_valid_test(df_test, df_train):
    valid_suburbs = df_train['Suburb'].unique()
    valid_months = df_train['Month'].unique()
    return df_test[df_test['Suburb'].isin(valid_suburbs) & df_test['Month'].isin(valid_months)].copy()

# Define the evaluate_model function
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Define formulas using log_price
base_formula = 'log_price ~ BuildingArea + Bedroom2 + Car + YearBuilt + Rooms + Bathroom + Landsize + C(Suburb) + C(Month)'
extended_formula = base_formula + ' + Cluster + DistanceToCentroid'

# Define features for scaling
features_to_scale = ['BuildingArea', 'Bedroom2', 'Car', 'YearBuilt', 'Rooms', 'Bathroom', 'Landsize', 'Month']


cluster_range = range(2, 32, 1)
results_list = []

for k in cluster_range:
    # Scale features before clustering
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df[features_to_scale]) # Scale selected features

    # KMeans clustering
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    df['Cluster'] = kmeans.fit_predict(scaled)
    centroids = kmeans.cluster_centers_
    df['DistanceToCentroid'] = np.linalg.norm(scaled - centroids[df['Cluster']], axis=1)

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Filter test sets
    test_base = filter_valid_test(test_df, train_df).copy()
    test_ext = filter_valid_test(test_df, train_df).copy()

    # Fit OLS models
    base_model = smf.ols(base_formula, data=train_df).fit()
    extended_model = smf.ols(extended_formula, data=train_df).fit()

    test_base['PredictedLog_Base'] = base_model.predict(test_base)
    test_ext['PredictedLog_Ext'] = extended_model.predict(test_ext)

    # Evaluate OLS
    rmse_base, mae_base, r2_base = evaluate_model(test_base['log_price'], test_base['PredictedLog_Base'])
    rmse_ext, mae_ext, r2_ext = evaluate_model(test_ext['log_price'], test_ext['PredictedLog_Ext'])


    results_list.append({
        "Clusters": k,
        "OLS Base Test R²": r2_base,
        "OLS Ext Test R²": r2_ext,
        "Diff in R²": r2_ext - r2_base,
        "OLS Base R²": base_model.rsquared,
        "OLS Ext Train R²": extended_model.rsquared,
        "OLS Base RMSE": rmse_base,
        "OLS Base MAE": mae_base,
        "OLS Ext RMSE": rmse_ext,
        "OLS Ext MAE": mae_ext,
    })

# Convert results to DataFrame and display
results_cluster_df = pd.DataFrame(results_list)
print(results_cluster_df.round(3))
results_cluster_df.to_csv('/content/results_cluster.csv', index=False)

    Clusters  OLS Base Test R²  OLS Ext Test R²  Diff in R²  OLS Base R²  \
0          2             0.745            0.774       0.029        0.772   
1          3             0.745            0.783       0.038        0.772   
2          4             0.745            0.782       0.038        0.772   
3          5             0.745            0.791       0.046        0.772   
4          6             0.745            0.765       0.020        0.772   
5          7             0.745            0.784       0.039        0.772   
6          8             0.745            0.784       0.039        0.772   
7          9             0.745            0.783       0.038        0.772   
8         10             0.745            0.766       0.021        0.772   
9         11             0.745            0.770       0.025        0.772   
10        12             0.745            0.774       0.029        0.772   
11        13             0.745            0.792       0.047        0.772   
12        14