In [1]:
import pandas as pd
import numpy as np

## Load Data

In [2]:
mega_fd = pd.read_csv("cleaned_data.csv")
print("Loaded cleaned data with shape:", mega_fd.shape)

Loaded cleaned data with shape: (19113, 30)


## Select Features

In [10]:
# Remove ClosePrice column
features = mega_fd.drop(columns=['ClosePrice'])

# Define numerical columns first
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns identified:", num_cols)

Numerical columns identified: ['Latitude', 'Longitude', 'LivingArea', 'DaysOnMarket', 'ParkingTotal', 'YearBuilt', 'BathroomsTotalInteger', 'BedroomsTotal', 'Stories', 'LotSizeArea', 'MainLevelBedrooms', 'GarageSpaces', 'AssociationFee', 'LotSizeSquareFeet']


## Predicting Close Price

### KNN - Numerical variables only

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score

df_knn = mega_fd[num_cols + ['ClosePrice']].dropna()

X = df_knn[num_cols]
y = df_knn['ClosePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"KNN MAPE: {mape:.3f}")
print(f"KNN R-squared: {r2:.3f}")

KNN MAPE: 0.430
KNN R-squared: 0.317


### KNN - With categorical variables

In [None]:
# with categorical variables
df_no_null_cols = mega_fd.dropna(axis=1)

feature_cols = [col for col in df_no_null_cols.columns if col != 'ClosePrice']

df_no_null_cols = df_no_null_cols.dropna(subset=['ClosePrice'])

X = df_no_null_cols[feature_cols]
y = df_no_null_cols['ClosePrice']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"KNN MAPE (all columns, no nulls): {mape:.2f}")
print(f"KNN R-squared (all columns, no nulls): {r2:.3f}")

KNN MAPE (all columns, no nulls): 0.43
KNN R-squared (all columns, no nulls): 0.272


### Random Forest - numerical only

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score

# Use only numerical columns for fair comparison
num_cols_rf = [col for col in num_cols if col != 'ClosePrice']
df_rf = mega_fd[num_cols_rf + ['ClosePrice']].dropna()

# Remove top and bottom 1% outliers based on ClosePrice
q1 = df_rf['ClosePrice'].quantile(0.01)
q99 = df_rf['ClosePrice'].quantile(0.99)
df_rf_filtered = df_rf[(df_rf['ClosePrice'] >= q1) & (df_rf['ClosePrice'] <= q99)]

X = df_rf_filtered[num_cols_rf]
y = df_rf_filtered['ClosePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Random Forest MAPE (outliers removed): {mape:.2f}")
print(f"Random Forest R-squared (outliers removed): {r2:.3f}")


Random Forest MAPE (outliers removed): 0.11
Random Forest R-squared (outliers removed): 0.866


### XGBoost - numerical only

In [15]:
import xgboost as xgb

# Use only numerical columns for fair comparison (same as rf)
X = df_rf_filtered[num_cols_rf]
y = df_rf_filtered['ClosePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_reg = xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGBoost MAPE (outliers removed): {mape:.2f}")
print(f"XGBoost R-squared (outliers removed): {r2:.3f}")

XGBoost MAPE (outliers removed): 0.12
XGBoost R-squared (outliers removed): 0.875


### LightGBM - numerical only

In [17]:
import lightgbm as lgb

# Use the same features and target as previous models
X = df_rf_filtered[num_cols_rf]
y = df_rf_filtered['ClosePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lgb_reg = lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
lgb_reg.fit(X_train, y_train)

y_pred = lgb_reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"LightGBM MAPE (outliers removed): {mape:.2f}")
print(f"LightGBM R-squared (outliers removed): {r2:.3f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1989
[LightGBM] [Info] Number of data points in the train set: 14991, number of used features: 14
[LightGBM] [Info] Start training from score 1032721.465396
LightGBM MAPE (outliers removed): 0.12
LightGBM R-squared (outliers removed): 0.868
