In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import xgboost as xgb
import numpy as np
from scipy import stats

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <89AD948E-E564-3266-867D-7AF89D6488F0> /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


In [None]:
data = pd.read_csv('../../data/curated/cleaned_real_estate_data.csv')
data = data.drop(columns=["agency"])

Drop features that dont make sense to include

In [None]:
# Lat and lon already reflected in postcode
# Median rent weekly is data leakage as it is already a strong aggregate est. for rent (tested it without dropping, it is top1)
data = data.drop(columns=["lat", "lon", "Median_rent_weekly"])

Encode categorical

In [None]:
factor_cols = ["property_type", "available_day", "available_month", "available_year"]
data = pd.get_dummies(data, columns=factor_cols, drop_first=True)

Outlier removal (wanted to see comparison to already cleaned data)

In [None]:
def filter_z_score(df, threshold=3):
    df_numeric = df.select_dtypes(include=np.number) # Select numeric columns
    z_scores = np.abs(stats.zscore(df_numeric)) # Compute Z-scores
    mask = (z_scores < threshold).all(axis=1) # Keep rows where all numeric columns are within threshold

    return df[mask].reset_index(drop=True)

def filter_iqr(df):
    df_numeric = df.select_dtypes(include=np.number)
    Q1 = df_numeric.quantile(0.25)
    Q3 = df_numeric.quantile(0.75)
    IQR = Q3 - Q1

    mask = ~((df_numeric < (Q1 - 3 * IQR)) | 
            (df_numeric > (Q3 + 3 * IQR))).any(axis=1)

    return df[mask].reset_index(drop=True)

In [None]:
df_clean_z = filter_z_score(data)
df_clean_iqr = filter_iqr(data)

In [None]:
def fit_models(data):
    """
    Given cleaned data, fit models and find MSE and R^2 of each model and 
    """
    import time
    start_time = time.time()

    # Create train test
    X = data.drop(columns=['weekly_rent'])
    y = data['weekly_rent']

    print("Preparing train/test split...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # RFR
    print("Fitting Random Forest Regressor...")
    rfr = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    rfr.fit(X_train, y_train)
    y_pred_rfr = rfr.predict(X_test)

    mse_rfr = mean_squared_error(y_test, y_pred_rfr)
    r2_rfr = r2_score(y_test, y_pred_rfr)
    print("Finding Random Forest feature importance...")
    rf_importance = rfr.feature_importances_
    rf_feat_imp = pd.Series(rf_importance, index=X.columns).sort_values(ascending=False)
    print("Random Forest done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # XGBoost
    print("Fitting XGBoost Regressor...")
    xgbr = xgb.XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=6, random_state=42)
    
    xgbr.fit(X_train, y_train, verbose=True)
    y_pred_xgb = xgbr.predict(X_test)

    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    print("Finding XGBoost feature importance...")
    xgb_importance = xgbr.feature_importances_
    xgb_feat_imp = pd.Series(xgb_importance, index=X.columns).sort_values(ascending=False)
    print("XGBoost done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # SVR
    print("Fitting SVR...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svr = SVR(kernel='rbf')
    svr.fit(X_train_scaled, y_train)
    y_pred_svr = svr.predict(X_test_scaled)

    mse_svr = mean_squared_error(y_test, y_pred_svr)
    r2_svr = r2_score(y_test, y_pred_svr)
    print("SVR done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # KNN
    print("Fitting KNN Regressor...")
    knn = KNeighborsRegressor()

    knn.fit(X_train_scaled, y_train)
    y_pred_knn = knn.predict(X_test_scaled)

    mse_knn = mean_squared_error(y_test, y_pred_knn)
    r2_knn = r2_score(y_test, y_pred_knn)
    print("KNN done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # Return results as df
    def feat_imp_to_dict(feat_imp_series):
        """
        Convert a pandas Series of feature importances to a dictionary
        with keys top1, top2, ..., topN
        """
        feat_dict = {}
        sorted_feats = feat_imp_series.sort_values(ascending=False)
        for i, (feat, val) in enumerate(sorted_feats.items(), 1):
            feat_dict[f"top{i}"] = (feat, round(val, 2))
        return feat_dict

    model_results = pd.DataFrame()
   
    rf_dict = {
        "Model": "Random Forest Regressor",
        "MSE": mse_rfr,
        "R2": r2_rfr
    }
    rf_dict.update(feat_imp_to_dict(rf_feat_imp))
    model_results = pd.concat([model_results, pd.DataFrame([rf_dict])], ignore_index=True)

    xgb_dict = {
        "Model": "XGBoost Regressor",
        "MSE": mse_xgb,
        "R2": r2_xgb
    }
    xgb_dict.update(feat_imp_to_dict(xgb_feat_imp))
    model_results = pd.concat([model_results, pd.DataFrame([xgb_dict])], ignore_index=True)

    svr_dict = {
        "Model": "SVR",
        "MSE": mse_svr,
        "R2": r2_svr
    }
    model_results = pd.concat([model_results, pd.DataFrame([svr_dict])], ignore_index=True)

    knn_dict = {
        "Model": "KNN Regressor",
        "MSE": mse_knn,
        "R2": r2_knn
    }
    model_results = pd.concat([model_results, pd.DataFrame([knn_dict])], ignore_index=True)

    return model_results

In [None]:
fit_models(data)

Preparing train/test split...
Fitting Random Forest Regressor...
Finding Random Forest feature importance...
Random Forest done. Time elapsed: 10.97s
Fitting XGBoost Regressor...
Finding XGBoost feature importance...
XGBoost done. Time elapsed: 12.33s
Fitting SVR...
SVR done. Time elapsed: 20.08s
Fitting KNN Regressor...
KNN done. Time elapsed: 20.15s


Unnamed: 0,Model,MSE,R2,top1,top2,top3,top4,top5,top6,top7,...,top96,top97,top98,top99,top100,top101,top102,top103,top104,top105
0,Random Forest Regressor,21269.260908,0.687634,"(bathrooms, 0.18)","(Median_rent_weekly, 0.15)","(bedrooms, 0.12)","(Bachelor (%), 0.07)","(num_metro_tram_stops, 0.05)","(Median_tot_fam_inc_weekly, 0.04)","(days_listed, 0.04)",...,"(available_year_2019, 0.0)","(available_year_2021, 0.0)","(property_type_Villa, 0.0)","(available_year_2013, 0.0)","(available_year_2017, 0.0)","(property_type_Duplex, 0.0)","(available_year_2014, 0.0)","(property_type_Block of Units, 0.0)","(property_type_Terrace, 0.0)","(available_year_2015, 0.0)"
1,XGBoost Regressor,21633.151215,0.682289,"(Median_rent_weekly, 0.13)","(Bachelor (%), 0.12)","(bathrooms, 0.09)","(bedrooms, 0.04)","(Median_tot_fam_inc_weekly, 0.03)","(available_month_3, 0.03)","(population_est, 0.03)",...,"(property_type_Villa, 0.0)","(available_year_2019, 0.0)","(property_type_Duplex, 0.0)","(property_type_Block of Units, 0.0)","(crime_per_person, 0.0)","(crime_index, 0.0)","(property_type_Terrace, 0.0)","(available_year_2014, 0.0)","(available_year_2015, 0.0)","(available_year_2020, 0.0)"
2,SVR,56114.946525,0.17588,,,,,,,,...,,,,,,,,,,
3,KNN Regressor,47015.236872,0.309521,,,,,,,,...,,,,,,,,,,


In [None]:
fit_models(df_clean_z)

Preparing train/test split...
Fitting Random Forest Regressor...
Finding Random Forest feature importance...
Random Forest done. Time elapsed: 8.74s
Fitting XGBoost Regressor...
Finding XGBoost feature importance...
XGBoost done. Time elapsed: 10.21s
Fitting SVR...
SVR done. Time elapsed: 15.01s
Fitting KNN Regressor...
KNN done. Time elapsed: 15.05s


Unnamed: 0,Model,MSE,R2,top1,top2,top3,top4,top5,top6,top7,...,top96,top97,top98,top99,top100,top101,top102,top103,top104,top105
0,Random Forest Regressor,7978.869925,0.749934,"(Median_rent_weekly, 0.18)","(bathrooms, 0.15)","(bedrooms, 0.14)","(Bachelor (%), 0.08)","(num_metro_tram_stops, 0.05)","(population_est, 0.04)","(days_listed, 0.03)",...,"(property_type_Duplex, 0.0)","(available_year_2021, 0.0)","(property_type_Block of Units, 0.0)","(available_year_2018, 0.0)","(available_year_2017, 0.0)","(available_year_2016, 0.0)","(property_type_Terrace, 0.0)","(available_year_2014, 0.0)","(available_year_2015, 0.0)","(available_year_2019, 0.0)"
1,XGBoost Regressor,7777.283471,0.756252,"(Median_rent_weekly, 0.19)","(Bachelor (%), 0.11)","(bathrooms, 0.1)","(bedrooms, 0.06)","(population_est, 0.04)","(Mortgage (%), 0.02)","(Certificate_level (%), 0.02)",...,"(property_type_Block of Units, 0.0)","(crime_index, 0.0)","(crime_per_person, 0.0)","(property_type_Terrace, 0.0)","(available_year_2018, 0.0)","(available_year_2015, 0.0)","(available_year_2014, 0.0)","(available_year_2017, 0.0)","(available_year_2016, 0.0)","(available_year_2019, 0.0)"
2,SVR,23561.178824,0.261567,,,,,,,,...,,,,,,,,,,
3,KNN Regressor,20068.529329,0.371031,,,,,,,,...,,,,,,,,,,


In [None]:
fit_models(df_clean_iqr)

Preparing train/test split...
Fitting Random Forest Regressor...
Finding Random Forest feature importance...
Random Forest done. Time elapsed: 6.95s
Fitting XGBoost Regressor...
Finding XGBoost feature importance...
XGBoost done. Time elapsed: 8.01s
Fitting SVR...
SVR done. Time elapsed: 12.02s
Fitting KNN Regressor...
KNN done. Time elapsed: 12.05s


Unnamed: 0,Model,MSE,R2,top1,top2,top3,top4,top5,top6,top7,...,top96,top97,top98,top99,top100,top101,top102,top103,top104,top105
0,Random Forest Regressor,6695.625328,0.763537,"(bathrooms, 0.19)","(bedrooms, 0.15)","(Median_rent_weekly, 0.13)","(Bachelor (%), 0.1)","(num_metro_tram_stops, 0.05)","(num_metro_bus_stops, 0.03)","(population_est, 0.03)",...,"(property_type_Car Space, 0.0)","(num_regional_bus_stops, 0.0)","(property_type_Terrace, 0.0)","(property_type_Farm, 0.0)","(available_year_2018, 0.0)","(available_year_2014, 0.0)","(available_year_2013, 0.0)","(available_year_2016, 0.0)","(available_year_2015, 0.0)","(available_year_2019, 0.0)"
1,XGBoost Regressor,6814.393183,0.759342,"(bathrooms, 0.14)","(Median_rent_weekly, 0.14)","(Bachelor (%), 0.12)","(bedrooms, 0.06)","(Average_household_size, 0.04)","(population_est, 0.04)","(Mortgage (%), 0.03)",...,"(property_type_Terrace, 0.0)","(property_type_Farm, 0.0)","(available_year_2018, 0.0)","(available_year_2016, 0.0)","(available_year_2015, 0.0)","(available_year_2014, 0.0)","(available_month_12, 0.0)","(available_year_2013, 0.0)","(available_year_2019, 0.0)","(available_year_2020, 0.0)"
2,SVR,21420.708175,0.243505,,,,,,,,...,,,,,,,,,,
3,KNN Regressor,17567.787221,0.379575,,,,,,,,...,,,,,,,,,,
