In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import xgboost as xgb
import numpy as np
from scipy import stats

In [15]:
data = pd.read_csv('data/curated/cleaned_real_estate_data.csv')
data = data.drop(columns=["agency"])

factor_cols = ["property_type", "available_day", "available_month", "available_year"]
data = pd.get_dummies(data, columns=factor_cols, drop_first=True)

In [16]:
def filter_z_score(df, threshold=3):
    df_numeric = df.select_dtypes(include=np.number) # Select numeric columns
    z_scores = np.abs(stats.zscore(df_numeric)) # Compute Z-scores
    mask = (z_scores < threshold).all(axis=1) # Keep rows where all numeric columns are within threshold

    return df[mask].reset_index(drop=True)

def filter_iqr(df):
    df_numeric = df.select_dtypes(include=np.number)
    Q1 = df_numeric.quantile(0.25)
    Q3 = df_numeric.quantile(0.75)
    IQR = Q3 - Q1

    mask = ~((df_numeric < (Q1 - 3 * IQR)) | 
            (df_numeric > (Q3 + 3 * IQR))).any(axis=1)

    return df[mask].reset_index(drop=True)

In [17]:
df_clean_z = filter_z_score(data)
df_clean_iqr = filter_iqr(data)

In [18]:
def fit_models(data):
    """
    Given cleaned data, fit models and find MSE and R^2 of each model and 
    """
    import time
    start_time = time.time()

    # Create train test
    X = data.drop(columns=['weekly_rent'])
    y = data['weekly_rent']

    print("Preparing train/test split...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # RFR
    print("Fitting Random Forest Regressor...")
    rfr = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    rfr.fit(X_train, y_train)
    y_pred_rfr = rfr.predict(X_test)

    mse_rfr = mean_squared_error(y_test, y_pred_rfr)
    r2_rfr = r2_score(y_test, y_pred_rfr)
    print("Finding Random Forest feature importance...")
    rf_importance = rfr.feature_importances_
    rf_feat_imp = pd.Series(rf_importance, index=X.columns).sort_values(ascending=False)
    print("Random Forest done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # XGBoost
    print("Fitting XGBoost Regressor...")
    xgbr = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    verbosity=1
    )
    
    xgbr.fit(X_train, y_train, verbose=True)
    y_pred_xgb = xgbr.predict(X_test)

    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    print("Finding XGBoost feature importance...")
    xgb_importance = xgbr.feature_importances_
    xgb_feat_imp = pd.Series(xgb_importance, index=X.columns).sort_values(ascending=False)
    print("XGBoost done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # SVR
    print("Fitting SVR...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svr = SVR(kernel='rbf')
    svr.fit(X_train_scaled, y_train)
    y_pred_svr = svr.predict(X_test_scaled)

    mse_svr = mean_squared_error(y_test, y_pred_svr)
    r2_svr = r2_score(y_test, y_pred_svr)
    print("SVR done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # KNN
    print("Fitting KNN Regressor...")
    knn = KNeighborsRegressor()

    knn.fit(X_train_scaled, y_train)
    y_pred_knn = knn.predict(X_test_scaled)

    mse_knn = mean_squared_error(y_test, y_pred_knn)
    r2_knn = r2_score(y_test, y_pred_knn)
    print("KNN done. Time elapsed: {:.2f}s".format(time.time() - start_time))

    # Return results as df
    def feat_imp_to_dict(feat_imp_series):
        """
        Convert a pandas Series of feature importances to a dictionary
        with keys top1, top2, ..., topN
        """
        feat_dict = {}
        sorted_feats = feat_imp_series.sort_values(ascending=False)
        for i, (feat, val) in enumerate(sorted_feats.items(), 1):
            feat_dict[f"top{i}"] = feat
        return feat_dict

    model_results = pd.DataFrame()
   
    rf_dict = {
        "Model": "Random Forest Regressor",
        "MSE": mse_rfr,
        "R2": r2_rfr
    }
    rf_dict.update(feat_imp_to_dict(rf_feat_imp))
    model_results = pd.concat([model_results, pd.DataFrame([rf_dict])], ignore_index=True)

    xgb_dict = {
        "Model": "XGBoost Regressor",
        "MSE": mse_xgb,
        "R2": r2_xgb
    }
    xgb_dict.update(feat_imp_to_dict(xgb_feat_imp))
    model_results = pd.concat([model_results, pd.DataFrame([xgb_dict])], ignore_index=True)

    svr_dict = {
        "Model": "SVR",
        "MSE": mse_svr,
        "R2": r2_svr
    }
    model_results = pd.concat([model_results, pd.DataFrame([svr_dict])], ignore_index=True)

    knn_dict = {
        "Model": "KNN Regressor",
        "MSE": mse_knn,
        "R2": r2_knn
    }
    model_results = pd.concat([model_results, pd.DataFrame([knn_dict])], ignore_index=True)

    return model_results

In [19]:
fit_models(data)

Preparing train/test split...
Fitting Random Forest Regressor...
Finding Random Forest feature importance...
Random Forest done. Time elapsed: 12.61s
Fitting XGBoost Regressor...
Finding XGBoost feature importance...
XGBoost done. Time elapsed: 13.97s
Fitting SVR...
SVR done. Time elapsed: 21.33s
Fitting KNN Regressor...
KNN done. Time elapsed: 21.40s


Unnamed: 0,Model,MSE,R2,top1,top2,top3,top4,top5,top6,top7,...,top98,top99,top100,top101,top102,top103,top104,top105,top106,top107
0,Random Forest Regressor,21248.605125,0.687937,bathrooms,Median_rent_weekly,bedrooms,Bachelor (%),num_metro_tram_stops,lon,Median_tot_fam_inc_weekly,...,property_type_Villa,available_year_2020,available_year_2013,available_year_2021,available_year_2017,property_type_Duplex,available_year_2014,property_type_Block of Units,property_type_Terrace,available_year_2015
1,XGBoost Regressor,21455.532702,0.684898,Median_rent_weekly,Bachelor (%),bathrooms,bedrooms,Median_tot_fam_inc_weekly,available_month_3,Certificate_level (%),...,property_type_Villa,available_year_2020,available_year_2017,property_type_Block of Units,crime_per_person,crime_index,property_type_Duplex,property_type_Terrace,available_year_2015,available_year_2014
2,SVR,56242.815065,0.174002,,,,,,,,...,,,,,,,,,,
3,KNN Regressor,47092.752762,0.308383,,,,,,,,...,,,,,,,,,,


In [20]:
fit_models(df_clean_z)

Preparing train/test split...
Fitting Random Forest Regressor...
Finding Random Forest feature importance...
Random Forest done. Time elapsed: 8.46s
Fitting XGBoost Regressor...
Finding XGBoost feature importance...
XGBoost done. Time elapsed: 9.63s
Fitting SVR...
SVR done. Time elapsed: 14.01s
Fitting KNN Regressor...
KNN done. Time elapsed: 14.06s


Unnamed: 0,Model,MSE,R2,top1,top2,top3,top4,top5,top6,top7,...,top98,top99,top100,top101,top102,top103,top104,top105,top106,top107
0,Random Forest Regressor,8497.190156,0.726589,Median_rent_weekly,bathrooms,bedrooms,Bachelor (%),num_metro_tram_stops,lon,population_est,...,available_year_2020,available_year_2016,available_year_2019,property_type_Block of Units,property_type_Duplex,property_type_Terrace,available_year_2018,available_year_2017,available_year_2015,available_year_2014
1,XGBoost Regressor,8073.762872,0.740213,Median_rent_weekly,Bachelor (%),bathrooms,bedrooms,population_est,Mortgage (%),Total rented (%),...,crime_per_person,crime_index,property_type_Terrace,available_year_2018,available_year_2016,available_year_2015,available_year_2014,available_year_2017,available_year_2019,available_year_2020
2,SVR,23514.390202,0.243385,,,,,,,,...,,,,,,,,,,
3,KNN Regressor,19559.233423,0.370649,,,,,,,,...,,,,,,,,,,


In [21]:
fit_models(df_clean_iqr)

Preparing train/test split...
Fitting Random Forest Regressor...
Finding Random Forest feature importance...
Random Forest done. Time elapsed: 7.20s
Fitting XGBoost Regressor...
Finding XGBoost feature importance...
XGBoost done. Time elapsed: 8.39s
Fitting SVR...
SVR done. Time elapsed: 11.73s
Fitting KNN Regressor...
KNN done. Time elapsed: 11.77s


Unnamed: 0,Model,MSE,R2,top1,top2,top3,top4,top5,top6,top7,...,top98,top99,top100,top101,top102,top103,top104,top105,top106,top107
0,Random Forest Regressor,6643.125717,0.763837,bathrooms,bedrooms,Median_rent_weekly,Bachelor (%),num_metro_tram_stops,lon,lat,...,available_year_2018,available_year_2016,property_type_Car Space,num_regional_bus_stops,property_type_Terrace,property_type_Farm,available_year_2015,available_year_2014,available_year_2013,available_year_2019
1,XGBoost Regressor,6912.113131,0.754275,Median_rent_weekly,bathrooms,Bachelor (%),bedrooms,population_est,Average_household_size,Mortgage (%),...,crime_per_person,crime_index,property_type_Terrace,property_type_Farm,available_year_2013,available_year_2014,available_year_2015,available_year_2016,available_year_2019,available_year_2020
2,SVR,20933.874701,0.255802,,,,,,,,...,,,,,,,,,,
3,KNN Regressor,17111.010719,0.391704,,,,,,,,...,,,,,,,,,,
