In [1]:
# We might need `imblearn`, so it's useful to install it if you haven't already!
# !pip install imblearn

import math                                                                                # Mathematical functions + calculations
import warnings                                                                            # Warnings management
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd                                                                        # Data analysis + manipulation
import numpy as np                                                                         # Array computations + mathematical functions
import statsmodels.api as sm                                                               # Statistical computations for models
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN                                                      # Performs both under- and oversampling on a dataset
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV, RandomizedSearchCV      # Split data into random train and test subsets
from sklearn.linear_model import LogisticRegression, LinearRegression                                        # Regressor model for classification cases
from sklearn.neural_network import MLPRegressor, MLPClassifier                             # Multilayer perceptron model for regression cases
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report # MSE and MAE used in model evaluation + overview evaluation features
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBClassifier
from IPython.display import display

%matplotlib inline
%env PYTHONWARNINGS=ignore
rng = np.random.RandomState(0)



## 1. HR Analytics

### 1.1 Load and Sample the data

In [None]:
df = pd.read_csv("Datasets/hr_data_new.csv")
df.head()

In [None]:
df.info()

The number of entries for each column match the total entry count for the whole dataset, meaning that there are no null values. They were handled properly in Assignment 1.

There were a number of operations done in the last assignment, including:
- ~~Proportional stratified sampling~~
- Encoding categorical features
- Feature normalisation

This means that I won't need to revisit these operations again for this dataset.

In [None]:
# Defining the X and Y datasets then splitting them
X = df.drop("is_promoted", axis=1)
y = df["is_promoted"]

print("=== Original dataset proportion ===")
print(y.value_counts())
print(f"Ratio (UP : P): {len(y[y == 0]) / len(y[y == 1])}\n")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=rng)
X_train, y_train = SMOTEENN(random_state=rng).fit_resample(X_train, y_train)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng)

print("=== Post-split + sampled proportions ===")
print("Training set")
print(y_train.value_counts())
print(f"Ratio (UP : P): {len(y_train[y_train == 0]) / len(y_train[y_train == 1])}")

print("Testing set")
print(y_test.value_counts())
print(f"Ratio (UP : P): {len(y_test[y_test == 0]) / len(y_test[y_test == 1])}")

### 1.2 and 1.3 Build the Model(s) + Evaluate and Improve the Model(s)

#### Logistic Regression

A logistic regression model is suitable for this test case because this problem's target features is a binary statement (a yes or no answer). We can build a binomial logistic regression classifier then later fine tune it as one possible model to evaluate in this problem.

##### Base Model

In [None]:
# Viewing details about the data-model relationship
lg_sm = sm.Logit(y_train, X_train).fit()
lg_sm.summary()

`[!]` With the original dataset from the notebook in Assignment 1, the p-values for all the features are 0.000. While lower p-values generally suggest higher confidence on the coefficients of the features, the fact that all of them are this low suggests a possible issue with the model and the dataset.

I looked up to learn more about it and came to the possibility of two cases happening:
- Perfect separation: the state where a particular combination of predictor variables perfectly predicts the outcome variable
- Multicollinearity: the state where there is high correlation between two or more predictors)

I found that the way sampling was done may have had a significant influence on the way this model learns and adapts. I was previously using stratified proportional sampling where the proportions remained the same as the imbalances attempt to even out, but that failed. This new method simply undersamples the majority class to match the amount in the minority class, and this significantly improved the model as seen in the f-score (initially with a score ~0.1 to now, ~0.6) below.

In [None]:
# Building the classifier
lg = LogisticRegression(max_iter=10000, random_state=rng)
lg.fit(X_train, y_train)

In [None]:
# Determining the accuracy on the training and testing sets
lg_train_cv = cross_validate(lg, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (logistic regression): {sum(lg_train_cv['test_accuracy']) / len(lg_train_cv['test_accuracy'])}")

lg_test_cv = cross_validate(lg, X_test, y_test, cv=skf, scoring=["accuracy", "f1"])
print(f"Testing accuracy (logistic regression) : {sum(lg_test_cv['test_accuracy']) / len(lg_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = lg.predict(X_train)

print("=== Classification report (logistic regression, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = lg.predict(X_test)

print("=== Classification report (logistic regression, testing set) ===")
print(classification_report(y_test, y_test_pred))

##### GridSearch Tuning

In [None]:
# Trying a combination of hyperparameters to determine the best for this classifier
param_grid = {
    "penalty": ["none", "l1", "l2", "elasticnet"],
    "solver": ["lbfgs", "liblinear", "newton-cg", "sag", "saga"],
    "max_iter": [1000, 2000, 4000, 10000]
}

lg = LogisticRegression(random_state=rng)
gs = GridSearchCV(lg, param_grid=param_grid, scoring="f1", cv=skf, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    gs.fit(X_train, y_train)

    print(f"Training f1-score (logistic regression, tuned): {gs.best_score_}")
    print(f"Best combination (logistic regression, tuned) : {gs.best_params_}")

##### Tuned Model

In [None]:
# Building the classifier
lg = gs.best_estimator_

In [None]:
# Determining the accuracy on the training and testing sets
lg_train_cv = cross_validate(lg, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (logistic regression, tuned): {sum(lg_train_cv['test_accuracy']) / len(lg_train_cv['test_accuracy'])}")

lg_test_cv = cross_validate(lg, X_test, y_test, cv=skf, scoring=["accuracy"])
print(f"Testing accuracy (logistic regression, tuned) : {sum(lg_test_cv['test_accuracy']) / len(lg_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = lg.predict(X_train)

print("=== Classification report (logistic regression, tuned, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = lg.predict(X_test)

print("=== Classification report (logistic regression, tuned, testing set) ===")
print(classification_report(y_test, y_test_pred))

I can make the observation that the model is able to train sufficiently well on the training set as seen by its relatively high accuracy and F1 scores. The testing set shows a model with a higher accuracy at around the 90%, but I think that this is inaccurate. Displaying the classification report proved my point, with a higher precision, recall, and F1 score in the majority set (non-promoted employees) than those that who were.

I can accept this model's behaviour though since there is a major disproportion in the test data. The recall of the minority class remains somewhat high though, meaning of all positive predictions, there are many employees in it that actually were.

#### Multilayered Perceptron ANN

##### Base Model

In [None]:
# Building the classifier
mlp = MLPClassifier(random_state=rng)
with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings
    
    mlp.fit(X_train, y_train)

In [None]:
# Determining the accuracy on the training and testing sets
mlp_train_cv = cross_validate(lg, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (MLP ANN): {sum(mlp_train_cv['test_accuracy']) / len(mlp_train_cv['test_accuracy'])}")

mlp_test_cv = cross_validate(lg, X_test, y_test, cv=skf, scoring=["accuracy"])
print(f"Testing accuracy (MLP ANN) : {sum(mlp_test_cv['test_accuracy']) / len(mlp_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = mlp.predict(X_train)

print("=== Classification report (MLP ANN, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = mlp.predict(X_test)

print("=== Classification report (MLP ANN, testing set) ===")
print(classification_report(y_test, y_test_pred))

##### GridSearch Tuning

In [None]:
# Trying a combination of hyperparameters to determine the best combination
param_grid = {
    "hidden_layer_sizes": [(10, ), (25, ), (50, ), (100, )],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd", "adam"],
    "learning_rate": ["constant", "invscaling", "adaptive"]
}
mlp = MLPClassifier(random_state=rng)
gs = RandomizedSearchCV(mlp, param_distributions=param_grid, scoring="f1", cv=skf, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    gs.fit(X_train, y_train)
    
    print(f"Best combination (MLP ANN, tuned): {gs.best_params_}")

##### Tuned Model

In [None]:
# Building the classifier
mlp = gs.best_estimator_

In [None]:
# Determining the accuracy on the training and testing sets
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mlp_train_cv = cross_validate(mlp, X_train, y_train, cv=skf, scoring=["accuracy"])
    print(f"Training accuracy (MLP ANN, tuned): {sum(mlp_train_cv['test_accuracy']) / len(mlp_train_cv['test_accuracy'])}")

    mlp_test_cv = cross_validate(mlp, X_test, y_test, cv=skf, scoring=["accuracy"])
    print(f"Testing accuracy (MLP ANN, tuned) : {sum(mlp_test_cv['test_accuracy']) / len(mlp_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = mlp.predict(X_train)

print("=== Classification report (MLP ANN, tuned, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = mlp.predict(X_test)

print("=== Classification report (MLP ANN, tuned, testing set) ===")
print(classification_report(y_test, y_test_pred))

#### Random Forest

##### Base Model

In [None]:
# Building the classifier
rf = RandomForestClassifier(random_state=rng)
rf.fit(X_train, y_train)

In [None]:
# Determining the accuracy on the training and testing sets
rf_train_cv = cross_validate(rf, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (random forest): {sum(rf_train_cv['test_accuracy']) / len(rf_train_cv['test_accuracy'])}")

rf_test_cv = cross_validate(rf, X_test, y_test, cv=skf, scoring=["accuracy"])
print(f"Testing accuracy (random forest) : {sum(rf_test_cv['test_accuracy']) / len(rf_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = rf.predict(X_train)

print("=== Classification report (random forest, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = rf.predict(X_test)

print("=== Classification report (random forest, testing set) ===")
print(classification_report(y_test, y_test_pred))

##### GridSearch Tuning

In [None]:
# Tries a combination of hyperparameters to determine the best for this classifier
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [5, 10, 15],
    "max_features": ["sqrt", "log2", None],
    "class_weight": ["balanced", "balanced_subsample", None]
}

lg = RandomForestClassifier(random_state=rng)
gs = GridSearchCV(lg, param_grid=param_grid, cv=skf, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    gs.fit(X_train, y_train)

    print(f"Best combination (random forest, tuned) : {gs.best_params_}")

##### Tuned Model

In [None]:
# Building the classifier
rf = gs.best_estimator_

In [None]:
# Determining the accuracy on the training and testing sets
rf_train_cv = cross_validate(rf, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (random forest, tuned): {sum(rf_train_cv['test_accuracy']) / len(rf_train_cv['test_accuracy'])}")

rf_test_cv = cross_validate(rf, X_test, y_test, cv=skf, scoring=["accuracy"])
print(f"Testing accuracy (random forest, tuned) : {sum(rf_test_cv['test_accuracy']) / len(rf_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = rf.predict(X_train)

print("=== Classification report (random forest, tuned, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = rf.predict(X_test)

print("=== Classification report (random forest, tuned, testing set) ===")
print(classification_report(y_test, y_test_pred))

#### XGBoost

In [None]:
# Building the classifier
xgb = XGBClassifier(objective="binary:logistic", random_state=rng)
xgb.fit(X_train, y_train)

In [None]:
# Determining the accuracy on the training and testing sets
xgb_train_cv = cross_validate(xgb, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (XGBoost): {sum(xgb_train_cv['test_accuracy']) / len(xgb_train_cv['test_accuracy'])}")

xgb_test_cv = cross_validate(xgb, X_test, y_test, cv=skf, scoring=["accuracy"])
print(f"Testing accuracy (XGBoost) : {sum(xgb_test_cv['test_accuracy']) / len(xgb_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = xgb.predict(X_train)

print("=== Classification report (XGBoost, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = xgb.predict(X_test)

print("=== Classification report (XGBoost, testing set) ===")
print(classification_report(y_test, y_test_pred))

##### GridSearch Tuning

In [None]:
# Trying a combination of hyperparameters to determine the best for this classifier
param_grid = {
    "booster": ["gbtree", "gblinear", "dart"],
    "max_depth": [5, 10, 15],
    "subsample": [0, 0.5, 1],
    "sampling_method": ["uniform", "gradient_based"],
}

xgb = XGBClassifier(objective="binary:logistic", random_state=rng)
gs = GridSearchCV(xgb, param_grid=param_grid, cv=skf, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    gs.fit(X_train, y_train)

    print(f"Best combination (XGBooster, tuned) : {gs.best_params_}")

##### Tuned Model

In [None]:
# Building the classifier
xgb = gs.best_estimator_

In [None]:
# Determining the accuracy on the training and testing sets
xgb_train_cv = cross_validate(lg, X_train, y_train, cv=skf, scoring=["accuracy"])
print(f"Training accuracy (random forest): {sum(xgb_train_cv['test_accuracy']) / len(xgb_train_cv['test_accuracy'])}")

xgb_test_cv = cross_validate(lg, X_test, y_test, cv=skf, scoring=["accuracy"])
print(f"Testing accuracy (random forest) : {sum(xgb_test_cv['test_accuracy']) / len(xgb_test_cv['test_accuracy'])}")

In [None]:
# Analyzing the model's performance through main classification metrics
y_train_pred = xgb.predict(X_train)

print("=== Classification report (XGBoost, tuned, training set) ===")
print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = xgb.predict(X_test)

print("=== Classification report (XGBoost, tuned, testing set) ===")
print(classification_report(y_test, y_test_pred))

---

## 2. Airbnb

### 2.1 Load and Sample the data

In [52]:
df = pd.read_csv("Datasets/listings_new.csv")
df.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365
0,North Region,Woodlands,1.44255,103.7958,1.0,83,5.198497,0.693147,0.00995,1.0
1,Central Region,Bukit Timah,1.33235,103.78521,1.0,81,4.51086,2.944439,0.24686,1.0
2,North Region,Woodlands,1.44246,103.79667,1.0,69,1.94591,3.044522,0.182322,1.0
3,East Region,Tampines,1.34541,103.95712,1.0,206,0.693147,2.70805,0.139762,0.967123
4,East Region,Tampines,1.34567,103.95963,1.0,94,0.693147,3.135494,0.198851,0.972603


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   neighbourhood_group  7497 non-null   object 
 1   neighbourhood        7497 non-null   object 
 2   latitude             7497 non-null   float64
 3   longitude            7497 non-null   float64
 4   room_type            7497 non-null   float64
 5   price                7497 non-null   int64  
 6   minimum_nights       7497 non-null   float64
 7   number_of_reviews    7497 non-null   float64
 8   reviews_per_month    7497 non-null   float64
 9   availability_365     7497 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 585.8+ KB


In [54]:
df.describe()

Unnamed: 0,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365
count,7497.0,7497.0,7497.0,7497.0,7497.0,7497.0,7497.0,7497.0
mean,1.314649,103.848951,1.456583,132.511538,1.85789,1.418348,0.376534,0.565316
std,0.030615,0.044146,0.592793,82.14694,1.267354,1.448394,0.478031,0.400851
min,1.24526,103.66547,0.0,0.0,0.693147,0.0,0.0,0.0
25%,1.29601,103.8361,1.0,62.0,0.693147,0.0,0.0,0.142466
50%,1.31125,103.84981,2.0,118.0,1.386294,1.098612,0.157004,0.70137
75%,1.32255,103.87535,2.0,181.0,2.397895,2.484907,0.620576,0.969863
max,1.45459,103.97342,2.0,378.0,6.908755,5.780744,2.639057,1.0


Because of the hint given for this problem in the previous assignment, I opted to keep some parts of the dataset available but not for use by the model. Instead, it'll be used to split the dataset into further smaller subsets. That way, I can make specific models for a particular criteria (e.g., neighbourhood group) that might yield better results than if I were to create one generalised model for everything.

Because there isn't much time, I have opted to pick two subsets to pursue for this problem:
- A subset for listings in the Central region (`neighbourhood_group` is `Central`)
- A subset for listings offering a private room (`room_type` is `Private room`)

In [55]:
# Splitting the dataset into neighborhood groups and encoding the neighborhood categorical feature
neighborhood_groups = [group for group in df.groupby("neighbourhood_group", as_index=False)]
neighborhoods = [group for group in df.groupby("neighbourhood", as_index=False)]

for group in neighborhood_groups:
    group_df = group[1]
    group_df["neighbourhood"] = OrdinalEncoder(categories=[group_df["neighbourhood"].unique()]).fit_transform(group_df[["neighbourhood"]])
    group_df.drop(["neighbourhood_group", "reviews_per_month", "latitude", "longitude"], axis=1, inplace=True)

for group in neighborhoods:
    group_df = group[1]
    group_df.drop(["neighbourhood_group", "neighbourhood", "reviews_per_month", "latitude", "longitude"], axis=1, inplace=True)

central_df = neighborhood_groups[0][1]
kallang_df = neighborhoods[15][1]

display(central_df)
display(kallang_df)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,availability_365
1,0.0,1.0,81,4.510860,2.944439,1.000000
10,1.0,1.0,44,2.772589,2.944439,0.906849
11,1.0,1.0,40,3.433987,2.397895,0.756164
12,2.0,1.0,65,1.098612,4.836282,0.920548
13,1.0,1.0,44,3.433987,2.639057,0.931507
...,...,...,...,...,...,...
7492,6.0,2.0,100,1.386294,0.000000,0.167123
7493,6.0,2.0,100,1.386294,0.000000,0.167123
7494,11.0,1.0,58,3.433987,0.000000,0.473973
7495,4.0,1.0,56,2.708050,0.000000,0.082192


Unnamed: 0,room_type,price,minimum_nights,number_of_reviews,availability_365
44,1.0,100,5.902633,0.000000,1.000000
59,1.0,49,2.944439,1.098612,0.986301
60,1.0,81,4.510860,2.708050,0.000000
68,1.0,56,2.944439,0.693147,0.909589
77,1.0,40,2.944439,2.197225,0.983562
...,...,...,...,...,...
7464,2.0,110,2.708050,0.000000,1.000000
7469,1.0,49,2.944439,0.000000,0.983562
7475,1.0,58,2.944439,0.000000,0.969863
7479,1.0,129,1.386294,0.000000,0.260274


### 2.2 and 2.3 Build the Model(s) + Evaluate and Improve the Model(s)

#### Central Listings

In [103]:
# Defining the X and Y datasets then splitting them
X = central_df.drop("price", axis=1)
y = central_df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rng)

##### Linear Regression

In [57]:
# Viewing details about the data-model relationship
lr_sm = sm.OLS(y_train, X_train).fit()
lr_sm.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.855
Model:,OLS,Adj. R-squared (uncentered):,0.854
Method:,Least Squares,F-statistic:,4889.0
Date:,"Thu, 08 Feb 2024",Prob (F-statistic):,0.0
Time:,22:44:16,Log-Likelihood:,-23118.0
No. Observations:,4162,AIC:,46250.0
Df Residuals:,4157,BIC:,46280.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
neighbourhood,2.2333,0.205,10.903,0.000,1.832,2.635
room_type,90.3100,1.260,71.693,0.000,87.840,92.780
minimum_nights,-12.7232,0.836,-15.220,0.000,-14.362,-11.084
number_of_reviews,-4.3168,0.643,-6.713,0.000,-5.577,-3.056
availability_365,28.1919,2.386,11.816,0.000,23.514,32.869

0,1,2,3
Omnibus:,555.962,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,824.786
Skew:,0.977,Prob(JB):,7.94e-180
Kurtosis:,3.97,Cond. No.,22.6


###### Base Model

In [58]:
# Building the regressor
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [59]:
# Determining important metrics on the training and testing sets
lr_train_cv = cross_validate(lr, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (linear regression, Central listings): {sum(np.sqrt(-lr_train_cv['test_neg_mean_squared_error'])) / len(lr_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (linear regression, Central listings) : {sum(-lr_train_cv['test_neg_mean_absolute_error']) / len(lr_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (linear regression, Central listings) : {sum(lr_train_cv['test_r2']) / len(lr_train_cv['test_r2'])}")

print()

lr_test_cv = cross_validate(lr, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (linear regression, Central listings) : {sum(np.sqrt(-lr_test_cv['test_neg_mean_squared_error'])) / len(lr_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (linear regression, Central listings)  : {sum(-lr_test_cv['test_neg_mean_absolute_error']) / len(lr_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (linear regression, Central listings)  : {sum(lr_test_cv['test_r2']) / len(lr_test_cv['test_r2'])}")

Training RMSE (linear regression, Central listings): 62.614516152134044
Training MAE (linear regression, Central listings) : 48.60042703468012
Training R^2 (linear regression, Central listings) : 0.42285968479150976

Testing RMSE (linear regression, Central listings) : 61.914760004667166
Testing MAE (linear regression, Central listings)  : 48.18668693147375
Testing R^2 (linear regression, Central listings)  : 0.4393384080090776


In [60]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(lr, open("Models/central_lr.pkl", "wb"))

##### Support Vector Machine

###### Base Model

In [104]:
# Building the regressor
svm = SVR()
svm.fit(X_train, y_train)

SVR()

In [62]:
# Determining important metrics on the training and testing sets
svm_train_cv = cross_validate(svm, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (SVR, Central listings): {sum(np.sqrt(-svm_train_cv['test_neg_mean_squared_error'])) / len(svm_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (SVR, Central listings) : {sum(-svm_train_cv['test_neg_mean_absolute_error']) / len(svm_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (SVR, Central listings) : {sum(svm_train_cv['test_r2']) / len(svm_train_cv['test_r2'])}")

print()

svm_test_cv = cross_validate(svm, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (SVR, Central listings) : {sum(np.sqrt(-svm_test_cv['test_neg_mean_squared_error'])) / len(svm_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (SVR, Central listings)  : {sum(-svm_test_cv['test_neg_mean_absolute_error']) / len(svm_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (SVR, Central listings)  : {sum(svm_test_cv['test_r2']) / len(svm_test_cv['test_r2'])}")

Training RMSE (SVR, Central listings): 76.61187752834294
Training MAE (SVR, Central listings) : 57.95632428197242
Training R^2 (SVR, Central listings) : 0.13650947884049303

Testing RMSE (SVR, Central listings) : 80.00573921993806
Testing MAE (SVR, Central listings)  : 61.94573222463043
Testing R^2 (SVR, Central listings)  : 0.06446892523847816


###### GridSearch Tuning

In [105]:
# Tries a combination of hyperparameters to determine the best for this regressor
param_grid = { 
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "epsilon": [0.1, 0.2, 0.3, 0.4]
}
svm = SVR()
gs = GridSearchCV(svm, param_grid=param_grid, scoring="r2", cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings

    gs.fit(X_train, y_train)

    print(f"Best combination (SVR, tuned, Central listings): {gs.best_params_}")

Best combination (SVR, tuned, Central listings): {'C': 100, 'epsilon': 0.2, 'kernel': 'rbf'}


###### Tuned Model

In [106]:
# Building the regressor
svm = gs.best_estimator_

In [107]:
# Determining important metrics on the training and testing sets
svm_train_cv = cross_validate(svm, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (SVR, tuned, Central listings): {sum(np.sqrt(-svm_train_cv['test_neg_mean_squared_error'])) / len(svm_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (SVR, tuned, Central listings) : {sum(-svm_train_cv['test_neg_mean_absolute_error']) / len(svm_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (SVR, tuned, Central listings) : {sum(svm_train_cv['test_r2']) / len(svm_train_cv['test_r2'])}")

print()

svm_test_cv = cross_validate(svm, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (SVR, tuned, Central listings) : {sum(np.sqrt(-svm_test_cv['test_neg_mean_squared_error'])) / len(svm_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (SVR, tuned, Central listings)  : {sum(-svm_test_cv['test_neg_mean_absolute_error']) / len(svm_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (SVR, tuned, Central listings)  : {sum(svm_test_cv['test_r2']) / len(svm_test_cv['test_r2'])}")

Training RMSE (SVR, tuned, Central listings): 63.24343765507801
Training MAE (SVR, tuned, Central listings) : 45.28343285440935
Training R^2 (SVR, tuned, Central listings) : 0.42526065129946417

Testing RMSE (SVR, tuned, Central listings) : 61.507630856678205
Testing MAE (SVR, tuned, Central listings)  : 43.61076497816154
Testing R^2 (SVR, tuned, Central listings)  : 0.41354110138611777


In [66]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(svm, open("Models/central_svm.pkl", "wb"))

##### Multi-layered Perceptron ANN

###### Base Model

In [67]:
# Building the regressor
mlp = MLPRegressor(random_state=rng)
with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings
    
    mlp.fit(X_train, y_train)

In [68]:
# Determining important metrics on the training and testing sets
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mlp_train_cv = cross_validate(mlp, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Training RMSE (MLP ANN, Central listings): {sum(np.sqrt(-mlp_train_cv['test_neg_mean_squared_error'])) / len(mlp_train_cv['test_neg_mean_squared_error'])}")
    print(f"Training MAE (MLP ANN, Central listings) : {sum(-mlp_train_cv['test_neg_mean_absolute_error']) / len(mlp_train_cv['test_neg_mean_absolute_error'])}")
    print(f"Training R^2 (MLP ANN, Central listings) : {sum(mlp_train_cv['test_r2']) / len(mlp_train_cv['test_r2'])}")

    print()

    mlp_test_cv = cross_validate(mlp, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Testing RMSE (MLP ANN, Central listings) : {sum(np.sqrt(-mlp_test_cv['test_neg_mean_squared_error'])) / len(mlp_test_cv['test_neg_mean_squared_error'])}")
    print(f"Testing MAE (MLP ANN, Central listings)  : {sum(-mlp_test_cv['test_neg_mean_absolute_error']) / len(mlp_test_cv['test_neg_mean_absolute_error'])}")
    print(f"Testing R^2 (MLP ANN, Central listings)  : {sum(mlp_test_cv['test_r2']) / len(mlp_test_cv['test_r2'])}")

Training RMSE (MLP ANN, Central listings): 62.456901785664954
Training MAE (MLP ANN, Central listings) : 48.42957855094611
Training R^2 (MLP ANN, Central listings) : 0.42580454123258776

Testing RMSE (MLP ANN, Central listings) : 62.817824339447796
Testing MAE (MLP ANN, Central listings)  : 49.61645048283369
Testing R^2 (MLP ANN, Central listings)  : 0.42302227244997087


`[!]` The initial training and testing accuracies for a base model MLP regressor (with no hyperparameter customisation) are extremely low. We will need to continue customising the hyperparameters to better suit the use case and (hopefully) improve the accuracy and other metrics overall.

###### GridSearch Tuning

In [69]:
# Tries a combination of hyperparameters to determine the best for this regressor
param_grid = { 
    "activation": ["identity", "logistic", "tanh", "relu"], 
    "hidden_layer_sizes": [(5, ), (10, ), (20, ), (30, ), (40, )], 
    "max_iter": [500, 1000, 2000, 4000, 10000], 
    "solver": ["sgd", "adam"]
}
mlp = MLPRegressor(random_state=rng)
gs = GridSearchCV(mlp, param_grid=param_grid, scoring="r2", cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings

    gs.fit(X_train, y_train)

    print(f"Best combination (MLP ANN, tuned, Central listings): {gs.best_params_}")

Best combination (MLP ANN, tuned, Central listings): {'activation': 'logistic', 'hidden_layer_sizes': (30,), 'max_iter': 4000, 'solver': 'adam'}


###### Tuned Model

In [70]:
# Building the regressor
mlp = gs.best_estimator_

with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings
    
    mlp.fit(X_train, y_train)

In [71]:
# Determining important metrics on the training and testing sets
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mlp_train_cv = cross_validate(mlp, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Training RMSE (MLP ANN, tuned, Central listings): {sum(np.sqrt(-mlp_train_cv['test_neg_mean_squared_error'])) / len(mlp_train_cv['test_neg_mean_squared_error'])}")
    print(f"Training MAE (MLP ANN, tuned, Central listings) : {sum(-mlp_train_cv['test_neg_mean_absolute_error']) / len(mlp_train_cv['test_neg_mean_absolute_error'])}")
    print(f"Training R^2 (MLP ANN, tuned, Central listings) : {sum(mlp_train_cv['test_r2']) / len(mlp_train_cv['test_r2'])}")

    print()

    mlp_test_cv = cross_validate(mlp, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Testing RMSE (MLP ANN, tuned, Central listings) : {sum(np.sqrt(-mlp_test_cv['test_neg_mean_squared_error'])) / len(mlp_test_cv['test_neg_mean_squared_error'])}")
    print(f"Testing MAE (MLP ANN, tuned, Central listings)  : {sum(-mlp_test_cv['test_neg_mean_absolute_error']) / len(mlp_test_cv['test_neg_mean_absolute_error'])}")
    print(f"Testing R^2 (MLP ANN, tuned, Central listings)  : {sum(mlp_test_cv['test_r2']) / len(mlp_test_cv['test_r2'])}")

Training RMSE (MLP ANN, tuned, Central listings): 60.033527293586
Training MAE (MLP ANN, tuned, Central listings) : 45.89981998328368
Training R^2 (MLP ANN, tuned, Central listings) : 0.4694939475536522

Testing RMSE (MLP ANN, tuned, Central listings) : 60.51266777019631
Testing MAE (MLP ANN, tuned, Central listings)  : 46.43238901098526
Testing R^2 (MLP ANN, tuned, Central listings)  : 0.46455917278234526


In [72]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(mlp, open("Models/central_mlp.pkl", "wb"))

##### AdaBoost

###### Base Model

In [73]:
# Building the regressor
ab = AdaBoostRegressor(random_state=rng)
ab.fit(X_train, y_train)

AdaBoostRegressor(random_state=RandomState(MT19937) at 0x7FD9E19A5D40)

In [74]:
# Determining important metrics on the training and testing sets
ab_train_cv = cross_validate(ab, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (AdaBoost, Central listings): {sum(np.sqrt(-ab_train_cv['test_neg_mean_squared_error'])) / len(ab_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (AdaBoost, Central listings) : {sum(-ab_train_cv['test_neg_mean_absolute_error']) / len(ab_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (AdaBoost, Central listings) : {sum(ab_train_cv['test_r2']) / len(ab_train_cv['test_r2'])}")

print()

ab_test_cv = cross_validate(ab, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (AdaBoost, Central listings) : {sum(np.sqrt(-ab_test_cv['test_neg_mean_squared_error'])) / len(ab_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (AdaBoost, Central listings)  : {sum(-ab_test_cv['test_neg_mean_absolute_error']) / len(ab_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (AdaBoost, Central listings)  : {sum(ab_test_cv['test_r2']) / len(ab_test_cv['test_r2'])}")

Training RMSE (AdaBoost, Central listings): 64.78528744201321
Training MAE (AdaBoost, Central listings) : 54.08566039780307
Training R^2 (AdaBoost, Central listings) : 0.38177356822182884

Testing RMSE (AdaBoost, Central listings) : 63.808719606695036
Testing MAE (AdaBoost, Central listings)  : 52.814701848838844
Testing R^2 (AdaBoost, Central listings)  : 0.40366242817158576


In [75]:
# Tries a combination of hyperparameters to determine the best for this classifier
param_grid = {
    "n_estimators": [25, 50, 75, 100],
    "learning_rate": [0.01, 0.25, 0.5, 0.75, 1],
    "loss": ["linear", "square", "exponential"]
}

ab = AdaBoostRegressor(random_state=rng)
gs = GridSearchCV(ab, param_grid=param_grid, scoring="r2", cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    gs.fit(X_train, y_train)

    print(f"Best combination (AdaBoost, tuned, Central listings) : {gs.best_params_}")

Best combination (AdaBoost, tuned, Central listings) : {'learning_rate': 0.01, 'loss': 'linear', 'n_estimators': 100}


###### Tuned Model

In [76]:
# Building the regressor
ab = gs.best_estimator_

In [77]:
# Determining important metrics on the training and testing sets
ab_train_cv = cross_validate(ab, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (AdaBoost, tuned, Central listings): {sum(np.sqrt(-ab_train_cv['test_neg_mean_squared_error'])) / len(ab_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (AdaBoost, tuned, Central listings) : {sum(-ab_train_cv['test_neg_mean_absolute_error']) / len(ab_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (AdaBoost, tuned, Central listings) : {sum(ab_train_cv['test_r2']) / len(ab_train_cv['test_r2'])}")

print()

ab_test_cv = cross_validate(ab, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (AdaBoost, tuned, Central listings) : {sum(np.sqrt(-ab_test_cv['test_neg_mean_squared_error'])) / len(ab_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (AdaBoost, tuned, Central listings)  : {sum(-ab_test_cv['test_neg_mean_absolute_error']) / len(ab_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (AdaBoost, tuned, Central listings)  : {sum(ab_test_cv['test_r2']) / len(ab_test_cv['test_r2'])}")

Training RMSE (AdaBoost, tuned, Central listings): 61.93257804000176
Training MAE (AdaBoost, tuned, Central listings) : 48.38712556137786
Training R^2 (AdaBoost, tuned, Central listings) : 0.4350576404766001

Testing RMSE (AdaBoost, tuned, Central listings) : 61.84684556080165
Testing MAE (AdaBoost, tuned, Central listings)  : 48.15115008783186
Testing R^2 (AdaBoost, tuned, Central listings)  : 0.4399438318182353


In [78]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(ab, open("Models/central_ab.pkl", "wb"))

#### Kallang listings

In [79]:
# Defining the X and Y datasets then splitting them
X = kallang_df.drop("price", axis=1)
y = kallang_df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rng)

##### Linear Regression

In [80]:
# Viewing details about the data-model relationship
lr_sm = sm.OLS(y_train, X_train).fit()
lr_sm.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.824
Model:,OLS,Adj. R-squared (uncentered):,0.823
Method:,Least Squares,F-statistic:,807.2
Date:,"Thu, 08 Feb 2024",Prob (F-statistic):,1.05e-258
Time:,22:49:47,Log-Likelihood:,-3873.0
No. Observations:,695,AIC:,7754.0
Df Residuals:,691,BIC:,7772.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
room_type,97.0666,3.189,30.438,0.000,90.805,103.328
minimum_nights,-15.9476,2.160,-7.384,0.000,-20.188,-11.707
number_of_reviews,1.8011,1.610,1.119,0.264,-1.360,4.962
availability_365,27.3428,5.848,4.676,0.000,15.861,38.824

0,1,2,3
Omnibus:,111.432,Durbin-Watson:,1.95
Prob(Omnibus):,0.0,Jarque-Bera (JB):,175.168
Skew:,1.041,Prob(JB):,9.18e-39
Kurtosis:,4.308,Cond. No.,7.32


###### Base Model

In [81]:
# Building the regressor
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [82]:
# Determining important metrics on the training and testing sets
lr_train_cv = cross_validate(lr, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (linear regression, Kallang listings): {sum(np.sqrt(-lr_train_cv['test_neg_mean_squared_error'])) / len(lr_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (linear regression, Kallang listings) : {sum(-lr_train_cv['test_neg_mean_absolute_error']) / len(lr_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (linear regression, Kallang listings) : {sum(lr_train_cv['test_r2']) / len(lr_train_cv['test_r2'])}")

print()

lr_test_cv = cross_validate(lr, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (linear regression, Kallang listings) : {sum(np.sqrt(-lr_test_cv['test_neg_mean_squared_error'])) / len(lr_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (linear regression, Kallang listings)  : {sum(-lr_test_cv['test_neg_mean_absolute_error']) / len(lr_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (linear regression, Kallang listings)  : {sum(lr_test_cv['test_r2']) / len(lr_test_cv['test_r2'])}")

Training RMSE (linear regression, Kallang listings): 64.0229950612638
Training MAE (linear regression, Kallang listings) : 48.77472371196027
Training R^2 (linear regression, Kallang listings) : 0.44884484645932315

Testing RMSE (linear regression, Kallang listings) : 56.96665612967121
Testing MAE (linear regression, Kallang listings)  : 43.35939325581402
Testing R^2 (linear regression, Kallang listings)  : 0.5293206596856599


In [83]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(lr, open("Models/kallang_lr.pkl", "wb"))

##### Support Vector Machine

###### Base Model

In [84]:
# Building the regressor
svm = SVR()
svm.fit(X_train, y_train)

SVR()

In [85]:
# Determining important metrics on the training and testing sets
svm_train_cv = cross_validate(svm, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (SVR, Kallang listings): {sum(np.sqrt(-svm_train_cv['test_neg_mean_squared_error'])) / len(svm_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (SVR, Kallang listings) : {sum(-svm_train_cv['test_neg_mean_absolute_error']) / len(svm_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (SVR, Kallang listings) : {sum(svm_train_cv['test_r2']) / len(svm_train_cv['test_r2'])}")

print()

svm_test_cv = cross_validate(svm, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (SVR, Kallang listings) : {sum(np.sqrt(-svm_test_cv['test_neg_mean_squared_error'])) / len(svm_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (SVR, Kallang listings)  : {sum(-svm_test_cv['test_neg_mean_absolute_error']) / len(svm_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (SVR, Kallang listings)  : {sum(svm_test_cv['test_r2']) / len(svm_test_cv['test_r2'])}")

Training RMSE (SVR, Kallang listings): 79.27903720089148
Training MAE (SVR, Kallang listings) : 56.681451608911175
Training R^2 (SVR, Kallang listings) : 0.15999775396422258

Testing RMSE (SVR, Kallang listings) : 77.5249607846248
Testing MAE (SVR, Kallang listings)  : 61.68478597571823
Testing R^2 (SVR, Kallang listings)  : 0.13871332922888568


###### GridSearch Tuning

In [102]:
# Tries a combination of hyperparameters to determine the best for this regressor
param_grid = { 
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "epsilon": [0.1, 0.2, 0.3, 0.4]
}
svm = SVR()
gs = GridSearchCV(svm, param_grid=param_grid, scoring="r2", cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings

    gs.fit(X_train, y_train)

    print(f"Best combination (SVR, tuned, Kallang listings): {gs.best_params_}")

Best combination (SVR, tuned, Kallang listings): {'C': 100, 'epsilon': 0.4, 'kernel': 'rbf'}


###### Tuned Model

In [87]:
# Building the regressor
svm = gs.best_estimator_

In [88]:
# Determining important metrics on the training and testing sets
svm_train_cv = cross_validate(svm, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (SVR, tuned, Kallang listings): {sum(np.sqrt(-svm_train_cv['test_neg_mean_squared_error'])) / len(svm_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (SVR, tuned, Kallang listings) : {sum(-svm_train_cv['test_neg_mean_absolute_error']) / len(svm_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (SVR, tuned, Kallang listings) : {sum(svm_train_cv['test_r2']) / len(svm_train_cv['test_r2'])}")

print()

svm_test_cv = cross_validate(svm, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (SVR, tuned, Kallang listings) : {sum(np.sqrt(-svm_test_cv['test_neg_mean_squared_error'])) / len(svm_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (SVR, tuned, Kallang listings)  : {sum(-svm_test_cv['test_neg_mean_absolute_error']) / len(svm_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (SVR, tuned, Kallang listings)  : {sum(svm_test_cv['test_r2']) / len(svm_test_cv['test_r2'])}")

Training RMSE (SVR, tuned, Kallang listings): 63.17251662176267
Training MAE (SVR, tuned, Kallang listings) : 42.69706968799497
Training R^2 (SVR, tuned, Kallang listings) : 0.46549092006105114

Testing RMSE (SVR, tuned, Kallang listings) : 57.616946408608364
Testing MAE (SVR, tuned, Kallang listings)  : 40.635313058048425
Testing R^2 (SVR, tuned, Kallang listings)  : 0.5199235831156366


In [89]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(svm, open("Models/kallang_svm.pkl", "wb"))

##### Multi-layered Perceptron ANN

###### Base Model

In [90]:
# Building the regressor
mlp = MLPRegressor(random_state=rng)
with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings
    
    mlp.fit(X_train, y_train)

In [91]:
# Determining important metrics on the training and testing sets
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mlp_train_cv = cross_validate(mlp, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Training RMSE (MLP ANN, Kallang listings): {sum(np.sqrt(-mlp_train_cv['test_neg_mean_squared_error'])) / len(mlp_train_cv['test_neg_mean_squared_error'])}")
    print(f"Training MAE (MLP ANN, Kallang listings) : {sum(-mlp_train_cv['test_neg_mean_absolute_error']) / len(mlp_train_cv['test_neg_mean_absolute_error'])}")
    print(f"Training R^2 (MLP ANN, Kallang listings) : {sum(mlp_train_cv['test_r2']) / len(mlp_train_cv['test_r2'])}")

    print()

    mlp_test_cv = cross_validate(mlp, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Testing RMSE (MLP ANN, Kallang listings) : {sum(np.sqrt(-mlp_test_cv['test_neg_mean_squared_error'])) / len(mlp_test_cv['test_neg_mean_squared_error'])}")
    print(f"Testing MAE (MLP ANN, Kallang listings)  : {sum(-mlp_test_cv['test_neg_mean_absolute_error']) / len(mlp_test_cv['test_neg_mean_absolute_error'])}")
    print(f"Testing R^2 (MLP ANN, Kallang listings)  : {sum(mlp_test_cv['test_r2']) / len(mlp_test_cv['test_r2'])}")

Training RMSE (MLP ANN, Kallang listings): 84.19985911352671
Training MAE (MLP ANN, Kallang listings) : 66.193289653244
Training R^2 (MLP ANN, Kallang listings) : 0.051439129350281165

Testing RMSE (MLP ANN, Kallang listings) : 89.09061866153623
Testing MAE (MLP ANN, Kallang listings)  : 69.78930333824445
Testing R^2 (MLP ANN, Kallang listings)  : -0.13886575266334272


`[!]` The initial training and testing accuracies for a base model MLP regressor (with no hyperparameter customisation) are extremely low. We will need to continue customising the hyperparameters to better suit the use case and (hopefully) improve the accuracy and other metrics overall.

###### GridSearch Tuning

In [92]:
# Tries a combination of hyperparameters to determine the best for this regressor
param_grid = { 
    "activation": ["identity", "logistic", "tanh", "relu"], 
    "hidden_layer_sizes": [(5, ), (10, ), (20, ), (30, ), (40, )], 
    "max_iter": [500, 1000, 2000, 4000, 10000], 
    "solver": ["sgd", "adam"]
}
mlp = MLPRegressor(random_state=rng)
gs = GridSearchCV(mlp, param_grid=param_grid, scoring="r2", cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings

    gs.fit(X_train, y_train)

    print(f"Best combination (MLP ANN, tuned, Kallang listings): {gs.best_params_}")

Best combination (MLP ANN, tuned, Kallang listings): {'activation': 'logistic', 'hidden_layer_sizes': (40,), 'max_iter': 1000, 'solver': 'sgd'}


###### Tuned Model

In [93]:
# Building the regressor
mlp = gs.best_estimator_

with warnings.catch_warnings():
    warnings.simplefilter("ignore") # Temporarily ignores warnings
    
    mlp.fit(X_train, y_train)

In [94]:
# Determining important metrics on the training and testing sets
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mlp_train_cv = cross_validate(mlp, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Training RMSE (MLP ANN, tuned, Kallang listings): {sum(np.sqrt(-mlp_train_cv['test_neg_mean_squared_error'])) / len(mlp_train_cv['test_neg_mean_squared_error'])}")
    print(f"Training MAE (MLP ANN, tuned, Kallang listings) : {sum(-mlp_train_cv['test_neg_mean_absolute_error']) / len(mlp_train_cv['test_neg_mean_absolute_error'])}")
    print(f"Training R^2 (MLP ANN, tuned, Kallang listings) : {sum(mlp_train_cv['test_r2']) / len(mlp_train_cv['test_r2'])}")

    print()

    mlp_test_cv = cross_validate(mlp, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
    print(f"Testing RMSE (MLP ANN, tuned, Kallang listings) : {sum(np.sqrt(-mlp_test_cv['test_neg_mean_squared_error'])) / len(mlp_test_cv['test_neg_mean_squared_error'])}")
    print(f"Testing MAE (MLP ANN, tuned, Kallang listings)  : {sum(-mlp_test_cv['test_neg_mean_absolute_error']) / len(mlp_test_cv['test_neg_mean_absolute_error'])}")
    print(f"Testing R^2 (MLP ANN, tuned, Kallang listings)  : {sum(mlp_test_cv['test_r2']) / len(mlp_test_cv['test_r2'])}")

Training RMSE (MLP ANN, tuned, Kallang listings): 59.72662202878612
Training MAE (MLP ANN, tuned, Kallang listings) : 42.93076308758805
Training R^2 (MLP ANN, tuned, Kallang listings) : 0.5201942623987728

Testing RMSE (MLP ANN, tuned, Kallang listings) : 55.28872269744638
Testing MAE (MLP ANN, tuned, Kallang listings)  : 41.73193899831987
Testing R^2 (MLP ANN, tuned, Kallang listings)  : 0.5582557205517306


In [95]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(mlp, open("Models/kallang_mlp.pkl", "wb"))

##### AdaBoost

###### Base Model

In [96]:
# Building the regressor
ab = AdaBoostRegressor(random_state=rng)
ab.fit(X_train, y_train)

AdaBoostRegressor(random_state=RandomState(MT19937) at 0x7FD9E19A5D40)

In [97]:
# Determining important metrics on the training and testing sets
ab_train_cv = cross_validate(ab, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (AdaBoost, Kallang listings): {sum(np.sqrt(-ab_train_cv['test_neg_mean_squared_error'])) / len(ab_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (AdaBoost, Kallang listings) : {sum(-ab_train_cv['test_neg_mean_absolute_error']) / len(ab_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (AdaBoost, Kallang listings) : {sum(ab_train_cv['test_r2']) / len(ab_train_cv['test_r2'])}")

print()

ab_test_cv = cross_validate(ab, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (AdaBoost, Kallang listings) : {sum(np.sqrt(-ab_test_cv['test_neg_mean_squared_error'])) / len(ab_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (AdaBoost, Kallang listings)  : {sum(-ab_test_cv['test_neg_mean_absolute_error']) / len(ab_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (AdaBoost, Kallang listings)  : {sum(ab_test_cv['test_r2']) / len(ab_test_cv['test_r2'])}")

Training RMSE (AdaBoost, Kallang listings): 62.54788401319718
Training MAE (AdaBoost, Kallang listings) : 48.09927471146036
Training R^2 (AdaBoost, Kallang listings) : 0.47205290404532174

Testing RMSE (AdaBoost, Kallang listings) : 55.54132974355154
Testing MAE (AdaBoost, Kallang listings)  : 45.20656349721365
Testing R^2 (AdaBoost, Kallang listings)  : 0.5545474498949552


In [98]:
# Tries a combination of hyperparameters to determine the best for this classifier
param_grid = {
    "n_estimators": [25, 50, 75, 100],
    "learning_rate": [0.01, 0.25, 0.5, 0.75, 1],
    "loss": ["linear", "square", "exponential"]
}

ab = AdaBoostRegressor(random_state=rng)
gs = GridSearchCV(ab, param_grid=param_grid, scoring="r2", cv=5, n_jobs=-1)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    gs.fit(X_train, y_train)

    print(f"Best combination (AdaBoost, tuned, Kallang listings) : {gs.best_params_}")

Best combination (AdaBoost, tuned, Kallang listings) : {'learning_rate': 0.01, 'loss': 'square', 'n_estimators': 50}


###### Tuned Model

In [99]:
# Building the regressor
ab = gs.best_estimator_
ab.fit(X_train, y_train)

AdaBoostRegressor(learning_rate=0.01, loss='square',
                  random_state=RandomState(MT19937) at 0x7FD9902A3C40)

In [100]:
# Determining important metrics on the training and testing sets
ab_train_cv = cross_validate(ab, X_train, y_train, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Training RMSE (AdaBoost, tuned, Kallang listings): {sum(np.sqrt(-ab_train_cv['test_neg_mean_squared_error'])) / len(ab_train_cv['test_neg_mean_squared_error'])}")
print(f"Training MAE (AdaBoost, tuned, Kallang listings) : {sum(-ab_train_cv['test_neg_mean_absolute_error']) / len(ab_train_cv['test_neg_mean_absolute_error'])}")
print(f"Training R^2 (AdaBoost, tuned, Kallang listings) : {sum(ab_train_cv['test_r2']) / len(ab_train_cv['test_r2'])}")

print()

ab_test_cv = cross_validate(ab, X_test, y_test, scoring=["neg_mean_squared_error", "neg_mean_absolute_error", "r2"], cv=5)
print(f"Testing RMSE (AdaBoost, tuned, Kallang listings) : {sum(np.sqrt(-ab_test_cv['test_neg_mean_squared_error'])) / len(ab_test_cv['test_neg_mean_squared_error'])}")
print(f"Testing MAE (AdaBoost, tuned, Kallang listings)  : {sum(-ab_test_cv['test_neg_mean_absolute_error']) / len(ab_test_cv['test_neg_mean_absolute_error'])}")
print(f"Testing R^2 (AdaBoost, tuned, Kallang listings)  : {sum(ab_test_cv['test_r2']) / len(ab_test_cv['test_r2'])}")

Training RMSE (AdaBoost, tuned, Kallang listings): 59.355791748060184
Training MAE (AdaBoost, tuned, Kallang listings) : 43.68690086695187
Training R^2 (AdaBoost, tuned, Kallang listings) : 0.5248691709830616

Testing RMSE (AdaBoost, tuned, Kallang listings) : 55.93816693022692
Testing MAE (AdaBoost, tuned, Kallang listings)  : 42.302844182426256
Testing R^2 (AdaBoost, tuned, Kallang listings)  : 0.5490976832479322


In [101]:
# Exporting the model for external use
if not os.path.exists("Models"):
    os.makedirs("Models")

pickle.dump(ab, open("Models/kallang_ab.pkl", "wb"))