In [None]:
import joblib
import numpy as np
import pandas as pd
import random
import string
import time
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from src.utils import *

In [None]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

In [None]:
def random_string(k: int = 6) -> str:
    res = "".join(random.choices(string.ascii_uppercase + string.digits, k=k))
    return str(res)

In [None]:
def save_model(model) -> str:
    ts = str(int(time.time()))
    res = random_string()
    filename = f"tmp/model/lightgbm_{ts}_{res}.joblib"
    joblib.dump(model, filename)
    return filename

def load_model(filename: str):
    return joblib.load(filename)

In [None]:
def save_test_prediction(y_pred) -> str:
    ts = str(int(time.time()))
    res = random_string()
    filename = f"tmp/pred/lightgbm_test_pred_{ts}_{res}.csv"
    y_pred = np.array(y_pred)
    pred_df = pd.DataFrame(y_pred, columns=["Predicted"])
    pred_df.index.name = "Id"
    pred_df.to_csv(filename)
    return filename

In [None]:
def round_to_nearest_price(y, round_interval: float = 50.0):
    # function to round the label to the nearest price (default: nearest 50 sgd)
    return np.round(np.array(y) / round_interval) * round_interval

# Approach 1

In [None]:
train_path = BASELINE_TRAIN
val_path = BASELINE_VAL
test_path = BASELINE_TEST

In [None]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [None]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 16, 'max_leaf_nodes': 15, 'min_samples_leaf': 200}
Model saved at tmp/model/lightgbm_1699189588_9X1NWA.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 474.97853075536926, Score = 0.560558327900953


In [None]:
y_val_pred = model.predict(X_val)
y_val_pred_round = round_to_nearest_price(y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
rmse_val_round = mean_squared_error(y_val, y_val_pred_round, squared=False)
print(f"Validation: RMSE = {rmse_val}")
print(f"Validation (rounded prediction): RMSE = {rmse_val_round}")

Validation: RMSE = 475.0136555826826
Validation (rounded prediction): RMSE = 475.04473473558255


# Approach 2

In [None]:
train_path = BASELINE_W_FEAT_ENG_TRAIN
val_path = BASELINE_W_FEAT_ENG_VAL
test_path = BASELINE_W_FEAT_ENG_TEST

In [None]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [None]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': None, 'max_leaf_nodes': 15, 'min_samples_leaf': 200}
Model saved at tmp/model/lightgbm_1699197799_URHUMS.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 472.58460361211496, Score = 0.5649768025116157


In [None]:
y_val_pred = model.predict(X_val)
y_val_pred_round = round_to_nearest_price(y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
rmse_val_round = mean_squared_error(y_val, y_val_pred_round, squared=False)
print(f"Validation: RMSE = {rmse_val}")
print(f"Validation (rounded prediction): RMSE = {rmse_val_round}")

Validation: RMSE = 475.6069947849758
Validation (rounded prediction): RMSE = 476.0365882296584


# Approach 3

In [None]:
train_path = TRUNCATED_BASELINE_TRAIN
val_path = TRUNCATED_BASELINE_VAL
test_path = TRUNCATED_BASELINE_TEST

In [None]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [None]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 4, 'max_leaf_nodes': 15, 'min_samples_leaf': 100}
Model saved at tmp/model/lightgbm_1699200120_EXYA8Y.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 473.907508492817, Score = 0.5625378749771468


In [None]:
y_val_pred = model.predict(X_val)
y_val_pred_round = round_to_nearest_price(y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
rmse_val_round = mean_squared_error(y_val, y_val_pred_round, squared=False)
print(f"Validation: RMSE = {rmse_val}")
print(f"Validation (rounded prediction): RMSE = {rmse_val_round}")

Validation: RMSE = 474.52836175066955
Validation (rounded prediction): RMSE = 474.802590557381


# Approach 4

In [None]:
train_path = TRUNCATED_FEAT_ENG_TRAIN
val_path = TRUNCATED_FEAT_ENG_VAL
test_path = TRUNCATED_FEAT_ENG_TEST

In [None]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [None]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 4, 'max_leaf_nodes': 15, 'min_samples_leaf': 100}
Model saved at tmp/model/lightgbm_1699202250_GS27T6.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 475.7846616014707, Score = 0.5590654263174558


In [None]:
y_val_pred = model.predict(X_val)
y_val_pred_round = round_to_nearest_price(y_val_pred)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
rmse_val_round = mean_squared_error(y_val, y_val_pred_round, squared=False)
print(f"Validation: RMSE = {rmse_val}")
print(f"Validation (rounded prediction): RMSE = {rmse_val_round}")

Validation: RMSE = 475.57421027288035
Validation (rounded prediction): RMSE = 475.8965223659446


# Result

| Approach | Best Params | Train RMSE | Validation RMSE |
|:--------|:--------|:--------|:--------|
|1|learning_rate: 0.1, max_depth: 16, max_leaf_nodes: 15, min_samples_leaf: 200|474.98|475.01|
|2|learning_rate: 0.1, max_depth: None, max_leaf_nodes: 15, min_samples_leaf: 200|472.58|475.61|
|3|learning_rate: 0.1, max_depth: 4, max_leaf_nodes: 15, min_samples_leaf: 100|473.91|474.53|
|4|learning_rate: 0.1, max_depth: 4, max_leaf_nodes: 15, min_samples_leaf: 100|475.78|475.57|