In [1]:
import joblib
import numpy as np
import pandas as pd
import random
import string
import time
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from src.utils import *

In [2]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

In [3]:
def random_string(k: int = 6) -> str:
    res = "".join(random.choices(string.ascii_uppercase + string.digits, k=k))
    return str(res)

In [4]:
def save_model(model) -> str:
    ts = str(int(time.time()))
    res = random_string()
    filename = f"tmp/model/lightgbm_{ts}_{res}.joblib"
    joblib.dump(model, filename)
    return filename

def load_model(filename: str):
    return joblib.load(filename)

In [5]:
def save_test_prediction(y_pred) -> str:
    ts = str(int(time.time()))
    res = random_string()
    filename = f"tmp/pred/lightgbm_test_pred_{ts}_{res}.csv"
    y_pred = np.array(y_pred)
    pred_df = pd.DataFrame(y_pred, columns=["Predicted"])
    pred_df.index.name = "Id"
    pred_df.to_csv(filename)
    return filename

In [6]:
def round_to_nearest_price(y, round_interval: float = 50.0):
    # function to round the label to the nearest price (default: nearest 50 sgd)
    return np.round(np.array(y) / round_interval) * round_interval

In [7]:
train_path = BASELINE_W_FEAT_ENG_TRAIN
val_path = BASELINE_W_FEAT_ENG_VAL
test_path = BASELINE_W_FEAT_ENG_TEST

In [8]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

df = pd.concat([df_train, df_val], ignore_index=True)

In [9]:
X_train, y_train = split_features_and_monthly_rent_label(df)
X_test, _ = split_features_and_monthly_rent_label(df_test)

In [10]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 8, 'max_leaf_nodes': 31, 'min_samples_leaf': 200}
Model saved at tmp/model/lightgbm_1698600835_5IX3BY.joblib


In [11]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 471.5669953270882, Score = 0.5648983513027435


In [12]:
y_test_pred = model.predict(X_test)
y_test_pred_round = round_to_nearest_price(y_test_pred)
y_test_pred_path = save_test_prediction(y_test_pred)
y_test_pred_round_path = save_test_prediction(y_test_pred_round)
print(f"Test prediction saved at {y_test_pred_path}")
print(f"Test prediction (rounded prediction) saved at {y_test_pred_round_path}")

Test prediction saved at tmp/pred/lightgbm_test_pred_1698600836_7V74M6.csv
Test prediction (rounded prediction) saved at tmp/pred/lightgbm_test_pred_1698600836_89VZTN.csv
