# LightGBM

We explore the solution of using LightGBM models using the `HistGradientBoostingRegressor` implementation from `scikit-learn`

| Approach | Best Params | Train RMSE | Validation RMSE |
|:--------|:--------|:--------|:--------|
|1|learning_rate: 0.1, max_depth: 16, max_leaf_nodes: 15, min_samples_leaf: 200|474.98|475.01|
|2|learning_rate: 0.1, max_depth: None, max_leaf_nodes: 15, min_samples_leaf: 200|472.58|475.61|
|3|learning_rate: 0.1, max_depth: 4, max_leaf_nodes: 15, min_samples_leaf: 100|473.91|474.53|
|4|learning_rate: 0.1, max_depth: 4, max_leaf_nodes: 15, min_samples_leaf: 100|475.78|475.57|

In [1]:
import joblib
import numpy as np
import pandas as pd
import random
import string
import time
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error

In [2]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
BASELINE_TEST = "data/test/baseline_test.csv"

TRUNCATED_BASELINE_TRAIN = "data/train/baseline-truncated_train.csv"
TRUNCATED_BASELINE_VAL = "data/train/baseline-truncated_val.csv"
TRUNCATED_BASELINE_TEST = "data/test/baseline-truncated_test.csv"

BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline-w-feature-eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline-w-feature-eng_val.csv"
BASELINE_W_FEAT_ENG_TEST = "data/test/baseline-w-feature-eng_test.csv"

TRUNCATED_FEAT_ENG_TRAIN = "data/train/truncated-feat-eng_train.csv"
TRUNCATED_FEAT_ENG_VAL = "data/train/truncated-feat-eng_val.csv"
TRUNCATED_FEAT_ENG_TEST = "data/test/truncated-feat-eng_test.csv"

In [3]:
def split_features_and_monthly_rent_label(df: pd.DataFrame):
    if "monthly_rent" not in df.columns:
        # test data
        X = df.copy()
        return X, None
    X = df.drop(columns="monthly_rent")
    y = df["monthly_rent"]
    return X, y

In [4]:
def random_string(k: int = 6) -> str:
    res = "".join(random.choices(string.ascii_uppercase + string.digits, k=k))
    return str(res)

In [5]:
def save_model(model) -> str:
    ts = str(int(time.time()))
    res = random_string()
    filename = f"tmp/model/lightgbm_{ts}_{res}.joblib"
    joblib.dump(model, filename)
    return filename

def load_model(filename: str):
    return joblib.load(filename)

In [6]:
def save_test_prediction(y_pred) -> str:
    ts = str(int(time.time()))
    res = random_string()
    filename = f"tmp/pred/lightgbm_test_pred_{ts}_{res}.csv"
    y_pred = np.array(y_pred)
    pred_df = pd.DataFrame(y_pred, columns=["Predicted"])
    pred_df.index.name = "Id"
    pred_df.to_csv(filename)
    return filename

## 1. Baseline (Cleaned dataset without feat eng variables)

In [20]:
train_path = BASELINE_TRAIN
val_path = BASELINE_VAL
test_path = BASELINE_TEST

In [21]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [22]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

### a. Base model

In [23]:
model = HistGradientBoostingRegressor(
    learning_rate = 0.1,
    max_iter = 100,
    max_leaf_nodes = 31,
    max_depth = None,
    min_samples_leaf = 20,
)
model.fit(X_train, y_train)
model_path = save_model(model)
print(f"Model saved at {model_path}")

Model saved at tmp/model/lightgbm_1698581373_4IR31Z.joblib


In [24]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 470.97488611920465, Score = 0.5679353069792842


In [25]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 475.7323632433293
Validation (rounded prediction): RMSE = 475.8111495120727


In [26]:
y_test_pred = model.predict(X_test)
y_test_pred_path = save_test_prediction(y_test_pred)
print(f"Test prediction saved at {y_test_pred_path}")

Test prediction saved at tmp/pred/lightgbm_test_pred_1698581373_MN5X7H.csv
Test prediction (rounded prediction) saved at tmp/pred/lightgbm_test_pred_1698581373_HNDM4X.csv


### b. Hyper-parameter Tuning

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 16, 'max_leaf_nodes': 15, 'min_samples_leaf': 200}
Model saved at tmp/model/lightgbm_1699189588_9X1NWA.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 474.97853075536926, Score = 0.560558327900953


In [None]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 475.0136555826826
Validation (rounded prediction): RMSE = 475.04473473558255


## 2. Dataset with additional features

In [33]:
train_path = BASELINE_W_FEAT_ENG_TRAIN
val_path = BASELINE_W_FEAT_ENG_VAL
test_path = BASELINE_W_FEAT_ENG_TEST

In [34]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [35]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

### a. Base model

In [36]:
model = HistGradientBoostingRegressor(
    learning_rate = 0.1,
    max_iter = 100,
    max_leaf_nodes = 31,
    max_depth = None,
    min_samples_leaf = 20,
)
model.fit(X_train, y_train)
model_path = save_model(model)
print(f"Model saved at {model_path}")

Model saved at tmp/model/lightgbm_1698582496_PQ4NHK.joblib


In [37]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 471.1192991113379, Score = 0.5676703021010041


In [38]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 475.5102896026055
Validation (rounded prediction): RMSE = 475.42218080354644


In [39]:
y_test_pred = model.predict(X_test)
y_test_pred_path = save_test_prediction(y_test_pred)
print(f"Test prediction saved at {y_test_pred_path}")

Test prediction saved at tmp/pred/lightgbm_test_pred_1698582497_7273N9.csv
Test prediction (rounded prediction) saved at tmp/pred/lightgbm_test_pred_1698582497_WUT8XQ.csv


### b. Hyper-parameter Tuning

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': None, 'max_leaf_nodes': 15, 'min_samples_leaf': 200}
Model saved at tmp/model/lightgbm_1699197799_URHUMS.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 472.58460361211496, Score = 0.5649768025116157


In [None]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 475.6069947849758
Validation (rounded prediction): RMSE = 476.0365882296584


## 3. Truncated baseline dataset (extra columns removal)

In [45]:
train_path = TRUNCATED_BASELINE_TRAIN
val_path = TRUNCATED_BASELINE_VAL
test_path = TRUNCATED_BASELINE_TEST

In [46]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [47]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

### a. Base model

In [48]:
model = HistGradientBoostingRegressor(
    learning_rate = 0.1,
    max_iter = 100,
    max_leaf_nodes = 31,
    max_depth = None,
    min_samples_leaf = 20,
)
model.fit(X_train, y_train)
model_path = save_model(model)
print(f"Model saved at {model_path}")

Model saved at tmp/model/lightgbm_1698582504_RYGOLV.joblib


In [49]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 473.118880464112, Score = 0.5639926223050177


In [50]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 476.43654808355495
Validation (rounded prediction): RMSE = 476.8149536245691


In [51]:
y_test_pred = model.predict(X_test)
y_test_pred_path = save_test_prediction(y_test_pred)
print(f"Test prediction saved at {y_test_pred_path}")

Test prediction saved at tmp/pred/lightgbm_test_pred_1698582504_DDTQWJ.csv
Test prediction (rounded prediction) saved at tmp/pred/lightgbm_test_pred_1698582504_SBW0Y7.csv


### b. Hyper-parameter Tuning

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 4, 'max_leaf_nodes': 15, 'min_samples_leaf': 100}
Model saved at tmp/model/lightgbm_1699200120_EXYA8Y.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 473.907508492817, Score = 0.5625378749771468


In [None]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 474.52836175066955
Validation (rounded prediction): RMSE = 474.802590557381


## 4. Truncated baseline + additional features dataset

In [57]:
train_path = TRUNCATED_FEAT_ENG_TRAIN
val_path = TRUNCATED_FEAT_ENG_VAL
test_path = TRUNCATED_FEAT_ENG_TEST

In [58]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [59]:
X_train, y_train = split_features_and_monthly_rent_label(df_train)
X_val, y_val = split_features_and_monthly_rent_label(df_val)
X_test, _ = split_features_and_monthly_rent_label(df_test)

### a. Base model

In [60]:
model = HistGradientBoostingRegressor(
    learning_rate = 0.1,
    max_iter = 100,
    max_leaf_nodes = 31,
    max_depth = None,
    min_samples_leaf = 20,
)
model.fit(X_train, y_train)
model_path = save_model(model)
print(f"Model saved at {model_path}")

Model saved at tmp/model/lightgbm_1698582508_1535LE.joblib


In [61]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 470.0750001593485, Score = 0.569584811087428


In [62]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 475.88728534075983
Validation (rounded prediction): RMSE = 475.914470607202


In [63]:
y_test_pred = model.predict(X_test)
y_test_pred_path = save_test_prediction(y_test_pred)
print(f"Test prediction saved at {y_test_pred_path}")

Test prediction saved at tmp/pred/lightgbm_test_pred_1698582508_XWAA81.csv
Test prediction (rounded prediction) saved at tmp/pred/lightgbm_test_pred_1698582508_PUM68S.csv


### b. Hyper-parameter Tuning

In [None]:
regressor = HistGradientBoostingRegressor(max_iter = 1000)
param_grids = {
    "learning_rate": [0.1, 0.2, 0.5, 1.0],
    "max_leaf_nodes": [15, 31, 63, 127],
    "max_depth": [4, 8, 16, None],
    "min_samples_leaf": [20, 50, 100, 200],
}
cv = KFold(n_splits=10, shuffle=True)
model = GridSearchCV(
    estimator=regressor,
    param_grid=param_grids,
    return_train_score=True,
    cv=cv,
)
model.fit(X_train, y_train)
print(f"Best params: {str(model.best_params_)}")
model_path = save_model(model)
print(f"Model saved at {model_path}")

Best params: {'learning_rate': 0.1, 'max_depth': 4, 'max_leaf_nodes': 15, 'min_samples_leaf': 100}
Model saved at tmp/model/lightgbm_1699202250_GS27T6.joblib


In [None]:
y_train_pred = model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
score_train = model.score(X_train, y_train)
print(f"Training: RMSE = {rmse_train}, Score = {score_train}")

Training: RMSE = 475.7846616014707, Score = 0.5590654263174558


In [None]:
y_val_pred = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation: RMSE = {rmse_val}")

Validation: RMSE = 475.57421027288035
Validation (rounded prediction): RMSE = 475.8965223659446
