# LightGBM

In [30]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [31]:
def load_preprocessed_data(data_path: str, label_col: str):
    df = pd.read_csv(data_path)
    y = df[label_col]
    X = df.drop(columns=label_col)
    return X, y

## 1. Cleaned dataset w/o feat eng variables

In [32]:
data_path_train = "data/train/baseline_train.csv"
data_path_val = "data/train/baseline_val.csv"
label_col = "monthly_rent"

In [33]:
model_params = {
    "learning_rate": 0.1,
    "max_iter": 100,
    "max_leaf_nodes": 31,
    "max_depth": None,
    "min_samples_leaf": 20,
}

In [34]:
model = HistGradientBoostingRegressor(**model_params)

In [35]:
X_train, y_train = load_preprocessed_data(data_path_train, label_col)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
score = model.score(X_train, y_train)
print(f"Training: MSE = {mse}, Score = {score}")

Training: MSE = 221761.6449298726, Score = 0.568043798599414


In [36]:
X_val, y_val = load_preprocessed_data(data_path_val, label_col)
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
score = model.score(X_val, y_val)
print(f"Validation: MSE = {mse}, Score = {score}")

Validation: MSE = 226121.3467232415, Score = 0.5387529021447376


## 2. Cleaned dataset w/ feat eng variables

In [37]:
data_path_train = "data/train/baseline_w_feature_eng_train.csv"
data_path_val = "data/train/baseline_w_feature_eng_val.csv"
label_col = "monthly_rent"

In [38]:
model_params = {
    "learning_rate": 0.1,
    "max_iter": 100,
    "max_leaf_nodes": 31,
    "max_depth": None,
    "min_samples_leaf": 20,
}

In [39]:
model = HistGradientBoostingRegressor(**model_params)

In [40]:
X_train, y_train = load_preprocessed_data(data_path_train, label_col)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
score = model.score(X_train, y_train)
print(f"Training: MSE = {mse}, Score = {score}")

Training: MSE = 221821.58280034648, Score = 0.5679579419120743


In [41]:
X_val, y_val = load_preprocessed_data(data_path_val, label_col)
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
score = model.score(X_val, y_val)
print(f"Validation: MSE = {mse}, Score = {score}")

Validation: MSE = 225901.56845204812, Score = 0.5393201125714477


## 3. Cleaned dataset w/o feat eng variables (Price per sqm as label)

In [42]:
data_path_train = "data/train/feature_eng_ppsm_train.csv"
data_path_val = "data/train/feature_eng_ppsm_val.csv"
label_col = "rent_per_sqm"

In [43]:
model_params = {
    "learning_rate": 0.1,
    "max_iter": 100,
    "max_leaf_nodes": 31,
    "max_depth": None,
    "min_samples_leaf": 20,
}

In [44]:
model = HistGradientBoostingRegressor(**model_params)

In [45]:
X_train, y_train = load_preprocessed_data(data_path_train, label_col)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
score = model.score(X_train, y_train)
print(f"Training: MSE = {mse}, Score = {score}")

Training: MSE = 30.06692889062316, Score = 0.6332418450731643


In [46]:
X_val, y_val = load_preprocessed_data(data_path_val, label_col)
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
score = model.score(X_val, y_val)
print(f"Validation: MSE = {mse}, Score = {score}")

Validation: MSE = 31.618661595473185, Score = 0.6046956702766987
