Import libs

In [1]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

Load dataset

In [2]:
df = pd.read_csv("kangaroo_clean.csv")  # Replace with your actual file
# df.head()

Make a subset of columns

In [3]:
kgr_subset = df[[
                 "type_subtype",
                 "bedroomCount",
                 "locality_normalized",
                 "habitableSurface",
                 "epcScoreMapping",
                 "province",
                 "price",
                 "roomCount",
                 "facedeCount",
                 "gardenSurface",
                 "kitchenType"
            ]]

Remove "price" outliers

In [4]:
kgr_subset = kgr_subset[(kgr_subset["price"] > 80000) & (kgr_subset["price"] < 1200000)] 

Drop missing values in facedecount column

In [5]:
kgr_subset = kgr_subset.dropna(axis=0, subset=["facedeCount"])

Define the target and features 

In [6]:

X = kgr_subset.drop(["price"], axis=1, inplace=False)
y = kgr_subset["price"]

Define categorical or numerical columns

In [7]:
num_cols = ["bedroomCount", "habitableSurface", "facedeCount", "gardenSurface", "roomCount"]
cat_cols = ["type_subtype","locality_normalized", "province", "epcScoreMapping", "kitchenType"]


Preprocess the categorical columns with OneHotEncoder

In [8]:
prep = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ],
    remainder="passthrough"
)

Create a pipeline model with LightGBM Regressor

In [9]:
LGBM_model = Pipeline(steps=[
    ("preprocessing", prep),
    ("regressor", LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42))
])

Split in 80% Training and 20% Testing

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=0.2)

Calculate the errors

In [11]:
LGBM_model.fit(X_train, y_train)
y_pred = LGBM_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred)
mae_proportion = mae / y_test.mean()
print(f"📈 MAE  (Mean Absolute Error)     : {mae:.2f}")
print(f"📊 MAE (en proportion)           : {mae_proportion:.4f}")
print(f"📉 MAPE (Mean Absolute Percentage Error) : {mape * 100:.2f}%")
print(f"🔁 RMSE (Root Mean Squared Error) : {rmse:.2f}")
print(f"📊 Score R² on the test set : {r2:.3f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1471
[LightGBM] [Info] Number of data points in the train set: 37719, number of used features: 456
[LightGBM] [Info] Start training from score 370633.723402
📈 MAE  (Mean Absolute Error)     : 75541.15
📊 MAE (en proportion)           : 0.2045
📉 MAPE (Mean Absolute Percentage Error) : 22.61%
🔁 RMSE (Root Mean Squared Error) : 12016079242.02
📊 Score R² on the test set : 0.694




Save Model

In [12]:
# Save the model
import pickle
pickle.dump(LGBM_model, open("kangaroo_clean.sav", "wb"))