In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
data = './data/df_cleandata.csv'
df_data = pd.read_csv(data, sep = ',')
df_data = pd.DataFrame(df_data)

print(df_data)

            type      subtype  bedroomCount  bathroomCount         province  \
0      APARTMENT    APARTMENT           2.0            1.0         Brussels   
1          HOUSE        HOUSE           4.0            2.0         Brussels   
2      APARTMENT    APARTMENT           2.0            1.0         Brussels   
3      APARTMENT    APARTMENT           2.0            2.0         Brussels   
4      APARTMENT  FLAT_STUDIO           1.0            1.0         Brussels   
...          ...          ...           ...            ...              ...   
76365      HOUSE        VILLA           3.0            1.0          Antwerp   
76366  APARTMENT    APARTMENT           3.0            1.0         Brussels   
76367  APARTMENT    APARTMENT           3.0            1.0  Flemish Brabant   
76368  APARTMENT    APARTMENT           3.0            2.0    West Flanders   
76369  APARTMENT    APARTMENT           3.0            2.0          Antwerp   

       postCode  habitableSurface buildingCondition

In [3]:
df_data.columns

Index(['type', 'subtype', 'bedroomCount', 'bathroomCount', 'province',
       'postCode', 'habitableSurface', 'buildingCondition', 'facedeCount',
       'hasTerrace', 'epcScore', 'price', 'price_per_m2'],
      dtype='object')

In [4]:
# Remove price/m² columns
df = df_data.drop(columns=['price_per_m2'])
print(df.shape)
print(df.columns)

(76370, 12)
Index(['type', 'subtype', 'bedroomCount', 'bathroomCount', 'province',
       'postCode', 'habitableSurface', 'buildingCondition', 'facedeCount',
       'hasTerrace', 'epcScore', 'price'],
      dtype='object')


In [5]:
df= pd.get_dummies(df, columns=['type','province', 'epcScore','subtype','buildingCondition'], drop_first=True)
print(df.head(10))
print(df.dtypes)

   bedroomCount  bathroomCount  postCode  habitableSurface  facedeCount  \
0           2.0            1.0      1040             100.0          1.0   
1           4.0            2.0      1040             270.0          2.0   
2           2.0            1.0      1040              87.0          2.0   
3           2.0            2.0      1040             104.0          2.0   
4           1.0            1.0      1040              71.0          2.0   
5           2.0            1.0      1040              90.0          2.0   
6           3.0            2.0      1040             220.0          2.0   
7           3.0            2.0      1040             220.0          2.0   
8           2.0            1.0      1040             187.0          2.0   
9           1.0            1.0      1040              93.0          2.0   

   hasTerrace     price  type_HOUSE  province_Brussels  \
0           1  399000.0       False               True   
1           1  895000.0        True               True   


In [13]:
# 1. Create model GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(n_estimators=2000,      
                                      learning_rate=0.05,    
                                      max_depth=4,           
                                      random_state=42)

# 2. Create Pipeline : 
pipeline = Pipeline(steps=[('regressor',model_gbr)])

# 3. Prepare data for pipeline : 
X = df.drop('price', axis=1)
y = df['price']

#save the columns model for futures predictions
model_columns = X.columns

# 4. Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
# 1. Train model GradientBoostingRegressor
pipeline.fit(X_train, y_train)

# 2. Prediction on xtest
y_pred_gbr = pipeline.predict(X_test)

# 3. Model evaluation
mse = mean_squared_error(y_test, y_pred_gbr)
MAE = mean_absolute_error(y_test, y_pred_gbr)
r2 = r2_score(y_test, y_pred_gbr)

print(f"MAE: {MAE:.2f}")
print(f"RMSE: {mse**0.5:.2f}")
print(f"R² score : {r2:.2f}")

MAE: 111019.29
RMSE: 276427.98
R² score : 0.70


In [15]:
price = df['price']
median_price = df['price'].median()
mean_price = df['price'].mean()
absolute_dev = np.abs(price - median_price)
mad_price = np.median(absolute_dev)

print(f"mad price : {mad_price:.2f} €")
print(f"median price : {median_price:.2f} €")
print(f"mean price : {mean_price:.2f} €")


mad price : 114900.00 €
median price : 329900.00 €
mean price : 447606.06 €


In [None]:
min = df['price'].min()
max = df['price'].max()

#rmse = 276669.26 # n_estimators=1000, learning_rate=0.1, max_depth=3
rmse = 274332.41 #n_estimators=1500, learning_rate=0.1, max_depth=5
276427.98
rmse_range = (rmse / (max - min)) * 100
rmse_median = (rmse / median_price) * 100

print(f"min price: {min:.2f}€")
print(f"max price: {max:.2f}€")
print(f"RMSE in range price : {rmse_range:.2f}%")
print(f"RMSE in median price : {rmse_median:.2f}%")

min price: 3141.00€
max price: 15000000.00€
RMSE in range price : 1.83%
RMSE in median price : 83.16%


In [22]:
# Compare MAE with standard deviation 
mad_price = 114900.00
MAE = 111019.29

Ratio = ((MAE-mad_price) / mad_price)*100
print(f"Ratio : {Ratio:.2f}%")


Ratio : -3.38%
