## The Mission
The real estate company Immo Eliza asked you to create a machine learning model to predict prices of real estate properties in Belgium.

After the scraping, cleaning and analyzing, you are ready to preprocess the data and finally build a performant machine learning model!


In [26]:
def handling_csv():
    import pandas as pd

    df = pd.read_csv("./data/clean_data/final_cleaned_data.csv")
    # converting float to int
    # df = df.apply(lambda x: x.astype("Int64") if x.dtype == float and (x.dropna() % 1 == 0).all() else x)

    # converting objects to strings
    # df['type'] = df['type'].astype('category')
    # df['subtype'] = df['subtype'].astype('category')
    # # df['state_of_building'] = df['state_of_building'].astype('string')
    # df['province'] = df['province'].astype('category')
    return df

df = handling_csv()
# display(df.dtypes) 

In [27]:
# Overview of all the columns and the total missing values per column
def overview_function():
    df = handling_csv()
    print(df.isna().sum())
    return df

df = overview_function()

property_ID                         0
locality_name                    2328
postal_code                         0
type                                0
subtype                             0
price (€)                           0
number_of_bedrooms                148
living_area (m²)                    0
equiped_kitchen (yes:1, no:0)       0
furnished (yes:1, no:0)             0
open_fire (yes:1, no:0)             0
terrace (yes:1, no:0)               0
terrace_area (m²)                6750
garden (yes:1, no:0)                0
number_facades                   4197
swimming_pool (yes:1, no:0)         0
state_of_building                2913
province                            0
dtype: int64


### Traint test split --> imputing --> encoding --> scaling

Scaling before splitting can lead to: Data Leakage. Therefore, it is important to follow the path indicated above. 


### 1. Train_test_split

In [28]:
# The variable `y` is the variable that must be regressed. The target to be predicted.
# The variable `X` which will be the explanatory variable. The feature.

def train_test_split(df):
    from sklearn.model_selection import train_test_split
    df = handling_csv()

    # Columns to drop
    columns_to_drop = ["price (€)", "property_ID", "locality_name", "postal_code"]

    X = df.drop(columns=columns_to_drop)
    y = df['price (€)']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(df)


In [29]:
def log_transform_target(y_train, y_test):
    import numpy as np
    """
    Apply log1p transform to target and return transformed versions.
    Also returns the inverse_transform function needed after prediction.
    """
    
    # transform (log1p handles zero safely)
    y_train_log = np.log1p(y_train)
    y_test_log = np.log1p(y_test)

    # define inverse transform
    def inverse_log_transform(y_pred_log):
        return np.expm1(y_pred_log)

    return y_train_log, y_test_log, inverse_log_transform


### 2. Imputation

Sart with a SimpleImputer and continue with more difficult to get better results (Optional). 

In [30]:
# Function to impute numerical columns
def impute_numeric_columns(X_train, X_test):
    from sklearn.impute import SimpleImputer
    """
    Imputes numerical columns in X_train and X_test using the mean of X_train.
    Returns the updated X_train, X_test, and the list of numerical columns.
    """

    # Select numerical columns automatically
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

    # Create imputer (mean strategy)
    imputer = SimpleImputer(strategy="mean")

    # Fit on training data
    X_train[num_cols] = imputer.fit_transform(X_train[num_cols])

    # Transform test data using SAME means
    X_test[num_cols] = imputer.transform(X_test[num_cols])

    return X_train, X_test, num_cols

X_train, X_test, num_cols = impute_numeric_columns(X_train, X_test)
display(X_train.head())

Unnamed: 0,type,subtype,number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",state_of_building,province
13689,Apartment,Studio,2.721205,12.0,1.0,0.0,0.0,0.0,20.772097,0.0,4.0,0.0,New,Brussels
8991,Apartment,Apartment,1.0,73.0,0.0,0.0,0.0,1.0,24.0,0.0,2.877018,0.0,New,Luxembourg
3325,Apartment,Duplex,2.0,102.0,1.0,0.0,0.0,1.0,20.772097,0.0,2.877018,0.0,Normal,Brabant-Wallon
13462,Apartment,Apartment,3.0,114.0,1.0,0.0,0.0,1.0,8.0,1.0,3.0,0.0,New,Brussels
1710,Apartment,Apartment,2.0,93.0,1.0,0.0,0.0,1.0,11.0,0.0,4.0,0.0,New,Antwerp


In [31]:
# Function to impute categorical column "state of building"
def impute_categorical_state(X_train, X_test, column="state_of_building"):
    """
    Fills missing values in the state_of_building column using the 'unknown' category.
    """
    X_train[column] = X_train[column].fillna("unknown")
    X_test[column] = X_test[column].fillna("unknown")
    
    return X_train, X_test

X_train, X_test = impute_categorical_state(X_train, X_test, column="state_of_building")
display(X_train.head())

Unnamed: 0,type,subtype,number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",state_of_building,province
13689,Apartment,Studio,2.721205,12.0,1.0,0.0,0.0,0.0,20.772097,0.0,4.0,0.0,New,Brussels
8991,Apartment,Apartment,1.0,73.0,0.0,0.0,0.0,1.0,24.0,0.0,2.877018,0.0,New,Luxembourg
3325,Apartment,Duplex,2.0,102.0,1.0,0.0,0.0,1.0,20.772097,0.0,2.877018,0.0,Normal,Brabant-Wallon
13462,Apartment,Apartment,3.0,114.0,1.0,0.0,0.0,1.0,8.0,1.0,3.0,0.0,New,Brussels
1710,Apartment,Apartment,2.0,93.0,1.0,0.0,0.0,1.0,11.0,0.0,4.0,0.0,New,Antwerp


### 3. Encoding : converting categorical data into numeric features using **one-hot encoding**

One-Hot Encoding will be used for "province", "type", "subtype" and "state_of_building" columns.

In [32]:
def encoding_ohe(X_train, X_test):
    from sklearn.preprocessing import OneHotEncoder
    import pandas as pd
    
    one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    ohe_cols = ["province", "type", "subtype", "state_of_building"]
    # fit only on train to prevent data leakage
    type_train = one_hot_encoder.fit_transform(X_train[ohe_cols])
    type_test = one_hot_encoder.transform(X_test[ohe_cols])

    feature_names = one_hot_encoder.get_feature_names_out(ohe_cols)

    # convert the encoded arrays back into DataFrames
    type_train_df = pd.DataFrame(type_train, columns=feature_names, index=X_train.index)
    type_test_df = pd.DataFrame(type_test, columns=feature_names, index=X_test.index)

    X_train_final = pd.concat([X_train.drop(columns=ohe_cols), type_train_df], axis=1)
    X_test_final = pd.concat([X_test.drop(columns=ohe_cols), type_test_df], axis=1)

    return X_train_final, X_test_final

X_train_final, X_test_final = encoding_ohe(X_train, X_test)
display(X_train_final.head())

#     return type_train_df, type_test_df

# X_train_final, X_test_final = encoding_ohe(X_train, X_test)
# display(X_train_final.head())


Unnamed: 0,number_of_bedrooms,living_area (m²),"equiped_kitchen (yes:1, no:0)","furnished (yes:1, no:0)","open_fire (yes:1, no:0)","terrace (yes:1, no:0)",terrace_area (m²),"garden (yes:1, no:0)",number_facades,"swimming_pool (yes:1, no:0)",...,state_of_building_Excellent,state_of_building_Fully renovated,state_of_building_New,state_of_building_Normal,state_of_building_To be renovated,state_of_building_To demolish,state_of_building_To renovate,state_of_building_To restore,state_of_building_Under construction,state_of_building_unknown
13689,2.721205,12.0,1.0,0.0,0.0,0.0,20.772097,0.0,4.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8991,1.0,73.0,0.0,0.0,0.0,1.0,24.0,0.0,2.877018,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3325,2.0,102.0,1.0,0.0,0.0,1.0,20.772097,0.0,2.877018,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13462,3.0,114.0,1.0,0.0,0.0,1.0,8.0,1.0,3.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1710,2.0,93.0,1.0,0.0,0.0,1.0,11.0,0.0,4.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Final Assembly

In [None]:
import pandas as pd
# Step 1: Load CSV
csv = handling_csv()

df = pd.read_csv("./data/clean_data/final_cleaned_data.csv")

# Step 2: Prepare X and y
# X_train, X_test, y_train, y_test = handling_csv()
X_train, X_test, y_train, y_test = train_test_split(df)

y_train_log, y_test_log, inverse = log_transform_target(y_train, y_test)

# Step 3: Impute numerical values
X_train_imputed_num, X_test_imputed_num, num_cols = impute_numeric_columns(X_train, X_test)

# Step 4: Impute categorical missing values
X_train, X_test = impute_categorical_state(X_train_imputed_num, X_test_imputed_num)

# Step 5: LabelEncoding for "type", "subtype" and "province" columns

X_train_final, X_test_final = encoding_ohe(X_train, X_test)
ohe_cols = ["province", "type", "subtype", "state_of_building"]


# Optional: quick safety check
print("X_train_final shape:", X_train_final.shape)
print("X_test_final shape: ", X_test_final.shape)


X_train_final shape: (11636, 48)
X_test_final shape:  (2909, 48)
11636
2909


### 5. Training and Evaluation

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Train XGBoost model
def train_xgboost(X_train, y_train):
    model = XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=8,
        subsample=1.0,
        colsample_bytree=0.6,
        random_state=888,
        n_jobs=-1,
        min_child_weight=5
    )
    model.fit(X_train, y_train)
    return model


def evaluate_model(model, 
                   X_train, y_train_log, y_train_real,
                   X_test,  y_test_log,  y_test_real, inverse):
    
    # Predict in log space
    y_pred_train_log = model.predict(X_train)
    y_pred_test_log = model.predict(X_test)
    
    # Convert back to real price scale
    y_pred_train = inverse(y_pred_train_log)
    y_pred_test = inverse(y_pred_test_log)

    metrics = {
        "train": {
            "MAE": mean_absolute_error(y_train_real, y_pred_train),
            "RMSE": np.sqrt(mean_squared_error(y_train_real, y_pred_train)),
            "R2": r2_score(y_train_real, y_pred_train)
        },
        "test": {
            "MAE": mean_absolute_error(y_test_real, y_pred_test),
            "RMSE": np.sqrt(mean_squared_error(y_test_real, y_pred_test)),
            "R2": r2_score(y_test_real, y_pred_test)
        }
    }
    return metrics

# Train XGBoost
model_xgb = train_xgboost(X_train_final, y_train_log)

# Evaluate
results_xgb = evaluate_model(
    model_xgb,
    X_train_final, y_train_log, y_train,
    X_test_final, y_test_log, y_test,
    inverse
)

results_xgb



{'train': {'MAE': 45476.4453125,
  'RMSE': np.float64(105909.41820253759),
  'R2': 0.8483845591545105},
 'test': {'MAE': 68959.734375,
  'RMSE': np.float64(129614.28069468272),
  'R2': 0.7195080518722534}}

Scaling

In [35]:
# def scale_numeric(X_train, X_val, X_test, numeric_cols):
#     from sklearn.preprocessing import StandardScaler
    
#     scaler = StandardScaler()

#     X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
#     X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
#     X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

#     return X_train, X_val, X_test, scaler

# display(X_train.head())