![Photo by Stephen Phillips - Hostreviews.co.uk on UnSplash](https://cf.bstatic.com/xdata/images/hotel/max1024x768/408003083.jpg?k=c49b5c4a2346b3ab002b9d1b22dbfb596cee523b53abef2550d0c92d0faf2d8b&o=&hp=1){fig-align="center" width=50%}


# Import data

In [19]:
import time
from pathlib import Path

import lightgbm as lgb
import pandas as pd
from catboost import CatBoostRegressor
from data import utils
from lets_plot import *
from lets_plot.mapping import as_discrete
from sklearn import compose, impute, model_selection, pipeline, preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBRegressor

LetsPlot.setup_html()

**Objective**:
* Examine the necessary sample pre-processing steps before modeling
* Create the required pipeline
* 
Evaluate multiple algorit
* 
Choose a suitable baseline mol.





# Prepare dataframe before modelling
## Read in the processed file

In [41]:
utils.seed_everything(utils.Configuration.seed)

df = (
    pd.read_parquet(
        utils.Configuration.INTERIM_DATA_PATH.joinpath(
            "2023-10-01_Processed_dataset_for_NB_use.parquet.gzip"
        )
    )
    .sample(frac=1, random_state=utils.Configuration.seed)
    .reset_index(drop=True)
    .drop(
        columns=[
            "external_reference",
            "ad_url",
            "day_of_retrieval",
            "website",
            "reference_number_of_the_epc_report",
            "housenumber",
        ]
    )
)

print(f"Shape of dataframe after read-in a pre-processing: {df.shape}")
X = df.drop(columns=utils.Configuration.target_col)
y = df[utils.Configuration.target_col]

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of dataframe after read-in a pre-processing: (3660, 50)
Shape of X: (3660, 49)
Shape of y: (3660,)


## Train-test split

In [42]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=utils.Configuration.seed
)

print(f"Shape of X-train: {X_train.shape}")
print(f"Shape of X-test: {X_test.shape}")

Shape of X-train: (2928, 49)
Shape of X-test: (732, 49)


# Implementing the data-processing pipeline


In [20]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names_in_):
        self.feature_names_in_ = feature_names_in_
        self.n_features_in_ = len(feature_names_in_)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:, self.feature_names_in_].copy(deep=True)

In [22]:
# Selecting columns by dtypes

df.select_dtypes("object")

Unnamed: 0,available_as_of,building_condition,energy_class,external_reference,heating_type,kitchen_type,latest_land_use_designation,reference_number_of_the_epc_report,surroundings_type,website,ad_url,housenumber,street,city,postal,state
0,,To renovate,G,7011 - 3689,Gas,Semi equipped,,20230612016983,,http://www.davidrobin.be,https://www.immoweb.be/en/classified/house/for...,226,Chée de Charleroi,Fleurus,6220,Région Wallonne
1,Immediately,To renovate,G,5518336,Gas,Installed,,Not specified,Urban,http://www.latouretpetit.be,https://www.immoweb.be/en/classified/house/for...,130,Baron Albert d'Huartlaan,Kraainem,1950,Vlaams Gewest
2,,To renovate,D,15126 - Esselaar 56,,,"Living area (residential, urban or rural)",20230517-0002893473-RES-1,Countryside,http://www.clavisimmo.be,https://www.immoweb.be/en/classified/town-hous...,163a,Hollebeekstraat,Linkebeek,1630,Vlaams Gewest
3,After signing the deed,To renovate,F,5512392,Fuel oil,Not installed,,20230512008664,Urban,https://honesty.be/,https://www.immoweb.be/en/classified/mansion/f...,92,Av. de Bouillon,Libramont-Chevigny,6800,Région Wallonne
4,After signing the deed,Good,F,5312248,Fuel oil,Installed,,20230526016001,Isolated,http://www.hendrix.be,https://www.immoweb.be/en/classified/house/for...,77,Rue de Rixensart,Rixensart,1332,Région Wallonne
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3655,After signing the deed,Good,F,5508577,Gas,Installed,,202101060002356727RES1,Isolated,http://www.latouretpetit.be,https://www.immoweb.be/en/classified/house/for...,130,Baron Albert d'Huartlaan,Kraainem,1950,Vlaams Gewest
3656,,Good,E,15557 - 111114555,,Installed,,202205180140,,http://www.trevirasquain.be,https://www.immoweb.be/en/classified/house/for...,7d,Av. des Ardennes,Huy,4500,Région Wallonne
3657,After signing the deed,As new,B,5343631,Gas,USA hyper equipped,,31043-G2012-256/EP14854/A001/D01/SD001,,http://www.cambierdenil.be,https://www.immoweb.be/en/classified/villa/for...,10,Albertplein,Knokke-Heist,8300,Vlaams Gewest
3658,After signing the deed,Just renovated,G,5496624,Gas,Hyper equipped,,Not specified,Isolated,http://www.propertylab.be,https://www.immoweb.be/en/classified/house/for...,51,Rue Langeveld,Uccle,1180,Bruxelles
