# IMMO-ELIZA

In [11]:
!pip install category_encoders
!pip install pgeocode



In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import category_encoders as ce
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ridge_regression
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from scipy.stats import gaussian_kde
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pgeocode
import xgboost as xgb
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor

In [25]:
df = pd.read_csv("./data/Kangaroo.csv")

In [26]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [27]:
print((df.isna().mean() * 100).round(2).astype(str) + ' %')

Unnamed: 0                    0.0 %
id                            0.0 %
url                           0.0 %
type                          0.0 %
subtype                       0.0 %
bedroomCount                 8.47 %
bathroomCount               17.04 %
province                      0.0 %
locality                      0.0 %
postCode                      0.0 %
habitableSurface            15.66 %
roomCount                   72.69 %
monthlyCost                 100.0 %
hasAttic                    84.43 %
hasBasement                 63.52 %
hasDressingRoom             96.73 %
diningRoomSurface           91.41 %
hasDiningRoom                82.4 %
buildingCondition           27.84 %
buildingConstructionYear     38.9 %
facedeCount                 33.76 %
floorCount                  53.28 %
streetFacadeWidth            80.7 %
hasLift                      76.3 %
floodZoneType               44.67 %
heatingType                 41.33 %
hasHeatPump                  90.7 %
hasPhotovoltaicPanels       

In [7]:
df = df.drop(columns=["Unnamed: 0", "url"])

df = df.drop(columns=['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople', 'roomCount', 'diningRoomSurface', 
                      'streetFacadeWidth', 'gardenOrientation', 'kitchenSurface', 'floorCount', 'hasDiningRoom', 
                      'hasDressingRoom'])

df = df.drop_duplicates(subset=["id"], keep="first")

binary_cols = [
    'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels', 
    'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice', 
    'hasSwimmingPool', 'hasFireplace', 'parkingCountIndoor', 'parkingCountOutdoor',
    'hasAttic'
]

for col in binary_cols:
    df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0}).fillna(0).astype(int)

# Colonnes dépendantes d'autres colonnes
df['hasLivingRoom'] = df['hasLivingRoom'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df.loc[df['hasLivingRoom'].isna(), 'hasLivingRoom'] = df['livingRoomSurface'].notnull().astype(int)

df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df.loc[df['hasGarden'].isna(), 'hasGarden'] = df['gardenSurface'].notnull().astype(int)

df['hasTerrace'] = df['hasTerrace'].map({True: 1, False: 0, 'True': 1, 'False': 0})
df.loc[df['hasTerrace'].isna(), 'hasTerrace'] = df['terraceSurface'].notnull().astype(int)

df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0})

# When hasLivingRoom = 0 ; livingRoomSurface = 0
df.loc[df['hasLivingRoom'] == 0, 'livingRoomSurface'] = 0

# When hasGarden = 0 ; gardenSurface = 0
df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0

# When hasTerrace = 0 ; terraceSurface = 0 and terraceOrientation = 0
df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0
df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = 0

#drop number of facade bigger than 4 and transform "facedeCount" into "facadeCount"
df['facadeCount'] = df['facedeCount']
df = df.drop(columns='facedeCount')
df['facadeCount'] = df['facadeCount'].fillna(2)
df = df[df['facadeCount'] <= 4]

# drop lines without price
df = df.dropna(subset="price")

# bedroomCount : lets assume that they have at least one so fill nan by 1
df['bedroomCount'] = df['bedroomCount'].fillna(1).astype(float)

# bathroomCount same as bedrooms
df['bathroomCount'] = df['bathroomCount'].fillna(1).astype(float)

# toiletCount same as bedrooms
df['toiletCount'] = df['toiletCount'].fillna(1).astype(float)

# habitableSurface : replace by median 
df['habitableSurface'] = df['habitableSurface'].fillna(df['habitableSurface'].median())

# buildingCondition : replace by 'NOT_MENTIONED
df['buildingCondition'] = df['buildingCondition'].fillna('NOT_MENTIONED')

# buildingConstructionYear
df['buildingConstructionYear'] = df['buildingConstructionYear'].fillna(df['buildingConstructionYear'].median()).astype(int)


# floodZoneType lts assume that missing values are NON_FLOOD_ZONE
df['floodZoneType'] = df['floodZoneType'].fillna('NON_FLOOD_ZONE')

# heatingType
df['heatingType'] = df['heatingType'].fillna(df['heatingType'].mode()[0])

# hasThermicPanels lets assume that if its not precised, there are not
df['hasThermicPanels'] = df['hasThermicPanels'].fillna(0).astype(float)

# kitchenType
df['kitchenType'] = df['kitchenType'].fillna(df['kitchenType'].mode()[0])

# landSurface
df['landSurface'] = df['landSurface'].fillna(df['landSurface'].median())

# livingRoomSurface
df['livingRoomSurface'] = df['livingRoomSurface'].fillna(df['livingRoomSurface'].median())

# terraceSurface
median_terrace = df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].notnull()), 'terraceSurface'].median()
df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = median_terrace
df.loc[(df['hasTerrace'] != 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = 0

# terraceOrientation
mode_terrace = df.loc[(df['hasTerrace'] == 1), 'terraceOrientation'].mode()[0]
df.loc[(df['hasTerrace'] == 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = mode_terrace
df.loc[(df['hasTerrace'] != 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NO_TERRACE'

# epcScore
epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
df = df[df['epcScore'].isin(epc_order)]
df['epcScore'] = df['epcScore'].fillna(df['epcScore'].mode()[0])

def transform_data_types(df, col_types):
    for col, dtype in col_types.items():
        df[col] = df[col].astype(dtype)
    return df

col_types = {'id': 'int', 'type': 'str', 'subtype': 'str', 'bedroomCount': 'int', 'bathroomCount': 'int',
             'province': 'str', 'locality': 'str', 'postCode': 'int', 'habitableSurface': 'float', 
             'hasBasement': 'int', 'buildingCondition': 'str',
             'buildingConstructionYear': 'int', 'hasLift': 'int', 'floodZoneType': 'str',
             'heatingType': 'str', 'hasHeatPump': 'int', 'hasPhotovoltaicPanels': 'int', 'hasThermicPanels': 'int',
             'kitchenType': 'str', 'landSurface': 'float', 'hasLivingRoom': 'int', 'livingRoomSurface': 'float',
             'hasGarden': 'int', 'gardenSurface': 'float', 'parkingCountIndoor': 'int', 'parkingCountOutdoor': 'int',
             'hasAirConditioning': 'int', 'hasArmoredDoor': 'int', 'hasVisiophone': 'int', 'hasOffice': 'int', 
             'toiletCount': 'int', 'hasSwimmingPool': 'int', 'hasFireplace': 'int', 'hasTerrace': 'int', 'terraceSurface': 'float',
             'terraceOrientation': 'str', 'epcScore': 'str', 'price': 'float', 'facadeCount': 'int'}

df = transform_data_types(df, col_types)

In [8]:
df.columns

Index(['id', 'type', 'subtype', 'bedroomCount', 'bathroomCount', 'province',
       'locality', 'postCode', 'habitableSurface', 'hasAttic', 'hasBasement',
       'buildingCondition', 'buildingConstructionYear', 'hasLift',
       'floodZoneType', 'heatingType', 'hasHeatPump', 'hasPhotovoltaicPanels',
       'hasThermicPanels', 'kitchenType', 'landSurface', 'hasLivingRoom',
       'livingRoomSurface', 'hasGarden', 'gardenSurface', 'parkingCountIndoor',
       'parkingCountOutdoor', 'hasAirConditioning', 'hasArmoredDoor',
       'hasVisiophone', 'hasOffice', 'toiletCount', 'hasSwimmingPool',
       'hasFireplace', 'hasTerrace', 'terraceSurface', 'terraceOrientation',
       'epcScore', 'price', 'facadeCount'],
      dtype='object')

In [9]:
print("Percentage of nan values : ")
print((df.isna().mean() * 100).round(2).astype(str) + ' %')

Percentage of nan values : 
id                          0.0 %
type                        0.0 %
subtype                     0.0 %
bedroomCount                0.0 %
bathroomCount               0.0 %
province                    0.0 %
locality                    0.0 %
postCode                    0.0 %
habitableSurface            0.0 %
hasAttic                    0.0 %
hasBasement                 0.0 %
buildingCondition           0.0 %
buildingConstructionYear    0.0 %
hasLift                     0.0 %
floodZoneType               0.0 %
heatingType                 0.0 %
hasHeatPump                 0.0 %
hasPhotovoltaicPanels       0.0 %
hasThermicPanels            0.0 %
kitchenType                 0.0 %
landSurface                 0.0 %
hasLivingRoom               0.0 %
livingRoomSurface           0.0 %
hasGarden                   0.0 %
gardenSurface               0.0 %
parkingCountIndoor          0.0 %
parkingCountOutdoor         0.0 %
hasAirConditioning          0.0 %
hasArmoredDoor      

In [10]:
print(df.dtypes)

id                            int64
type                         object
subtype                      object
bedroomCount                  int64
bathroomCount                 int64
province                     object
locality                     object
postCode                      int64
habitableSurface            float64
hasAttic                      int64
hasBasement                   int64
buildingCondition            object
buildingConstructionYear      int64
hasLift                       int64
floodZoneType                object
heatingType                  object
hasHeatPump                   int64
hasPhotovoltaicPanels         int64
hasThermicPanels              int64
kitchenType                  object
landSurface                 float64
hasLivingRoom                 int64
livingRoomSurface           float64
hasGarden                     int64
gardenSurface               float64
parkingCountIndoor            int64
parkingCountOutdoor           int64
hasAirConditioning          

## Keep only houses between 100.000 and 1.000.000€

In [13]:
df_clean = df[(df['price']<1000000) & (df['price']>100000)]
df_clean.shape

(59267, 40)

## Handle categrical data :

In [None]:
# Type into isHouse -> if false : Apartment
df_clean['isHouse'] = (df_clean['type'] == 'HOUSE').astype(bool)

# subtype -> in pipeline

# province ? drop or dummies ?
df_clean = pd.get_dummies(df_clean, columns=['province'], prefix='province')
# locality ? drop because zipcode

# building condition 
condition_rating = {
    'to restore': 0,
    'to renovate': 1,
    'to be done up': 2,
    'good': 3,
    'just renovated': 4,
    'as new': 5
}
df_clean['buildingCondition'] = (df_clean['buildingCondition'].astype(str).str.strip().str.lower()
                                    .map(condition_rating).fillna(-1).astype(int))


# heatingType
df_clean = pd.get_dummies(df_clean, columns=['heatingType'], prefix='heating')

# kitchenType
df_clean['kitchenScore'] = df_clean['kitchenType'].map({
    'Not equipped': 0,
    'Semi equipped': 1,
    'Installed': 2,
    'Hyper equipped': 3,
    'USA semi equipped': 1,
    'USA hyper equipped': 3
})

# epcScore
epc_score_rating = {
    'A++': 10,
    'A+': 9,
    'A': 8,
    'B': 7,
    'C': 6,
    'D': 5,
    'E': 4,
    'F': 3,
    'G': 2,
    'G_F': 1
}
df_clean['epcScore'] = (
    df_clean['epcScore']
    .astype(str)
    .str.strip()
    .str.upper()
    .map(epc_score_rating)
)

df_clean = df_clean.drop(columns=['type', 'heatingType', 'terraceOrientation', 'province'])

KeyError: "None of [Index(['province'], dtype='object')] are in the [columns]"

In [2]:
def epcToNumeric(row):
    region = row['region']
    epc_score = row['epcScore']
    
    epc_mapping = {
        'Flanders': {
            'A++': 0,
            'A+': 0,
            'A': 100,
            'B': 200,
            'C': 300,
            'D': 400,
            'E': 500,
            'F': 600,
            'G': 700
        },
        'Wallonia': {
            'A++': 0,
            'A+': 50,
            'A': 90,
            'B': 170,
            'C': 250,
            'D': 330,
            'E': 420,
            'F': 510,
            'G': 600
        },
        'Bruxelles': {
            'A++': 0,
            'A+': 0,
            'A': 45,
            'B': 95,
            'C': 145,
            'D': 210,
            'E': 275,
            'F': 345,
            'G': 450
        }
    }
    
    return epc_mapping.get(region, {}).get(epc_score, None)

In [3]:
def pricePerM2(df):
    df['pricePerM2'] = df['price']/df['habitableSurface']
    return df

In [4]:
def getCoordinates(df):
    nomi = pgeocode.Nominatim('be')
    
    unique_postcodes = df["postCode"].astype(str).unique()

    geo_df = nomi.query_postal_code(list(unique_postcodes))

    geo_df = geo_df[['postal_code', 'latitude', 'longitude']]
    geo_df = geo_df.rename(columns={'postal_code': 'postCode'})

    df['postCode'] = df['postCode'].astype(str)
    geo_df['postCode'] = geo_df['postCode'].astype(str)

    df = df.merge(geo_df, on='postCode', how='left')

    return df

In [None]:
# Make a cleaning function :

def transform_data_types(df, col_types):
        for col, dtype in col_types.items():
            df[col] = df[col].astype(dtype)
        return df

def cleaning(df):
    df = df.drop(columns=["Unnamed: 0", "url"])

    df = df.drop(columns=['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople', 'roomCount', 'diningRoomSurface', 
                          'streetFacadeWidth', 'gardenOrientation', 'kitchenSurface', 'floorCount', 'hasDiningRoom', 
                          'hasDressingRoom'])
    
    
    binary_cols = [
        'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels', 
        'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice', 
        'hasSwimmingPool', 'hasFireplace', 'parkingCountIndoor', 'parkingCountOutdoor',
        'hasAttic'
    ]
    
    for col in binary_cols:
        df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0}).fillna(0).astype(int)
    
    # Colonnes dépendantes d'autres colonnes
    df['hasLivingRoom'] = df['hasLivingRoom'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df.loc[df['hasLivingRoom'].isna(), 'hasLivingRoom'] = df['livingRoomSurface'].notnull().astype(int)
    
    df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df.loc[df['hasGarden'].isna(), 'hasGarden'] = df['gardenSurface'].notnull().astype(int)
    
    df['hasTerrace'] = df['hasTerrace'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df.loc[df['hasTerrace'].isna(), 'hasTerrace'] = df['terraceSurface'].notnull().astype(int)
    
    # When hasLivingRoom = 0 ; livingRoomSurface = 0
    df.loc[df['hasLivingRoom'] == 0, 'livingRoomSurface'] = 0
    
    # When hasGarden = 0 ; gardenSurface = 0
    df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0
    
    # When hasTerrace = 0 ; terraceSurface = 0 and terraceOrientation = 0
    df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0
    df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = 0
    
    #drop number of facade bigger than 4 and transform "facedeCount" into "facadeCount"
    df['facadeCount'] = df['facedeCount']
    df = df.drop(columns='facedeCount')
    df['facadeCount'] = df['facadeCount'].fillna(2)
    '''df = df[df['facadeCount'] <= 4]'''
    
    # bedroomCount : lets assume that they have at least one so fill nan by 1
    df['bedroomCount'] = df['bedroomCount'].fillna(1).astype(float)
    
    # bathroomCount same as bedrooms
    df['bathroomCount'] = df['bathroomCount'].fillna(1).astype(float)
    
    # toiletCount same as bedrooms
    df['toiletCount'] = df['toiletCount'].fillna(1).astype(float)
    
    # habitableSurface : replace by median 
    #df['habitableSurface'] = df['habitableSurface'].fillna(df['habitableSurface'].median())
    mediane_by_subtype = df.groupby('subtype')['habitableSurface'].median()
    df['habitableSurface'] = df.apply(
        lambda row: mediane_by_subtype[row['subtype']] if pd.isna(row['habitableSurface']) else row['habitableSurface'],
        axis=1
    )
    
    # buildingCondition : replace by 'NOT_MENTIONED
    df['buildingCondition'] = df['buildingCondition'].fillna('NOT_MENTIONED')
    
    # buildingConstructionYear
    df['buildingConstructionYear'] = df['buildingConstructionYear'].fillna(df['buildingConstructionYear'].median()).astype(int)
    
    
    # floodZoneType lts assume that missing values are NON_FLOOD_ZONE
    df['floodZoneType'] = df['floodZoneType'].fillna('NON_FLOOD_ZONE')
    
    # heatingType
    df['heatingType'] = df['heatingType'].fillna(df['heatingType'].mode()[0])
    
    # hasThermicPanels lets assume that if its not precised, there are not
    df['hasThermicPanels'] = df['hasThermicPanels'].fillna(0).astype(float)
    
    # kitchenType
    df['kitchenType'] = df['kitchenType'].fillna(df['kitchenType'].mode()[0])
    
    # landSurface
    df['landSurface'] = df['landSurface'].fillna(df['landSurface'].median())
    
    # livingRoomSurface
    df['livingRoomSurface'] = df['livingRoomSurface'].fillna(df['livingRoomSurface'].median())
    
    # terraceSurface
    median_terrace = df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].notnull()), 'terraceSurface'].median()
    df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = median_terrace
    df.loc[(df['hasTerrace'] != 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = 0
    
    # terraceOrientation
    mode_terrace = df.loc[(df['hasTerrace'] == 1), 'terraceOrientation'].mode()[0]
    df.loc[(df['hasTerrace'] == 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = mode_terrace
    df.loc[(df['hasTerrace'] != 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NO_TERRACE'

    
    col_types = {'id': 'int', 'type': 'str', 'subtype': 'str', 'bedroomCount': 'int', 'bathroomCount': 'int',
                 'province': 'str', 'locality': 'str', 'postCode': 'int', 'habitableSurface': 'float', 
                 'hasBasement': 'int', 'buildingCondition': 'str',
                 'buildingConstructionYear': 'int', 'hasLift': 'int', 'floodZoneType': 'str',
                 'heatingType': 'str', 'hasHeatPump': 'int', 'hasPhotovoltaicPanels': 'int', 'hasThermicPanels': 'int',
                 'kitchenType': 'str', 'landSurface': 'float', 'hasLivingRoom': 'int', 'livingRoomSurface': 'float',
                 'hasGarden': 'int', 'gardenSurface': 'float', 'parkingCountIndoor': 'int', 'parkingCountOutdoor': 'int',
                 'hasAirConditioning': 'int', 'hasArmoredDoor': 'int', 'hasVisiophone': 'int', 'hasOffice': 'int', 
                 'toiletCount': 'int', 'hasSwimmingPool': 'int', 'hasFireplace': 'int', 'hasTerrace': 'int', 'terraceSurface': 'float',
                 'terraceOrientation': 'str', 'epcScore': 'str', 'facadeCount': 'int'}
    
    df = transform_data_types(df, col_types)
###
###
###
    # Type into isHouse -> if false : Apartment
    df['isHouse'] = (df['type'] == 'HOUSE').astype(int)

    # subtype -> in pipeline

    # province ? drop or dummies ?
    df = pd.get_dummies(df, columns=['province'], prefix='province', dtype=int)
    
    # locality ? drop because zipcode

    # building condition 
    condition_rating = {
        'to restore': 0,
        'to renovate': 1,
        'to be done up': 2,
        'good': 3,
        'just renovated': 4,
        'as new': 5
    }
    df['buildingCondition'] = (df['buildingCondition'].astype(str).str.strip().str.lower()
                                    .map(condition_rating).fillna(-1).astype(int))

    # floodzone type 
    df['floodZoneType'] = (df['floodZoneType'] != 'NON_FLOOD_ZONE').astype(int)
    
    # heatingType
    df = pd.get_dummies(df, columns=['heatingType'], prefix='heating', dtype=int)
    
    # kitchenType
    df = pd.get_dummies(df, columns=['kitchenType'], prefix='kitchen', dtype=int)

    # add region information
    def get_region(zip_code):
        if 1000 <= zip_code <= 1299:
            return "Bruxelles"
        elif 1300 <= zip_code <= 1499 or 4000 <= zip_code <= 7999:
            return "Wallonia"
        else:
            return "Flanders"
    
    df['region'] = df['postCode'].apply(get_region)

    # epcScore
    df['epcScore'] = df.apply(epcToNumeric, axis=1)

    df = pricePerM2(df)
    df = getCoordinates(df)
    
    df = df.dropna(subset=['latitude', 'longitude'])

    df = df.drop(columns=['type', 'locality', 'region'])
    
    return df

In [6]:
def kdePriceM2ProvinceKNN(df):

    scaler = StandardScaler()
    coords_scaled = scaler.fit_transform(df[['latitude', 'longitude']])

    k = 20 
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(coords_scaled)
    distances, indices = knn.kneighbors(coords_scaled)

    kde_scores = []

    for i in range(len(df)):
        neighbor_idxs = indices[i]
        neighbor_prices = df['pricePerM2'].iloc[neighbor_idxs].dropna()

        if len(neighbor_prices) < 2:
            kde_scores.append(np.nan)
        else:
            kde = gaussian_kde(neighbor_prices)
            density = kde(df['pricePerM2'].iloc[i])
            kde_scores.append(density[0])

    df['kde_price_per_m2_knn'] = kde_scores

    df = df.drop(columns=['pricePerM2', 'latitude', 'longitude'])

    return df 

In [7]:
df = pd.read_csv("./data/Kangaroo.csv")
df = df.drop_duplicates(subset=["id"], keep="first")
df = df[(df['price']<2000000) & (df['price']>100000)]

# drop lines without price
df = df.dropna(subset="price")
# epcScore
epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
df = df[df['epcScore'].isin(epc_order)]
df['epcScore'] = df['epcScore'].fillna(df['epcScore'].mode()[0])

transform_data_types(df, {'price':float})

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train = cleaning(df_train)
df_test = cleaning(df_test)

df_train = kdePriceM2ProvinceKNN(df_train)
df_test = kdePriceM2ProvinceKNN(df_test)

X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test.drop(columns=['price'])
y_test = df_test['price']

X_test = X_test[X_train.columns]

In [None]:
# select multiple models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': linear_model.Lasso(alpha=0.1),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'ElasticNet': ElasticNet(random_state=0),
    'XGBoost': xgb.XGBRegressor(n_estimators=2000, random_state=42, learning_rate=0.1),
    'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.2),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(random_state=42, silent=True),
    'Ridge': Ridge(alpha=1.0),
    'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

}

results = {}
best_mae = float('inf')
best_model_name = ''
best_pipeline = Pipeline([])

for name, model in models.items():
    pipeline = Pipeline([
        ('encoder', ce.TargetEncoder(cols=['subtype', 'terraceOrientation'])),
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    pipeline.fit(X_train.drop(columns='id'), y_train)

    preds = pipeline.predict(X_test.drop(columns='id'))
    
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)

    errors = abs(preds - y_test)
    mape = 100 * (errors / y_test)
    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    print(f"{name} : MAE = {mae:.4f}, MSE = {mse:.4f}, accuracy = {accuracy:.4f}")

    results[name] = mae

    if mae < best_mae:
        best_mae = mae
        best_mse = mse
        best_accuracy = accuracy
        best_model_name = name
        best_pipeline = pipeline
        best_model = model

print("Models results :")
for model_name, mae in results.items():
    print(f"{model_name} : MAE = {mae:.4f}ler")

print(f"\n -> Best Model : {best_model_name} with MAE = {best_mae:.4f} and MSE = {best_mse:.4f}; accuracy = {best_accuracy:.4f}")

LinearRegression : MAE = 125018.7514, MSE = 37858284419.1513, accuracy = 65.0324


  model = cd_fast.enet_coordinate_descent(


Lasso : MAE = 125018.6187, MSE = 37858279429.0502, accuracy = 65.0325
DecisionTree : MAE = 99555.3496, MSE = 29712602325.9207, accuracy = 74.8266
RandomForest : MAE = 72203.4755, MSE = 14797549633.3701, accuracy = 81.4964
ElasticNet : MAE = 124787.2085, MSE = 40304657653.1282, accuracy = 65.5949
XGBoost : MAE = 63450.9390, MSE = 11229556288.8620, accuracy = 83.4498


In [162]:
# Applique le préprocessing uniquement
preprocessed_X = best_pipeline[:-1].transform(X_test.drop(columns='id'))

# Récupère les noms des colonnes après le TargetEncoder
try:
    feature_names = best_pipeline.named_steps['encoder'].get_feature_names_out()
except:
    # Fallback si ce n’est pas supporté
    feature_names = X_test.drop(columns='id').columns  # ou crée des noms bidons si ça plante encore

# Maintenant on construit le DataFrame correctement
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': r.importances_mean
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(f"model: {importance_df}")

model:                                                       Feature  Importance
4                                            habitableSurface    0.660922
68                                       kde_price_per_m2_knn    0.247751
3                                                    postCode    0.227995
13                                                landSurface    0.063087
30                                                   epcScore    0.054287
2                                               bathroomCount    0.041386
0                                                     subtype    0.023197
1                                                bedroomCount    0.022862
43                                     province_West Flanders    0.017198
8                                    buildingConstructionYear    0.010532
9                                                     hasLift    0.007459
24                                                toiletCount    0.007191
32                             

In [16]:
model = best_model

pipeline = Pipeline([
    ('encoder', ce.TargetEncoder(cols=['subtype', 'terraceOrientation'])),
    ('scaler', StandardScaler()),
    ('model', model)
])

pipeline.fit(X_train.drop(columns='id'), y_train)
preds = pipeline.predict(X_test.drop(columns='id'))

mae = mean_absolute_error(y_test, preds)
errors = abs(preds - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)

print(f"{best_model_name} : MAE = {mae:.4f}, accuracy = {accuracy:.4f}")


XGBoost : MAE = 38534.1303, accuracy = 84.7865
