# IMMO-ELIZA

In [11]:
!pip install category_encoders
!pip install pgeocode



In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import category_encoders as ce
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ridge_regression
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from scipy.stats import gaussian_kde
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pgeocode
import xgboost as xgb
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
def epcToNumeric(row):
    region = row['region']
    epc_score = row['epcScore']
    
    epc_mapping = {
        'Flanders': {
            'A++': 0,
            'A+': 0,
            'A': 100,
            'B': 200,
            'C': 300,
            'D': 400,
            'E': 500,
            'F': 600,
            'G': 700
        },
        'Wallonia': {
            'A++': 0,
            'A+': 50,
            'A': 90,
            'B': 170,
            'C': 250,
            'D': 330,
            'E': 420,
            'F': 510,
            'G': 600
        },
        'Bruxelles': {
            'A++': 0,
            'A+': 0,
            'A': 45,
            'B': 95,
            'C': 145,
            'D': 210,
            'E': 275,
            'F': 345,
            'G': 450
        }
    }
    
    return epc_mapping.get(region, {}).get(epc_score, None)

In [3]:
def pricePerM2(df):
    df['pricePerM2'] = df['price']/df['habitableSurface']
    return df

In [4]:
def getCoordinates(df):
    nomi = pgeocode.Nominatim('be')
    
    unique_postcodes = df["postCode"].astype(str).unique()

    geo_df = nomi.query_postal_code(list(unique_postcodes))

    geo_df = geo_df[['postal_code', 'latitude', 'longitude']]
    geo_df = geo_df.rename(columns={'postal_code': 'postCode'})

    df['postCode'] = df['postCode'].astype(str)
    geo_df['postCode'] = geo_df['postCode'].astype(str)

    df = df.merge(geo_df, on='postCode', how='left')

    return df

In [5]:
# Make a cleaning function :

def transform_data_types(df, col_types):
        for col, dtype in col_types.items():
            df[col] = df[col].astype(dtype)
        return df

def cleaning(df):
    df = df.drop(columns=["Unnamed: 0", "url"])

    df = df.drop(columns=['monthlyCost', 'hasBalcony', 'accessibleDisabledPeople', 'roomCount', 'diningRoomSurface', 
                          'streetFacadeWidth', 'gardenOrientation', 'kitchenSurface', 'floorCount', 'hasDiningRoom', 
                          'hasDressingRoom'])
    
    
    binary_cols = [
        'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels', 
        'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice', 
        'hasSwimmingPool', 'hasFireplace', 'parkingCountIndoor', 'parkingCountOutdoor',
        'hasAttic'
    ]
    
    for col in binary_cols:
        df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0}).fillna(0).astype(int)
    
    # Colonnes dépendantes d'autres colonnes
    df['hasLivingRoom'] = df['hasLivingRoom'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df.loc[df['hasLivingRoom'].isna(), 'hasLivingRoom'] = df['livingRoomSurface'].notnull().astype(int)
    
    df['hasGarden'] = df['hasGarden'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df.loc[df['hasGarden'].isna(), 'hasGarden'] = df['gardenSurface'].notnull().astype(int)
    
    df['hasTerrace'] = df['hasTerrace'].map({True: 1, False: 0, 'True': 1, 'False': 0})
    df.loc[df['hasTerrace'].isna(), 'hasTerrace'] = df['terraceSurface'].notnull().astype(int)
    
    # When hasLivingRoom = 0 ; livingRoomSurface = 0
    df.loc[df['hasLivingRoom'] == 0, 'livingRoomSurface'] = 0
    
    # When hasGarden = 0 ; gardenSurface = 0
    df.loc[df['hasGarden'] == 0, 'gardenSurface'] = 0
    
    # When hasTerrace = 0 ; terraceSurface = 0 and terraceOrientation = 0
    df.loc[df['hasTerrace'] == 0, 'terraceSurface'] = 0
    df.loc[df['hasTerrace'] == 0, 'terraceOrientation'] = 0
    
    #drop number of facade bigger than 4 and transform "facedeCount" into "facadeCount"
    df['facadeCount'] = df['facedeCount']
    df = df.drop(columns='facedeCount')
    df['facadeCount'] = df['facadeCount'].fillna(2)
    '''df = df[df['facadeCount'] <= 4]'''
    
    # bedroomCount : lets assume that they have at least one so fill nan by 1
    df['bedroomCount'] = df['bedroomCount'].fillna(1).astype(float)
    
    # bathroomCount same as bedrooms
    df['bathroomCount'] = df['bathroomCount'].fillna(1).astype(float)
    
    # toiletCount same as bedrooms
    df['toiletCount'] = df['toiletCount'].fillna(1).astype(float)
    
    # habitableSurface : replace by median 
    #df['habitableSurface'] = df['habitableSurface'].fillna(df['habitableSurface'].median())
    mediane_by_subtype = df.groupby('subtype')['habitableSurface'].median()
    df['habitableSurface'] = df.apply(
        lambda row: mediane_by_subtype[row['subtype']] if pd.isna(row['habitableSurface']) else row['habitableSurface'],
        axis=1
    )
    
    # buildingCondition : replace by 'NOT_MENTIONED
    df['buildingCondition'] = df['buildingCondition'].fillna('NOT_MENTIONED')
    
    # buildingConstructionYear
    df['buildingConstructionYear'] = df['buildingConstructionYear'].fillna(df['buildingConstructionYear'].median()).astype(int)
    
    
    # floodZoneType lts assume that missing values are NON_FLOOD_ZONE
    df['floodZoneType'] = df['floodZoneType'].fillna('NON_FLOOD_ZONE')
    
    # heatingType
    df['heatingType'] = df['heatingType'].fillna(df['heatingType'].mode()[0])
    
    # hasThermicPanels lets assume that if its not precised, there are not
    df['hasThermicPanels'] = df['hasThermicPanels'].fillna(0).astype(float)
    
    # kitchenType
    df['kitchenType'] = df['kitchenType'].fillna(df['kitchenType'].mode()[0])
    
    # landSurface
    df['landSurface'] = df['landSurface'].fillna(df['landSurface'].median())
    
    # livingRoomSurface
    df['livingRoomSurface'] = df['livingRoomSurface'].fillna(df['livingRoomSurface'].median())
    
    # terraceSurface
    median_terrace = df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].notnull()), 'terraceSurface'].median()
    df.loc[(df['hasTerrace'] == 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = median_terrace
    df.loc[(df['hasTerrace'] != 1) & (df['terraceSurface'].isna()), 'terraceSurface'] = 0
    
    # terraceOrientation
    mode_terrace = df.loc[(df['hasTerrace'] == 1), 'terraceOrientation'].mode()[0]
    df.loc[(df['hasTerrace'] == 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = mode_terrace
    df.loc[(df['hasTerrace'] != 1) & (df['terraceOrientation'].isna()), 'terraceOrientation'] = 'NO_TERRACE'

    
    col_types = {'id': 'int', 'type': 'str', 'subtype': 'str', 'bedroomCount': 'int', 'bathroomCount': 'int',
                 'province': 'str', 'locality': 'str', 'postCode': 'int', 'habitableSurface': 'float', 
                 'hasBasement': 'int', 'buildingCondition': 'str',
                 'buildingConstructionYear': 'int', 'hasLift': 'int', 'floodZoneType': 'str',
                 'heatingType': 'str', 'hasHeatPump': 'int', 'hasPhotovoltaicPanels': 'int', 'hasThermicPanels': 'int',
                 'kitchenType': 'str', 'landSurface': 'float', 'hasLivingRoom': 'int', 'livingRoomSurface': 'float',
                 'hasGarden': 'int', 'gardenSurface': 'float', 'parkingCountIndoor': 'int', 'parkingCountOutdoor': 'int',
                 'hasAirConditioning': 'int', 'hasArmoredDoor': 'int', 'hasVisiophone': 'int', 'hasOffice': 'int', 
                 'toiletCount': 'int', 'hasSwimmingPool': 'int', 'hasFireplace': 'int', 'hasTerrace': 'int', 'terraceSurface': 'float',
                 'terraceOrientation': 'str', 'epcScore': 'str', 'facadeCount': 'int'}
    
    df = transform_data_types(df, col_types)
###
###
###
    # Type into isHouse -> if false : Apartment
    df['isHouse'] = (df['type'] == 'HOUSE').astype(int)

    # subtype -> in pipeline

    # province ? drop or dummies ?
    df = pd.get_dummies(df, columns=['province'], prefix='province', dtype=int)
    
    # locality ? drop because zipcode

    # building condition 
    condition_rating = {
        'to restore': 0,
        'to renovate': 1,
        'to be done up': 2,
        'good': 3,
        'just renovated': 4,
        'as new': 5
    }
    df['buildingCondition'] = (df['buildingCondition'].astype(str).str.strip().str.lower()
                                    .map(condition_rating).fillna(-1).astype(int))

    # floodzone type 
    df['floodZoneType'] = (df['floodZoneType'] != 'NON_FLOOD_ZONE').astype(int)
    
    # heatingType
    df = pd.get_dummies(df, columns=['heatingType'], prefix='heating', dtype=int)
    
    # kitchenType
    df = pd.get_dummies(df, columns=['kitchenType'], prefix='kitchen', dtype=int)

    # add region information
    def get_region(zip_code):
        if 1000 <= zip_code <= 1299:
            return "Bruxelles"
        elif 1300 <= zip_code <= 1499 or 4000 <= zip_code <= 7999:
            return "Wallonia"
        else:
            return "Flanders"
    
    df['region'] = df['postCode'].apply(get_region)

    # epcScore
    df['epcScore'] = df.apply(epcToNumeric, axis=1)

    df = pricePerM2(df)
    df = getCoordinates(df)
    
    df = df.dropna(subset=['latitude', 'longitude'])

    df = df.drop(columns=['type', 'locality', 'region'])
    
    return df

In [6]:
def kdePriceM2ProvinceKNN(df):

    scaler = StandardScaler()
    coords_scaled = scaler.fit_transform(df[['latitude', 'longitude']])

    k = 20 
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(coords_scaled)
    distances, indices = knn.kneighbors(coords_scaled)

    kde_scores = []

    for i in range(len(df)):
        neighbor_idxs = indices[i]
        neighbor_prices = df['pricePerM2'].iloc[neighbor_idxs].dropna()

        if len(neighbor_prices) < 2:
            kde_scores.append(np.nan)
        else:
            kde = gaussian_kde(neighbor_prices)
            density = kde(df['pricePerM2'].iloc[i])
            kde_scores.append(density[0])

    df['kde_price_per_m2_knn'] = kde_scores

    df = df.drop(columns=['pricePerM2', 'latitude', 'longitude'])

    return df 

In [7]:
df = pd.read_csv("./data/Kangaroo.csv")
df = df.drop_duplicates(subset=["id"], keep="first")
#df = df[(df['price']<2000000) & (df['price']>100000)]
df = df[(df['price']<1000000) ]

# drop lines without price
df = df.dropna(subset="price")
# epcScore
epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
df = df[df['epcScore'].isin(epc_order)]
df['epcScore'] = df['epcScore'].fillna(df['epcScore'].mode()[0])

transform_data_types(df, {'price':float})

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train = cleaning(df_train)
df_test = cleaning(df_test)

df_train = kdePriceM2ProvinceKNN(df_train)
df_test = kdePriceM2ProvinceKNN(df_test)

X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test.drop(columns=['price'])
y_test = df_test['price']

X_test = X_test[X_train.columns]

In [16]:
X_train.dtypes

id                              int64
subtype                        object
bedroomCount                    int64
bathroomCount                   int64
postCode                       object
                               ...   
kitchen_USA_HYPER_EQUIPPED      int64
kitchen_USA_INSTALLED           int64
kitchen_USA_SEMI_EQUIPPED       int64
kitchen_USA_UNINSTALLED         int64
kde_price_per_m2_knn          float64
Length: 62, dtype: object

In [None]:
# select multiple models

models = {
    #'LinearRegression': LinearRegression(),
    #'Lasso': linear_model.Lasso(alpha=0.1),
    #'DecisionTree': DecisionTreeRegressor(random_state=42),
    #'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    #'ElasticNet': ElasticNet(random_state=0),
    'XGBoost': xgb.XGBRegressor(n_estimators=2000, random_state=42, learning_rate=0.1),
    'XGBoostElsa': xgb.XGBRegressor(n_estimators=2000, random_state=42, learning_rate=0.05, subsample= 0.8),
    'XGBoostAlex': xgb.XGBRegressor(n_estimators=2500, random_state=42, learning_rate=0.08, subsample= 0.8),
    'XGBoostAlex2': xgb.XGBRegressor(n_estimators=2500, random_state=42, learning_rate=0.08),
    #'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.2),
    #'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    #'AdaBoost': AdaBoostRegressor(random_state=42),
    #'LightGBM': lgb.LGBMRegressor(random_state=42),
    #'CatBoost': CatBoostRegressor(random_state=42, silent=True),
    #'Ridge': Ridge(alpha=1.0),
    #'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}

results = {}
best_mae = float('inf')
best_model_name = ''
best_pipeline = Pipeline([])

for name, model in models.items():
    pipeline = Pipeline([
        ('encoder', ce.TargetEncoder(cols=['subtype', 'terraceOrientation'])),
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    pipeline.fit(X_train.drop(columns='id'), y_train)

    preds = pipeline.predict(X_test.drop(columns='id'))
    
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)

    errors = abs(preds - y_test)
    mape = 100 * (errors / y_test)
    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    print(f"{name} : MAE = {mae:.4f}, MSE = {mse:.4f}, accuracy = {accuracy:.4f}")

    results[name] = mae

    if mae < best_mae:
        best_mae = mae
        best_mse = mse
        best_accuracy = accuracy
        best_model_name = name
        best_pipeline = pipeline
        best_model = model

print("Models results :")
for model_name, mae in results.items():
    print(f"{model_name} : MAE = {mae:.4f}ler")

print(f"\n -> Best Model : {best_model_name} with MAE = {best_mae:.4f} and MSE = {best_mse:.4f}; accuracy = {best_accuracy:.4f}")

LinearRegression : MAE = 89984.1603, MSE = 15928704937.3619, accuracy = 69.5957
Lasso : MAE = 89984.0817, MSE = 15928701927.3055, accuracy = 69.5957
DecisionTree : MAE = 80516.3371, MSE = 14668202836.7372, accuracy = 74.1166
RandomForest : MAE = 56494.0618, MSE = 6894266571.3988, accuracy = 81.4932
ElasticNet : MAE = 92776.0046, MSE = 16949444545.7836, accuracy = 68.0657
XGBoost : MAE = 50247.8558, MSE = 5488290954.6839, accuracy = 83.8174
XGBoostElsa : MAE = 49760.4806, MSE = 5389791103.1069, accuracy = 83.9645
SVR : MAE = 139031.6666, MSE = 37603554723.6041, accuracy = 54.3001
GradientBoosting : MAE = 66794.3751, MSE = 9042786980.0018, accuracy = 77.6679
AdaBoost : MAE = 132921.4580, MSE = 23376165644.2714, accuracy = 43.2115
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1919
[L

In [None]:
model = best_model

pipeline = Pipeline([
    ('encoder', ce.TargetEncoder(cols=['subtype', 'terraceOrientation'])),
    ('scaler', StandardScaler()),
    ('model', model)
])

pipeline.fit(X_train.drop(columns='id'), y_train)
preds = pipeline.predict(X_test.drop(columns='id'))

mae = mean_absolute_error(y_test, preds)
errors = abs(preds - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)

print(f"{best_model_name} : MAE = {mae:.4f}, accuracy = {accuracy:.4f}")


XGBoost : MAE = 38534.1303, accuracy = 84.7865


In [9]:

pipeline = Pipeline([
    ('encoder', ce.TargetEncoder(cols=['subtype', 'terraceOrientation'])),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBRegressor(random_state=42))
])

param_grid = {
    'model__n_estimators': [1000, 2000, 3000],
    'model__max_depth': [5, 7, 9],
    'model__learning_rate': [0.005, 0.01, 0.1, 0.2],
    'model__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

grid_search.fit(X_train, y_train)
print("Best params:", grid_search.best_params_)

preds = grid_search.predict(X_test.drop(columns='id'))

mae = mean_absolute_error(y_test, preds)
errors = abs(preds - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)

print(f"{best_model_name} : MAE = {mae:.4f}, accuracy = {accuracy:.4f}")

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best params: {'model__learning_rate': 0.01, 'model__max_depth': 9, 'model__n_estimators': 3000, 'model__subsample': 0.8}


ValueError: Unexpected input dimension 61, expected 62

# TRY WITHOUT CLEANING

In [6]:
from preprocessing_simpler import trainTestClean

In [21]:
df_train, df_test = trainTestClean()

In [22]:
X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test.drop(columns=['price'])
y_test = df_test['price']

X_test = X_test[X_train.columns]

In [23]:
# select multiple models

model = xgb.XGBRegressor(
    n_estimators=3000,
    learning_rate=0.031,
    max_depth=9,
    subsample=0.831,
    colsample_bytree=0.395,
    colsample_bylevel=0.449,
    reg_alpha=1.769,
    reg_lambda=3.922,
    random_state=42,
    tree_method='hist'
    )


model.fit(X_train.drop(columns='id'), y_train)

preds = model.predict(X_test.drop(columns='id'))
    
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)

print(f"MAE = {mae:.4f}, MSE = {mse:.4f}")
#print(f"MAE = {0}, MSE = {0}, accuracy = 100% you are the best")


MAE = 82649.2875, MSE = 41837062011.6745


# do nothing, clean nothing


In [26]:
# 1. Conversion des booléens en 0/1
bool_cols = [
    'hasAttic', 'hasBasement', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels',
    'hasThermicPanels', 'hasLivingRoom', 'hasGarden', 'hasAirConditioning',
    'hasArmoredDoor', 'hasVisiophone', 'hasOffice', 'hasSwimmingPool',
    'hasFireplace', 'hasTerrace'
]

for col in bool_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# 2. Encodage label des colonnes catégorielles
cat_cols = df.select_dtypes(include='object').columns

for col in cat_cols:
    df[col], _ = pd.factorize(df[col])

# 3. On garde que les lignes avec un prix connu
df = df[df["price"].notnull()]

# 4. Features et target
X = df.drop("price", axis=1)
y = df["price"]

# 5. Split + entraînement rapide
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# 6. Score
print("Score XGBoost :", model.score(X_test, y_test))

Score XGBoost : 0.7685086452971165


In [25]:
df.dtypes

type                         object
subtype                      object
bedroomCount                float64
bathroomCount               float64
province                     object
locality                     object
postCode                      int64
habitableSurface            float64
hasAttic                     object
hasBasement                  object
buildingCondition            object
buildingConstructionYear    float64
facedeCount                 float64
hasLift                      object
floodZoneType                object
heatingType                  object
hasHeatPump                  object
hasPhotovoltaicPanels        object
hasThermicPanels             object
kitchenType                  object
landSurface                 float64
hasLivingRoom                object
livingRoomSurface           float64
hasGarden                    object
gardenSurface               float64
gardenOrientation            object
parkingCountIndoor          float64
parkingCountOutdoor         