![Photo by Stephen Phillips - Hostreviews.co.uk on UnSplash](https://cf.bstatic.com/xdata/images/hotel/max1024x768/408003083.jpg?k=c49b5c4a2346b3ab002b9d1b22dbfb596cee523b53abef2550d0c92d0faf2d8b&o=&hp=1){fig-align="center" width=50%}


# Import data

In [1]:
import time
from pathlib import Path

import pandas as pd
from data import utils
from lets_plot import *
from lets_plot.mapping import as_discrete

LetsPlot.setup_html()

**Goal**:
- Identify what basic pre-processing steps need to be taken before uploading the data to a database

# Select Columns to Retain Based on the Quantity of Missing Values


In the realm of web scraping, managing the sheer volume of data is often the initial hurdle to conquer. It's not so much about deciding what data to collect but rather what data to retain. As we delve into the vast realm of the Imoweb website, we are met with a plethora of listings, each offering a unique set of information.

For many of these listings, there are commonalities – details like location and price tend to be constants. However, interspersed among them are those one-of-a-kind nuggets of information, such as the number of swimming pools available. While these specific details can certainly be vital in assessing the value of certain listings, the downside is that they can lead to a sparse dataset.

Currently, our primary objective is to pinpoint which features are prevalent across the board, drawing insights from a pre-scraped dataset comprising around 1000 ads. Once we've identified these common denominators, we can streamline our data collection process by retaining these key attributes while discarding the less likely occurrences.

In [31]:
for filename in utils.Configuration.RAW_DATA_PATH.glob("*.gzip"):
    if "data" in filename.stem:
        df = pd.read_parquet(filename)
df

Unnamed: 0,Address,Available as of,CO₂ emission,Covered parking spaces,Energy class,External reference,Flood zone type,Outdoor parking spaces,Planning permission obtained,Possible priority purchase right,...,Surroundings type,Furnished,Heating type,Bedroom 3 surface,Office,Latest land use designation,Street frontage width,"Gas, water & electricity",Price,Double glazing
0,Sint-Denijslaan 1 9000 - Gent,At delivery,Not specified,1,Not specified,5530472,Possible flood zone,1,Yes,No,...,,,,,,,,,,
1,Rue Simon 46/2 6990 - Hotton,Depending on the tenant,71 kg CO₂/m²,2,D,5530890,Property partially or completely located in a ...,,Yes,No,...,,,,,,,,,,
2,Sint-Denijslaan 1 9000 - Gent,At delivery,Not specified,,Not specified,5533819,Possible flood zone,,Yes,,...,,,,,,,,,,
3,Hoogstraat 20 9340 - Lede,After signing the deed,Not specified,,F,5523589,Non flood zone,,Yes,No,...,,,,,,,,,,
4,Heidestatiestraat 22 2920 - Kalmthout,,Not specified,,Not specified,95278 - NWB-23-SEM-0,Non flood zone,,Yes,No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Avenuedes Ardennes 7d 4500 - Huy,,Not specified,1,E,18115 - 111115428,,,,,...,,,Fuel oil,,,,,No,"€ 265,000 265000 €",Yes
87,Sint-Denijslaan 1 9000 - Gent,,Not specified,,Not specified,5528536,Non flood zone,,,No,...,,,,,,,,,,
88,Avenue Louise 523 1050 - Ixelles,,87 kg CO₂/m²,2,B,,,4,,,...,,,Gas,21 m² square meters,,,,Yes,"€ 1,495,000 1495000 €",Yes
89,Avenue des Tournesols 14 1640 - Rhode-St-Genèse,,37 kg CO₂/m²,,D,5531996,,,,,...,,,Gas,16 m² square meters,,,5 m,Yes,"€ 445,000 445000 €",Yes


In [38]:
for columns in df:
    print(columns, df[columns].unique().shape)

Address (49,)
Available as of (5,)
CO₂ emission (32,)
Covered parking spaces (4,)
Energy class (8,)
External reference (85,)
Flood zone type (4,)
Outdoor parking spaces (6,)
Planning permission obtained (2,)
Possible priority purchase right (3,)
Primary energy consumption (67,)
Proceedings for breach of planning regulations (2,)
Reference number of the EPC report (60,)
Subdivision permit (3,)
Tenement building (2,)
Website (34,)
Yearly theoretical total energy consumption (20,)
ad_url (91,)
day_of_retrieval (91,)
Kitchen type (7,)
Width of the lot on the street (11,)
Kitchen surface (19,)
Dining room (2,)
Living area (43,)
Bedroom 1 surface (22,)
Basement (1,)
Connection to sewer network (2,)
As built plan (2,)
TV cable (2,)
Cadastral income (40,)
Bedrooms (8,)
Toilets (6,)
Building condition (6,)
Surface of the plot (50,)
Bedroom 2 surface (11,)
Garden surface (1,)
Construction year (42,)
Number of frontages (4,)
Bathrooms (5,)
Living room surface (26,)
Surroundings type (5,)
Furnishe

In [52]:
utils.Configuration.features_to_keep_sales

['day_of_retrieval',
 'ad_url',
 'Reference number of the EPC report',
 'Energy class',
 'Primary energy consumption',
 'Yearly theoretical total energy consumption',
 'CO₂ emission',
 'Tenement building',
 'Address',
 'Bedrooms',
 'Living area',
 'Bathrooms',
 'Surface of the plot',
 'Price',
 'Building condition',
 'Double glazing',
 'Number of frontages',
 'Website',
 'Toilets',
 'External reference',
 'Heating type',
 'Cadastral income',
 'Gas, water & electricity',
 'Latest land use designation',
 'Connection to sewer network',
 'Covered parking spaces',
 'Possible priority purchase right',
 'Proceedings for breach of planning regulations',
 'Construction year',
 'Subdivision permit',
 'Bedroom 1 surface',
 'Bedroom 2 surface',
 'Available as of',
 'Kitchen type',
 'Flood zone type',
 'Living room surface',
 'Planning permission obtained',
 'Kitchen surface',
 'TV cable',
 'Bedroom 3 surface',
 'Furnished',
 'Outdoor parking spaces',
 'Surroundings type',
 'Garden surface',
 'Base

In [81]:
(
    pd.read_csv(r"C:\Users\s0212777\individual_ad_22.csv")
    .set_index("0")
    .transpose()
    .filter(utils.Configuration.features_to_keep_sales)
    .pipe(
        lambda df: df.assign(
            **{
                col: pd.Series(dtype="float64")
                for col in set(utils.Configuration.features_to_keep_sales)
                - set(df.columns)
            }
        )
    )
)

0,Available as of,Construction year,Building condition,Street frontage width,Number of frontages,Surroundings type,Living area,Living room surface,Dining room,Kitchen type,...,Address,Website,External reference,day_of_retrieval,ad_url,Basement,Outdoor parking spaces,Planning permission obtained,Covered parking spaces,Proceedings for breach of planning regulations
1,To be defined,1918,Good,8 m,2,Isolated,133 m² square meters,32 m² square meters,Yes,Installed,...,Rue de la Wallonie 2A 4680 - Oupeye,http://www.nigel-immo.be,5529147,2023-09-22 20:42:43.891253,https://www.immoweb.be/en/classified/house/for...,,,,,


In [85]:
csv_files = []
for i in Path(r"C:\Users\s0212777").glob("*.csv"):
    temp = pd.read_csv(i)
    csv_files.append(temp)

In [86]:
csv_files

[   Unnamed: 0         Available as of  Construction year Building condition  \
 0           1  After signing the deed               2016             As new   
 
   Street frontage width  Number of frontages  \
 0                   7 m                    3   
 
                            Surroundings type            Living area  \
 0  Living area (residential, urban or rural)  135  m² square meters   
 
     Living room surface Dining room  ... Kitchen surface  Office  \
 0  30  m² square meters         Yes  ...             NaN     NaN   
 
   Covered parking spaces Basement Toilets  Connection to sewer network  \
 0                    NaN      NaN     NaN                          NaN   
 
   Proceedings for breach of planning regulations Website  \
 0                                            NaN     NaN   
 
   Width of the lot on the street As built plan  
 0                            NaN           NaN  
 
 [1 rows x 51 columns],
    Unnamed: 0 Available as of  Construction year 

In [92]:
pd.concat(csv_files, axis=0)

Unnamed: 0.1,Unnamed: 0,Available as of,Construction year,Building condition,Street frontage width,Number of frontages,Surroundings type,Living area,Living room surface,Dining room,...,Kitchen surface,Office,Covered parking spaces,Basement,Toilets,Connection to sewer network,Proceedings for breach of planning regulations,Website,Width of the lot on the street,As built plan
0,1,After signing the deed,2016.0,As new,7 m,3.0,"Living area (residential, urban or rural)",135 m² square meters,30 m² square meters,Yes,...,,,,,,,,,,
0,1,To be defined,1918.0,Good,4 m,2.0,Isolated,105 m² square meters,14 m² square meters,Yes,...,12 m² square meters,,,Yes,1.0,,,http://www.penoit.be,,
0,1,To be defined,1900.0,Good,,2.0,"Living area (residential, urban or rural)",130 m² square meters,24 m² square meters,Yes,...,18 m² square meters,,,,3.0,Connected,,http://www.mdi.be,,
0,1,At delivery,,,,,,,,,...,,,1.0,,,,,http://www.immodavinci.be,,
0,1,,1980.0,Good,,4.0,,744 m² square meters,,,...,,,16.0,,5.0,,,https://agency.quares.be,31 m meters,No
0,1,To be defined,1953.0,Good,14 m,3.0,Urban,301 m² square meters,20 m² square meters,Yes,...,25 m² square meters,Yes,2.0,,2.0,Not connected,No,http://www.living-stone.be,,
0,1,After signing the deed,1960.0,To be done up,6 m,3.0,"Living area (residential, urban or rural)",80 m² square meters,20 m² square meters,,...,6 m² square meters,,1.0,Yes,1.0,,,http://www.maxime-realestate.be,,
0,1,After signing the deed,1935.0,As new,,3.0,Urban,90 m² square meters,21 m² square meters,,...,6 m² square meters,,,,2.0,Connected,,http://www.amelotproperties.be,,
0,1,At delivery,,,,,,,,,...,,,,,,,No,http://www.immodavinci.be,,
0,1,,,,,,,,,,...,,,,,,,,http://www.heylenvastgoed.be,,


In [2]:
pd.read_parquet(r"C:\Users\s0212777\OneDrive - Universiteit Antwerpen\Jupyter_projects\Articles\house_price_prediction\data\raw\complete_dataset_2023-09-23.parquet.gzip")

Unnamed: 0,Available as of,Construction year,Building condition,Street frontage width,Number of frontages,Covered parking spaces,Outdoor parking spaces,Surroundings type,Living area,Living room surface,...,Cadastral income,Tenement building,Address,External reference,day_of_retrieval,ad_url,Website,As built plan,Office,Dining room
0,After signing the deed,1971.0,Just renovated,12 m,4.0,2.0,5.0,"Living area (residential, urban or rural)",197 m² square meters,47 m² square meters,...,"€ 1,279 1279 €",No,Stationstraat 30 9600 - Ronse,5411439,2023-09-23 17:11:29.870179,https://www.immoweb.be/en/classified/villa/for...,,,,
1,After signing the deed,1949.0,To renovate,18 m,3.0,1.0,,Isolated,139 m² square meters,10 m² square meters,...,€ 689 689 €,No,Rue de la Wallonie 2A 4680 - Oupeye,5534704,2023-09-23 17:11:29.995637,https://www.immoweb.be/en/classified/house/for...,http://www.nigel-immo.be,,Yes,Yes
2,After signing the deed,1920.0,Good,5.5 m,2.0,,,Urban,200 m² square meters,26 m² square meters,...,€ 917 917 €,No,Sint-Denijslaan 1 9000 - Gent,5531386,2023-09-23 17:11:32.203024,https://www.immoweb.be/en/classified/mansion/f...,http://www.immodavinci.be,No,Yes,
3,After signing the deed,1937.0,To renovate,8 m,2.0,,,Urban,230 m² square meters,,...,"€ 1,623 1623 €",No,Hoogstraat 20 9340 - Lede,5535368,2023-09-23 17:11:34.716784,https://www.immoweb.be/en/classified/house/for...,http://www.immoderas.be,,Yes,
4,Depending on the tenant,1900.0,Good,6.5 m,2.0,,,Urban,360 m² square meters,34 m² square meters,...,"€ 1,621 1621 €",No,"Mechelsesteenweg,157 2018 - Antwerpen",5534431,2023-09-23 17:11:34.857664,https://www.immoweb.be/en/classified/mansion/f...,,,,
5,,1925.0,As new,5 m,2.0,,,,178 m² square meters,30 m² square meters,...,€ 518 518 €,No,Plantin en Moretuslei 135A 2140 - Borgerhout,11351 - 3966,2023-09-23 17:11:35.080346,https://www.immoweb.be/en/classified/mansion/f...,http://www.immovasta.be,No,,
6,,1918.0,To be done up,12 m,2.0,3.0,2.0,Urban,148 m² square meters,,...,€ 936 936 €,No,Chaussee de Charleroi 59 B 5030 - Gembloux,441 - 93363974,2023-09-23 17:11:35.208188,https://www.immoweb.be/en/classified/house/for...,http://www.trevitessier.be,No,,
7,After signing the deed,2017.0,As new,6.8 m,3.0,,2.0,Urban,145 m² square meters,27 m² square meters,...,€ 833 833 €,No,Hoogstraat 20 9340 - Lede,5531684,2023-09-23 17:11:35.374734,https://www.immoweb.be/en/classified/house/for...,http://www.immoderas.be,,,
8,After signing the deed,1929.0,As new,8 m,2.0,,1.0,"Living area (residential, urban or rural)",300 m² square meters,39 m² square meters,...,"€ 2,791 2791 €",No,Avenue de Tervuren 113 1040 - Etterbeek,5532305,2023-09-23 17:11:35.544038,https://www.immoweb.be/en/classified/house/for...,,,,
9,After signing the deed,1967.0,Good,6 m,2.0,,,Urban,249 m² square meters,30 m² square meters,...,"€ 1,026 1026 €",Yes,Hoogstraat 20 9340 - Lede,5523589,2023-09-23 17:11:35.756325,https://www.immoweb.be/en/classified/mixed-use...,http://www.immoderas.be,,Yes,
