![Photo by Stephen Phillips - Hostreviews.co.uk on UnSplash](https://cf.bstatic.com/xdata/images/hotel/max1024x768/408003083.jpg?k=c49b5c4a2346b3ab002b9d1b22dbfb596cee523b53abef2550d0c92d0faf2d8b&o=&hp=1){fig-align="center" width=50%}


# Import data

In [1]:
import time
from pathlib import Path

import numpy as np
import pandas as pd
from data import utils
from lets_plot import *
from lets_plot.mapping import as_discrete

LetsPlot.setup_html()

**Goal**:
- Identify what basic pre-processing steps need to be taken before uploading the data to a database

# Select Columns to Retain Based on the Quantity of Missing Values


In the realm of web scraping, managing the sheer volume of data is often the initial hurdle to conquer. It's not so much about deciding what data to collect but rather what data to retain. As we delve into the vast realm of the Imoweb website, we are met with a plethora of listings, each offering a unique set of information.

For many of these listings, there are commonalities – details like location and price tend to be constants. However, interspersed among them are those one-of-a-kind nuggets of information, such as the number of swimming pools available. While these specific details can certainly be vital in assessing the value of certain listings, the downside is that they can lead to a sparse dataset.

Currently, our primary objective is to pinpoint which features are prevalent across the board, drawing insights from a pre-scraped dataset comprising around 1000 ads. Once we've identified these common denominators, we can streamline our data collection process by retaining these key attributes while discarding the less likely occurrences.

In [2]:
for filename in utils.Configuration.RAW_DATA_PATH.glob("*.gzip"):
    if "data" in filename.stem:
        df = pd.read_parquet(filename)
print(df.shape)
df.head()

(60, 50)


Unnamed: 0,Available as of,Construction year,Building condition,Street frontage width,Number of frontages,Covered parking spaces,Outdoor parking spaces,Surroundings type,Living area,Living room surface,...,Cadastral income,Tenement building,Address,External reference,day_of_retrieval,ad_url,Website,As built plan,Office,Dining room
0,After signing the deed,1971,Just renovated,12 m,4,2.0,5.0,"Living area (residential, urban or rural)",197 m² square meters,47 m² square meters,...,"€ 1,279 1279 €",No,Stationstraat 30 9600 - Ronse,5411439,2023-09-23 17:11:29.870179,https://www.immoweb.be/en/classified/villa/for...,,,,
1,After signing the deed,1949,To renovate,18 m,3,1.0,,Isolated,139 m² square meters,10 m² square meters,...,€ 689 689 €,No,Rue de la Wallonie 2A 4680 - Oupeye,5534704,2023-09-23 17:11:29.995637,https://www.immoweb.be/en/classified/house/for...,http://www.nigel-immo.be,,Yes,Yes
2,After signing the deed,1920,Good,5.5 m,2,,,Urban,200 m² square meters,26 m² square meters,...,€ 917 917 €,No,Sint-Denijslaan 1 9000 - Gent,5531386,2023-09-23 17:11:32.203024,https://www.immoweb.be/en/classified/mansion/f...,http://www.immodavinci.be,No,Yes,
3,After signing the deed,1937,To renovate,8 m,2,,,Urban,230 m² square meters,,...,"€ 1,623 1623 €",No,Hoogstraat 20 9340 - Lede,5535368,2023-09-23 17:11:34.716784,https://www.immoweb.be/en/classified/house/for...,http://www.immoderas.be,,Yes,
4,Depending on the tenant,1900,Good,6.5 m,2,,,Urban,360 m² square meters,34 m² square meters,...,"€ 1,621 1621 €",No,"Mechelsesteenweg,157 2018 - Antwerpen",5534431,2023-09-23 17:11:34.857664,https://www.immoweb.be/en/classified/mansion/f...,,,,


In [3]:
def pre_process_dataframe(df):
    def extract_numbers(df, columns):
        for column in columns:
            try:
                df[column] = df[column].str.extract(r"(\d+)").astype("float32")
            except Exception as e:
                print(f"Error processing column {column}: {e}")
        return df

    def map_values(df, columns):
        for column in columns:
            try:
                df[column] = df[column].map({"Yes": True, None: False, "No": False})
            except Exception as e:
                print(f"Error processing column {column}: {e}")
        return df

    number_columns = [
        "construction_year",
        "street_frontage_width",
        "number_of_frontages",
        "covered_parking_spaces",
        "outdoor_parking_spaces",
        "living_area",
        "living_room_surface",
        "kitchen_surface",
        "bedrooms",
        "bedroom_1_surface",
        "bedroom_2_surface",
        "bedroom_3_surface",
        "bathrooms",
        "toilets",
        "surface_of_the_plot",
        "width_of_the_lot_on_the_street",
        "garden_surface",
        "primary_energy_consumption",
        "co2_emission",
        "yearly_theoretical_total_energy_consumption",
        "price",
        "cadastral_income",
    ]

    boolean_columns = [
        "basement",
        "furnished",
        "gas_water__electricity",
        "double_glazing",
        "planning_permission_obtained",
        "tv_cable",
        "dining_room",
        "proceedings_for_breach_of_planning_regulations",
        "subdivision_permit",
        "tenement_building",
    ]

    return (
        df.sort_index(axis=1)
        .fillna(np.nan)
        .rename(
            columns=lambda column: column.lower()
            .replace(" ", "_")
            .replace("&", "")
            .replace(",", "")
        )
        .rename(columns={"co₂_emission": "co2_emission"})
        .pipe(lambda df: extract_numbers(df, number_columns))
        .pipe(lambda df: map_values(df, boolean_columns))
        .assign(
            flood_zone_type=lambda df: df.flood_zone_type.map(
                {
                    "Non flood zone": False,
                    "No": False,
                    "Possible flood zone": True,
                }
            ),
            connection_to_sewer_network=lambda df: df.connection_to_sewer_network.map(
                {
                    "Connected": True,
                    "Not connected": False,
                }
            ),
            as_built_plan=lambda df: df.as_built_plan.map(
                {
                    "Yes, conform": True,
                    "No": False,
                }
            ),
        )
    )


pre_process_dataframe(df).head().style.set_sticky(axis=0)

Unnamed: 0,address,as_built_plan,available_as_of,basement,bathrooms,bedroom_1_surface,bedroom_2_surface,bedroom_3_surface,bedrooms,building_condition,co2_emission,cadastral_income,connection_to_sewer_network,construction_year,covered_parking_spaces,dining_room,double_glazing,energy_class,external_reference,flood_zone_type,furnished,garden_surface,gas_water__electricity,heating_type,kitchen_surface,kitchen_type,latest_land_use_designation,living_area,living_room_surface,number_of_frontages,office,outdoor_parking_spaces,planning_permission_obtained,possible_priority_purchase_right,price,primary_energy_consumption,proceedings_for_breach_of_planning_regulations,reference_number_of_the_epc_report,street_frontage_width,subdivision_permit,surface_of_the_plot,surroundings_type,tv_cable,tenement_building,toilets,website,width_of_the_lot_on_the_street,yearly_theoretical_total_energy_consumption,ad_url,day_of_retrieval
0,Stationstraat 30 9600 - Ronse,,After signing the deed,True,1.0,18.0,13.0,9.0,4.0,Just renovated,9784.0,1.0,True,1971.0,2.0,,True,D,5411439,False,False,485.0,True,Gas,16.0,Hyper equipped,"Living area (residential, urban or rural)",197.0,47.0,4.0,,5.0,True,Yes,465.0,371.0,False,1-RES,12.0,False,992.0,"Living area (residential, urban or rural)",True,False,2.0,,20.0,,https://www.immoweb.be/en/classified/villa/for-sale/ronse/9600/10838435,2023-09-23 17:11:29.870179
1,Rue de la Wallonie 2A 4680 - Oupeye,,After signing the deed,True,1.0,17.0,11.0,,2.0,To renovate,173.0,689.0,True,1949.0,1.0,True,True,G,5534704,False,False,315.0,,Fuel oil,13.0,Semi equipped,"Living area (residential, urban or rural)",139.0,10.0,3.0,Yes,,,No,150.0,699.0,False,20230303013078,18.0,False,413.0,Isolated,True,False,1.0,http://www.nigel-immo.be,18.0,96913.0,https://www.immoweb.be/en/classified/house/for-sale/esneux%20tilff/4130/10838582,2023-09-23 17:11:29.995637
2,Sint-Denijslaan 1 9000 - Gent,False,After signing the deed,True,1.0,22.0,21.0,13.0,3.0,Good,9544.0,917.0,True,1920.0,,,True,C,5531386,False,False,55.0,True,Gas,13.0,Installed,"Living area (residential, urban or rural)",200.0,26.0,2.0,Yes,,,No,529.0,239.0,False,2281999,5.0,False,136.0,Urban,True,False,1.0,http://www.immodavinci.be,,,https://www.immoweb.be/en/classified/mansion/for-sale/gent/9000/10835799,2023-09-23 17:11:32.203024
3,Hoogstraat 20 9340 - Lede,,After signing the deed,,,,,,3.0,To renovate,,1.0,True,1937.0,,,True,E,5535368,,False,,True,Gas,,Semi equipped,"Living area (residential, urban or rural)",230.0,,2.0,Yes,,True,No,245.0,496.0,False,20230804-0002955788-KNR-1,8.0,False,154.0,Urban,True,False,2.0,http://www.immoderas.be,8.0,,https://www.immoweb.be/en/classified/house/for-sale/sint-lievens-houtem/9520/10835368,2023-09-23 17:11:34.716784
4,"Mechelsesteenweg,157 2018 - Antwerpen",,Depending on the tenant,True,5.0,20.0,18.0,18.0,8.0,Good,22.0,1.0,True,1900.0,,,True,C,5534431,False,False,36.0,True,Gas,,Installed,,360.0,34.0,2.0,,,,,1.0,249.0,,Not specified,6.0,,173.0,Urban,True,False,6.0,,6.0,,https://www.immoweb.be/en/classified/mansion/for-sale/antwerp/2018/10835944,2023-09-23 17:11:34.857664


In [4]:
(
    df["Construction year"]
    .str.extract(r"([^a-zA-Z])", expand=True)  # NON-matching alphabetical characters
    .value_counts()
)

1    42
2     6
Name: count, dtype: int64

# Assessing Feature Cardinality

In [5]:
# Assuming df is your DataFrame
number_unique_entries = {
    "column_name": pre_processed_dataframe.columns.tolist(),
    "column_dtype": [
        pre_processed_dataframe[col].dtype for col in pre_processed_dataframe.columns
    ],
    "unique_values_pct": [
        pre_processed_dataframe[col].nunique()
        for col in pre_processed_dataframe.columns
    ],
}

result_df = (
    pd.DataFrame(number_unique_entries)
    .sort_values("unique_values_pct")
    .assign(
        unique_values_pct=lambda x: x.unique_values_pct.div(df.shape[0])
        .mul(100)
        .round(1)
    )
    .pipe(
        lambda df: ggplot(df, aes("unique_values_pct", "column_name"))
        + geom_bar(stat="identity", orientation="y")
        + labs(
            title="Assessing Feature Cardinality",
            subtitle=""" Features with a Low Cardinality (Less than 10 Distinct Values) Can Be  Utilized as Categorical Variables, 
            while Those with Higher Cardinality, typically represented as floats or integers, May Be Employed as They Are
            """,
            x="Percentage of Unique Values per Feature",
            y="",
            caption="https://www.immoweb.be/",
        )
        + theme(
            plot_subtitle=element_text(
                size=12, face="italic"
            ),  # Customize subtitle appearance
            plot_title=element_text(size=15, face="bold"),  # Customize title appearance
        )
        + ggsize(800, 1000)
    )
)
result_df

NameError: name 'pre_processed_dataframe' is not defined