## Importing and reading the file:

In [50]:
import pandas as pd

df = pd.read_csv("immovlan_final_file.csv", encoding="utf-8", on_bad_lines="skip")   #skips any broken lines that pandas can’t read

## Removing duplicate rows:

In [51]:
df = df.drop_duplicates()

## Remove whitespaces from strings:

applymap() -> applies a function to every single cell in the table (goes cell by cell) <br>
lambda x: -> for each cell 'do this' <br>
x.strip() -> removes extra spaces from a string <br>
if isinstance(x, str) -> checks if the x is a string <br>
else x -> if not a string, leave it


In [52]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



## Fix encoding:

In [53]:
def fix_encoding(text):
    if isinstance(text, str):
        try:
            return text.encode('latin1').decode('utf-8')   #convert the text to bytes using Latin1 encoding, then convert bytes back to proper UTF-8 text
        except (UnicodeEncodeError, UnicodeDecodeError):
            return text
    return text

df = df.applymap(fix_encoding)


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



## Convert yes and no values to 0, 1:

In [54]:
yes_or_no_columns = ["Furnished", "Attic", "Garage", "Elevator", "Garden", "Terrace", "Swimming pool"]
for column in yes_or_no_columns:
    df[column] = (
        df[column]
        .astype(str)  # make sure everything is text
        .str.strip()  # remove spaces
        .str.lower()  # make all text lowercase
        .map({"yes": 1, "no": 0})  # missing values become NaN automatically
    )

## Cleaning price and area columns:

re.sub(pattern, replacement, string) -> replaces text that matches pattern with replacement

In [55]:

import re

def clean_price(x):
    if not isinstance(x, str):          # converting cell to string if it’s not already
        x = str(x)
    # remove all non-digit characters except comma, dot, minus
    x = re.sub(r'[^\d,.-]', '', x)       #[^\d,.-] -> match anything that is NOT a digit, comma, dot, or minus
    # convert European decimal comma if needed
    if ',' in x and x.count(',') == 1 and '.' not in x:  # there is no dot so we don’t accidentally replace the wrong thing
        x = x.replace(',', '.')
    # remove leftover commas (thousands separators)
    x = x.replace(',', '')
    try:
        return float(x)     # convert the result to a float
    except:
        return None         # if cannot convert-> becomes NaN

df['Price'] = df['Price'].apply(clean_price)


# --- Clean area columns ---
area_columns = ['Livable surface', 'Surface garden', 'Surface terrace', 'Total land surface']
for column in area_columns:
    df[column] = (
        df[column]
        .astype(str)
        .str.replace('m²', '', regex=False)
        .str.replace('m2', '', regex=False)
        .str.strip()
    )
    df[column] = pd.to_numeric(df[column], errors='coerce')  # if cannot convert-> becomes NaN

## Drop the rows where price is NaN:

In [56]:
df = df.dropna(subset=['Price'])         #removes every row where the price is NaN

## Check output:

In [57]:
df.head(20)

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Type of heating,Type of glazing,Elevator,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175000.0,New,On contract,0.0,51.0,,,,...,,,,,0.0,,0.0,,,
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415000.0,New,On contract,1.0,70.0,0.0,,1.0,...,,,1.0,2.0,1.0,,1.0,20.0,,1.0
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399000.0,,,2.0,129.0,,,1.0,...,Gas,Double glass,,4.0,1.0,,,,,
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229000.0,New,,2.0,82.0,,,,...,,,1.0,3.0,0.0,,1.0,8.0,,
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320000.0,New,,3.0,106.0,,,,...,,,,,1.0,,1.0,6.0,,
5,https://immovlan.be/en/detail/apartment/for-sa...,vbd46661,185000.0,Excellent,,1.0,56.0,0.0,,,...,Gas,,1.0,4.0,0.0,,1.0,7.0,,
6,https://immovlan.be/en/detail/residence/for-sa...,vbd11776,190000.0,,,5.0,142.0,0.0,1.0,1.0,...,Fuel oil,,0.0,3.0,1.0,,1.0,,320.0,
8,https://immovlan.be/en/detail/apartment/for-sa...,vbd36992,329000.0,New,,3.0,113.0,,,,...,,,1.0,2.0,0.0,,1.0,9.0,,
9,https://immovlan.be/en/detail/residence/for-sa...,rbu64988,327850.0,To be renovated,On contract,3.0,185.0,0.0,1.0,1.0,...,Fuel oil,Double glass,0.0,4.0,1.0,315.0,1.0,21.0,683.0,0.0
10,https://immovlan.be/en/detail/investment-prope...,rbu62634,1475000.0,New,On contract,12.0,,,,1.0,...,Hot air,Double glass,0.0,4.0,,,1.0,,624.0,


## Extracting postcode and city name:

In [58]:
def extract_city_postcode(url):
    match = re.search(r"/(\d{4})/(.*)/", str(url))    # looks for a pattern in the text  /4 digits between slashes/ followed by the next text () puts them into seperate groups
    if match:
        postcode = match.group(1)        # gives us the first part of the regex
        city = match.group(2).replace("-", " ").title()   # gives us the second part + replaces the - with space + capitalizes the city
        return pd.Series([postcode, city])          # returns both values together
    else:
        return pd.Series([None, None])

df[["postcode", "city"]] = df["url"].apply(extract_city_postcode)    # run the code

df["postcode"] = pd.to_numeric(df["postcode"], errors="coerce")     # make it numeric


## Save the clean DataFrame to CSV File:

In [59]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv("cleaned_data.csv", index=False, encoding='utf-8-sig', na_rep='NaN')


## Creating apartments DataFrame:

In [60]:
df =  pd.read_csv("immovlan_cleaned_file.csv", encoding="utf-8")

In [61]:
for col in ["Price"]:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[df[col].between(lower_bound, upper_bound)]



In [62]:
apartments = df[df["type"].isin(["apartment", "ground floor", "penthouse", "studio", "duplex", "loft", "triplex"])]

apartments.head(10)

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool,type,postal_code,city
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175000.0,New,On contract,0.0,51.0,,,,...,,0.0,,0.0,,,,studio,4000,liege
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415000.0,New,On contract,1.0,70.0,0.0,,1.0,...,2.0,1.0,,1.0,20.0,,1.0,apartment,1410,waterloo
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229000.0,New,,2.0,82.0,,,,...,3.0,0.0,,1.0,8.0,,,apartment,7000,mons
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320000.0,New,,3.0,106.0,,,,...,,1.0,,1.0,6.0,,,apartment,7000,mons
5,https://immovlan.be/en/detail/apartment/for-sa...,vbd46661,185000.0,Excellent,,1.0,56.0,0.0,,,...,4.0,0.0,,1.0,7.0,,,apartment,1400,nivelles
8,https://immovlan.be/en/detail/apartment/for-sa...,vbd36992,329000.0,New,,3.0,113.0,,,,...,2.0,0.0,,1.0,9.0,,,apartment,1480,tubize
11,https://immovlan.be/en/detail/apartment/for-sa...,vbd46746,679000.0,New,On contract,3.0,150.0,,,,...,2.0,0.0,,1.0,6.0,,0.0,apartment,1170,watermaal-bosvoorde
12,https://immovlan.be/en/detail/apartment/for-sa...,rbt86438,224250.0,New,,1.0,71.0,,0.0,1.0,...,2.0,0.0,,1.0,,,0.0,apartment,8900,ieper
13,https://immovlan.be/en/detail/apartment/for-sa...,vbd34109,229900.0,Excellent,Immediately,2.0,77.0,0.0,,1.0,...,4.0,0.0,,0.0,,,0.0,apartment,6790,aubange
16,https://immovlan.be/en/detail/apartment/for-sa...,rbu46377,537720.0,New,On contract,3.0,170.0,0.0,,1.0,...,4.0,1.0,,1.0,21.0,,,apartment,1982,elewijt


In [63]:
import plotly.express as px

# Group by city and calculate average price
avg_price_city = apartments.groupby("city", as_index=False)["Price"].mean()

# Make a bar chart
fig = px.bar(
    avg_price_city,
    x="city",
    y="Price",
    title="Average Apartment Price by City",
    labels={"city": "City", "Price": "Average Price (€)"},
    color="Price",
)
fig.show()


In [64]:

count_by_city = apartments["city"].value_counts().reset_index()
count_by_city.columns = ["city", "count"]

fig = px.bar(
    count_by_city,
    x="city",
    y="count",
    title="Number of Apartment Listings per City",
    labels={"city": "City", "count": "Number of Listings"},
)
fig.show()

In [65]:
type_counts = apartments["type"].value_counts().reset_index()
type_counts.columns = ["Type", "Count"]

import plotly.express as px

fig = px.bar(
    type_counts,
    x="Type",
    y="Count",
    title="Number of Listings by Apartment Type",
    color="Type",
    text="Count",
)

fig.update_traces(textposition="outside")
fig.update_layout(showlegend=False)
fig.show()

fig = px.pie(
    type_counts,
    names="Type",
    values="Count",
    title="Share of Listings by Apartment Type",
)
fig.show()

