## Importing and reading the file:

In [53]:
import pandas as pd

df = pd.read_csv("immovlan_final_file.csv", encoding="utf-8", on_bad_lines="skip")   #skips any broken lines that pandas can’t read

## Removing duplicate rows:

In [54]:
df = df.drop_duplicates()

## Remove whitespaces from strings:

applymap() -> applies a function to every single cell in the table (goes cell by cell) <br>
lambda x: -> for each cell 'do this' <br>
x.strip() -> removes extra spaces from a string <br>
if isinstance(x, str) -> checks if the x is a string <br>
else x -> if not a string, leave it


In [55]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


## Fix encoding:

In [56]:
def fix_encoding(text):
    if isinstance(text, str):
        try:
            return text.encode('latin1').decode('utf-8')   #convert the text to bytes using Latin1 encoding, then convert bytes back to proper UTF-8 text
        except (UnicodeEncodeError, UnicodeDecodeError):
            return text
    return text

df = df.applymap(fix_encoding)

  df = df.applymap(fix_encoding)


## Convert yes and no values to 0, 1:

In [57]:
yes_or_no_columns = ["Furnished", "Attic", "Garage", "Elevator", "Garden", "Terrace", "Swimming pool"]
for column in yes_or_no_columns:
    df[column] = (
        df[column]
        .astype(str)  # make sure everything is text
        .str.strip()  # remove spaces
        .str.lower()  # make all text lowercase
        .map({"yes": 0, "no": 1})  # missing values become NaN automatically
    )

## Cleaning price and area columns:

re.sub(pattern, replacement, string) -> replaces text that matches pattern with replacement

In [58]:

import re

def clean_price(x):
    if not isinstance(x, str):          # converting cell to string if it’s not already
        x = str(x)
    # remove all non-digit characters except comma, dot, minus
    x = re.sub(r'[^\d,.-]', '', x)       #[^\d,.-] -> match anything that is NOT a digit, comma, dot, or minus
    # convert European decimal comma if needed
    if ',' in x and x.count(',') == 1 and '.' not in x:  # there is no dot so we don’t accidentally replace the wrong thing
        x = x.replace(',', '.')
    # remove leftover commas (thousands separators)
    x = x.replace(',', '')
    try:
        return float(x)     # convert the result to a float
    except:
        return None         # if cannot convert-> becomes NaN

df['Price'] = df['Price'].apply(clean_price)


# --- Clean area columns ---
area_columns = ['Livable surface', 'Surface garden', 'Surface terrace', 'Total land surface']
for column in area_columns:
    df[column] = (
        df[column]
        .astype(str)
        .str.replace('m²', '', regex=False)
        .str.replace('m2', '', regex=False)
        .str.strip()
    )
    df[column] = pd.to_numeric(df[column], errors='coerce')  # if cannot convert-> becomes NaN

## Drop the rows where price is NaN:

In [59]:
df = df.dropna(subset=['Price'])         #removes every row where the price is NaN

## Check output:

In [60]:
df.head(20)

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Type of heating,Type of glazing,Elevator,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175000.0,New,On contract,0.0,51.0,,,,...,,,,,1.0,,1.0,,,
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415000.0,New,On contract,1.0,70.0,1.0,,0.0,...,,,0.0,2.0,0.0,,0.0,20.0,,0.0
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399000.0,,,2.0,129.0,,,0.0,...,Gas,Double glass,,4.0,0.0,,,,,
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229000.0,New,,2.0,82.0,,,,...,,,0.0,3.0,1.0,,0.0,8.0,,
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320000.0,New,,3.0,106.0,,,,...,,,,,0.0,,0.0,6.0,,
5,https://immovlan.be/en/detail/apartment/for-sa...,vbd46661,185000.0,Excellent,,1.0,56.0,1.0,,,...,Gas,,0.0,4.0,1.0,,0.0,7.0,,
6,https://immovlan.be/en/detail/residence/for-sa...,vbd11776,190000.0,,,5.0,142.0,1.0,0.0,0.0,...,Fuel oil,,1.0,3.0,0.0,,0.0,,320.0,
8,https://immovlan.be/en/detail/apartment/for-sa...,vbd36992,329000.0,New,,3.0,113.0,,,,...,,,0.0,2.0,1.0,,0.0,9.0,,
9,https://immovlan.be/en/detail/residence/for-sa...,rbu64988,327850.0,To be renovated,On contract,3.0,185.0,1.0,0.0,0.0,...,Fuel oil,Double glass,1.0,4.0,0.0,315.0,0.0,21.0,683.0,1.0
10,https://immovlan.be/en/detail/investment-prope...,rbu62634,1475000.0,New,On contract,12.0,,,,0.0,...,Hot air,Double glass,1.0,4.0,,,0.0,,624.0,


## Extracting postcode and city name:

In [None]:
def extract_city_postcode(url):
    match = re.search(r"/(\d{4})/(.*)/", str(url))    # looks for a pattern in the text  /4 digits between slashes/ followed by the next text () puts them into seperate groups
    if match:
        postcode = match.group(1)        # gives us the first part of the regex
        city = match.group(2).replace("-", " ").title()   # gives us the second part + replaces the - with space + capitalizes the city
        return pd.Series([postcode, city])          # returns both values together
    else:
        return pd.Series([None, None])

df[["postcode", "city"]] = df["url"].apply(extract_city_postcode)    # run the code

df["postcode"] = pd.to_numeric(df["postcode"], errors="coerce")     # make it numeric


## Save the clean DataFrame to CSV File:

In [62]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv("cleaned_data.csv", index=False, encoding='utf-8-sig', na_rep='NaN')
