Import libraries and setup

In [23]:
import pandas as pd
import numpy as np
import re

Load data and Preview first row 

In [24]:

df = pd.read_csv("immovlan_final_file.csv")
df.head()

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Type of heating,Type of glazing,Elevator,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175 000 €,New,On contract,0.0,51 m²,,,,...,,,,,No,,No,,,
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415 000 €,New,On contract,1.0,70 m²,No,,Yes,...,,,Yes,2.0,Yes,,Yes,20 m²,,Yes
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399 000 €,,,2.0,129 m²,,,Yes,...,Gas,Double glass,,4.0,Yes,,,,,
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229 000 €,New,,2.0,82 m²,,,,...,,,Yes,3.0,No,,Yes,8 m²,,
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320 000 €,New,,3.0,106 m²,,,,...,,,,,Yes,,Yes,6 m²,,


Check duplicates rows and unique value IDs 

In [30]:
# check 1: Duplicate rows 
duplicate_rows = df.duplicated().sum()
print(f"\n1. Exact duplicate rows: {duplicate_rows}")

# check 2: Duplicate property IDs
if 'Property ID' in df.columns:
    duplicate_ids = df['Property ID'].duplicated().sum()
    print(f"2. Duplicate Property IDs: {duplicate_ids}")

duplicate_remove = df.drop_duplicates()
print(duplicate_remove.shape)


1. Exact duplicate rows: 0
2. Duplicate Property IDs: 0
(16309, 26)


Check for whitespace - 

lambda x: isinstance(x, str) and x != x.strip()
For each cell x in the DataFrame:
isinstance(x, str) → checks if the cell contains a string.
x.strip() → removes leading/trailing whitespace.
x != x.strip() → will be True if trimming changes the value (i.e., whitespace existed).

In [26]:
has_whitespace = df.map(lambda x: isinstance(x, str) and x != x.strip()).any().any()
print("Contains whitespace?", has_whitespace)

Contains whitespace? False


Check Data Types

In [27]:
# df.dtypes

df.info(show_counts=True, memory_usage=True, verbose=True)
df_obj = df.select_dtypes(include=['object'])
print(df_obj.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16309 entries, 0 to 16308
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   url                    16309 non-null  object 
 1   Property ID            16309 non-null  object 
 2   Price                  15725 non-null  object 
 3   State of the property  11846 non-null  object 
 4   Availability           7001 non-null   object 
 5   Number of bedrooms     14083 non-null  float64
 6   Livable surface        13294 non-null  object 
 7   Furnished              8383 non-null   object 
 8   Attic                  3661 non-null   object 
 9   Garage                 7044 non-null   object 
 10  Number of garages      3799 non-null   float64
 11  Kitchen equipment      4380 non-null   object 
 12  Kitchen type           2181 non-null   object 
 13  Number of bathrooms    12042 non-null  float64
 14  Number of showers      3538 non-null   float64
 15  Nu

Fixing columns which are numbers but shown as object 

Price                   object  ❌ Has "€" symbol
Livable surface         object  ❌ Has "m²" unit
Surface garden          object  ❌ Has "m²" unit

In [28]:
# ============================================
# STEP 1: Define cleaning function
# ============================================

def clean_numeric(value):
    """Universal numeric cleaner"""
    if pd.isna(value):
        return np.nan
    
    value = str(value)
    value = re.sub(r'[€$£,\s]', '', value)   # Remove currency, commas, spaces
    value = re.sub(r'm²?', '', value)         # Remove m² or m
    value = re.sub(r'[a-zA-Z]', '', value)    # Remove letters
    
    try:
        return float(value)
    except:
        return np.nan

# ============================================
# STEP 2: Apply to columns
# ============================================

columns = ['Price', 'Livable surface', 'Surface garden','Surface terrace', 'Total land surface']

for col in columns:
    if col in df.columns:
        # Before
        before_type = df[col].dtype
        before_sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        
        # Clean
        df[col] = df[col].apply(clean_numeric)
        
        # After
        after_type = df[col].dtype
        after_sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        
    # Report
        print(f"\n✓ {col}:")
        print(f"    {before_type} → {after_type}")
        print(f"    '{before_sample}' → {after_sample}")     


✓ Price:
    object → float64
    '175 000 €' → 175000.0

✓ Livable surface:
    object → float64
    '51 m²' → 51.0

✓ Surface garden:
    object → float64
    '315 m²' → 315.0

✓ Surface terrace:
    object → float64
    '20 m²' → 20.0

✓ Total land surface:
    object → float64
    '320 m²' → 320.0


Convert yes and no values to 1, 0

In [None]:
yes_or_no_columns = ["Furnished", "Attic", "Garage", "Elevator", "Garden", "Terrace", "Swimming pool"]

for column in yes_or_no_columns:
    if column in df.columns:
        df[column] = (      
            df[column]
            .astype(str)                     # make sure everything is text
            .str.strip()                     # remove spaces
            .str.lower()                     # make all text lowercase
            .map({
                "yes": 1, "y": 1, "true": 1, "1": 1,
                "no": 0, "n": 0, "false": 0, "0": 0
            })
        )
print(df[yes_or_no_columns].head(20))

    Furnished  Attic  Garage  Elevator  Garden  Terrace  Swimming pool
0         NaN    NaN     NaN       NaN     0.0      0.0            NaN
1         0.0    NaN     1.0       1.0     1.0      1.0            1.0
2         NaN    NaN     1.0       NaN     1.0      NaN            NaN
3         NaN    NaN     NaN       1.0     0.0      1.0            NaN
4         NaN    NaN     NaN       NaN     1.0      1.0            NaN
5         0.0    NaN     NaN       1.0     0.0      1.0            NaN
6         0.0    1.0     1.0       0.0     1.0      1.0            NaN
7         0.0    NaN     NaN       NaN     1.0      1.0            NaN
8         NaN    NaN     NaN       1.0     0.0      1.0            NaN
9         0.0    1.0     1.0       0.0     1.0      1.0            0.0
10        NaN    NaN     1.0       0.0     NaN      1.0            NaN
11        NaN    NaN     NaN       1.0     0.0      1.0            0.0
12        NaN    0.0     1.0       1.0     0.0      1.0            0.0
13    

Checking for missing values

In [None]:

missing_count = df.isnull().sum()
print(missing_count)

url                          0
Property ID                  0
Price                      584
State of the property     4463
Availability              9308
Number of bedrooms        2226
Livable surface           3015
Furnished                 7926
Attic                    12648
Garage                    9265
Number of garages        12510
Kitchen equipment        11929
Kitchen type             14128
Number of bathrooms       4267
Number of showers        12771
Number of toilets         6295
Type of heating           6966
Type of glazing           9232
Elevator                  5486
Number of facades         6492
Garden                    3523
Surface garden           13578
Terrace                   2721
Surface terrace           8861
Total land surface        9319
Swimming pool            12389
dtype: int64


Replace missing values by Nan

In [None]:
# Replace empty strings with pd.NA
df = df.replace("", pd.NA)

# Convert all columns to object type to allow string "nan"
df = df.astype(object)

# Fill missing values with string "nan"
df = df.fillna("nan")

print(df)

                                                     url Property ID  \
0      https://immovlan.be/en/detail/studio/for-sale/...    vbd20021   
1      https://immovlan.be/en/detail/apartment/for-sa...    vbd30235   
2      https://immovlan.be/en/detail/residence/for-sa...    vbd46297   
3      https://immovlan.be/en/detail/apartment/for-sa...    vbd36813   
4      https://immovlan.be/en/detail/apartment/for-sa...    vbb60643   
...                                                  ...         ...   
16304  https://immovlan.be/en/detail/villa/for-sale/1...    vwd15514   
16305  https://immovlan.be/en/detail/investment-prope...    rbu64401   
16306  https://immovlan.be/en/detail/investment-prope...    rbt71588   
16307  https://immovlan.be/en/detail/apartment/for-sa...    rbu61550   
16308  https://immovlan.be/en/detail/apartment/for-sa...    rbu65159   

          Price State of the property Availability Number of bedrooms  \
0      175000.0                   New  On contract            

Extract information from URLs 

In [40]:
# Extract the parts
df[['type', 'postal_code', 'city']] = df['url'].str.extract(
    r'detail/([^/]+)/for-sale/(\d+)/([^/]+)/'
)

print(df[['type', 'postal_code', 'city']])


                      type postal_code               city
0                   studio        4000              liege
1                apartment        1410           waterloo
2                residence        1501          buizingen
3                apartment        7000               mons
4                apartment        7000               mons
...                    ...         ...                ...
16304                villa        1440  braine-le-chateau
16305  investment-property        2530           boechout
16306  investment-property        8600          diksmuide
16307            apartment        2630         aartselaar
16308            apartment        8630             veurne

[16309 rows x 3 columns]


Save cleaned data into new file 

In [42]:
df.to_csv("immovlan_cleaned_file.csv", index=False, encoding="utf-8")
df.dtypes

url                      object
Property ID              object
Price                    object
State of the property    object
Availability             object
Number of bedrooms       object
Livable surface          object
Furnished                object
Attic                    object
Garage                   object
Number of garages        object
Kitchen equipment        object
Kitchen type             object
Number of bathrooms      object
Number of showers        object
Number of toilets        object
Type of heating          object
Type of glazing          object
Elevator                 object
Number of facades        object
Garden                   object
Surface garden           object
Terrace                  object
Surface terrace          object
Total land surface       object
Swimming pool            object
type                     object
postal_code              object
city                     object
dtype: object