Import libraries and setup

In [12]:
import pandas as pd
import numpy as np
import re

Load data and Preview first row 

In [13]:

df = pd.read_csv("immovlan_final_file.csv")
df.head()

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Type of heating,Type of glazing,Elevator,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175 000 €,New,On contract,0.0,51 m²,,,,...,,,,,No,,No,,,
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415 000 €,New,On contract,1.0,70 m²,No,,Yes,...,,,Yes,2.0,Yes,,Yes,20 m²,,Yes
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399 000 €,,,2.0,129 m²,,,Yes,...,Gas,Double glass,,4.0,Yes,,,,,
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229 000 €,New,,2.0,82 m²,,,,...,,,Yes,3.0,No,,Yes,8 m²,,
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320 000 €,New,,3.0,106 m²,,,,...,,,,,Yes,,Yes,6 m²,,


Check duplicates rows and unique value IDs 

In [14]:
# check 1: Duplicate rows 
duplicate_rows = df.duplicated().sum()
print(f"\n1. Exact duplicate rows: {duplicate_rows}")

# check 2: Duplicate property IDs
if 'Property ID' in df.columns:
    duplicate_ids = df['Property ID'].duplicated().sum()
    print(f"2. Duplicate Property IDs: {duplicate_ids}")

duplicate_remove = df.drop_duplicates()
print(duplicate_remove.shape)


1. Exact duplicate rows: 0
2. Duplicate Property IDs: 0
(16309, 26)


Check for whitespace - 

lambda x: isinstance(x, str) and x != x.strip()
For each cell x in the DataFrame:
isinstance(x, str) → checks if the cell contains a string.
x.strip() → removes leading/trailing whitespace.
x != x.strip() → will be True if trimming changes the value (i.e., whitespace existed).

In [15]:
has_whitespace = df.map(lambda x: isinstance(x, str) and x != x.strip()).any().any()
print("Contains whitespace?", has_whitespace)

Contains whitespace? False


Check Data Types

In [16]:
# df.dtypes

df.info(show_counts=True, memory_usage=True, verbose=True)
df_obj = df.select_dtypes(include=['object'])
print(df_obj.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16309 entries, 0 to 16308
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   url                    16309 non-null  object 
 1   Property ID            16309 non-null  object 
 2   Price                  15725 non-null  object 
 3   State of the property  11846 non-null  object 
 4   Availability           7001 non-null   object 
 5   Number of bedrooms     14083 non-null  float64
 6   Livable surface        13294 non-null  object 
 7   Furnished              8383 non-null   object 
 8   Attic                  3661 non-null   object 
 9   Garage                 7044 non-null   object 
 10  Number of garages      3799 non-null   float64
 11  Kitchen equipment      4380 non-null   object 
 12  Kitchen type           2181 non-null   object 
 13  Number of bathrooms    12042 non-null  float64
 14  Number of showers      3538 non-null   float64
 15  Nu

Fixing columns which are numbers but shown as object 

Price                   object  ❌ Has "€" symbol
Livable surface         object  ❌ Has "m²" unit
Surface garden          object  ❌ Has "m²" unit

In [17]:
# ============================================
# STEP 1: Define cleaning function
# ============================================

def clean_numeric(value):
    """Universal numeric cleaner"""
    if pd.isna(value):
        return np.nan
    
    value = str(value)
    value = re.sub(r'[€$£,\s]', '', value)   # Remove currency, commas, spaces
    value = re.sub(r'm²?', '', value)         # Remove m² or m
    value = re.sub(r'[a-zA-Z]', '', value)    # Remove letters
    
    try:
        return float(value)
    except:
        return np.nan

# ============================================
# STEP 2: Apply to columns
# ============================================

columns = ['Price', 'Livable surface', 'Surface garden','Surface terrace', 'Total land surface']

for col in columns:
    if col in df.columns:
        # Before
        before_type = df[col].dtype
        before_sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        
        # Clean
        df[col] = df[col].apply(clean_numeric)
        
        # After
        after_type = df[col].dtype
        after_sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        
    # Report
        print(f"\n✓ {col}:")
        print(f"    {before_type} → {after_type}")
        print(f"    '{before_sample}' → {after_sample}")     


✓ Price:
    object → float64
    '175 000 €' → 175000.0

✓ Livable surface:
    object → float64
    '51 m²' → 51.0

✓ Surface garden:
    object → float64
    '315 m²' → 315.0

✓ Surface terrace:
    object → float64
    '20 m²' → 20.0

✓ Total land surface:
    object → float64
    '320 m²' → 320.0


Convert yes and no values to 1, 0

In [18]:
yes_or_no_columns = ["Furnished", "Attic", "Garage", "Elevator", "Garden", "Terrace", "Swimming pool"]

for column in yes_or_no_columns:
    if column in df.columns:
        df[column] = (      
            df[column]
            .astype(str)                     # make sure everything is text
            .str.strip()                     # remove spaces
            .str.lower()                     # make all text lowercase
            .map({
                "yes": 1, "y": 1, "true": 1, "1": 1,
                "no": 0, "n": 0, "false": 0, "0": 0
            })
        )
display(df[yes_or_no_columns].head(20))

Unnamed: 0,Furnished,Attic,Garage,Elevator,Garden,Terrace,Swimming pool
0,,,,,0.0,0.0,
1,0.0,,1.0,1.0,1.0,1.0,1.0
2,,,1.0,,1.0,,
3,,,,1.0,0.0,1.0,
4,,,,,1.0,1.0,
5,0.0,,,1.0,0.0,1.0,
6,0.0,1.0,1.0,0.0,1.0,1.0,
7,0.0,,,,1.0,1.0,
8,,,,1.0,0.0,1.0,
9,0.0,1.0,1.0,0.0,1.0,1.0,0.0


Checking for missing values

In [25]:

missing_count = df.isnull().sum()
display(missing_count)

url                      0
Property ID              0
Price                    0
State of the property    0
Availability             0
Number of bedrooms       0
Livable surface          0
Furnished                0
Attic                    0
Garage                   0
Number of garages        0
Kitchen equipment        0
Kitchen type             0
Number of bathrooms      0
Number of showers        0
Number of toilets        0
Type of heating          0
Type of glazing          0
Elevator                 0
Number of facades        0
Garden                   0
Surface garden           0
Terrace                  0
Surface terrace          0
Total land surface       0
Swimming pool            0
type                     0
postal_code              0
city                     0
dtype: int64

Replace missing values by Nan

In [23]:
# Replace empty strings with pd.NA
df = df.replace("", pd.NA)

# Convert all columns to object type to allow string "nan"
df = df.astype(object)

# Fill missing values with string "nan"
df = df.fillna("nan")

display(df)

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool,type,postal_code,city
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175000.0,New,On contract,0.0,51.0,,,,...,,0.0,,0.0,,,,studio,4000,liege
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415000.0,New,On contract,1.0,70.0,0.0,,1.0,...,2.0,1.0,,1.0,20.0,,1.0,apartment,1410,waterloo
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399000.0,,,2.0,129.0,,,1.0,...,4.0,1.0,,,,,,residence,1501,buizingen
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229000.0,New,,2.0,82.0,,,,...,3.0,0.0,,1.0,8.0,,,apartment,7000,mons
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320000.0,New,,3.0,106.0,,,,...,,1.0,,1.0,6.0,,,apartment,7000,mons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,https://immovlan.be/en/detail/villa/for-sale/1...,vwd15514,995000.0,Excellent,On contract,6.0,300.0,,,1.0,...,4.0,1.0,750.0,1.0,20.0,1448.0,1.0,villa,1440,braine-le-chateau
16305,https://immovlan.be/en/detail/investment-prope...,rbu64401,645000.0,,,4.0,,,,1.0,...,,,,1.0,,213.0,,investment-property,2530,boechout
16306,https://immovlan.be/en/detail/investment-prope...,rbt71588,649000.0,Normal,,5.0,,,,,...,2.0,,,,,110.0,,investment-property,8600,diksmuide
16307,https://immovlan.be/en/detail/apartment/for-sa...,rbu61550,239000.0,,On contract,2.0,100.0,0.0,,1.0,...,,0.0,,0.0,,,,apartment,2630,aartselaar


Extract information from URLs 

In [21]:
# Extract the parts
df[['type', 'postal_code', 'city']] = df['url'].str.extract(
    r'detail/([^/]+)/for-sale/(\d+)/([^/]+)/'
)

print(df[['type', 'postal_code', 'city']])


                      type postal_code               city
0                   studio        4000              liege
1                apartment        1410           waterloo
2                residence        1501          buizingen
3                apartment        7000               mons
4                apartment        7000               mons
...                    ...         ...                ...
16304                villa        1440  braine-le-chateau
16305  investment-property        2530           boechout
16306  investment-property        8600          diksmuide
16307            apartment        2630         aartselaar
16308            apartment        8630             veurne

[16309 rows x 3 columns]


Save cleaned data into new file 

In [24]:
df.to_csv("immovlan_cleaned_file.csv", index=False, encoding="utf-8")
df.dtypes

url                      object
Property ID              object
Price                    object
State of the property    object
Availability             object
Number of bedrooms       object
Livable surface          object
Furnished                object
Attic                    object
Garage                   object
Number of garages        object
Kitchen equipment        object
Kitchen type             object
Number of bathrooms      object
Number of showers        object
Number of toilets        object
Type of heating          object
Type of glazing          object
Elevator                 object
Number of facades        object
Garden                   object
Surface garden           object
Terrace                  object
Surface terrace          object
Total land surface       object
Swimming pool            object
type                     object
postal_code              object
city                     object
dtype: object

## Create dataframe on type using two categories - Businesses and Land 

In [35]:
df = pd.read_csv("immovlan_cleaned_file.csv")
# Clean up the 'type' column
df['type'] = df['type'].str.lower().str.replace('-', ' ').str.strip()         # removes spaces and capitalizes words
df["type"].unique()                      # display list from column 

# Define subcategories
business_types = ['commercial building', 'industrial building', 'office space', 'business surface']
land_types = ['land', 'development site', 'to parcel out site']

# Filter rows where 'type' matches any of the business or land subcategories
property_df = df[df['type'].isin(business_types + land_types)]

# Preview
display(property_df['type'].value_counts())
display(property_df.head(50))



type
land                   642
commercial building    439
development site       394
industrial building    172
office space           123
business surface        45
to parcel out site      17
Name: count, dtype: int64

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool,type,postal_code,city
21,https://immovlan.be/en/detail/commercial-build...,rbu66908,150000.0,,On contract,,,,,,...,,,,1.0,34.0,,,commercial building,1800,vilvoorde
24,https://immovlan.be/en/detail/land/for-sale/69...,vbd36859,75000.0,,,,,,,,...,,,,,,694.0,,land,6950,nassogne
33,https://immovlan.be/en/detail/land/for-sale/96...,rbu60077,180000.0,,,,,,,,...,,,,,,1514.0,,land,9600,ronse
35,https://immovlan.be/en/detail/commercial-build...,rbu50976,499000.0,,,4.0,,,,,...,,,,,,345.0,,commercial building,1570,galmaarden
37,https://immovlan.be/en/detail/land/for-sale/62...,vbd49230,65000.0,,On contract,,,,,,...,,,,,,460.0,,land,6230,pont-a-celles
47,https://immovlan.be/en/detail/industrial-build...,rbu66785,665528.0,New,,,,,,,...,,,,,,411.0,,industrial building,8000,brugge
49,https://immovlan.be/en/detail/commercial-build...,vbd47313,110000.0,,,,,,,,...,,,,,,,,commercial building,1000,brussels
79,https://immovlan.be/en/detail/land/for-sale/14...,vbd44103,255000.0,,On contract,,,,,,...,,,,,,1750.0,,land,1490,court-saint-etienne
92,https://immovlan.be/en/detail/development-site...,rbu49574,440000.0,,,,,,,,...,,,,,,707.0,,development site,3920,lommel
104,https://immovlan.be/en/detail/office-space/for...,vbd47443,279000.0,,,,,,,1.0,...,,,,,,340.0,,office space,4020,wandre
