## Import libraries and setup

In [2]:
import pandas as pd
import numpy as np
import re


## Load data and Preview first row 

In [3]:

df = pd.read_csv("immovlan_final_file.csv")
df.head()

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Type of heating,Type of glazing,Elevator,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175‚ÄØ000 ‚Ç¨,New,On contract,0.0,51 m¬≤,,,,...,,,,,No,,No,,,
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415‚ÄØ000 ‚Ç¨,New,On contract,1.0,70 m¬≤,No,,Yes,...,,,Yes,2.0,Yes,,Yes,20 m¬≤,,Yes
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399‚ÄØ000 ‚Ç¨,,,2.0,129 m¬≤,,,Yes,...,Gas,Double glass,,4.0,Yes,,,,,
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229‚ÄØ000 ‚Ç¨,New,,2.0,82 m¬≤,,,,...,,,Yes,3.0,No,,Yes,8 m¬≤,,
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320‚ÄØ000 ‚Ç¨,New,,3.0,106 m¬≤,,,,...,,,,,Yes,,Yes,6 m¬≤,,


## Check duplicates rows and unique value IDs 

In [4]:
# check 1: Duplicate rows 
duplicate_rows = df.duplicated().sum()
print(f"\n1. Exact duplicate rows: {duplicate_rows}")

# check 2: Duplicate property IDs
if 'Property ID' in df.columns:
    duplicate_ids = df['Property ID'].duplicated().sum()
    print(f"2. Duplicate Property IDs: {duplicate_ids}")

duplicate_remove = df.drop_duplicates()
print(duplicate_remove.shape)


1. Exact duplicate rows: 0
2. Duplicate Property IDs: 0
(16309, 26)


## Check for whitespace - 

lambda x: isinstance(x, str) and x != x.strip()
For each cell x in the DataFrame:
isinstance(x, str) ‚Üí checks if the cell contains a string.
x.strip() ‚Üí removes leading/trailing whitespace.
x != x.strip() ‚Üí will be True if trimming changes the value (i.e., whitespace existed).

In [5]:
has_whitespace = df.map(lambda x: isinstance(x, str) and x != x.strip()).any().any()
print("Contains whitespace?", has_whitespace)

Contains whitespace? False


## Check Data Types

In [6]:
# df.dtypes

df.info(show_counts=True, memory_usage=True, verbose=True)
df_obj = df.select_dtypes(include=['object'])
print(df_obj.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16309 entries, 0 to 16308
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   url                    16309 non-null  object 
 1   Property ID            16309 non-null  object 
 2   Price                  15725 non-null  object 
 3   State of the property  11846 non-null  object 
 4   Availability           7001 non-null   object 
 5   Number of bedrooms     14083 non-null  float64
 6   Livable surface        13294 non-null  object 
 7   Furnished              8383 non-null   object 
 8   Attic                  3661 non-null   object 
 9   Garage                 7044 non-null   object 
 10  Number of garages      3799 non-null   float64
 11  Kitchen equipment      4380 non-null   object 
 12  Kitchen type           2181 non-null   object 
 13  Number of bathrooms    12042 non-null  float64
 14  Number of showers      3538 non-null   float64
 15  Nu

## Fixing columns which are numbers but shown as object 

Price                   object  ‚ùå Has "‚Ç¨" symbol
Livable surface         object  ‚ùå Has "m¬≤" unit
Surface garden          object  ‚ùå Has "m¬≤" unit

In [7]:
# ============================================
# STEP 1: Define cleaning function
# ============================================

def clean_numeric(value):
    """Universal numeric cleaner"""
    if pd.isna(value):
        return np.nan
    
    value = str(value)
    value = re.sub(r'[‚Ç¨$¬£,\s]', '', value)   # Remove currency, commas, spaces
    value = re.sub(r'm¬≤?', '', value)         # Remove m¬≤ or m
    value = re.sub(r'[a-zA-Z]', '', value)    # Remove letters
    
    try:
        return float(value)
    except:
        return np.nan

# ============================================
# STEP 2: Apply to columns
# ============================================

columns = ['Price', 'Livable surface', 'Surface garden','Surface terrace', 'Total land surface']

for col in columns:
    if col in df.columns:
        # Before
        before_type = df[col].dtype
        before_sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        
        # Clean
        df[col] = df[col].apply(clean_numeric)
        
        # After
        after_type = df[col].dtype
        after_sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        
    # Report
        print(f"\n‚úì {col}:")
        print(f"    {before_type} ‚Üí {after_type}")
        print(f"    '{before_sample}' ‚Üí {after_sample}")     


‚úì Price:
    object ‚Üí float64
    '175‚ÄØ000 ‚Ç¨' ‚Üí 175000.0

‚úì Livable surface:
    object ‚Üí float64
    '51 m¬≤' ‚Üí 51.0

‚úì Surface garden:
    object ‚Üí float64
    '315 m¬≤' ‚Üí 315.0

‚úì Surface terrace:
    object ‚Üí float64
    '20 m¬≤' ‚Üí 20.0

‚úì Total land surface:
    object ‚Üí float64
    '320 m¬≤' ‚Üí 320.0


## Convert yes and no values to 1, 0

In [8]:
yes_or_no_columns = ["Furnished", "Attic", "Garage", "Elevator", "Garden", "Terrace", "Swimming pool"]

for column in yes_or_no_columns:
    if column in df.columns:
        df[column] = (      
            df[column]
            .astype(str)                     # make sure everything is text
            .str.strip()                     # remove spaces
            .str.lower()                     # make all text lowercase
            .map({
                "yes": 1, "y": 1, "true": 1, "1": 1,
                "no": 0, "n": 0, "false": 0, "0": 0
            })
        )
display(df[yes_or_no_columns].head(20))

Unnamed: 0,Furnished,Attic,Garage,Elevator,Garden,Terrace,Swimming pool
0,,,,,0.0,0.0,
1,0.0,,1.0,1.0,1.0,1.0,1.0
2,,,1.0,,1.0,,
3,,,,1.0,0.0,1.0,
4,,,,,1.0,1.0,
5,0.0,,,1.0,0.0,1.0,
6,0.0,1.0,1.0,0.0,1.0,1.0,
7,0.0,,,,1.0,1.0,
8,,,,1.0,0.0,1.0,
9,0.0,1.0,1.0,0.0,1.0,1.0,0.0


## Checking for missing values

In [9]:

missing_count = df.isnull().sum()
display(missing_count)

url                          0
Property ID                  0
Price                      584
State of the property     4463
Availability              9308
Number of bedrooms        2226
Livable surface           3015
Furnished                 7926
Attic                    12648
Garage                    9265
Number of garages        12510
Kitchen equipment        11929
Kitchen type             14128
Number of bathrooms       4267
Number of showers        12771
Number of toilets         6295
Type of heating           6966
Type of glazing           9232
Elevator                  5486
Number of facades         6492
Garden                    3523
Surface garden           13578
Terrace                   2721
Surface terrace           8861
Total land surface        9319
Swimming pool            12389
dtype: int64

## Replace missing values by Nan

In [10]:
# Replace empty strings with pd.NA
df = df.replace("", pd.NA)

# Convert all columns to object type to allow string "nan"
df = df.astype(object)

# Fill missing values with string "nan"
df = df.fillna("nan")

display(df)

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Type of heating,Type of glazing,Elevator,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175000.0,New,On contract,0.0,51.0,,,,...,,,,,0.0,,0.0,,,
1,https://immovlan.be/en/detail/apartment/for-sa...,vbd30235,415000.0,New,On contract,1.0,70.0,0.0,,1.0,...,,,1.0,2.0,1.0,,1.0,20.0,,1.0
2,https://immovlan.be/en/detail/residence/for-sa...,vbd46297,399000.0,,,2.0,129.0,,,1.0,...,Gas,Double glass,,4.0,1.0,,,,,
3,https://immovlan.be/en/detail/apartment/for-sa...,vbd36813,229000.0,New,,2.0,82.0,,,,...,,,1.0,3.0,0.0,,1.0,8.0,,
4,https://immovlan.be/en/detail/apartment/for-sa...,vbb60643,320000.0,New,,3.0,106.0,,,,...,,,,,1.0,,1.0,6.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,https://immovlan.be/en/detail/villa/for-sale/1...,vwd15514,995000.0,Excellent,On contract,6.0,300.0,,,1.0,...,Not specified,,,4.0,1.0,750.0,1.0,20.0,1448.0,1.0
16305,https://immovlan.be/en/detail/investment-prope...,rbu64401,645000.0,,,4.0,,,,1.0,...,,,,,,,1.0,,213.0,
16306,https://immovlan.be/en/detail/investment-prope...,rbt71588,649000.0,Normal,,5.0,,,,,...,Gas,,0.0,2.0,,,,,110.0,
16307,https://immovlan.be/en/detail/apartment/for-sa...,rbu61550,239000.0,,On contract,2.0,100.0,0.0,,1.0,...,Electricity,,1.0,,0.0,,0.0,,,


Extract information from URLs 

In [11]:
# Extract the parts
df[['type', 'postal_code', 'city']] = df['url'].str.extract(
    r'detail/([^/]+)/for-sale/(\d+)/([^/]+)/'
)

print(df[['type', 'postal_code', 'city']])


                      type postal_code               city
0                   studio        4000              liege
1                apartment        1410           waterloo
2                residence        1501          buizingen
3                apartment        7000               mons
4                apartment        7000               mons
...                    ...         ...                ...
16304                villa        1440  braine-le-chateau
16305  investment-property        2530           boechout
16306  investment-property        8600          diksmuide
16307            apartment        2630         aartselaar
16308            apartment        8630             veurne

[16309 rows x 3 columns]


## Save cleaned data into new file 

In [12]:
df.to_csv("immovlan_cleaned_file.csv", index=False, encoding="utf-8")
df.head(1)



Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool,type,postal_code,city
0,https://immovlan.be/en/detail/studio/for-sale/...,vbd20021,175000.0,New,On contract,0.0,51.0,,,,...,,0.0,,0.0,,,,studio,4000,liege


## Create dataframe on type using two categories - Businesses and Land 

In [13]:
df = pd.read_csv("immovlan_cleaned_file.csv")
# Clean up the 'type' column
df['type'] = df['type'].str.lower().str.replace('-', ' ').str.strip()         # removes spaces and capitalizes words
df["type"].unique()                      # display list from column 

# Define subcategories
business_types = ['commercial building', 'industrial building', 'office space', 'business surface']
land_types = ['land', 'development site', 'to parcel out site']

# Filter rows where 'type' matches any of the business or land subcategories
property_df = df[df['type'].isin(business_types + land_types)]

# Preview
display(property_df['type'].value_counts())
display(property_df.head())



type
land                   642
commercial building    439
development site       394
industrial building    172
office space           123
business surface        45
to parcel out site      17
Name: count, dtype: int64

Unnamed: 0,url,Property ID,Price,State of the property,Availability,Number of bedrooms,Livable surface,Furnished,Attic,Garage,...,Number of facades,Garden,Surface garden,Terrace,Surface terrace,Total land surface,Swimming pool,type,postal_code,city
21,https://immovlan.be/en/detail/commercial-build...,rbu66908,150000.0,,On contract,,,,,,...,,,,1.0,34.0,,,commercial building,1800,vilvoorde
24,https://immovlan.be/en/detail/land/for-sale/69...,vbd36859,75000.0,,,,,,,,...,,,,,,694.0,,land,6950,nassogne
33,https://immovlan.be/en/detail/land/for-sale/96...,rbu60077,180000.0,,,,,,,,...,,,,,,1514.0,,land,9600,ronse
35,https://immovlan.be/en/detail/commercial-build...,rbu50976,499000.0,,,4.0,,,,,...,,,,,,345.0,,commercial building,1570,galmaarden
37,https://immovlan.be/en/detail/land/for-sale/62...,vbd49230,65000.0,,On contract,,,,,,...,,,,,,460.0,,land,6230,pont-a-celles


## Regions 
A. Deterministic postcode -> region mapping 

In [14]:
def postcode_to_region(pc):
    if pd.isna(pc):
        return "Unknown"
    pc = int(pc)

    # Brussels
    if 1000 <= pc <= 1299:
        return "Brussels"

    # Wallonia
    if 1300 <= pc <= 1499:
        return "Wallonia"
    if 4000 <= pc <= 7999:
        return "Wallonia"

    # Flanders
    if 1500 <= pc <= 3999:
        return "Flanders"
    if 8000 <= pc <= 9999:
        return "Flanders"

    return "Unknown"

df["Region"] = df["postal_code"].apply(postcode_to_region)

# Test 
print("Region distribution:")
print(df["Region"].value_counts(dropna=False))

df.to_csv("immovlan_cleaned_file.csv", index=False, encoding="utf-8")

Region distribution:
Region
Wallonia    8142
Flanders    6724
Brussels    1443
Name: count, dtype: int64


B. Map postcode to province

In [15]:
provinces = {
    "brussels": list(range(1000, 1300)),
    "brabant_walloon": list(range(1300, 1500)),
    "brabant_flemish": list(range(1500, 2000)) + list(range(3000, 3500)),
    "antwerp": list(range(2000, 3000)),
    "limburg": list(range(3500, 4000)),
    "liege": list(range(4000, 4500)),
    "namur": list(range(4500, 5681)),
    "hainaut": list(range(5681, 6600)) + list(range(7000, 8000)),
    "luxembourg": list(range(6600, 7000)),
    "west_flanders": list(range(8000, 9000)),
    "east_flanders": list(range(9000, 10000))
}
def postcode_to_province(postcode):
    # Brussels
    if postcode in provinces["brussels"]:
        return "Brussels"
    # Walloon Brabant
    if postcode in provinces["brabant_walloon"]:
        return "Walloon Brabant"
    # Flemish Brabant
    if postcode in provinces["brabant_flemish"]:
        return "Flemish Brabant"
    # Antwerp
    if postcode in provinces["antwerp"]:
        return "Antwerp"
    # Limburg
    if postcode in provinces["limburg"]:
        return "Limburg"

    # Liege
    if postcode in provinces["liege"]:
        return "Liege"

    # Namur
    if postcode in provinces["namur"]:
        return "Namur"

    # Hainaut
    if postcode in provinces["hainaut"]:
        return "Hainaut"

    # Luxembourg
    if postcode in provinces["luxembourg"]:
        return "Luxembourg"

    # West Flanders
    if postcode in provinces["west_flanders"]:
        return "West Flanders"

    # East Flanders
    if postcode in provinces["east_flanders"]:
        return "East Flanders"

    return "Unknown"

df["province"] = df["postal_code"].apply(postcode_to_province)

# Test
print("Provinces distribution:")
print(df["province"].value_counts(dropna=False))

df.to_csv("immovlan_cleaned_file.csv", index=False, encoding="utf-8")

Provinces distribution:
province
Namur              2447
Walloon Brabant    1625
Hainaut            1503
Limburg            1502
Brussels           1443
Antwerp            1408
Luxembourg         1387
West Flanders      1293
Flemish Brabant    1283
East Flanders      1238
Liege              1180
Name: count, dtype: int64


## Calculate area per meter square

In [16]:
df["price_per_sqm"] = df["Price"] / df["Livable surface"]
df['Price_per_sqm_land'] = df['Price'] / df['Total land surface']

df.to_csv("immovlan_cleaned_file.csv", index=False, encoding="utf-8")

## Visualization  

In [17]:
import plotly.express as px

print("="*70)
print("üìä BOX PLOT: Price Distribution by Province")
print("="*70)

# Remove missing prices and provinces
df_plot = df[df['Price'].notna() & df['province'].notna()].copy()

# Create interactive box plot
fig = px.box(
    df_plot, 
    x='province', 
    y='Price',
    title='Property Prices by Province',
    labels={'Price': 'Price (‚Ç¨)', 'province': 'Province', 'type': 'Property type'},
    color='province',  # Different color for each province
    hover_data=['Price','type']  # Show price on hover
)

# Customize layout
fig.update_layout(
    height=600,
    xaxis_tickangle=-45,  # Rotate labels
    showlegend=True,
    font=dict(size=12)
)

# Format y-axis as currency
fig.update_yaxes(tickformat='‚Ç¨,.0f')

fig.show()

print(f"\n‚úì Displaying {len(df_plot)} properties across {df_plot['province'].nunique()} provinces")


üìä BOX PLOT: Price Distribution by Province



‚úì Displaying 15725 properties across 11 provinces


In [18]:
import plotly.graph_objects as go 


print("="*70)
print("üìä BAR CHART: Average Price by Province")
print("="*70)

# Calculate average price per province
avg_prices = df.groupby('province')['Price'].agg(['mean', 'median', 'count']).reset_index()
avg_prices = avg_prices.sort_values('mean', ascending=False)

print("\nAverage Prices by Province:")
print(avg_prices)

# Create bar chart
fig = go.Figure()

# Add bars for mean price
fig.add_trace(go.Bar(
    x=avg_prices['province'],
    y=avg_prices['mean'],
    name='Average Price',
    text=avg_prices['mean'].apply(lambda x: f'‚Ç¨{x:,.0f}'),
    textposition='outside',
    marker_color='lightblue',
    hovertemplate='<b>%{x}</b><br>Average: ‚Ç¨%{y:,.0f}<extra></extra>'
))

# Add bars for median price
fig.add_trace(go.Bar(
    x=avg_prices['province'],
    y=avg_prices['median'],
    name='Median Price',
    text=avg_prices['median'].apply(lambda x: f'‚Ç¨{x:,.0f}'),
    textposition='outside',
    marker_color='lightcoral',
    hovertemplate='<b>%{x}</b><br>Median: ‚Ç¨%{y:,.0f}<extra></extra>'
))

fig.update_layout(
    title='Average vs Median Price by Province',
    xaxis_title='Province',
    yaxis_title='Price (‚Ç¨)',
    barmode='group',  # Side-by-side bars
    height=600,
    xaxis_tickangle=-45,
    hovermode='x unified'
)

fig.update_yaxes(tickformat='‚Ç¨,.0f')

fig.show()

print(f"\n‚úì Chart created for {len(avg_prices)} provinces")

üìä BAR CHART: Average Price by Province

Average Prices by Province:
           province           mean    median  count
1          Brussels  548583.616725  429000.0   1435
3   Flemish Brabant  496121.361582  414904.0   1239
2     East Flanders  448566.978723  385000.0   1222
9   Walloon Brabant  442028.372281  385000.0   1609
0           Antwerp  385367.681492  335000.0   1394
10    West Flanders  364727.584712  319500.0   1269
6           Limburg  348713.569952  325000.0   1451
8             Namur  301085.254366  279500.0   2233
7        Luxembourg  301006.332064  275000.0   1313
5             Liege  289565.280215  275000.0   1117
4           Hainaut  259911.762994  232400.0   1443



‚úì Chart created for 11 provinces


In [19]:
from plotly.subplots import make_subplots

print("="*70)
print("üìä DASHBOARD: Comprehensive Province Analysis")
print("="*70)

df_plot = df[df['Price'].notna() & df['province'].notna()].copy()

# Calculate statistics
stats = df_plot.groupby('province').agg({
    'Price': ['mean', 'median', 'count', 'min', 'max']
}).round(0)

stats.columns = ['Mean', 'Median', 'Count', 'Min', 'Max']
stats = stats.reset_index().sort_values('Mean', ascending=False)

# Create subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Average Price by Province',
        'Property Count by Province',
        'Price Range (Min-Max)',
        'Mean vs Median Price'
    ),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'scatter'}, {'type': 'scatter'}]]
)

# 1. Average Price
fig.add_trace(
    go.Bar(x=stats['province'], y=stats['Mean'], name='Avg Price', marker_color='lightblue'),
    row=1, col=1
)

# 2. Property Count
fig.add_trace(
    go.Bar(x=stats['province'], y=stats['Count'], name='Count', marker_color='lightgreen'),
    row=1, col=2
)

# 3. Price Range
fig.add_trace(
    go.Scatter(x=stats['province'], y=stats['Min'], mode='markers', 
               name='Min', marker=dict(size=10, color='red')),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=stats['province'], y=stats['Max'], mode='markers', 
               name='Max', marker=dict(size=10, color='green')),
    row=2, col=1
)

# 4. Mean vs Median
fig.add_trace(
    go.Scatter(x=stats['province'], y=stats['Mean'], mode='lines+markers',
               name='Mean', line=dict(color='blue')),
    row=2, col=2
)
fig.add_trace(
    go.Scatter(x=stats['province'], y=stats['Median'], mode='lines+markers',
               name='Median', line=dict(color='red')),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=900,
    showlegend=True,
    title_text="Province Price Analysis Dashboard"
)

# Rotate x-axis labels
fig.update_xaxes(tickangle=-45)

fig.show()

print(f"\n‚úì Dashboard created with {len(stats)} provinces")

üìä DASHBOARD: Comprehensive Province Analysis



‚úì Dashboard created with 11 provinces


### Map of belgium with median price per province

In [21]:
# Aggregate the median price and number of observations of each province in a Dataframe

province_median_summary = df.groupby("province")["Price"].agg(median_price="median", n_obs="count").reset_index()
province_median_summary.index += 1
province_mean_summary = df.groupby("province")["Price"].agg(mean_price="mean", n_obs="count").reset_index().round(2)
province_mean_summary.index += 1

#Optional
display(province_median_summary)
display(province_mean_summary)

Unnamed: 0,province,median_price,n_obs
1,Antwerp,335000.0,1394
2,Brussels,429000.0,1435
3,East Flanders,385000.0,1222
4,Flemish Brabant,414904.0,1239
5,Hainaut,232400.0,1443
6,Liege,275000.0,1117
7,Limburg,325000.0,1451
8,Luxembourg,275000.0,1313
9,Namur,279500.0,2233
10,Walloon Brabant,385000.0,1609


Unnamed: 0,province,mean_price,n_obs
1,Antwerp,385367.68,1394
2,Brussels,548583.62,1435
3,East Flanders,448566.98,1222
4,Flemish Brabant,496121.36,1239
5,Hainaut,259911.76,1443
6,Liege,289565.28,1117
7,Limburg,348713.57,1451
8,Luxembourg,301006.33,1313
9,Namur,301085.25,2233
10,Walloon Brabant,442028.37,1609


In [22]:
# Shapefile (map) that matches the provinces (with GeoJSON or shapefile)

import geopandas as gpd

# Natural Earth ‚ÄúAdmin 1 ‚Äì States, Provinces‚Äù shapefile for Belgium
url = "https://raw.githubusercontent.com/nvkelso/natural-earth-vector/master/geojson/ne_10m_admin_1_states_provinces.geojson"
provinces_map = gpd.read_file(url).query("iso_a2 == 'BE'")



# Testing: list the columns that contains province names
print(provinces_map[["name", "name_en"]])

                 name           name_en
200     West Flanders     West Flanders
201           Hainaut           Hainaut
203             Namur             Namur
205        Luxembourg        Luxembourg
208             Liege             Li√®ge
760     East Flanders     East Flanders
762           Antwerp           Antwerp
763           Limburg           Limburg
1709         Brussels  Brussels Capital
1710  Flemish Brabant   Flemish Brabant
1711  Walloon Brabant   Walloon Brabant


In [24]:
# Load the map and harmonise province names

# Load Natural-Earth provinces in Belgium
provinces_map = gpd.read_file(url).query("iso_a2 == 'BE'")[["name_en", "geometry"]]

# Rename column to match our data
provinces_map = provinces_map.rename(columns={"name_en": "province"})

# Spelling fixes so names == province_summary
name_fix = {
    "Li√®ge": "Liege",
    "Brussels Capital": "Brussels",
}
provinces_map["province"] = provinces_map["province"].replace(name_fix)

# Testing
print("Mapped provinces:", sorted(provinces_map["province"].unique()))
print("Data provinces:", sorted(province_median_summary["province"].unique()))

Mapped provinces: ['Antwerp', 'Brussels', 'East Flanders', 'Flemish Brabant', 'Hainaut', 'Liege', 'Limburg', 'Luxembourg', 'Namur', 'Walloon Brabant', 'West Flanders']
Data provinces: ['Antwerp', 'Brussels', 'East Flanders', 'Flemish Brabant', 'Hainaut', 'Liege', 'Limburg', 'Luxembourg', 'Namur', 'Walloon Brabant', 'West Flanders']


In [25]:
# Merge price data into the map

# Left join so every province polygon gets its median price and observation count
provinces_map_median = provinces_map.merge(province_median_summary, on="province", how="left")
provinces_map_mean = provinces_map.merge(province_mean_summary, on="province", how="left")

# Testing

# Median
display(provinces_map_median.head())
display(provinces_map_median[["province", "median_price", "n_obs"]].sort_values("median_price"))

# Mean
display(provinces_map_mean.head())
display(provinces_map_mean[["province", "mean_price", "n_obs"]].sort_values("mean_price"))

Unnamed: 0,province,geometry,median_price,n_obs
0,West Flanders,"POLYGON ((2.65062 50.81225, 2.64245 50.81246, ...",319500.0,1269
1,Hainaut,"MULTIPOLYGON (((2.88688 50.69665, 2.87128 50.6...",232400.0,1443
2,Namur,"POLYGON ((4.83136 50.1434, 4.8201 50.14645, 4....",279500.0,2233
3,Luxembourg,"POLYGON ((5.19248 49.68281, 5.16943 49.6872, 5...",275000.0,1313
4,Liege,"POLYGON ((6.03863 50.14841, 6.02736 50.14945, ...",275000.0,1117


Unnamed: 0,province,median_price,n_obs
1,Hainaut,232400.0,1443
3,Luxembourg,275000.0,1313
4,Liege,275000.0,1117
2,Namur,279500.0,2233
0,West Flanders,319500.0,1269
7,Limburg,325000.0,1451
6,Antwerp,335000.0,1394
5,East Flanders,385000.0,1222
10,Walloon Brabant,385000.0,1609
9,Flemish Brabant,414904.0,1239


Unnamed: 0,province,geometry,mean_price,n_obs
0,West Flanders,"POLYGON ((2.65062 50.81225, 2.64245 50.81246, ...",364727.58,1269
1,Hainaut,"MULTIPOLYGON (((2.88688 50.69665, 2.87128 50.6...",259911.76,1443
2,Namur,"POLYGON ((4.83136 50.1434, 4.8201 50.14645, 4....",301085.25,2233
3,Luxembourg,"POLYGON ((5.19248 49.68281, 5.16943 49.6872, 5...",301006.33,1313
4,Liege,"POLYGON ((6.03863 50.14841, 6.02736 50.14945, ...",289565.28,1117


Unnamed: 0,province,mean_price,n_obs
1,Hainaut,259911.76,1443
4,Liege,289565.28,1117
3,Luxembourg,301006.33,1313
2,Namur,301085.25,2233
7,Limburg,348713.57,1451
0,West Flanders,364727.58,1269
6,Antwerp,385367.68,1394
10,Walloon Brabant,442028.37,1609
5,East Flanders,448566.98,1222
9,Flemish Brabant,496121.36,1239


In [26]:
# Median price per province map with interactive tool-tips

import folium, json

# centre on Belgium
m = folium.Map(location=[50.5, 4.5], zoom_start=8)

# build the choropleth
folium.Choropleth(
    geo_data=provinces_map_median.to_json(),
    data=provinces_map_median,
    columns=["province", "median_price"],
    key_on="feature.properties.province",
    fill_color="YlGn",
    nan_fill_color="grey",
    legend_name="Median price (‚Ç¨)"
).add_to(m)

# add hover tool-tips
from folium import features

tooltip = features.GeoJsonTooltip(
    fields=["province", "median_price", "n_obs"],
    aliases=["Province:", "Median price (‚Ç¨):", "Observations:"],
    localize=True
)

features.GeoJson(
    provinces_map_median.to_json(),
    tooltip=tooltip,
    style_function=lambda x: {"fillOpacity": 0, "weight": 0}
).add_to(m)

m

In [27]:
# Mean price per province map with interactive tool-tips

# centre on Belgium
m = folium.Map(location=[50.5, 4.5], zoom_start=8)

# build the choropleth
folium.Choropleth(
    geo_data=provinces_map_mean.to_json(),
    data=provinces_map_mean,
    columns=["province", "mean_price"],
    key_on="feature.properties.province",
    fill_color="YlGn",
    nan_fill_color="grey",
    legend_name="Mean price (‚Ç¨)"
).add_to(m)

# add hover tool-tips
from folium import features

tooltip = features.GeoJsonTooltip(
    fields=["province", "mean_price", "n_obs"],
    aliases=["Province:", "Mean price (‚Ç¨):", "Observations:"],
    localize=True
)

features.GeoJson(
    provinces_map_mean.to_json(),
    tooltip=tooltip,
    style_function=lambda x: {"fillOpacity": 0, "weight": 0}
).add_to(m)

m