# Part 1: Fetch and prepare data

We use the osmnx interface to fetch OSM building data for Dresden.
The data then gets enriched and prepared to be loaded into a vector store that enables semantic search

In [None]:
!pip install osmnx geopandas



1.1 Fetch building data from OSM

In [None]:
import osmnx as ox
import geopandas as gpd

# Configure OSMnx to use Overpass and filter only buildings
tags = {"building": True}

# Get buildings in Dresden
gdf = ox.features.features_from_place("Dresden, Germany", tags)

# Show a preview
gdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,dog,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,1546854470,POINT (13.86689 51.14757),Dresden,DE,6.0,1465.0,Seifersdorfer Straße,townhall,yes,Bürgerhaus Schönborn,HiRes aerial imagery,...,,,,,,,,,,
node,1723634518,POINT (13.81303 51.06486),,,,,,,bunker,,,...,,,,,,,,,,
node,2729351830,POINT (13.68528 51.01194),,,86.0,,Birkigter Hang,,shed,,survey,...,,,,,,,,,,
node,2850710674,POINT (13.68304 51.01288),Dresden,DE,25.0,1189.0,Cornelius-Gurlitt-Straße,,yes,,,...,,,,,,,,,,
node,3336622284,POINT (13.67379 51.06486),Dresden,DE,2.0,1157.0,Am Lehmberg,,yes,,survey,...,,,,,,,,,,


In [None]:
print(f"Number of buildings fetched: {len(gdf)}")

Number of buildings fetched: 89923


Many buildings do not include meaningful building tags (only `building:yes`). For our tests we only include these buildings if the features have a `name` attribute.

In [None]:
# Filter rows where 'name' is not null OR 'building' is not 'yes'
filtered_gdf = gdf[gdf['name'].notna() | (gdf['building'] != 'yes')]

# Print or use the filtered GeoDataFrame
print(f"Number of filtered buildings: {len(filtered_gdf)}")
filtered_gdf.head()


Number of filtered buildings: 49972


Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,dog,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
node,1546854470,POINT (13.86689 51.14757),Dresden,DE,6.0,1465.0,Seifersdorfer Straße,townhall,yes,Bürgerhaus Schönborn,HiRes aerial imagery,...,,,,,,,,,,
node,1723634518,POINT (13.81303 51.06486),,,,,,,bunker,,,...,,,,,,,,,,
node,2729351830,POINT (13.68528 51.01194),,,86.0,,Birkigter Hang,,shed,,survey,...,,,,,,,,,,
node,5160667643,POINT (13.74831 51.03364),,,,,,,yes,A-Gebäude,,...,,,,,,,,,,
node,5160667644,POINT (13.74763 51.03404),,,,,,,yes,B-Gebäude,,...,,,,,,,,,,


### Simple queries on the dataset

Get some sample buildings by name:

In [None]:
!pip install fuzzywuzzy python-Levenshtein geojson

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting geojson
  Downloading geojson-3.2.0-py3-none-any.whl.metadata (16 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geojson-3.2.0-py3-none-any.whl (15 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz
from typing import Union

# Function to perform fuzzy matching on the 'name' column
def fuzzy_search_by_name(df: gpd.GeoDataFrame, query_name: str, threshold: int = 80) -> gpd.GeoDataFrame:
    """
    Performs a fuzzy search on the 'name' column of a GeoDataFrame.

    Args:
        df: The GeoDataFrame to search.
        query_name: The name to search for.
        threshold: The minimum fuzzy matching score (0-100).

    Returns:
        A GeoDataFrame containing the matching rows.
    """
    matches = []
    for index, row in df.iterrows():
        if 'name' in row and isinstance(row['name'], str):
            score = fuzz.ratio(row['name'].lower(), query_name.lower())
            if score >= threshold:
                matches.append(row)

    if matches:
        return gpd.GeoDataFrame(matches, crs=df.crs)  # Maintain original CRS
    else:
        return gpd.GeoDataFrame(columns=df.columns, crs=df.crs)  # Return empty GeoDataFrame with correct columns and CRS

In [None]:
# Perform the fuzzy search for "Deutsches Hygiene Museum"
hygiene_museum = fuzzy_search_by_name(filtered_gdf, "Deutsches Hygiene Museum")
hygiene_museum

Unnamed: 0,Unnamed: 1,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,dog,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata
relation,5651,"POLYGON ((13.74559 51.04509, 13.74597 51.04494...",Dresden,DE,1,1069,Lingnerplatz,,yes,Deutsches Hygiene-Museum,,...,no,,,,,,,,,


In [None]:
# Lexical search can't handle seatch terms that deviate too much from the desired attribute value
hygiene_museum = fuzzy_search_by_name(filtered_gdf, "Hygiene Museum")
hygiene_museum

Unnamed: 0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,dog,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata


In [None]:
# Perform the fuzzy search for "Semperoper"
semperoper = fuzzy_search_by_name(filtered_gdf, "Semperoper")
semperoper

Unnamed: 0,Unnamed: 1,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,dog,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata
way,242305584,"POLYGON ((13.73464 51.05477, 13.73467 51.05478...",Dresden,DE,2,1067,Theaterplatz,theatre,yes,Semperoper,,...,,,,,,,,,,


In [None]:
# Perform the fuzzy search for "Semperoper"
semperoper = fuzzy_search_by_name(filtered_gdf, "oper dresden")
semperoper

Unnamed: 0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,dog,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata


### Prepare features to load them into a vector store



Enrichment of the feature metadata with tag descriptions of the OSM Wiki

In [None]:
import asyncio
import aiohttp
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import re

async def get_osm_tag_description_async(tag: str,
                                        session: aiohttp.ClientSession,
                                        semaphore: asyncio.Semaphore) -> str:
    """
    Async fetch of the first‐paragraph description for an OSM tag page.
    `tag` should be like "building=museum".
    """
    tag_value = tag.split("=", 1)[1]
    API = "https://wiki.openstreetmap.org/w/api.php"

    # 1) Try the exintro extract
    params_extract = {
        "action":      "query",
        "format":      "json",
        "prop":        "extracts",
        "exintro":     "1",
        "explaintext": "1",
        "redirects":   "1",
        "titles":      f"Tag:building={tag_value}"
    }
    async with semaphore:
        try:
            async with session.get(API, params=params_extract) as resp:
                if resp.status == 200:
                    data = await resp.json()
                    for p in data.get("query", {}).get("pages", {}).values():
                        if (extract := p.get("extract", "").strip()):
                            return extract
        except Exception:
            pass

    # 2) Fallback: fetch full HTML and scrape first <p>
    params_full = {
        "action":    "parse",
        "format":    "json",
        "prop":      "text",
        "redirects": "1",
        "page":      f"Tag:building={tag_value}"
    }
    async with semaphore:
        try:
            async with session.get(API, params=params_full) as resp2:
                if resp2.status == 200:
                    html = (await resp2.json())\
                            .get("parse", {})\
                            .get("text", {})\
                            .get("*", "")
                    soup = BeautifulSoup(html, "html.parser")
                    container = soup.find("div", class_="mw-parser-output")
                    if container:
                        for p in container.find_all("p", recursive=False):
                            raw = p.get_text(separator=" ", strip=True)
                            clean = re.sub(r"\s+", " ", raw).strip()
                            if clean:
                                return clean
        except Exception:
            pass

    # 3) Ultimate fallback
    return tag_value

async def get_descriptions_for_gdf_tags(gdf: pd.DataFrame,
                                        max_concurrent: int = 20
                                       ) -> pd.DataFrame:
    """
    Given a GeoDataFrame `gdf` with a 'building' column,
    fetch wiki descriptions for each unique building tag,
    and return a new GeoDataFrame with 'building_description'.
    """
    gdf = gdf.copy()  # avoid mutating the original
    # Prepare the unique tags
    tags = [f"building={v}" for v in gdf['building'].dropna().unique()]

    # Shared mapping of tag -> description
    mapping: Dict[str,str] = {}

    # 1) Build the queue
    queue: asyncio.Queue = asyncio.Queue()
    for tag in tags:
        queue.put_nowait(tag)

    semaphore = asyncio.Semaphore(max_concurrent)

    async with aiohttp.ClientSession() as session:
        # 2) Define a worker
        async def worker():
            while True:
                try:
                    tag = await queue.get()
                except asyncio.CancelledError:
                    break
                try:
                    desc = await get_osm_tag_description_async(tag, session, semaphore)
                except Exception:
                    desc = tag  # fallback
                mapping[tag] = desc
                pbar.update(1)
                queue.task_done()

        # 3) Launch workers and progress bar
        pbar = tqdm(total=len(tags), desc="Fetching tag descriptions")
        workers = [asyncio.create_task(worker()) for _ in range(min(max_concurrent, len(tags)))]

        # 4) Wait until queue is drained
        await queue.join()
        # 5) Cancel workers
        for w in workers:
            w.cancel()
        pbar.close()

    # 6) Map back into a new column
    gdf.loc[:, 'building_description'] = (
        gdf['building']
           .map(lambda v: mapping.get(f"building={v}", None) if pd.notna(v) else None)
    )
    return gdf


In [None]:
enriched_gdf = await get_descriptions_for_gdf_tags(filtered_gdf)


Fetching tag descriptions:   0%|          | 0/109 [00:00<?, ?it/s]

In [None]:
enriched_gdf[['building', 'building_description']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,building,building_description
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1
node,1546854470,yes,yes
node,1723634518,bunker,bunker
node,2729351830,shed,shed
node,5160667643,yes,yes
node,5160667644,yes,yes


### We generate multiple subsets of our building data to support different scenarios:
- Scenario 1: Queries for a specific buildings (by name) (e.g. "Deutsches Hygiene Museum")
- Scenario 2: Queries by building type (e.g. "museums in Dresden")

Scenario 1: Buildings with names

In [None]:
features_with_names = enriched_gdf[filtered_gdf['name'].notna()]
print(f"Features with names: {len(features_with_names)}")


Features with names: 3371


In [None]:
sample_feature = features_with_names.sample()
sample_feature

Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,amenity,building,name,source,...,townhall:type,contact:mastodon,levels,name:hsb,polling_station,name:sv,old_name:1900-1945,animal,species:wikidata,building_description
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
way,26370514,"POLYGON ((13.78054 51.02217, 13.78096 51.02198...",Dresden,DE,7,1219,Rudolf-Bergander-Ring,place_of_worship,kingdom_hall,Königreichsaal Jehovas Zeugen,,...,,,,,,,,,,kingdom_hall


In [None]:
from typing import List, Dict, Any

def generate_documents(features_with_names: pd.DataFrame) -> List[Dict[str, str]]:
    """
    Generate a list of documents from features with names in a DataFrame.

    Parameters:
    - features_with_names: A pandas DataFrame with feature data including 'name', 'description', and other fields.

    Returns:
    - documents: A list of dictionaries, where each dictionary represents a document with 'id' and 'content'.
    """
    documents: List[Dict[str, str]] = []

    for index, row in features_with_names.iterrows():
        document: Dict[str, str] = {}
        document["id"] = str(index)  # Use the DataFrame index as the document ID
        document["content"] = ""

        # Add 'name' if available
        if pd.notna(row['name']):
            document["content"] += f"Name: {row['name']}\n"

        # Include all non-NaN key-value pairs for enriched context
        for key, value in row.items():
            if key not in ["geometry", "name", "description", "main_tag"] and pd.notna(value):
                document["content"] += f"{key}: {value}\n"

        # Add the building description
        if pd.notna(row['description']):
            document["content"] += f"Description: {row['description']}\n"

        documents.append(document)

    return documents


documents = generate_documents(features_with_names)

# Example to show the content of the first 5 documents:
for i in range(min(5, len(documents))):
    print(f"Document {i+1}:")
    print(documents[i]['content'])
    print("-" * 20)


Document 1:
Name: Bürgerhaus Schönborn
addr:city: Dresden
addr:country: DE
addr:housenumber: 6
addr:postcode: 01465
addr:street: Seifersdorfer Straße
amenity: townhall
building: yes
source: HiRes aerial imagery
building_description: yes

--------------------
Document 2:
Name: A-Gebäude
building: yes
building_description: yes

--------------------
Document 3:
Name: B-Gebäude
building: yes
building_description: yes

--------------------
Document 4:
Name: Geschwisterwohnen WG
building: yes
operator: Outlaw
website: https://www.outlaw-ggmbh.de/wohngruppe-geschwisterwohnen-loebtau/unsere-einrichtung/
building_description: yes

--------------------
Document 5:
Name: SUFW Soziales Kaufhaus
addr:housenumber: 41
addr:postcode: 01239
addr:street: Finsterwalder Straße
building: yes
operator: Sächsisches Umschulungs- und Fortbildungswerk Dresden e. V.
website: https://www.sufw.de/sozialer-moebeldienst-kaufhaus.html
email: gebrauchtes@sufw.de
opening_hours: Mo-Fr 08:00-17:00
phone: +49 351 27206651

We also need structured metadata for each document

In [None]:
import json
from typing import List, Dict, Any

def generate_metadata_objects(features_with_names: pd.DataFrame) -> List[Dict[str, Any]]:
    """
    Generate metadata objects from features with names in a DataFrame.

    Parameters:
    - features_with_names: A pandas DataFrame with geometry and other feature properties.

    Returns:
    - metadata_objects: A list of dictionaries containing metadata for each feature.
    """
    metadata_objects: List[Dict[str, Any]] = []

    for index, feature in features_with_names.iterrows():
        metadata: Dict[str, Any] = {}
        metadata["id"] = index
        metadata["properties"] = {}

        # Create a single 'properties' key containing all fields
        properties: Dict[str, Any] = {}
        for key, value in feature.items():
            if key != "geometry":
                if pd.isna(value):
                    continue
                # Explicitly encode to UTF-8 if it's a string
                properties[key] = value.encode('utf-8').decode('utf-8') if isinstance(value, str) else value
        metadata["properties"] = properties

        # Convert Shapely geometry to GeoJSON format before serializing
        metadata["geometry"] = feature["geometry"].__geo_interface__
        metadata["geometry"]["properties"] = properties
        metadata_objects.append(metadata)

    return metadata_objects


metadata_objects = generate_metadata_objects(features_with_names)

# Example to show the first metadata object
print(json.dumps(metadata_objects[0], indent=2, ensure_ascii=False))  # ensure_ascii=False for UTF-8

{
  "id": [
    "node",
    1546854470
  ],
  "properties": {
    "addr:city": "Dresden",
    "addr:country": "DE",
    "addr:housenumber": "6",
    "addr:postcode": "01465",
    "addr:street": "Seifersdorfer Straße",
    "amenity": "townhall",
    "building": "yes",
    "name": "Bürgerhaus Schönborn",
    "source": "HiRes aerial imagery",
    "building_description": "yes"
  },
  "geometry": {
    "type": "Point",
    "coordinates": [
      13.8668941,
      51.1475694
    ],
    "properties": {
      "addr:city": "Dresden",
      "addr:country": "DE",
      "addr:housenumber": "6",
      "addr:postcode": "01465",
      "addr:street": "Seifersdorfer Straße",
      "amenity": "townhall",
      "building": "yes",
      "name": "Bürgerhaus Schönborn",
      "source": "HiRes aerial imagery",
      "building_description": "yes"
    }
  }
}


In [None]:
from geojson import Feature
from typing import List, Dict, Any

def generate_metadata_objects(features_with_names: pd.DataFrame) -> List[Dict[str, Any]]:
    """
    Generate metadata objects from features with names in a DataFrame.

    Parameters:
    - features_with_names: A pandas DataFrame with geometry and other feature properties.

    Returns:
    - metadata_objects: A list of dictionaries containing metadata for each feature.
    """
    metadata_objects: List[Dict[str, Any]] = []

    for index, row in features_with_names.iterrows():
        metadata: Dict[str, Any] = {"id": str(index)}
        properties: Dict[str, Any] = {}

        # Extract geometry from the row
        geometry = row["geometry"].__geo_interface__

        # Populate properties from non-geometry fields
        for key, value in row.items():
            if key == "geometry" or pd.isna(value):
                continue
            clean_value = value.encode("utf-8").decode("utf-8") if isinstance(value, str) else value
            metadata[key] = clean_value
            properties[key] = clean_value

        # Add geometry details to properties (excluding 'type' and 'coordinates')
        for key, value in geometry.items():
            if key not in ("type", "coordinates"):
                properties[key] = value

        # Create geojson feature
        geojson_feature: Feature = Feature(geometry=geometry, properties=properties)

        # Add full geometry (including properties) to metadata as JSON string
        full_geometry: Dict[str, Any] = geometry.copy()
        full_geometry["properties"] = properties
        metadata["geometry"] = json.dumps(full_geometry, ensure_ascii=False)

        metadata_objects.append(metadata)

    return metadata_objects

metadata_objects = generate_metadata_objects(features_with_names)

# Example to show the first metadata object
print(json.dumps(metadata_objects[0], indent=2, ensure_ascii=False))


{
  "id": "('node', 1546854470)",
  "addr:city": "Dresden",
  "addr:country": "DE",
  "addr:housenumber": "6",
  "addr:postcode": "01465",
  "addr:street": "Seifersdorfer Straße",
  "amenity": "townhall",
  "building": "yes",
  "name": "Bürgerhaus Schönborn",
  "source": "HiRes aerial imagery",
  "building_description": "yes",
  "geometry": "{\"type\": \"Point\", \"coordinates\": [13.8668941, 51.1475694], \"properties\": {\"addr:city\": \"Dresden\", \"addr:country\": \"DE\", \"addr:housenumber\": \"6\", \"addr:postcode\": \"01465\", \"addr:street\": \"Seifersdorfer Straße\", \"amenity\": \"townhall\", \"building\": \"yes\", \"name\": \"Bürgerhaus Schönborn\", \"source\": \"HiRes aerial imagery\", \"building_description\": \"yes\"}}"
}


Scenario 2: Buildings grouped by building type

In [None]:
building_types = enriched_gdf['building'].unique()
building_types = building_types[~pd.Series(building_types).isin(['yes', 'no'])]
print(f"Building types: {building_types}")

Building types: ['bunker' 'shed' 'roof' 'house' 'bungalow' 'terrace' 'apartments' 'palace'
 'hotel' 'historic' 'public' 'commercial' 'ruins' 'office' 'industrial'
 'school' 'government' 'university' 'retail' 'hospital' 'museum'
 'kindergarten' 'civic' 'garage' 'sports_centre' 'kingdom_hall'
 'construction' 'parking' 'church' 'parish_hall' 'detached' 'cathedral'
 'theatre' 'train_station' 'dormitory' 'semidetached_house' 'hut'
 'garages' 'residential' 'service' 'carport' 'container' 'toilets'
 'guardhouse' 'college' 'manufacture' 'warehouse' 'sports_hall'
 'gymnasium' 'mosque' 'supermarket' 'tower' 'kiosk' 'ship' 'greenhouse'
 'pagoda' 'power' 'tech_cab' 'hangar' 'grandstand' 'farm' 'bridge'
 'chapel' 'water_works' 'barn' 'cabin' 'cowshed' 'synagogue' 'religious'
 'pavilion' 'allotment_house' 'boathouse' 'wall' 'fire_station'
 'gatehouse' 'porch' 'transportation' 'farm_auxiliary' 'transformer_tower'
 'digester' 'factory' 'stable' 'collapsed' 'shelter' 'outbuilding'
 'electricity' 'carav

In [None]:
enriched_gdf['building_description']

Unnamed: 0_level_0,Unnamed: 1_level_0,building_description
element,id,Unnamed: 2_level_1
node,1546854470,yes
node,1723634518,bunker
node,2729351830,shed
node,5160667643,yes
node,5160667644,yes
...,...,...
way,1377014657,allotment_house
way,1377014658,allotment_house
way,1377744971,garage
way,1379303913,ruins


In [None]:
type(building_types)

numpy.ndarray

In [None]:
from shapely.geometry import mapping
from geojson import Feature, FeatureCollection
import numpy as np
from typing import List, Dict, Tuple, Optional

def generate_grouped_documents_and_metadata(filtered_gdf: pd.DataFrame, building_types: np.ndarray) -> Tuple[List[Dict[str, str]], List[Dict[str, Optional[str]]]]:
    """
    Generate grouped documents and metadata for building types based on a filtered GeoDataFrame.

    Parameters:
    - filtered_gdf: A pandas DataFrame with a column "building" and "building_description" and geometry.
    - building_types: An array of building types to filter the GeoDataFrame by.

    Returns:
    - grouped_documents: A list of dictionaries containing document metadata for each building type.
    - grouped_metadata: A list of dictionaries containing additional metadata for each building type, including geometry.
    """
    grouped_documents: List[Dict[str, str]] = []
    grouped_metadata: List[Dict[str, Optional[str]]] = []

    for building_type in building_types:
        # Filter features by building type
        building_group = filtered_gdf[filtered_gdf['building'] == building_type]

        # Calculate number of features
        num_features: int = len(building_group)

        # Generate document
        document: Dict[str, str] = {
            "id": building_type,
            "content": f"Building Type: {building_type}\nNumber of features: {num_features}\n",
        }

        # Add unique descriptions as bullets
        descriptions: set = set()
        for _, row in building_group.iterrows():
            if pd.notna(row["building_description"]):
                descriptions.add(row["building_description"])

        for description in descriptions:
            document["content"] += f"- {description}\n"

        grouped_documents.append(document)

        # Generate metadata
        metadata: Dict[str, Optional[str]] = {
            "id": building_type,
            "tag": building_type,
            "num_features": num_features,
        }

        # Create FeatureCollection for geometry
        features: List[Feature] = []
        for index, row in building_group.iterrows():
            feature_properties: Dict[str, Optional[str]] = {}
            # Include name and address in metadata with utf-8 encoding
            for key, value in row.items():
                if key not in ["geometry"]:
                    if pd.isna(value):
                        continue
                    feature_properties[key] = value.encode('utf-8').decode('utf-8') if isinstance(value, str) else value
            features.append(Feature(geometry=mapping(row["geometry"]), properties=feature_properties))
        metadata["geometry"] = json.dumps(FeatureCollection(features), ensure_ascii=False)

        grouped_metadata.append(metadata)

    return grouped_documents, grouped_metadata

grouped_docs, grouped_meta = generate_grouped_documents_and_metadata(enriched_gdf, building_types)


In [None]:
# Print a sample document and metadata
print("Sample grouped document:")
print(json.dumps(grouped_docs[5], indent=2))

print("\nSample grouped metadata:")
print(json.dumps(grouped_meta[5], indent=2))

Sample grouped document:
{
  "id": "terrace",
  "content": "Building Type: terrace\nNumber of features: 788\n- terrace\n"
}

Sample grouped metadata:
{
  "id": "terrace",
  "tag": "terrace",
  "num_features": 788,
  "geometry": "{\"type\": \"FeatureCollection\", \"features\": [{\"type\": \"Feature\", \"geometry\": {\"type\": \"Point\", \"coordinates\": [13.806945, 51.139591]}, \"properties\": {\"building\": \"terrace\", \"addr:flats\": \"642-671\", \"building:levels\": \"1\", \"roof:levels\": \"1\", \"roof:shape\": \"gabled\", \"addr:housename\": \"Kamerun\", \"building_description\": \"terrace\"}}, {\"type\": \"Feature\", \"geometry\": {\"type\": \"Polygon\", \"coordinates\": [[[13.735101, 51.02722], [13.73522, 51.027212], [13.735184, 51.027006], [13.735065, 51.027014], [13.735101, 51.02722]]]}, \"properties\": {\"building\": \"terrace\", \"building:levels\": \"2\", \"roof:levels\": \"1\", \"roof:shape\": \"hipped\", \"roof:material\": \"roof_tiles\", \"roof:colour\": \"red\", \"roof:

In [None]:
documents_features_w_names, metadatas_features_w_names, ids_features_w_names = [d['content'] for d in documents] , metadata_objects, [str(d['id']) for d in documents]
documents_features_grouped, metadatas_features_grouped, ids_features_grouped = [d['content'] for d in grouped_docs], grouped_meta, [d['id'] for d in grouped_docs]

# Part 2: Load data into vector store

In [None]:
!pip install chromadb



Connect to the vector store instance (chromadb) using the python client

In [None]:
import chromadb
chroma_client = chromadb.HttpClient(host='https://klimakonform-maps.geo.tu-dresden.de/chromadb')
chroma_client.heartbeat()

1746435617539620149

Load an embedding model

In [None]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

model_name = "paraphrase-multilingual-mpnet-base-v2"

ef = SentenceTransformerEmbeddingFunction(model_name=model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Add documents to the vector store

Add a collection to the chromadb. We create a single collection (index) for both scenarios.

In [None]:
# Create collection with the embedding function in configuration (or get it if it already exists)
collection = chroma_client.get_or_create_collection(
    name="building_data",
    configuration={
        "embedding_function": ef
    }
)

In [None]:
# Add features with names:
collection.upsert(
    documents=documents_features_w_names,
    metadatas=metadatas_features_w_names,
    ids=ids_features_w_names
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:03<00:00, 27.5MiB/s]


In [None]:
# Add grouped_features:
collection.upsert(
    documents=documents_features_grouped,
    metadatas=metadatas_features_grouped,
    ids=ids_features_grouped
)

In [None]:
results = collection.query(query_texts=["hospitals"], n_results=50)

NameError: name 'collection' is not defined

Then we need a method to plot queried data on a map:

In [None]:
import folium
from folium.plugins import MarkerCluster
from typing import Optional, Union, Dict, Any, List

def plot_buildings(query: str) -> Optional[folium.Map]:
    results = collection.query(query_texts=[query], n_results=50)
    retrieved_geometries: List[Optional[str]] = [d.get('geometry') for d in results.get('metadatas')[0]]
    retrieved_metadatas: List[Dict[str, Any]] = results.get('metadatas')[0]

    if not retrieved_geometries or not retrieved_geometries[0]:
        print("No geometries found in the results.")
        return None

    m: Optional[folium.Map] = None
    for i, geojson_str in enumerate(retrieved_geometries):
        if not geojson_str:
            continue

        try:
            geojson_data: Dict[str, Any] = json.loads(geojson_str)

            if m is None:
                if geojson_data.get('type') == 'FeatureCollection' and 'features' in geojson_data:
                    first_feature = geojson_data['features'][0]
                    geometry = first_feature.get('geometry', {})
                elif geojson_data.get('type') == 'Feature':
                    geometry = geojson_data.get('geometry', {})
                else:
                    geometry = geojson_data

                coordinates = get_representative_coordinates(geometry)
                if coordinates:
                    center = [coordinates[1], coordinates[0]]
                    m = folium.Map(location=center, zoom_start=14)
                    break

        except json.JSONDecodeError as e:
            print(f"Error decoding GeoJSON: {e}")
            continue

    if m is None:
        print("Could not create map: no valid geometries found.")
        return None

    marker_cluster: MarkerCluster = MarkerCluster().add_to(m)

    for i, geojson_str in enumerate(retrieved_geometries):
        if not geojson_str:
            continue

        try:
            geojson_data: Dict[str, Any] = json.loads(geojson_str)
            metadata: Dict[str, Any] = {k: v for k, v in retrieved_metadatas[i].items() if k != 'geometry'}

            if geojson_data.get('type') == 'FeatureCollection' and 'features' in geojson_data:
                for feature in geojson_data['features']:
                    properties = feature.get('properties', {}) or metadata
                    add_feature_to_map(feature, properties, marker_cluster, m)
            elif geojson_data.get('type') == 'Feature':
                properties = geojson_data.get('properties', {}) or metadata
                add_feature_to_map(geojson_data, properties, marker_cluster, m)
            else:
                coordinates = get_representative_coordinates(geojson_data)
                if coordinates:
                    popup_text = "<div style='font-size:14px;'>"
                    for key, value in metadata.items():
                        popup_text += f"<b>{key}:</b> {value}<br>"
                    popup_text += f"<b>Type:</b> {geojson_data.get('type')}<br>"
                    popup_text += "</div>"

                    folium.Marker(
                        location=[coordinates[1], coordinates[0]],
                        popup=popup_text
                    ).add_to(marker_cluster)

                folium.GeoJson(geojson_data).add_to(m)

        except (json.JSONDecodeError, KeyError) as e:
            print(f"Error processing GeoJSON: {e}")
            continue

    return m

def add_feature_to_map(
    feature: Dict[str, Any],
    properties: Dict[str, Any],
    marker_cluster: MarkerCluster,
    map_obj: folium.Map
) -> None:
    """Helper function to add a GeoJSON Feature to the map"""
    if 'geometry' in feature:
        popup_text = "<div style='font-size:14px;'>"
        for key, value in properties.items():
            popup_text += f"<b>{key}:</b> {value}<br>"
        popup_text += "</div>"

        coordinates = get_representative_coordinates(feature['geometry'])
        if coordinates:
            folium.Marker(
                location=[coordinates[1], coordinates[0]],
                popup=popup_text
            ).add_to(marker_cluster)

        folium.GeoJson(feature).add_to(map_obj)

def get_representative_coordinates(geometry: Dict[str, Any]) -> Optional[List[float]]:
    """Extract a representative point from various geometry types."""
    if not geometry or 'type' not in geometry or 'coordinates' not in geometry:
        return None

    geo_type: str = geometry['type']
    coords: Any = geometry['coordinates']

    if geo_type == 'Point':
        return coords
    elif geo_type == 'MultiPoint':
        return coords[0] if coords else None
    elif geo_type == 'LineString':
        return coords[0] if coords else None
    elif geo_type == 'MultiLineString':
        return coords[0][0] if coords and coords[0] else None
    elif geo_type == 'Polygon':
        return coords[0][0] if coords and coords[0] else None
    elif geo_type == 'MultiPolygon':
        return coords[0][0][0] if coords and coords[0] and coords[0][0] else None
    else:
        return None


Try with some sample queries:

In [None]:
plot_buildings("semperoper")

In [None]:
plot_buildings("museums in dresden")

### Issues with using a single collection in the vector store

This works, but we do experience some issues when we have data for different queries in a single collection:
- different query types may require different similarity thresholds to cut off results
- query-by-type / query-by-name could potentially contain the same results and result in duplicates.

**Solution**:
- Create separte indexes for query-for-type / query-for-name
- Use a semantic router to route to the desired collection

In [None]:
!pip install -qU semantic-router

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.1/111.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.2/144.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### Semantic Routing for queries to the best matching vector store collection

The semantic router uses sample utterances to match incoming queries based on semantic similarity.

In [None]:
from semantic_router import Route
from semantic_router.routers import SemanticRouter
from semantic_router.encoders import HuggingFaceEncoder

# Specific building queries in Dresden
specific_building = Route(
    name="specific_building",
    utterances=[
        "Tell me about the Zwinger Palace in Dresden",
        "Zwinger",
        "Semperoper",
        "Frauenkirche",
        "Where is the Frauenkirche located?",
        "Show me information on the Semperoper",
        "I want to find the Dresden Castle",
        "What can you tell me about the Military History Museum?",
        "Give me details on the Green Vault",
        "Find the Blue Wonder bridge",
        "Show me the Hygiene Museum",
        "Tell me about the Yenidze building",
        "Locate the Albertinum in Dresden",
    ],
)

# Type-based building queries in Dresden
building_type = Route(
    name="building_type",
    utterances=[
        "List all museums in Dresden",
        "Find the hospitals in Dresden",
        "Show me art galleries in Dresden",
        "Are there any public libraries in Dresden?",
        "What schools are there in Dresden?",
        "Search for historical buildings in Dresden",
        "Look up churches in Dresden",
        "Which theaters are in Dresden?",
        "What kind of tourist attractions are in Dresden?",
        "Find universities in Dresden",
    ],
)

# Add to your routes list
routes = [specific_building, building_type]
encoder = HuggingFaceEncoder()
rl = SemanticRouter(encoder=encoder, routes=routes, auto_sync="local")




The semantic router needs a bit fine-tuning on the use case. These samples are generated by a GPT-4 model

See the documentation [here](https://github.com/aurelio-labs/semantic-router/blob/main/docs/06-threshold-optimization.ipynb)

In [None]:
test_data = [
    # specific_building
    ("Tell me about the Zwinger Palace", "specific_building"),
    ("Where is the Frauenkirche located?", "specific_building"),
    ("I want info about Dresden Castle", "specific_building"),
    ("What's the history of the Semperoper?", "specific_building"),
    ("Show me details on the Green Vault", "specific_building"),
    ("I'm looking for the Hygiene Museum", "specific_building"),
    ("Find the Blue Wonder bridge", "specific_building"),
    ("Locate the Albertinum in Dresden", "specific_building"),
    ("Semperoper", "specific_building"),
    ("Zwinger", "specific_building"),
    ("Frauenkirche", "specific_building"),
    ("Military History Museum", "specific_building"),
    ("Yenidze", "specific_building"),
    ("Green Vault", "specific_building"),
    ("Hygiene Museum", "specific_building"),
    ("Albertinum", "specific_building"),
    ("Dresden Castle", "specific_building"),
    ("Can you tell me about the Yenidze building?", "specific_building"),
    ("What is the Blue Wonder in Dresden?", "specific_building"),
    ("Give me historical info on the Frauenkirche", "specific_building"),
    ("I'm curious about the Zwinger", "specific_building"),
    ("Tell me the story behind the Semperoper", "specific_building"),
    ("Where is the Military History Museum?", "specific_building"),
    ("I want to learn more about the Green Vault", "specific_building"),
    ("Show me the Albertinum", "specific_building"),
    ("Tell me what the Hygiene Museum is about", "specific_building"),
    ("I heard of Yenidze — what is it?", "specific_building"),
    ("Where can I find Dresden Castle?", "specific_building"),
    ("I’m looking for more on Zwinger Palace", "specific_building"),

    # building_type
    ("List all museums in Dresden", "building_type"),
    ("Museums", "building_type"),
    ("Hospitals", "building_type"),
    ("Art galleries", "building_type"),
    ("Libraries", "building_type"),
    ("Schools", "building_type"),
    ("Churches", "building_type"),
    ("Theaters", "building_type"),
    ("Tourist attractions", "building_type"),
    ("Are there any hospitals in Dresden?", "building_type"),
    ("Show me art galleries in the city", "building_type"),
    ("Search for public libraries in Dresden", "building_type"),
    ("Find all schools around Dresden", "building_type"),
    ("Are there any churches in the city?", "building_type"),
    ("What kind of theaters are in Dresden?", "building_type"),
    ("Universities in Dresden", "building_type"),
    ("Museums Dresden", "building_type"),
    ("Hospitals Dresden", "building_type"),
    ("Libraries in Dresden", "building_type"),
    ("Churches Dresden", "building_type"),
    ("Art galleries Dresden", "building_type"),
    ("Tourist attractions Dresden", "building_type"),
    ("Historic buildings Dresden", "building_type"),
    ("List cultural sites in Dresden", "building_type"),
    ("Where can I find schools in Dresden?", "building_type"),
    ("Are there any interesting museums in Dresden?", "building_type"),
    ("Dresden churches", "building_type"),
    ("Galleries in Dresden", "building_type"),
    ("Looking for universities in the Dresden area", "building_type"),
    ("Please show me public buildings in Dresden", "building_type"),
    ("Top attractions in Dresden", "building_type"),
    ("Show educational buildings in Dresden", "building_type"),
    ("Is there a hospital nearby in Dresden?", "building_type"),
    ("I need a list of all theaters in Dresden", "building_type"),
    ("Find me some tourist spots in Dresden", "building_type"),
    ("Government buildings Dresden", "building_type"),
    ("Kindergartens in Dresden", "building_type"),
    ("Which buildings in Dresden are open to the public?", "building_type"),
    ("Cultural heritage sites in Dresden", "building_type"),
    ("Monuments Dresden", "building_type"),
]

# unpack the test data
X, y = zip(*test_data)

# evaluate using the default thresholds
accuracy = rl.evaluate(X=X, y=y)
print(f"Accuracy: {accuracy * 100:.2f}%")

route_thresholds = rl.get_thresholds()
print("Default route thresholds:", route_thresholds)

# Call the fit method
rl.fit(X=X, y=y)

route_thresholds = rl.get_thresholds()
print("Updated route thresholds:", route_thresholds)

accuracy = rl.evaluate(X=X, y=y)
print(f"Accuracy: {accuracy * 100:.2f}%")

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 98.55%
Default route thresholds: {'specific_building': np.float64(0.2828282828282829), 'building_type': np.float64(0.22222222222222224)}


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/500 [00:00<?, ?it/s]

Updated route thresholds: {'specific_building': np.float64(0.2828282828282829), 'building_type': np.float64(0.22222222222222224)}


Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 98.55%


In [None]:
rl("hospitals")

RouteChoice(name='building_type', function_call=None, similarity_score=None)

Now we create separate collections:

In [None]:
# Create collection with the embedding function in configuration (or get it if it already exists)
collection_buildings_with_names = chroma_client.get_or_create_collection(
    name="buildings_with_names",
    configuration={
        "embedding_function": ef
    }
)

collection_buildings_grouped_by_type = chroma_client.get_or_create_collection(
    name="buildings_grouped_by_type",
    configuration={
        "embedding_function": ef
    }
)

# Add features with names:
collection_buildings_with_names.upsert(
    documents=documents_features_w_names,
    metadatas=metadatas_features_w_names,
    ids=ids_features_w_names
)

# Add grouped_features:
collection_buildings_grouped_by_type.upsert(
    documents=documents_features_grouped,
    metadatas=metadatas_features_grouped,
    ids=ids_features_grouped
)


We add the semantic router as a layer before our actual semantic search

In [None]:
import folium
from folium.plugins import MarkerCluster
from typing import Optional, Dict, Any, List, Union

QueryResult = Dict[str, Union[List[Any], List[List[Dict[str, Any]]]]]

def query_with_route(query: str, rl: SemanticRouter) -> QueryResult:
    """
    This function uses a semantic router to route a query to a specific collection.
    It returns the results of the query.
    """
    if rl(query).name == "specific_building":
        print("Query by name")
        return collection_buildings_with_names.query(query_texts=[query], n_results=5)
    else:
        print("Query by type")
        return collection_buildings_grouped_by_type.query(query_texts=[query], n_results=3)

def plot_buildings_enhanced(query: str, rl: SemanticRouter) -> Optional[folium.Map]:
    """
    This function queries data from the vector store and plots it on a map.
    It uses a semantic router to route the query to a specific collection.
    """
    results = query_with_route(query, rl)
    retrieved_geometries: List[Optional[str]] = [d.get('geometry') for d in results.get('metadatas')[0]]
    retrieved_metadatas: List[Dict[str, Any]] = results.get('metadatas')[0]

    if not retrieved_geometries or not retrieved_geometries[0]:
        print("No geometries found in the results.")
        return None

    m: Optional[folium.Map] = None
    for geojson_str in retrieved_geometries:
        if not geojson_str:
            continue

        try:
            geojson_data: Dict[str, Any] = json.loads(geojson_str)

            if m is None:
                if geojson_data.get('type') == 'FeatureCollection' and 'features' in geojson_data:
                    first_feature = geojson_data['features'][0]
                    geometry = first_feature.get('geometry', {})
                elif geojson_data.get('type') == 'Feature':
                    geometry = geojson_data.get('geometry', {})
                else:
                    geometry = geojson_data

                coordinates = get_representative_coordinates(geometry)
                if coordinates:
                    center = [coordinates[1], coordinates[0]]
                    m = folium.Map(location=center, zoom_start=14)
                    break

        except json.JSONDecodeError as e:
            print(f"Error decoding GeoJSON: {e}")
            continue

    if m is None:
        print("Could not create map: no valid geometries found.")
        return None

    marker_cluster: MarkerCluster = MarkerCluster().add_to(m)

    for i, geojson_str in enumerate(retrieved_geometries):
        if not geojson_str:
            continue

        try:
            geojson_data: Dict[str, Any] = json.loads(geojson_str)
            metadata: Dict[str, Any] = {k: v for k, v in retrieved_metadatas[i].items() if k != 'geometry'}

            if geojson_data.get('type') == 'FeatureCollection' and 'features' in geojson_data:
                for feature in geojson_data['features']:
                    properties = feature.get('properties', {}) or metadata
                    add_feature_to_map(feature, properties, marker_cluster, m)
            elif geojson_data.get('type') == 'Feature':
                properties = geojson_data.get('properties', {}) or metadata
                add_feature_to_map(geojson_data, properties, marker_cluster, m)
            else:
                coordinates = get_representative_coordinates(geojson_data)
                if coordinates:
                    popup_text = "<div style='font-size:14px;'>"
                    for key, value in metadata.items():
                        popup_text += f"<b>{key}:</b> {value}<br>"
                    popup_text += f"<b>Type:</b> {geojson_data.get('type')}<br>"
                    popup_text += "</div>"

                    folium.Marker(
                        location=[coordinates[1], coordinates[0]],
                        popup=popup_text
                    ).add_to(marker_cluster)

                folium.GeoJson(geojson_data).add_to(m)

        except (json.JSONDecodeError, KeyError) as e:
            print(f"Error processing GeoJSON: {e}")
            continue

    return m


### Results with separate collections and semantic router

In [None]:
plot_buildings_enhanced(query="hospitals", rl=rl)

Query by type


In [None]:
plot_buildings_enhanced(query="deutsches hygiene museum", rl=rl)

Query by name


In [None]:
plot_buildings_enhanced(query="Bahnhof neustadt", rl=rl)

Query by name
