In [None]:
!pip install osmnx




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\malua\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
"""
01_data_ingestion_FIXED.py
Works with new OSMnx (>=1.7), fixes centroid warnings and list/array errors.
"""

import os
import osmnx as ox
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

STUDY_AREA = "Bengaluru, India"

OUTPUT_DIR = "data"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_GEOJSON = os.path.join(OUTPUT_DIR, "geodata.geojson")


# --------------------------
# Download OSM (new API)
# --------------------------
def download_osm(place):
    print("Downloading OSM layers for:", place)

    g_buildings = ox.features_from_place(place, tags={"building": True})
    g_pois = ox.features_from_place(place, tags={"amenity": True})

    # Roads = graph → edges
    G = ox.graph_from_place(place, network_type="drive")
    g_roads = ox.graph_to_gdfs(G, nodes=False, edges=True)

    return g_buildings, g_pois, g_roads



# -----------------------------------------
# Clean layers → keep only selected columns
# -----------------------------------------
def clean_layer(gdf, layer_name):
    keep = ["name", "amenity", "building", "highway", "geometry"]
    cols = [c for c in gdf.columns if c in keep]

    gdf = gdf[cols].copy()

    # Ensure all values are strings (fixes ambiguous truth value error)
    for col in ["name", "amenity", "building", "highway"]:
        if col in gdf.columns:
            gdf[col] = gdf[col].astype(str)

    gdf["layer"] = layer_name
    gdf = gdf.dropna(subset=["geometry"])

    return gdf



# -----------------------------------------
# Build clean text description
# -----------------------------------------
def build_text(row):
    parts = []

    # Safe attribute extraction (string only)
    name = str(row.get("name", "")).strip()
    amenity = str(row.get("amenity", "")).strip()
    building = str(row.get("building", "")).strip()
    highway = str(row.get("highway", "")).strip()

    if name and name != "nan":
        parts.append(f"Name: {name}")
    if amenity and amenity != "nan":
        parts.append(f"Amenity: {amenity}")
    if building and building != "nan":
        parts.append(f"Building type: {building}")
    if highway and highway != "nan":
        parts.append(f"Road type: {highway}")

    parts.append(f"Location: lat {row['lat']:.6f}, lon {row['lon']:.6f}")
    return ". ".join(parts)



# -----------------------------------------
# MAIN
# -----------------------------------------
def main():
    g_buildings, g_pois, g_roads = download_osm(STUDY_AREA)

    g_buildings = clean_layer(g_buildings, "building")
    g_pois = clean_layer(g_pois, "poi")
    g_roads = clean_layer(g_roads, "road")

    # Merge all
    combined = pd.concat([g_buildings, g_pois, g_roads], ignore_index=True)
    gdf_all = gpd.GeoDataFrame(combined, crs="EPSG:4326")

    # ---- FIX CENTROID WARNING PROPERLY ----
    gdf_metric = gdf_all.to_crs("EPSG:3857")
    gdf_all["lon"] = gdf_metric.centroid.to_crs(4326).x
    gdf_all["lat"] = gdf_metric.centroid.to_crs(4326).y
    # ---------------------------------------

    gdf_all["text"] = gdf_all.apply(build_text, axis=1)

    gdf_all.to_file(OUTPUT_GEOJSON, driver="GeoJSON")
    print("Saved:", OUTPUT_GEOJSON)
    print("Total features:", len(gdf_all))


if __name__ == "__main__":
    main()


Downloading OSM layers for: Bengaluru, India


  polygon = gdf_place["geometry"].unary_union
  polygon = gdf_place["geometry"].unary_union


Saved: data\geodata.geojson
Total features: 1110365


In [None]:
"""
05_evaluation.py
Simple evaluation helpers: Recall@k for retrieval given a ground truth set.
This is a helper — you must prepare ground truth pairs (query -> ground truth ids).
"""

import pandas as pd
import numpy as np

def semantic_search(query, top_k=5):
    # This is a placeholder for the actual semantic search logic.
    # It returns dummy data to prevent ModuleNotFoundError.
    print(f"Performing dummy semantic search for query: '{query}' with top_k={top_k}")
    return [{"id": i} for i in range(top_k)]

def recall_at_k(query, ground_truth_ids, k=5):
    res = semantic_search(query, top_k=k)
    retrieved_ids = [r["id"] for r in res]
    hits = sum(1 for id_ in ground_truth_ids if id_ in retrieved_ids)
    recall = hits / len(ground_truth_ids) if len(ground_truth_ids) > 0 else 0.0
    return recall, retrieved_ids

# Example usage (you need to create ground truth lists)
if __name__ == "__main__":
    q = "nearest school to MG Road"
    gt = [12, 34]  # replace with real IDs from metadata.csv that are true answers
    r, ids = recall_at_k(q, gt, k=10)
    print("Recall@", 10, "=", r, "retrieved ids:", ids)



Performing dummy semantic search for query: 'nearest school to MG Road' with top_k=10
Recall@ 10 = 0.0 retrieved ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
