<a href="https://colab.research.google.com/github/anhpdd/ml-property-valuation-klang-valley/blob/main/notebooks/2_1_Amenity_OSM_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install dependencies
!pip install -q overpy

# Core libraries
import pandas as pd
import geopandas as gpd
import numpy as np

# Geospatial
import ast
import overpy
from shapely.geometry import Point, Polygon, LineString

# Visualization
import plotly.express as px

# HTML requests
import requests
import xml.etree.ElementTree as ET

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Query Amenity by AREA OSM ID

In [3]:
def build_query_by_area_id(area_ids: dict, categories: dict) -> str:
    """
    Builds a correctly formatted Overpass QL query.
    """
    query_parts = []

    for area_name, rel_id in area_ids.items():
        area_id_for_query = int(rel_id) + 3600000000

        for category_name, tag_dict in categories.items():
            # Build the tag filters
            tag_filters = ''.join([f'["{k}"="{v}"]' for k, v in tag_dict.items()])

            # Query for nodes, ways, and relations
            query_parts.append(f'  nwr{tag_filters}(area:{area_id_for_query});')

    full_query = f"""
[out:json][timeout:240];
(
{''.join([p + '\n' for p in query_parts])}
);
(._;>>;);
out center;
"""
    return full_query

def parse_osm_element(element, categories: dict) -> dict:
    """Parses a single overpy element into a structured dictionary."""
    category = "Unknown"

    # Match element tags against category definitions
    for cat_name, tag_dict in categories.items():
        # Check if all tags in the dict match
        if all(element.tags.get(k) == v for k, v in tag_dict.items()):
            category = cat_name
            break

    geometry = None
    if isinstance(element, overpy.Node):
        geometry = (float(element.lat), float(element.lon))
    elif isinstance(element, overpy.Way):
        if element.nodes:
            geometry = [(float(node.lat), float(node.lon)) for node in element.nodes]
    elif isinstance(element, overpy.Relation):
        if hasattr(element, 'center_lat') and element.center_lat is not None:
            geometry = (float(element.center_lat), float(element.center_lon))

    return {
        "osm_id": element.id,
        "osm_type": element.__class__.__name__.lower(),
        "name": element.tags.get("name", "N/A"),
        "category": category,
        "tags": dict(element.tags),
        "geometry_coords": geometry
    }

## Configuration

In [4]:
# District OSM IDs for Klang Valley
DISTRICTS = {
    'GOMBAK': '12438352',
    'HULU LANGAT': '12438351',
    'KLANG': '12391135',
    'HULU SELANGOR': '10714199',
    'KUALA LANGAT': '10743362',
    'KUALA LUMPUR': '2939672',
    'KUALA SELANGOR': '10714137',
    'PETALING': '12391134',
    'PUTRAJAYA': '4443881',
    'SABAK BERNAM': '10714136',
    'SEPANG': '10743315'
}

# POI categories to extract
POI_CATEGORIES = {
    'School': {'amenity': 'school'},
    'Mall': {'shop': 'supermarket'},
    'Park': {'leisure': 'park'},
    'River': {'waterway': 'river'},
    'Lake': {'natural': 'water', 'water': 'lake'}
}

## Query Overpass API

In [5]:
# Build and execute query
query = build_query_by_area_id(DISTRICTS, POI_CATEGORIES)
print(f"üì° Querying Overpass API for {len(POI_CATEGORIES)} POI types across {len(DISTRICTS)} districts...")

api = overpy.Overpass()
result = api.query(query)
print(f"‚úÖ Query successful!")

üì° Querying Overpass API for 5 POI types across 11 districts...
‚úÖ Query successful!


## Parse Results

In [6]:
# Extract POI data from all element types
all_pois = []
for elements in [result.nodes, result.ways, result.relations]:
    for element in elements:
        poi = parse_osm_element(element, POI_CATEGORIES)
        if poi['category'] != 'Unknown' and poi['geometry_coords'] is not None:
            all_pois.append(poi)

pois_df = pd.DataFrame(all_pois)
print(f"‚úÖ Parsed {len(pois_df):,} POIs\n")

‚úÖ Parsed 6,653 POIs



## Visualization

In [7]:
if not pois_df.empty:
    # Extract coordinates for plotting
    plot_data = []
    for _, row in pois_df.iterrows():
        coords = row['geometry_coords']

        # Handle different coordinate formats
        if isinstance(coords, list) and coords:
            lat, lon = coords[0]
        elif isinstance(coords, tuple):
            lat, lon = coords
        else:
            continue

        plot_data.append({
            'lat': lat,
            'lon': lon,
            'name': row['name'],
            'category': row['category'],
            'osm_id': row['osm_id']
        })

    plot_df = pd.DataFrame(plot_data)

    # Create interactive map
    fig = px.scatter_mapbox(
        plot_df,
        lat='lat',
        lon='lon',
        color='category',
        hover_name='name',
        hover_data=['osm_id', 'category'],
        title=f'POIs in Klang Valley ({len(plot_df):,} locations)',
        zoom=9,
        height=700
    )
    fig.update_layout(mapbox_style='carto-positron', legend_title_text='POI Type')
    fig.show()

    # Display sample
    print("\nüìä Sample data:")
    print(pois_df[['osm_id', 'name', 'category', 'osm_type']].head(10))

    # Summary by category
    print(f"\nüìà POIs by category:")
    print(pois_df['category'].value_counts())
else:
    print("‚ö†Ô∏è  No POIs found")


üìä Sample data:
      osm_id                                      name category osm_type
0  158731933                                HeroMarket     Mall     node
1  251340254                             TF Value-Mart     Mall     node
2  284005497                                 The Store     Mall     node
3  295130264                              99 Speedmart     Mall     node
4  305358689                       Cambridge Al-Ikhlas   School     node
5  310462382                          SMK Tasik Kesuma   School     node
6  313042272                              99 Speedmart     Mall     node
7  316256514  Institut Latihan Pihak Berkuasa Tempatan   School     node
8  343202973                                     Mydin     Mall     node
9  391716374                       Sekolah Islam Hira'   School     node

üìà POIs by category:
category
School    1735
River     1646
Park      1546
Mall      1377
Lake       349
Name: count, dtype: int64


# Get Train Data through OSM ID

In [8]:
def get_train_data(train_id):
    """
    Fetches data for a given train ID from OpenStreetMap and returns a Pandas DataFrame.

    Args:
        train_id (str): The OpenStreetMap relation ID for the train route.

    Returns:
        pandas.DataFrame: A DataFrame containing the train route data, or None if an error occurs.
    """
    # Fetch the XML data
    url = f'https://www.openstreetmap.org/api/0.6/relation/{train_id}'

    try:

        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        # Parse the XML response
        root = ET.fromstring(response.content)

        # Initialize lists to store data
        stop_ids = []
        alt_names = []
        names = []
        starts = []
        ends = []
        lats = []
        lons = []
        stations_names = []
        stations_ids = []

        # Find the relation element
        for relation in root.findall('relation'):
            # Iterate over all member elements
            for member in relation.findall('member'):
                type_ = member.get('type')
                ref = member.get('ref')
                if type_ == 'node':
                    stop_ids.append(ref)
                    lat, lon, station_name, station_id = get_node_coordinates(ref)
                    lats.append(lat)
                    lons.append(lon)
                    stations_names.append(station_name)
                    stations_ids.append(station_id)

            # Extract tag values from the relation
            alt_name = None
            name = None
            start = None
            end = None
            for tag in relation.findall('tag'):
                k = tag.get('k')
                v = tag.get('v')
                if k == 'alt_name':
                    alt_name = v
                elif k == 'name':
                    name = v
                elif k == 'from':
                    start = v
                elif k == 'to':
                    end = v

            # Extend tag values to match the number of stops
            num_stops = len(stop_ids) - len(alt_names)
            alt_names.extend([alt_name] * num_stops)
            names.extend([name] * num_stops)
            starts.extend([start] * num_stops)
            ends.extend([end] * num_stops)

        # Create a DataFrame
        data = {
            'Stop ID': stop_ids,
            'Alt Name': alt_names,
            'Lat': lats,
            'Lon': lons,
            'Station Name': stations_names,
            'Station ID': stations_ids,
            'Name': names,
            'Start': starts,
            'End': ends
        }
        df = pd.DataFrame(data)

        return df

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for train ID {train_id}: {e}")
        return None
    except ET.ParseError as e:
        print(f"Error parsing XML for train ID {train_id}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred for train ID {train_id}: {e}")
        return None

def get_node_coordinates(node_id):
    """
    Fetches the coordinates of a node from OpenStreetMap.

    Args:
        node_id (str): The OpenStreetMap node ID.

    Returns:
        tuple: A tuple containing (latitude, longitude, station name, station ID), or (None, None, None, None) if an error occurs.
    """
    url = f"https://api.openstreetmap.org/api/0.6/node/{node_id}"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        root = ET.fromstring(response.content)

        node = root.find('node')
        if node is not None:
            lat = node.get('lat')
            lon = node.get('lon')
            station_name = None
            station_id = None
            for tag in node.findall('tag'):
                k = tag.get('k')
                v = tag.get('v')
                if k == 'name':
                    station_name = v
                elif k == 'ref':
                    station_id = v
            return (
                float(lat) if lat else None,
                float(lon) if lon else None,
                station_name,
                station_id
            )
        return (None, None, None, None)

    except (requests.RequestException, ET.ParseError, ValueError) as e:
        print(f"Error processing node {node_id}: {str(e)}")
        return (None, None, None, None)

## FETCH TRAIN LINE DATA

In [11]:
# RapidKL train lines (OSM relation IDs)
TRAIN_LINES = {
    'LRT3 Shah Alam': 8394085,
    'LRT Kelana Jaya': 8000438,
    'LRT Sri Petaling': 8000387,
    'LRT Ampang': 8000297,
    'MRT Kajang': 5690837,
    'MRT Putrajaya': 11313577,
    'BRT Sunway Line': 11549145
}

# Fetch station data for each line
all_stations = []
for line_name, train_id in TRAIN_LINES.items():
    print(f"üì° Fetching {line_name} (ID: {train_id})...", end=' ')

    df = get_train_data(str(train_id))

    if df is not None:
        df['train_id'] = train_id
        df['line_name'] = line_name
        all_stations.append(df)
        print(f"‚úÖ {len(df)} stations")
    else:
        print("‚ùå Failed")

# Combine all lines
train_stations = pd.concat(all_stations, ignore_index=True)

# Create Point geometries from coordinates
train_stations['geometry'] = train_stations.apply(
    lambda row: Point(row['Lon'], row['Lat'])
    if pd.notna(row['Lat']) and pd.notna(row['Lon'])
    else None,
    axis=1
)

# Remove invalid coordinates
train_stations = train_stations.dropna(subset=['geometry']).reset_index(drop=True)

print(f"\n‚úÖ Total: {len(train_stations)} stations across {len(TRAIN_LINES)} lines")
train_stations

üì° Fetching LRT3 Shah Alam (ID: 8394085)... ‚úÖ 20 stations
üì° Fetching LRT Kelana Jaya (ID: 8000438)... ‚úÖ 37 stations
üì° Fetching LRT Sri Petaling (ID: 8000387)... ‚úÖ 29 stations
üì° Fetching LRT Ampang (ID: 8000297)... ‚úÖ 18 stations
üì° Fetching MRT Kajang (ID: 5690837)... ‚úÖ 29 stations
üì° Fetching MRT Putrajaya (ID: 11313577)... ‚úÖ 36 stations
üì° Fetching BRT Sunway Line (ID: 11549145)... ‚úÖ 7 stations

‚úÖ Total: 176 stations across 7 lines


Unnamed: 0,Stop ID,Alt Name,Lat,Lon,Station Name,Station ID,Name,Start,End,train_id,line_name,geometry
0,10605496579,LRT3,2.976366,101.459304,Johan Setia,SA26,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.4593038 2.9763664)
1,10605496582,LRT3,2.993168,101.445962,Bandar Bukit Tinggi,SA24,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.4459616 2.9931685)
2,10605496584,LRT3,3.005442,101.441877,Klang Jaya,SA23,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.4418774 3.0054418)
3,10605496586,LRT3,3.016073,101.440726,Sri Andalas,SA22,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.4407261 3.016073)
4,10605496587,LRT3,3.026846,101.442338,Taman Selatan,SA21,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.4423378 3.0268463)
...,...,...,...,...,...,...,...,...,...,...,...,...
171,11267149848,,3.065489,101.608609,SB4 SunMed,,BRT Sunway Line,Sunway - Setia Jaya,USJ 7,11549145,BRT Sunway Line,POINT (101.6086094 3.0654889)
172,11272247809,,3.070711,101.610732,SB3 Sunway Lagoon,,BRT Sunway Line,Sunway - Setia Jaya,USJ 7,11549145,BRT Sunway Line,POINT (101.6107323 3.0707114)
173,11273547895,,3.076147,101.610223,SB2 Mentari,,BRT Sunway Line,Sunway - Setia Jaya,USJ 7,11549145,BRT Sunway Line,POINT (101.6102234 3.0761465)
174,11273547901,,3.082956,101.612252,SB1 Sunway-Setia Jaya,,BRT Sunway Line,Sunway - Setia Jaya,USJ 7,11549145,BRT Sunway Line,POINT (101.6122517 3.0829564)


In [12]:
# Standardize column names to match amenities
train_gdf = gpd.GeoDataFrame(train_stations, geometry='geometry', crs='EPSG:4326').drop_duplicates(subset = ['Lat', 'Lon'])


# Add 'category' column to match amenities structure
train_gdf = train_gdf.rename(columns={'Stop ID': 'osm_id',
                                  'Station Name': 'name'})

# Specify feature type and osm_type before merging
train_gdf['feature_type'] = 'Rail Station'
train_gdf['osm_type'] = 'node'

# Inspect dataframe
train_gdf.head()

Unnamed: 0,osm_id,Alt Name,Lat,Lon,name,Station ID,Name,Start,End,train_id,line_name,geometry,feature_type,osm_type
0,10605496579,LRT3,2.976366,101.459304,Johan Setia,SA26,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.4593 2.97637),Rail Station,node
1,10605496582,LRT3,2.993168,101.445962,Bandar Bukit Tinggi,SA24,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.44596 2.99317),Rail Station,node
2,10605496584,LRT3,3.005442,101.441877,Klang Jaya,SA23,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.44188 3.00544),Rail Station,node
3,10605496586,LRT3,3.016073,101.440726,Sri Andalas,SA22,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.44073 3.01607),Rail Station,node
4,10605496587,LRT3,3.026846,101.442338,Taman Selatan,SA21,Shah Alam Line (Johan Setia --> Bandar Utama),Johan Setia,Bandar Utama,8394085,LRT3 Shah Alam,POINT (101.44234 3.02685),Rail Station,node


# CLEAN & VALIDATE GEOMETRIES

In [13]:
def parse_geometry(row):
    """
    Smart parser that handles node, way, and relation geometries.
    """
    coord_string = row['geometry_coords']
    osm_type = row['osm_type']

    # Check for NaN or missing values
    if coord_string is None or (isinstance(coord_string, str) and coord_string.lower() == 'nan'):
        return None

    try:
        # Parse the string
        coords = ast.literal_eval(str(coord_string))

        if osm_type == 'node':
            # Single coordinate tuple: (lat, lon) ‚Üí Point(lon, lat)
            return Point(coords[1], coords[0])

        elif osm_type == 'relation':
            # Relation with center point (from "out center;")
            if isinstance(coords, tuple) and len(coords) == 2:
                # Single coordinate tuple: (lat, lon) ‚Üí Point(lon, lat)
                return Point(coords[1], coords[0])
            else:
                # Complex relation geometry (rare)
                return None

        elif osm_type == 'way':
            # List of coordinate tuples: [(lat, lon), ...] ‚Üí Polygon or LineString

            # Convert from (lat, lon) to (lon, lat) for Shapely
            shapely_coords = [(coord[1], coord[0]) for coord in coords]

            # Check if it's a closed way (polygon) or open way (line)
            if len(shapely_coords) >= 4 and shapely_coords[0] == shapely_coords[-1]:
                # Closed way ‚Üí Polygon
                return Polygon(shapely_coords)
            elif len(shapely_coords) >= 2:
                # Open way ‚Üí LineString
                return LineString(shapely_coords)
            else:
                return None

        else:
            # Unknown type
            return None

    except Exception as e:
        # Silently return None for unparseable data
        return None

In [14]:
# Parse geometries
missing_coords = pois_df['geometry_coords'].isna().sum()
print(f"‚ö†Ô∏è  Missing coordinates: {missing_coords}/{len(pois_df)}")

pois_df['geometry'] = pois_df.apply(parse_geometry, axis=1)

# Remove invalid geometries
poi_clean = pois_df.dropna(subset=['geometry']).reset_index(drop=True)
removed = len(pois_df) - len(poi_clean)

print(f"‚úÖ Valid geometries: {len(poi_clean)}/{len(pois_df)} ({len(poi_clean)/len(pois_df)*100:.1f}%)")
if removed > 0:
    print(f"üóëÔ∏è  Removed: {removed} invalid geometries")

# Create GeoDataFrame
amenity_gdf = gpd.GeoDataFrame(
    poi_clean,
    geometry='geometry',
    crs='EPSG:4326'
).rename(columns={'category': 'feature_type'})

print(f"\nüìä Geometry types:\n{amenity_gdf.geometry.geom_type.value_counts()}")
print(f"\nüìä Feature distribution:\n{amenity_gdf['feature_type'].value_counts()}")

‚ö†Ô∏è  Missing coordinates: 0/6653
‚úÖ Valid geometries: 6653/6653 (100.0%)

üìä Geometry types:
Polygon       3322
Point         1694
LineString    1637
Name: count, dtype: int64

üìä Feature distribution:
feature_type
School    1735
River     1646
Park      1546
Mall      1377
Lake       349
Name: count, dtype: int64


# COMBINE AMENITIES + TRAIN STATIONS

In [15]:
# Standardize columns
common_cols = ['osm_id', 'osm_type', 'name', 'feature_type', 'geometry']

amenities_std = amenity_gdf[common_cols].copy()
train_std = train_gdf[common_cols + ['Station ID', 'line_name', 'train_id']].copy()

# Combine into single GeoDataFrame
all_pois_gdf = gpd.GeoDataFrame(
    pd.concat([amenities_std, train_std], ignore_index=True),
    geometry='geometry',
    crs='EPSG:4326'
).assign(feature_type=lambda x: x['feature_type'].str.lower())

print(f"\n‚úÖ Combined dataset: {len(all_pois_gdf):,} POIs")
print(f"   ‚Ä¢ Amenities: {len(amenities_std):,}")
print(f"   ‚Ä¢ Train stations: {len(train_std):,}")

all_pois_gdf.head()


‚úÖ Combined dataset: 6,818 POIs
   ‚Ä¢ Amenities: 6,653
   ‚Ä¢ Train stations: 165


Unnamed: 0,osm_id,osm_type,name,feature_type,geometry,Station ID,line_name,train_id
0,158731933,node,HeroMarket,mall,POINT (101.54722 3.08735),,,
1,251340254,node,TF Value-Mart,mall,POINT (101.73058 3.08076),,,
2,284005497,node,The Store,mall,POINT (101.84486 2.94398),,,
3,295130264,node,99 Speedmart,mall,POINT (101.52735 3.04738),,,
4,305358689,node,Cambridge Al-Ikhlas,school,POINT (101.53991 3.08302),,,


In [None]:
#all_pois_gdf.to_excel('/content/drive/MyDrive/Colab/Capstone 1/all_pois_gdf.xlsx')