# file: cleanup-gsws-osm.ipynb
# Author: Arjuna rao C
# To update names of grama or ward sachivalayam in the format of <generic-name>, <unit name> with unit name from cleaned up
# Secretariat_name and add the secretariat codes
# as ref tag with value <GSWS:<secretariat_code>
# 

# input: "ap gsws.osm" file of sachivalayams about 60 as on 2025-12-18
# input: "secretariats_cleanedup.csv"  containg info including lat,long
# output: osm change file  with update to name, ref tags to be updated via josm later.  
# 

In [1]:
# Import Required Libraries
import osmnx as ox
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
# Specify the path to your .osm file
osm_file_path = 'ap gsws.osm'

# Parse the .osm file
tree = ET.parse(osm_file_path)
root = tree.getroot()

# Display root tag and attributes
print(root.tag, root.attrib)
# Example: Extract nodes from the .osm file
for child in root:
    if child.tag == 'node':
        attributes = child.attrib
        print(attributes)
        break  # Print only the first node to avoid too much output

osm {'version': '0.6', 'generator': 'Overpass API 0.7.62.8 e802775f'}
{'id': '4670946472', 'lat': '17.3764921', 'lon': '82.5323742', 'version': '3', 'timestamp': '2025-12-17T06:26:03Z', 'changeset': '176034708', 'uid': '2742073', 'user': 'arjunaraoc'}


In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString

# Parse nodes into a dict for quick lookup
node_dict = {}
for node in root.findall('node'):
    nid = node.get('id')
    try:
        lat = float(node.get('lat'))
        lon = float(node.get('lon'))
    except (TypeError, ValueError):
        continue
    tags = {t.get('k'): t.get('v') for t in node.findall('tag')}
    node_dict[nid] = {"osm_type": "node", "osm_id": nid, "tags": tags, "lat": lat, "lon": lon}

# Build GeoDataFrame for nodes
nodes_df = pd.DataFrame(node_dict).T.reset_index(drop=True)
if not nodes_df.empty:
    nodes_df["geometry"] = nodes_df.apply(lambda r: Point(r["lon"], r["lat"]), axis=1)
    gdf_nodes = gpd.GeoDataFrame(nodes_df.drop(columns=["lat", "lon"]), geometry="geometry", crs="EPSG:4326")
else:
    gdf_nodes = gpd.GeoDataFrame(columns=["osm_type", "osm_id", "tags", "geometry"], crs="EPSG:4326")

# Parse ways and construct LineStrings where possible (fallback to Point if single node)
ways = []
for way in root.findall('way'):
    wid = way.get('id')
    nd_refs = [nd.get('ref') for nd in way.findall('nd')]
    coords = [(node_dict[r]["lon"], node_dict[r]["lat"]) for r in nd_refs if r in node_dict]
    tags = {t.get('k'): t.get('v') for t in way.findall('tag')}
    geom = None
    if len(coords) >= 2:
        geom = LineString(coords)
    elif len(coords) == 1:
        geom = Point(coords[0])
    ways.append({"osm_type": "way", "osm_id": wid, "tags": tags, "geometry": geom})

ways_df = pd.DataFrame(ways)
if not ways_df.empty:
    gdf_ways = gpd.GeoDataFrame(ways_df, geometry="geometry", crs="EPSG:4326")
    # drop any ways with no geometry (no referenced nodes present)
    gdf_ways = gdf_ways[~gdf_ways.geometry.isna()]
else:
    gdf_ways = gpd.GeoDataFrame(columns=["osm_type", "osm_id", "tags", "geometry"], crs="EPSG:4326")

# Combine nodes and ways into a single GeoDataFrame (overwrites/creates variable `gdf`)
gdf = pd.concat([gdf_nodes, gdf_ways], ignore_index=True, sort=False)
gdf = gpd.GeoDataFrame(gdf, geometry="geometry", crs="EPSG:4326")

# Optional: extract common attributes for convenience
gdf["name"] = gdf["tags"].apply(lambda t: t.get("name") if isinstance(t, dict) else None)

print(f"Parsed {len(gdf_nodes)} nodes and {len(gdf_ways)} ways -> combined {len(gdf)} features")

Parsed 60 nodes and 0 ways -> combined 60 features


In [None]:
# read "secretariats_cleanedup.csv" into a DataFrame gsws_df
gsws_df = pd.read_csv('data/secretariats_cleanedup.csv')
# convert gsws_df to a GeoDataFrame gsws_gdf with Point geometries based on 'latitude' and 'longitude' columns  
gsws_gdf = gpd.GeoDataFrame(gsws_df, geometry=gpd.points_from_xy(gsws_df['longitude'], gsws_df['latitude']), crs="EPSG:4326")   

In [None]:
import geopandas as gpd

# Ensure both GeoDataFrames use a projected CRS in metres
# Example: UTM zone (adjust EPSG as appropriate for your data)
gdf = gdf.to_crs(epsg=7756)
gsws_gdf = gsws_gdf.to_crs(epsg=7756)

# Find nearest gsws_gdf point within 500 metres and merge attributes
merged_gdf = gpd.sjoin_nearest(
    gdf,
    gsws_gdf,
    how="left",
    max_distance=500,
    distance_col="distance_m"
)

# merged_gdf now contains attributes from pmgsy_gdf


In [None]:
# drop no matches
merged_gdf = merged_gdf[~merged_gdf['secretariat_code'].isna()]
# create new column 'new_name' by concatenating 'gen_name' and 'unit' with a comma and space
merged_gdf['new_name'] = merged_gdf['gen_name'] + ", " + merged_gdf['unit']
# create 'ref' column with GSWS prefix
merged_gdf['ref'] = 'GSWS:'+merged_gdf['secretariat_code'].astype('Int64').astype(str)  
# change projection back to WGS84
merged_gdf = merged_gdf.to_crs(epsg=4326)


In [None]:
# generate OSM Change (OSC) file "ap gsws updates.osc" with updates
from lxml import etree
import ast


src_root = root

# Build lookup of node metadata by id
node_meta = {}
for node in src_root.findall("node"):
    node_meta[node.get("id")] = {
        "version": node.get("version"),
        "timestamp": node.get("timestamp"),
        "changeset": node.get("changeset"),
        "uid": node.get("uid"),
        "user": node.get("user")
    }

# Create OSC root using version from source
osm = etree.Element(
    "osmChange",
    version=src_root.get("version", "0.6"),
    generator="GeoPandas OSM Update"
)
modify = etree.SubElement(osm, "modify")

for _, row in merged_gdf.iterrows():
    osm_id = str(row["osm_id"])

    # Parse tags
    tags = ast.literal_eval(row["tags"]) if isinstance(row["tags"], str) else dict(row["tags"])

    # Overwrite tags
    tags["name"] = row["new_name"]
    tags["ref"] = row["ref"]

    meta = node_meta.get(osm_id, {})

    node = etree.SubElement(
        modify,
        "node",
        id=osm_id,
        lat=str(row.geometry.y),
        lon=str(row.geometry.x),
        version=meta.get("version", "1"),
        timestamp=meta.get("timestamp"),
        changeset=meta.get("changeset"),
        uid=meta.get("uid"),
        user=meta.get("user")
    )

    # Remove empty attributes
    for k in list(node.attrib):
        if node.attrib[k] is None:
            del node.attrib[k]

    # Write tags
    for k, v in tags.items():
        etree.SubElement(node, "tag", k=str(k), v=str(v))

# Write OSC file
tree = etree.ElementTree(osm)
tree.write(
    "data/ap gsws updates.osc",
    encoding="UTF-8",
    xml_declaration=True,
    pretty_print=True
)
