# file: todo-gsws-osm.ipynb
# Author: Arjuna rao C
# To remove  already updated data  based on ref=GSWS:ID from the osm file from the cleaned up data there by arriving at todo data

# 

# input: "ap gsws 20251218.osm" file of sachivalayams about 60 as on 2025-12-18
# input: "secretariats_cleanedup.csv"  containg info including lat,long
# output:"secretariats_cleanedup_todo.csv" 
# 

In [None]:
# Import Required Libraries
import osmnx as ox
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
# Specify the path to your .osm file
osm_file_path = 'data/ap gsws 20251218.osm'

# Parse the .osm file
tree = ET.parse(osm_file_path)
root = tree.getroot()

# Display root tag and attributes
print(root.tag, root.attrib)
# Example: Extract nodes from the .osm file
for child in root:
    if child.tag == 'node':
        attributes = child.attrib
        print(attributes)
        break  # Print only the first node to avoid too much output

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString

# Parse nodes into a dict for quick lookup
node_dict = {}
for node in root.findall('node'):
    nid = node.get('id')
    try:
        lat = float(node.get('lat'))
        lon = float(node.get('lon'))
    except (TypeError, ValueError):
        continue
    tags = {t.get('k'): t.get('v') for t in node.findall('tag')}
    node_dict[nid] = {"osm_type": "node", "osm_id": nid, "tags": tags, "lat": lat, "lon": lon}

# Build GeoDataFrame for nodes
nodes_df = pd.DataFrame(node_dict).T.reset_index(drop=True)
if not nodes_df.empty:
    nodes_df["geometry"] = nodes_df.apply(lambda r: Point(r["lon"], r["lat"]), axis=1)
    gdf_nodes = gpd.GeoDataFrame(nodes_df.drop(columns=["lat", "lon"]), geometry="geometry", crs="EPSG:4326")
else:
    gdf_nodes = gpd.GeoDataFrame(columns=["osm_type", "osm_id", "tags", "geometry"], crs="EPSG:4326")

# Parse ways and construct LineStrings where possible (fallback to Point if single node)
ways = []
for way in root.findall('way'):
    wid = way.get('id')
    nd_refs = [nd.get('ref') for nd in way.findall('nd')]
    coords = [(node_dict[r]["lon"], node_dict[r]["lat"]) for r in nd_refs if r in node_dict]
    tags = {t.get('k'): t.get('v') for t in way.findall('tag')}
    geom = None
    if len(coords) >= 2:
        geom = LineString(coords)
    elif len(coords) == 1:
        geom = Point(coords[0])
    ways.append({"osm_type": "way", "osm_id": wid, "tags": tags, "geometry": geom})

ways_df = pd.DataFrame(ways)
if not ways_df.empty:
    gdf_ways = gpd.GeoDataFrame(ways_df, geometry="geometry", crs="EPSG:4326")
    # drop any ways with no geometry (no referenced nodes present)
    gdf_ways = gdf_ways[~gdf_ways.geometry.isna()]
else:
    gdf_ways = gpd.GeoDataFrame(columns=["osm_type", "osm_id", "tags", "geometry"], crs="EPSG:4326")

# Combine nodes and ways into a single GeoDataFrame (overwrites/creates variable `gdf`)
gdf = pd.concat([gdf_nodes, gdf_ways], ignore_index=True, sort=False)
gdf = gpd.GeoDataFrame(gdf, geometry="geometry", crs="EPSG:4326")

# Optional: extract common attributes for convenience
gdf["name"] = gdf["tags"].apply(lambda t: t.get("name") if isinstance(t, dict) else None)

print(f"Parsed {len(gdf_nodes)} nodes and {len(gdf_ways)} ways -> combined {len(gdf)} features")

In [None]:
# read "secretariats_cleanedup.csv" into a DataFrame gsws_df
gsws_df = pd.read_csv('data/secretariats_cleanedup.csv')
# extract ref values from tags column in gdf to a new column 'ref'
gdf['ref'] = gdf['tags'].apply(lambda x: x.get('ref', None) if isinstance(x, dict) else None)
# strip prefix "GSWS:" from ref values in gdf
gdf['ref'] = gdf['ref'].str.replace(r'^GSWS:', '', regex=True)  
# convert ref column to Int64 dtype
gdf['ref'] = gdf['ref'].astype('Int64')

In [None]:
# convert 'secretariat_code' column to Int64 dtype
gsws_df['secretariat_code'] = gsws_df['secretariat_code'].astype('Int64')

In [None]:
# remove rows in gsws_gdf where ref is in secretariat_code column of gdf
mask=gsws_df['secretariat_code'].isin(gdf['ref'])
filtered_df = gsws_df[~mask]
# remove rows in filtered_df where secretariat_code is in [11390446] as these are already mapped but assigned the earlier unit names
filtered_df = filtered_df[~filtered_df['secretariat_code'].isin([11390446])]
filtered_df.describe()

In [None]:
# write filtered_df to a new csv file "secretariats_to_map 20251218.csv" without the index
filtered_df.to_csv("data/secretariats_to_map 20251218.csv", index=False)