In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
import time
import json
import os

In [None]:
def process_data(source_excel):
    
    df = pd.read_excel(source_excel)
    
    df = df[(df["inaccessible"] != 1) & (df["irrelevant"] != 1)].copy()
    
    # geocode city, state, country to latitude & longitude
    geolocator = Nominatim(user_agent="euumo_geocoder")
    
    def geocode_row(row):
        components = [row.get("city", ""), row.get("state", ""), row.get("country", "")]
        query = ", ".join([c for c in components if isinstance(c, str) and c.strip()])
        try:
            loc = geolocator.geocode(query)
            time.sleep(1)
            if loc:
                return loc.latitude, loc.longitude
        except Exception:
            pass
        return None, None
    
    coords = df.apply(geocode_row, axis=1)
    df["latitude"], df["longitude"] = zip(*coords)
    
    # drop rows without successful geocoding
    df = df.dropna(subset=["latitude", "longitude"])
    
    # select only the required columns
    output_cols = [
        "id", "latitude", "longitude",
        "year", "city", "country",
        "use_case", "mode", "motivation", 
        "stakeholder", "link"
    ]
    df_out = df[output_cols]
    
    # replace NaN with None so JSON gets `null`
    df_out = df_out.where(pd.notnull(df_out), None)
    
    # build the output path
    out_dir = os.path.join("..", "UrbMobAI", "assets", "data")
    os.makedirs(out_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(source_excel))[0]
    output_file = os.path.join(out_dir, f"{base_name}.json")
    
    # dump to JSON
    with open(output_file, "w", encoding="utf-8") as fp:
        json.dump([
            {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [row["longitude"], row["latitude"]]  # [lon, lat]
                },
                "properties": {
                    "id": row["id"],
                    "city": row["city"],
                    "country": row["country"],
                    "use_case": row["use_case"],
                    "mode": row["mode"],
                    "motivation": row["motivation"],
                    "stakeholder": row["stakeholder"],
                    "link": row["link"]
                },
                "year": int(row["year"])
            }
            for _, row in df_out.iterrows()
        ], fp, ensure_ascii=False, indent=2)
    
    return output_file


In [None]:
source_file = "data/cityweb20250827_check.xlsx"
json_file = process_data(source_file)
print(f"Processed data written to: {json_file}")