In [1]:
import logging
import warnings
import joblib

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.neighbors import NearestNeighbors


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
warnings.resetwarnings()
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option("display.max_columns", None)

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(funcName)s - %(message)s', datefmt='%I:%M:%S')
logger = logging.getLogger(__name__)

### Preprocessing: Climate solution counts per domain

In [5]:
abstract_solutions = pd.read_parquet('../cities-learning-dec/data/climate_solutions_typology/oa_sentence_solutions_relevant.parquet')
abstract_city_mapping = pd.read_csv('../cities-learning-dec/data/geoparser/clean_places_augmented.csv', index_col=0)
abstract_city_mapping['city_id'] = abstract_city_mapping['city_word_match_id'].fillna(abstract_city_mapping['city_intersection_id'])

abstract_solutions = pd.merge(abstract_solutions, abstract_city_mapping[['id', 'city_id']], on='id')

domain_to_solution_ids = {
    "Mobility": [1, 2, 3, 4, 5, 6, 7],
    "Buildings": [8, 9, 10, 11, 12, 13, 14, 15, 16],
    "Energy": [17, 18, 19, 20, 21, 22, 23],
    "Thermal comfort and Heat stress management": [24, 25, 26, 27],
    "Food provisioning systems": [28, 29, 30, 31, 32, 33, 34],
    "Water": [35, 36, 37, 38, 39, 40],
    "Waste management": [41, 42, 43, 44],
    "Disaster and risk management": [45, 46, 47, 48, 49, 50],
    "Carbon dioxide removal": [51, 52, 53, 54],
}

for domain, solution_ids in domain_to_solution_ids.items():
    abstract_solutions[domain] = abstract_solutions[[f'solution_{i}_match' for i in solution_ids]].max(axis=1)

domains = list(domain_to_solution_ids.keys())
abstract_solution_counts = abstract_solutions.groupby(['city_id','id'])[domains].max()
city_solution_counts = abstract_solution_counts.groupby('city_id')[domains].sum()

city_solution_counts['n_solutions'] = city_solution_counts.sum(axis=1)
city_solution_counts['solution_domain_counts'] = city_solution_counts.to_dict(orient="records")
city_solution_counts = city_solution_counts[['n_solutions', 'solution_domain_counts']].rename_axis('GHS_urban_area_id').reset_index()


### Preprocessing: Nearest neighbors in embedding space

In [6]:
nn_all_runs = []
for i in range(30):
    embeddings = joblib.load(f'../cities-learning-dec/clustering_models/latent_representation/latent_run_{i}.pkl')

    emb_cols = ["latent_0", "latent_1", "latent_2", "latent_3"]
    X = embeddings[emb_cols].values

    nn = NearestNeighbors(n_neighbors=51, metric="euclidean")
    nn.fit(X)

    distances, indices = nn.kneighbors(X)

    # Remove self-index (first column)
    neighbor_indices = indices[:, 1:]
    neighbor_distances = distances[:, 1:]

    city_ids = embeddings["GHS_urban_area_id"].astype(int).values


    N, K = neighbor_indices.shape

    # Repeat each city ID K times â†’ shape (N*K,)
    city_a = np.repeat(city_ids, K)

    # Flatten neighbors and distances (already in the correct order)
    city_b = city_ids[neighbor_indices].ravel()
    dist = neighbor_distances.ravel()

    nn_long = pd.DataFrame({
        "city_a": city_a,
        "city_b": city_b,
        "distance": dist
    })
    nn_all_runs.append(nn_long)

nn_long = pd.concat(nn_all_runs)

nn = (
    nn_long.groupby(['city_a', 'city_b'], as_index=False)
          .distance.mean()
)

nn_top20 = (
    nn.sort_values(['city_a', 'distance'])
     .groupby('city_a')
     .head(20)
     .reset_index(drop=True)
)

nn_top20['distance'] = nn_top20['distance'].round(4)
nn_top20['distance'] = nn_top20['distance'].round(4)

nn_wide = (
    nn_top20
    .groupby("city_a")
    .agg({
        "city_b": list,
        "distance": list
    })
    .reset_index()
)

nn_wide = nn_wide.rename(columns={
    "city_a": "GHS_urban_area_id",
    "city_b": "neighbors",
    "distance": "neighbor_distances"
})

### Merging all data sources

In [7]:
city_characteristics = pd.read_parquet('../cities-learning-dec/data/clustering_data_clean/GHS_UCDB_2024_preproc_2025_04_09_uci_and_nan_imputation_add_vars_included.parquet')
city_assignment_proba = pd.read_csv('../cities-learning-dec/data/clustering_results/dec_clusters_k4.csv')
city_types = pd.read_csv('../cities-learning-dec/data/clustering_results/cities_by_regional_type_clean.csv', index_col=0).rename(columns={'city_id': 'GHS_urban_area_id'})
embeddings = joblib.load('../cities-learning-dec/clustering_models/latent_representation/latent_run_0.pkl')
city_socioeconomics = pd.read_csv('../cities-learning-dec/data/GHS_UCDB_GLOBE_R2024A_V1_0/socioeconomic.csv')[['ID_UC_G0', 'SC_SEC_GDF_2020', 'SC_SEC_HDI_2020']].rename(columns={'ID_UC_G0': 'GHS_urban_area_id', 'SC_SEC_GDF_2020': 'GHS_female_gender_index', 'SC_SEC_HDI_2020': 'GHS_HDI'})

emissions = pd.read_csv('../cities-learning-dec/data/emissions/balance_sheet.csv')
emissions = emissions[emissions['Year'] == 2022][['ID_UC_G0', 'ODIAC']].rename(columns={'ID_UC_G0': 'GHS_urban_area_id', 'ODIAC': 'total_emissions'})

geometries = gpd.read_parquet('../cities-learning-dec/data/clustering_data_clean/GHS_UCDB_2024_preproc_2025_04_03.parquet', columns=['GHS_urban_area_id', 'geometry'])
geometries['lat'] = geometries.centroid.to_crs(epsg=4326).y
geometries['lon'] = geometries.centroid.to_crs(epsg=4326).x
geometries = geometries[['GHS_urban_area_id', 'lat', 'lon']]

cities = pd.merge(city_characteristics, city_types, on='GHS_urban_area_id')
cities = pd.merge(city_assignment_proba, cities, on='GHS_urban_area_id', how='left')
cities = pd.merge(cities, embeddings, on='GHS_urban_area_id', how='left')
cities = pd.merge(cities, nn_wide, on='GHS_urban_area_id', how='left')
cities = pd.merge(cities, city_socioeconomics, on='GHS_urban_area_id', how='left')
cities = pd.merge(cities, geometries, on='GHS_urban_area_id', how='left')
cities = pd.merge(cities, city_solution_counts, on='GHS_urban_area_id', how='left')
cities = pd.merge(cities, emissions, on='GHS_urban_area_id', how='left')

cities['emissions'] = cities['total_emissions'] / cities['GHS_population']
cities['city_name'].fillna('Unknown', inplace=True)
cities['embedding'] = cities[['latent_0', 'latent_1', 'latent_2', 'latent_3']].to_numpy().tolist()
cities['type_probabilities'] = cities[['mean_prob_cluster_2', 'mean_prob_cluster_3', 'mean_prob_cluster_0', 'mean_prob_cluster_1']].to_numpy().tolist()

### Rename

In [8]:
cities = cities.rename(columns={
    'GHS_urban_area_id': 'id',
    'city_name': 'name',
    'Region': 'region',
    'cluster_name': 'type',
    'GHS_population': 'population',
    'GHS_population_growth': 'population_growth',
    'GHS_population_density': 'population_density',
    'GHS_population_density_growth': 'population_density_growth',
    'GHS_GDP_PPP': 'gdp_ppp',
    'GHS_GDP_PPP_growth': 'gdp_ppp_growth',
    'GHS_critical_infra': 'critical_infrastructure',
    'GHS_greenness_index': 'greenness_index',
    'GHS_precipitation': 'precipitation',
    'GHS_HDI': 'hdi',
    'GHS_female_gender_index': 'female_gender_index',
})


### Compute metric ranks and store as JSON

In [9]:
info_cols = [
    'id',
    'name',
    'country',
    'region',
    'type',
    'n_studies',
    'solution_domain_counts',
    'probability',
    'embedding',
    'neighbors',
    'neighbor_distances',
    'type_probabilities',
    'lat',
    'lon',
]

metric_cols = [
    'population',
    'population_growth',
    'population_density',
    'population_density_growth',
    'gdp_ppp',
    'gdp_ppp_growth',
    'critical_infrastructure',
    'greenness_index',
    'precipitation',
    'hdd',
    'cdd',
    'hdi',
    'female_gender_index',
    'emissions',
]

for col in metric_cols:
    cities[f"{col}_pct"] = (cities[col].rank(pct=True) * 100).round()

pct_cols = [f"{col}_pct" for col in metric_cols]

cities[info_cols + metric_cols + pct_cols].to_json('cities-learning-explorer/public/cities.json', orient='records')