In [None]:
# !pip install git+https://github.com/boettiger-lab/cng-python

In [1]:

import ibis
from ibis import _
from cng.utils import *
from cng.h3 import * 
import os
con = ibis.duckdb.connect("local.db", extensions = ["spatial", "h3"])
install_h3()



In [4]:
set_secrets(con, key ="", secret="", endpoint="s3-west.nrp-nautilus.io")
con.read_parquet("s3://public-overturemaps/hex/countries.parquet").head().execute()

Unnamed: 0,id,country,name,h8,h0
0,8ad3cd02-284d-4280-bea3-e3abe1615fef,DJ,Djibouti,8852d4bb59fffff,8053fffffffffff
1,8ad3cd02-284d-4280-bea3-e3abe1615fef,DJ,Djibouti,8852890b63fffff,8053fffffffffff
2,8ad3cd02-284d-4280-bea3-e3abe1615fef,DJ,Djibouti,8852891749fffff,8053fffffffffff
3,8ad3cd02-284d-4280-bea3-e3abe1615fef,DJ,Djibouti,8852d4d325fffff,8053fffffffffff
4,8ad3cd02-284d-4280-bea3-e3abe1615fef,DJ,Djibouti,8852f26b39fffff,8053fffffffffff


In [None]:
set_secrets(con, key ="", secret="", endpoint="s3-west.nrp-nautilus.io")
countries = con.read_parquet("s3://public-overturemaps/chunks/**")
countries.to_parquet("s3://public-overturemaps/hex/countries.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:

# Must used scoped secrets with different names for the different endpoints
set_secrets(con, name = "minio") # read/write using AWS env var credentials

def geom_to_cell(df, zoom=8, keep_cols=None):
    con = df.get_backend()
    
    # Default to keeping all columns except geom if not specified
    if keep_cols is None:
        keep_cols = [col for col in df.columns if col != 'geom']
    
    # Build column list for SELECT statements
    col_list = ', '.join(keep_cols)
    
    # all types must be multi-polygons
    cases = ibis.cases(
        (df.geom.geometry_type() == 'POLYGON', ST_Multi(df.geom)),
        else_=df.geom,
    )
    
    df = df.mutate(geom=cases)
    sql = ibis.to_sql(df)
    
    expr = f'''
        WITH t1 AS (
            SELECT {col_list}, UNNEST(ST_Dump(ST_GeomFromWKB(geom))).geom AS geom 
            FROM ({sql})
        ) 
        SELECT *, h3_polygon_wkt_to_cells_string(geom, {zoom}) AS h3id FROM t1
    '''

    out = con.sql(expr)
    return out







In [5]:

SOURCE = "s3://public-overturemaps/countries.parquet"
con.read_parquet(SOURCE)

In [None]:
countries = (con
    .read_parquet(SOURCE)
    .mutate(
        name =  ibis.coalesce(_.names['common']['en'], _.names['primary'])
    )
    .select('geometry', 'id', 'country', 'name', 'class')
    .rename(geom = "geometry")
)

countries.head().execute()


Unnamed: 0,geom,id,country,name,class,type
0,"MULTIPOLYGON (((43.15046 12.71288, 43.15056 12...",8ad3cd02-284d-4280-bea3-e3abe1615fef,DJ,Djibouti,land,division_area
1,"MULTIPOLYGON (((43.14654 12.71897, 43.14656 12...",2a6b563b-3199-4829-ae60-1dc49efe72b4,ER,Eritrea,land,division_area
2,"MULTIPOLYGON (((8.37604 35.46793, 8.37964 35.4...",d2ce5979-31c4-4dc7-9252-0b4a9f09b190,DZ,Algeria,land,division_area
3,"MULTIPOLYGON (((25.14205 31.67134, 25.14207 31...",602d6529-e4e7-44ab-9c2b-fe563609c69d,LY,Libya,land,division_area
4,"POLYGON ((43.9489 41.16873, 43.94934 41.16788,...",d9081f68-16c9-433e-a32d-aeb75c7be279,GE,Georgia,land,division_area


In [9]:
# Check if all countries have an English name
# Let's count total countries and how many have non-null English names
countries_check = (con
    .read_parquet(SOURCE)
    .mutate(
        country_name_en = _.names['common']['en']
    )
    .select('id', 'country', 'country_name_en')
)

# Get counts
stats = countries_check.execute()
total_countries = len(stats)
non_null_en_names = stats['country_name_en'].notna().sum()
null_en_names = stats['country_name_en'].isna().sum()

print(f"Total countries: {total_countries}")
print(f"Countries with English names: {non_null_en_names}")
print(f"Countries WITHOUT English names (NAs): {null_en_names}")

# Show which countries (if any) are missing English names
if null_en_names > 0:
    print("\nCountries missing English names:")
    missing = stats[stats['country_name_en'].isna()]
    print(missing[['country', 'country_name_en']])

Total countries: 219
Countries with English names: 216
Countries WITHOUT English names (NAs): 3

Countries missing English names:
    country country_name_en
99       XR            None
185      XB            None
217      XP            None


In [10]:
# Let's investigate these missing entries more closely
missing_countries = (con
    .read_parquet(SOURCE)
    .filter(_.country.isin(['XR', 'XB', 'XP']))
    .select('id', 'country', 'names')
)

missing_data = missing_countries.execute()
print("Details of countries missing English names:\n")
for idx, row in missing_data.iterrows():
    print(f"\nCountry code: {row['country']}")
    print(f"ID: {row['id']}")
    print(f"Names structure: {row['names']}")
    print("-" * 80)

Details of countries missing English names:


Country code: XR
ID: 96e64727-fb8e-4c50-8c5d-5a9543b068ba
Names structure: {'primary': 'VN/CN disputes in Spratly Islands', 'common': {'zh': '中越南沙争议地区', 'vi': 'Tranh chấp chủ quyền Việt Nam của quần đảo Trường Sa'}, 'rules': None}
--------------------------------------------------------------------------------

Country code: XB
ID: a760ade6-5c7e-49c0-a918-5c98d7458e20
Names structure: {'primary': 'Extent of Japanese claim at Dokdo', 'common': None, 'rules': None}
--------------------------------------------------------------------------------

Country code: XP
ID: a8c6a847-a47c-449a-9eae-ca9a74d3f435
Names structure: {'primary': 'CN/VN disputes in Paracel Islands', 'common': {'zh': '中越西沙争议地区', 'vi': 'Tranh chấp chủ quyền Việt Nam của quần đảo Hoàng Sa'}, 'rules': [{'variant': 'alternate', 'language': 'vi', 'perspectives': None, 'value': 'Huyện Hoàng Sa', 'between': None, 'side': None}]}
------------------------------------------------------

In [11]:
# Better approach: use a fallback to the 'primary' name when 'en' is missing
countries_with_fallback = (con
    .read_parquet(SOURCE)
    .select('geometry', 'id', 'country', 'region', 'primary', 'names')
    .mutate(
        # Try to get English name, fallback to primary if not available
        country_name_en = _.names['common']['en'],
        country_name_primary = _.names['primary']
    )
    .mutate(
        # Use English name if available, otherwise use primary name
        country_name = ibis.coalesce(_.country_name_en, _.country_name_primary)
    )
    .select('geometry', 'id', 'country', 'region', 'primary', 'country_name')
    .rename(geom = "geometry")
)

# Check the results - especially the previously missing ones
test = countries_with_fallback.filter(_.country.isin(['XR', 'XB', 'XP', 'US', 'CN'])).execute()
test[['country', 'country_name']]

Unnamed: 0,country,country_name
0,CN,China
1,XR,VN/CN disputes in Spratly Islands
2,XP,CN/VN disputes in Paracel Islands
3,US,United States
4,XB,Extent of Japanese claim at Dokdo


## Summary: English Country Names

**Answer:** Yes, extracting `names['common']['en']` **does introduce NAs** for 3 out of 219 entries.

### The 3 missing entries are:
- **XR**: VN/CN disputes in Spratly Islands
- **XB**: Extent of Japanese claim at Dokdo  
- **XP**: CN/VN disputes in Paracel Islands

These are **disputed territories**, not actual countries, which is why they don't have standard English names.

### Recommended Approach:
Use **`ibis.coalesce()`** to fallback to the `primary` name when the English name is missing:

```python
.mutate(
    country_name_en = _.names['common']['en'],
    country_name_primary = _.names['primary']
)
.mutate(
    country_name = ibis.coalesce(_.country_name_en, _.country_name_primary)
)
```

This ensures all 219 entries have a name, using English when available and falling back to the primary name for disputed territories.