In [14]:
import pandas as pd
import duckdb as db
import json

### Download the geonames.org alternative Name Database

- Go to https://download.geonames.org/export/dump/
- Download the `alternateNamesV2.zip`

In [12]:
alternate_names_columns = ['alternateNameId', 'geonameid', 'isolanguage', 'alternate name', 'isPreferredName', 'isShortName', 'isColloquial', 'isHistoric', 'from', 'to']
alternate_names_dtypes = {'alternateNameId': int, 'geonameid': int, 'isolanguage': str, 'alternate name': str, 'isPreferredName':str, 'isShortName':str, 'isColloquial':str, 'isHistoric':str, 'from':str, 'to': str}
filename = 'alternateNamesV2.txt'
alname_df = pd.read_csv(filename, sep='\t', header=None, names=alternate_names_columns, dtype=alternate_names_dtypes)
alname_df.fillna('', inplace=True)

In [13]:
alname_df.head()

Unnamed: 0,alternateNameId,geonameid,isolanguage,alternate name,isPreferredName,isShortName,isColloquial,isHistoric,from,to
0,1284819,2994701,,Roc Mélé,,,,,,
1,1284820,2994701,,Roc Meler,,,,,,
2,1291197,3017832,,Pic de les Abelletes,,,,,,
3,4290387,3017832,,Pic de la Font-Nègre,,,,,,
4,1291198,3017833,,Estany de les Abelletes,,,,,,


### Processing the alternative/foreign language database

Step 1: Preparsing
- ISO language code are 2 letters (Major filter)
- We need to remove where ISO language values are empty
- We need to remove where Geoname ID values are empty
- We need to remove where Alternative name values are empty

In [26]:
alname_df_all_values = db.sql("""SELECT geonameid, isolanguage, "alternate name" as alt_name
FROM alname_df
WHERE geonameid is not null
AND isolanguage !=''
AND alt_name !='' 
AND LENGTH(isolanguage) <= 2
""").df()
alname_df_all_values.head()

Unnamed: 0,geonameid,isolanguage,alt_name
0,3038886,ca,Pic de Tristaina
1,3038899,ca,Tossalet i Vinyals
2,3038914,ca,Canal de la Tosa
3,3038995,ca,Bosc del Solobre
4,3039037,ca,Solà d’Engordany


Optional Step 1: Taking a look at the top 20 listed languages in the database (Disclaimer in step 3)

In [27]:
db.sql('''
SELECT isolanguage, count(*) lng_count
FROM alname_df_all_values
GROUP BY isolanguage
ORDER BY lng_count DESC
LIMIT 20
''').df()

Unnamed: 0,isolanguage,lng_count
0,zh,1008392
1,en,728195
2,no,691172
3,es,556830
4,ru,546593
5,fa,517259
6,id,396136
7,ar,304521
8,th,265978
9,ja,196185


Step 2: Only picking out 10 languages

In [114]:
# 10 Languages selected: 'zh', 'en', 'es', 'fr', 'ja', 'pt', 'ru', 'de', 'ar', 'ko'
alname_df_all_values_selected = db.sql('''
SELECT *
FROM alname_df_all_values
WHERE isolanguage in ('zh', 'en', 'es', 'fr', 'ja', 'pt', 'ru', 'de', 'ar', 'ko')
''').df()
alname_df_all_values_selected

Unnamed: 0,geonameid,isolanguage,alt_name
0,8210545,en,Regans Lagoon
1,8210546,en,Regents Lake
2,8210547,en,Round Lake
3,8210548,en,Lake Ricketson
4,8210549,en,Ritchies Lagoon
...,...,...,...
3815569,7668943,en,Tiger Cave Temple
3815570,7730329,en,Khok Kathiam Airport
3815571,7730329,en,Koke Kathiem Airport
3815572,7778911,en,James Bond Island


Step 3: Taking the first alt name by each language
- Geonames have multiple entries for the same language and same location. There could be dozens of alternative names for the same location within the same language.

In [115]:
alname_df_all_values_selected_only_first_loc = db.sql('''WITH RankedRows AS (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY geonameid, isolanguage) AS row_num
    FROM alname_df_all_values_selected
)
SELECT geonameid, isolanguage, alt_name
FROM RankedRows
WHERE row_num = 1;
''').df()
alname_df_all_values_selected_only_first_loc

Unnamed: 0,geonameid,isolanguage,alt_name
0,650,ru,Бастак
1,1816,ar,حبات نزال
2,2106,ar,مطار رشت الدولي
3,2254,ar,وادي غدير الحمل
4,2287,ar,الفياضية
...,...,...,...
3424813,12746130,ko,인천광역시역사자료관
3424814,12746165,ko,수산물품관리원인천지원
3424815,12746177,ko,송학동삼가
3424816,12746201,ko,한국근대문학관


Step 4: Choose English as one of the mandatory listed languages.
- If English is not listed for a particular geoname entry, we will not select it.

In [116]:
alname_df_all_values_selected_only_first_loc_en_filtered =  db.sql('''
SELECT t2.*
FROM alname_df_all_values_selected_only_first_loc t2
JOIN (
SELECT geonameid
FROM alname_df_all_values_selected_only_first_loc
WHERE isolanguage = 'en') t1
ON t2.geonameid = t1.geonameid
''').df()
alname_df_all_values_selected_only_first_loc_en_filtered

Unnamed: 0,geonameid,isolanguage,alt_name
0,11592844,en,Pybus
1,11593903,en,Southwestern Washington Experiment Station
2,11594363,en,Piana Rocks Lighthouse
3,11608472,en,Santana do Livramento
4,11608475,en,San Pedro Pochutla Airport
...,...,...,...
944962,8556013,en,Windarling Mine
944963,8562582,en,Takaiwa Tunnel
944964,8436037,es,Estación de Ronda
944965,8538664,en,Mount Kanigan


Optional Step 2: Test how many languages are there for each geoname entry

In [155]:
db.sql('''
SELECT count_lang, count(*) as lang_group_count
FROM (
	SELECT geonameid, count(*) as count_lang
	FROM alname_df_all_values_selected_only_first_loc_en_filtered
	GROUP BY geonameid
	ORDER BY count_lang DESC
)
GROUP BY count_lang
ORDER BY count_lang DESC
''').df()

Unnamed: 0,count_lang,lang_group_count
0,10,4150
1,9,1650
2,8,4334
3,7,2545
4,6,2666
5,5,3839
6,4,6798
7,3,12548
8,2,91567
9,1,552969


Optional Step 3: Minimum number of required languages
- If you want a tabular output select the max languges in the input language array (which is 10).
- Here as an example we can select geonames with more than 5 alt names

In [158]:
db.sql('''
SELECT t1.*
FROM alname_df_all_values_selected_only_first_loc_en_filtered t1
JOIN (
	SELECT geonameid, count(*) as count_lang
	FROM alname_df_all_values_selected_only_first_loc_en_filtered
	GROUP BY geonameid
) t2
ON t1.geonameid = t2.geonameid
WHERE t2.count_lang > 4
''').df()

Unnamed: 0,geonameid,isolanguage,alt_name
0,895949,es,Zambia
1,896140,pt,Província Ocidental
2,899274,fr,Samfya
3,912764,ja,カサマ
4,921780,de,Mohéli
...,...,...,...
144023,97990,fr,Bakouba
144024,119505,en,Qazvin
144025,119505,ko,카즈빈
144026,124763,es,Markazi


Optional Step 4: JSONification of the language and alt_names
- The dictionary represented in a JSON like string

In [120]:
alname_df_all_values_selected_only_first_loc_en_filtered_jsonified  = alname_df_all_values_selected_only_first_loc_en_filtered.groupby('geonameid').apply(lambda x: json.dumps(dict(zip(x['isolanguage'], x['alt_name'])), ensure_ascii=False)).reset_index()
alname_df_all_values_selected_only_first_loc_en_filtered_jsonified.columns = ['geonameid', 'iso_language_alt_names']
alname_df_all_values_selected_only_first_loc_en_filtered_jsonified

Unnamed: 0,geonameid,iso_language_alt_names
0,14,"{""ar"": ""تخت أرة دو"", ""en"": ""Takht Arreh Do""}"
1,254,"{""ar"": ""جسر بالارود"", ""en"": ""Pol-e Bala Rud""}"
2,301,"{""en"": ""Shahr-e Ziba"", ""fr"": ""Shahr-e ziba""}"
3,470,"{""ja"": ""アーザーディー広場"", ""ru"": ""Площадь Азади"", ""en..."
4,566,"{""en"": ""Rudafshan Cave"", ""es"": ""Ghar-e-Roodafs..."
...,...,...
683061,12746714,"{""en"": ""Bosiljevo interchange""}"
683062,12746729,"{""en"": ""Ringwood Magistrates Court""}"
683063,12746744,"{""en"": ""Appleton Dock""}"
683064,12746795,"{""ja"": ""葛城地蔵・将軍地蔵"", ""en"": ""Katsuragi Kṣitigarb..."


Step 6: Converting the dataset in to Python Dictionary for JSON output

In [133]:
alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df  = alname_df_all_values_selected_only_first_loc_en_filtered.groupby('geonameid').apply(lambda x: dict(zip(x['isolanguage'], x['alt_name']))).reset_index()
alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df.columns = ['geonameid', 'iso_language_alt_names']

alname_df_all_values_selected_only_first_loc_en_filtered_dict = alname_df_all_values_selected_only_first_loc_en_filtered_dictionariezed_df.set_index('geonameid')['iso_language_alt_names'].to_dict()
alname_df_all_values_selected_only_first_loc_en_filtered_dict

{14: {'ar': 'تخت أرة دو', 'en': 'Takht Arreh Do'},
 254: {'ar': 'جسر بالارود', 'en': 'Pol-e Bala Rud'},
 301: {'en': 'Shahr-e Ziba', 'fr': 'Shahr-e ziba'},
 470: {'ja': 'アーザーディー広場',
  'ru': 'Площадь Азади',
  'en': 'Azadi Square',
  'fr': 'place Azadi',
  'ar': 'ميدان آزادي',
  'pt': 'Praça Azadi',
  'zh': '阿扎迪广场'},
 566: {'en': 'Rudafshan Cave',
  'es': 'Ghar-e-Roodafshan',
  'ar': 'كهف رودافشان الأثري',
  'fr': 'Ghar-e-Roodafshan',
  'de': 'Ghar-e-Roodafshan'},
 677: {'en': 'Azadi Stadium',
  'ko': '아자디 경기장',
  'ru': 'Азади',
  'de': 'Azadi-Stadion',
  'zh': '阿薩迪體育場',
  'fr': 'stade Azadi',
  'ja': 'アザディ・スタジアム',
  'pt': 'Estádio Azadi',
  'ar': 'ملعب آزادي',
  'es': 'Estadio Azadi'},
 820: {'ar': 'بلوط بازه', 'en': 'Balut Bazeh'},
 1102: {'en': 'Sefidestan'},
 1226: {'en': 'Zeyn ol Hajjilu', 'ar': 'زين\u200c الحاجيلو'},
 1248: {'ar': 'سنوكش', 'en': 'Senowkesh'},
 1253: {'ar': 'تازة كند خوشة مهر', 'en': 'Tazeh Kand-e Khusheh Mehr'},
 1424: {'ar': 'سیاه برة', 'en': 'Siah Pareh'},
 1624

### Testing out the different process dataset format

Test 1: Dataset output

In [148]:
print(alname_df_all_values_selected_only_first_loc_en_filtered[alname_df_all_values_selected_only_first_loc_en_filtered["geonameid"]==1269750].to_markdown())

|        |   geonameid | isolanguage   | alt_name   |
|-------:|------------:|:--------------|:-----------|
|  63100 |     1269750 | en            | India      |
|  63101 |     1269750 | ko            | 인도       |
| 177456 |     1269750 | ja            | インド     |
| 465842 |     1269750 | pt            | Índia      |
| 562186 |     1269750 | es            | India      |
| 601777 |     1269750 | zh            | 印度       |
| 721371 |     1269750 | ar            | الهند      |
| 810115 |     1269750 | fr            | Inde       |
| 839386 |     1269750 | ru            | Индия      |
| 922130 |     1269750 | de            | Indien     |


Test 2: JSONified table output

In [122]:
db.sql('''
SELECT *
FROM alname_df_all_values_selected_only_first_loc_en_filtered_jsonified
WHERE geonameid==1269750
''')

┌───────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ geonameid │                                          iso_language_alt_names                                          │
│   int32   │                                                 varchar                                                  │
├───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│   1269750 │ {"en": "India", "ko": "인도", "ja": "インド", "pt": "Índia", "es": "India", "zh": "印度", "ar": "الهند…  │
└───────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────┘

Test 3: Dictionary (/JSON) output

In [150]:
alname_df_all_values_selected_only_first_loc_en_filtered_dict[1269750]

{'en': 'India',
 'ko': '인도',
 'ja': 'インド',
 'pt': 'Índia',
 'es': 'India',
 'zh': '印度',
 'ar': 'الهند',
 'fr': 'Inde',
 'ru': 'Индия',
 'de': 'Indien'}

### Saving the files

The dataset in CSV and the JSON in JSON is available in the zip file `geoname_alt_names.zip`

In [145]:
alname_df_all_values_selected_only_first_loc_en_filtered.to_csv("geoname_alt_names.csv", index=False, encoding='utf-8')

In [146]:
with open("geoname_alt_names.json", "w", encoding='utf-8') as file:
    json.dump(alname_df_all_values_selected_only_first_loc_en_filtered_dict, file, ensure_ascii=False)