In [1]:
import geonamescache
from collections import Counter
import unidecode
import re
import json

with open("./data/headlines.txt", encoding="utf-8") as file:
    data = [headline.strip() for headline in file]
data = [unidecode.unidecode(headline) for headline in data]

In [2]:
gc1 = geonamescache.GeonamesCache()
countries = [country["name"] for country in gc1.get_countries().values()]
cities = [city["name"] for city in gc1.get_cities().values()]

country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries
}
city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}

unaccented_cities = list(city_accent_mapping.keys())
unaccented_countries = list(country_accent_mapping.keys())

# sort cities and countries by length
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)

In [3]:
country_reg = r"\b|\b".join(unaccented_countries)
city_reg = r'\b|\b'.join(unaccented_cities)

## Find the citys or countrys in a text headline.
def find_in_headline(headline):
    country_match = re.search(country_reg, headline)
    city_match = re.search(city_reg, headline)
    countries = None if not country_match else country_match.group(0)
    cities = None if not city_match else city_match.group(0)
    return dict(headline=headline, countries=countries, cities=cities)

## apply to all headlines
headline_countries_cities = [
    find_in_headline(headline) for headline in data
]

In [5]:
save_file = "./data/headline_countries_cities.json"
with open(save_file, "w") as file:
    file.write(json.dumps(headline_countries_cities))
    
with open("./data/country_accent_mapping.json", "w") as file:
    file.write(json.dumps(country_accent_mapping))    
    
with open("./data/city_accent_mapping.json", "w") as file:
    file.write(json.dumps(city_accent_mapping))