In [None]:
with open("headlines.txt") as file:
    data = [headline.strip() for headline in file]
    
data[:4]

In [None]:
import geonamescache

gc = geonamescache.GeonamesCache()
countries = [country["name"] for country in gc.get_countries().values()]
countries[:4]

In [None]:
cities = [city['name'] for city in gc.get_cities().values()]
cities[:4]

In [None]:
from collections import Counter

city_counts = Counter(cities)
city_counts.most_common(10)

## Removing Accent Marks

We need to remove the accent marks from the lists of countries and cities. For this we will use the `unidecode` library. (Method from this [Stack Overflow answer](https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string).) For the cities and the countries from geonamescache, we will map the unaccented name to the accented name. 

In [None]:
import unidecode

country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries
}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}
city_accent_mapping["Asmar"]

In [None]:
data = [unidecode.unidecode(headline) for headline in data]
data[-4:]

# Searching for Cities and Countries

Next, we'll search each headline for any cities and/or countries. To do this, we use regular expressions created from the unaccented cities and countries.

In [None]:
# Create list of cities and countries
unaccented_cities = list(city_accent_mapping.keys())
unaccented_countries = set(country_accent_mapping.keys())

print(f"There are {len(unaccented_cities)} cities to look through.")
print(f"There are {len(unaccented_countries)} countries to look through.")

In [None]:
import re

problem_city = 'San Jose'
re.search('\\bSan\\b|\\bSan Jose\\b', problem_city)

Here we see the second problem. We've matched only `San` instead of the entire city name. To correct this, we change the ordering of the regular expression.

In [None]:
re.search('\\bSan Jose\\b|\\bSan\\b', problem_city)

In [None]:
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)
unaccented_cities[:2]

In [None]:
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
unaccented_countries[:2]

In [None]:
city_regex = r'\b|\b'.join(unaccented_cities)
city_regex[1500:1800]

In [None]:
import numpy as np

np.random.seed(50)

test_headlines = np.random.choice(data, 10)

for test_headline in test_headlines:
    print(test_headline)
    match = re.search(city_regex, test_headline)
    if match:
        print(match.group(0), "\n")

In [None]:
country_regex = r"\b|\b".join(unaccented_countries)
country_regex[:100]

In [None]:
np.random.seed(100)
test_headlines = np.random.choice(data, 10)

for test_headline in test_headlines:
    print(test_headline)
    match = re.search(country_regex, test_headline)
    if match:
        print(match.group(0), "\n")

In [None]:
test_headline = data[3]
print(test_headline)
print(re.search(city_regex, test_headline).group(0))
print(re.search(country_regex, test_headline).group(0))

In [None]:
print(city_accent_mapping["Recife"])
print(country_accent_mapping["Brazil"])

Neither of these have accents. 

### City and Country Regular Expression Function

Let's encapsulate the logic to find city and country names into a function.

In [None]:
def find_city_and_country_in_headline(headline):
    """
    Find the city(s) and/or country(s) in a text headline.
    
    :param headline: string for headline
    
    :return dict: a dictionary mapping the headline to city(s) and/or countries.
    """
    city_match = re.search(city_regex, headline)
    country_match = re.search(country_regex, headline)
    cities = None if not city_match else city_match.group(0)
    countries = None if not country_match else country_match.group(0)
    return dict(headline=headline, countries=countries, cities=cities)

In [None]:
find_city_and_country_in_headline(data[3])

In [None]:
find_city_and_country_in_headline(data[1])

In [None]:
headline_cities_and_countries = [
    find_city_and_country_in_headline(headline) for headline in data
]
headline_cities_and_countries[-10:]

In [None]:
import json

save_file = "headline_cities_and_countries.json"
with open(save_file, "w") as fout:
    fout.write(json.dumps(headline_cities_and_countries))

In [None]:
with open(save_file, "r") as fin:
    check_data = json.loads(fin.read())

In [None]:
check_data[-10:]

In [None]:
check_data[:5]

In [None]:
with open("city_accent_mapping.json", "w") as fout:
    fout.write(json.dumps(city_accent_mapping))

In [None]:
with open("country_accent_mapping.json", "w") as fout:
    fout.write(json.dumps(country_accent_mapping))

In [None]:
import pandas as pd

data = pd.read_json("headline_cities_and_countries.json")
data = data.replace({None: np.nan})

data.head(10)

In [None]:
locations = {}
for key, value in gc.get_cities().items():
    locations[value["name"]] = (value["latitude"], value["longitude"])

data['latitude'] = data['cities'].apply(lambda x: locations.get(x, (None, None))[0])
data['longitude'] = data['cities'].apply(lambda x: locations.get(x, (None, None))[1])

In [None]:
data.head(10)

In [None]:
country_codes = {}
for key, value in gc.get_countries().items():
    country_codes[value["name"]] = value["iso"]

data['countrycode'] = data['countries'].apply(lambda x: country_codes.get(x, None))

In [None]:
data.head(10)