In [4]:
import re
import geonamescache
from unidecode import unidecode
import pandas as pd

# Get list of all countries
gc = geonamescache.GeonamesCache()
gcCountries = gc.get_countries()

# Get list of all cities
gcCities = gc.get_cities()
# sort list of countries by population (where multiple matches in a healine more likely to be in more populated area)
sortedCityNames = sorted(gcCities.values(), key=lambda x: x['population'], reverse = True)

# Process cities
cities = []
cityMap = {}
cityToLatLng = {}
for c in sortedCityNames:
    # print(c)
    decodedName = unidecode(c['name']).lower()
    
    # Handle some special cases that I observed
    if decodedName == 'antigua guatemala':
        decodedName = 'antigua'
        
    if decodedName == 'north druid hills':
        decodedName = 'druid hills'
        
    if decodedName == 'st. johns':
        decodedName = 'saint johns'
    
    if decodedName == 'cebu city':
        decodedName = 'cebu'
        
    # There happens to be a city call Of which matches 'of' in headlines, append the string 'city' to prevent
    # this from matching incorrectly
    if decodedName == 'of':
        decodedName = 'of city'

    cities.append(decodedName)
    
    # Map decoded name to country code and original city name
    if decodedName not in cityMap:
        cityMap[decodedName] = {}
        cityMap[decodedName]['countrycode'] = c['countrycode']
        cityMap[decodedName]['name'] = c['name']
    
    # Map city name to lat/lng for later
    if c['name'] not in cityToLatLng:
        cityToLatLng[c['name']] = {}
        cityToLatLng[c['name']]['lat'] = c['latitude']
        cityToLatLng[c['name']]['lng'] = c['longitude']

# print(cities)
# print(cityMap)
# print(cityToLatLng)

# Create regex string for decoded cities
citiesRegexString = r'(?P<city>\b(' + '|'.join(cities) + r')\b)'
# print(citiesRegexString)

# Create pre-compiled regex
cityRegex = re.compile(citiesRegexString)
# print(cityRegex)

# Process headlines and match cities where possible
count = 0
missed = 0
data = { 'headline': [], 'countries': [], 'cities': [] }
file = open("./data/headlines.txt", 'r')
for line in file.readlines():
    data['headline'].append(line.rstrip())
    city = None
    country = None
    
    cityMatch = re.search(cityRegex, line.rstrip().lower())
    if cityMatch != None:
        # print(line)
        cityLower = cityMatch.group('city')
        # print(cityLower)
        city = cityMap[cityLower]['name']
        country = gcCountries[cityMap[cityLower]['countrycode']]['name']
        count = count + 1
        # print('Country: [' + country + '] City: [' + city + '] Headline: [' + line + ']')
    else:
        # print(line)
        missed = missed + 1
    
    data['countries'].append(country)
    data['cities'].append(city)
        
file.close()
print('No city/country matched for [' + str(missed) + '] healines')

# Create dataFrame and output to csv file
dataFrame = pd.DataFrame(data = data)
# print(dataFrame)

dataFrame.to_csv(path_or_buf='./data/section1Output.csv', index=False)
print('Finished procssing headlines. Output file can be found at \'./data/section1Output.csv\'')

No city/country matched for [33] healines
Finished procssing headlines. Output file can be found at './data/section1Output.csv'
