In [1]:

import pandas as pd
import numpy as np
import re
import geonamescache
from unidecode import unidecode


In [4]:
# ----------------------------------------------------
# 
#     Read Headline, line by line
# 
# ----------------------------------------------------

# Using readlines() 
file1 = open('discovering-disease-outbreaks-base-master\\data\\headlines.txt', 'r') 
    
HeadLines = [line.strip()
             for line in file1.readlines()]

for line in HeadLines[0:10]:
    print(line)

Zika Outbreak Hits Miami
Could Zika Reach New York City?
First Case of Zika in Miami Beach
Mystery Virus Spreads in Recife, Brazil
Dallas man comes down with case of Zika
Trinidad confirms first Zika case
Zika Concerns are Spreading in Houston
Geneve Scientists Battle to Find Cure
The CDC in Atlanta is Growing Worried
Zika Infested Monkeys in Sao Paulo


In [6]:

def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)


# setup a dataframe to capture the data

df_city_county = pd.DataFrame(columns=['headlines','countries','cities'])


gc = geonamescache.GeonamesCache()

# -- Create a List of Cities
cities = [city['name'] for city in gc.get_cities().values()]

regex_cities = {}
for city in cities: # -- Create a list of RegEx for each City
    regex_cities[name_to_regex(city)] = city
    
    
# -- Create list of Countries
countries  = [country['name'] for country in gc.get_countries().values()]

regex_countries = {}
for country in countries: # -- Create a list of RegEx for each Country
    regex_countries[name_to_regex(country)] = country

    
df_index = 0
for text in HeadLines:  # -- Loop thru each headline and find City and Country
    #print(text)
    #print()
    
    found_cities_set = set()
    found_countries_set = set()
   
    for city in regex_cities:  # parse out city from headline
        if re.search(city, text):
            found_cities_set.add(regex_cities[city])
            
    for country in regex_countries: # parse our country from headline
        if re.search(country, text):
            found_countries_set.add(regex_countries[country])
 
    found_cities_list = list(found_cities_set)
    if found_cities_list == []:
        target_city = np.NaN
    else:
        #print(found_cities_list)
        target_city = max(found_cities_list,key=len)
        #print(target_city) 
    
        
    found_countries_list = list(found_countries_set)
    if found_countries_list == []:
        target_country = np.NaN
    else:
        #print(found_countries_list)
        target_country = max(found_countries_list,key=len)
        #print(target_country)    
    
    
    df_city_county.loc[df_index] = [text,target_country,target_city]
    df_index += 1
    
    
display(df_city_county)

Unnamed: 0,headlines,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,,Jerusalem
646,More Zika patients reported in Indang,,Indang
647,Suva authorities confirmed the spread of Rotav...,,Suva
648,More Zika patients reported in Bella Vista,,Bella Vista
