# Discovering Disease Outbreaks from News Headlines

In [1]:
import geonamescache
import re
from unidecode import unidecode
import pandas as pd

## 1. Extracting City and Country Information from News Headlines

In [2]:
gc = geonamescache.GeonamesCache()

In [3]:
def to_regex(name):
    return re.compile(fr'\b{name}\b', flags=re.IGNORECASE)

In [4]:
cities = [city['name'] for city in gc.get_cities().values()]
city_by_regex = {to_regex(name): name for name in cities}

In [5]:
countries = [country['name'] for country in gc.get_countries().values()]
country_by_regex = {to_regex(name): name for name in countries}

In [6]:
headlinesFile = open('../data/headlines.txt')

In [7]:
headlines = [unidecode(headline.strip()) for headline in headlinesFile.readlines()]

In [8]:
headlinesFile.close()

In [9]:
def find_name_in_line(line, name_by_regex):
    for regex, name in sorted(name_by_regex.items(), key=lambda x: x[1]):
        if regex.search(line):
            return name
    return None

In [10]:
df = pd.DataFrame({
    'headline': headlines,
    'city':     [find_name_in_line(headline, city_by_regex)    for headline in headlines],
    'country':  [find_name_in_line(headline, country_by_regex) for headline in headlines],
})

In [11]:
df.head()

Unnamed: 0,headline,city,country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,


In [12]:
df.to_pickle('../data/df.pkl')
