Get headlines into memory

In [1]:
f = open('headlines.txt', 'r')
headlines = [line.strip() for line in f.readlines()]
headlines[:10]

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil',
 'Dallas man comes down with case of Zika',
 'Trinidad confirms first Zika case',
 'Zika Concerns are Spreading in Houston',
 'Geneve Scientists Battle to Find Cure',
 'The CDC in Atlanta is Growing Worried',
 'Zika Infested Monkeys in Sao Paulo']

Get all country names

In [2]:
from geonamescache import GeonamesCache
gc = GeonamesCache()

countries = [country['name']for country in gc.get_countries().values()]
countries[125:150]

['Kazakhstan',
 'Laos',
 'Lebanon',
 'Saint Lucia',
 'Liechtenstein',
 'Sri Lanka',
 'Liberia',
 'Lesotho',
 'Lithuania',
 'Luxembourg',
 'Latvia',
 'Libya',
 'Morocco',
 'Monaco',
 'Moldova',
 'Montenegro',
 'Saint Martin',
 'Madagascar',
 'Marshall Islands',
 'Macedonia',
 'Mali',
 'Myanmar',
 'Mongolia',
 'Macao',
 'Northern Mariana Islands']

Create the regular expression dictionary for the country names.
- Requires the unidecode name to be included in the dictionary key
- SAMPLE: {(unidecode_name|name) : name}

In [3]:
from unidecode import unidecode
import re
def create_regex_from_name(name):
    unidecode_name = unidecode(name)
    if name != unidecode:
        r = fr'\b({name}|{unidecode_name})\b'
    else:
        r = fr'\b{name}\b'
    return re.compile(r, flags=re.IGNORECASE)
    

In [4]:
countries_re = {create_regex_from_name(c): c  for c in countries}

Get city names

In [5]:
cities = [city['name'] for city in gc.get_cities().values()]
cities[:5]

['Andorra la Vella',
 'Umm Al Quwain City',
 'Ras Al Khaimah City',
 'Zayed City',
 'Khawr Fakkān']

Build the regex function for getting the city name from the headlines

In [6]:
cities_re = {create_regex_from_name(c): c for c in cities}

In [16]:
def reg_extract(regex_pattern, city_name):
    match = re.match(regex_pattern, city_name)
    if match is not None:
        start, end = match.start(), match.end()
        return city_name[start:end]
    else:
        return ''

In [13]:
import pandas as pd
df = pd.DataFrame(headlines)
df = df.rename(columns={0:'headlines'})
df

Unnamed: 0,headlines
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika
...,...
645,Rumors about Rabies spreading in Jerusalem hav...
646,More Zika patients reported in Indang
647,Suva authorities confirmed the spread of Rotav...
648,More Zika patients reported in Bella Vista


In [25]:
h ='Mad Cow Disease Hits London'
for regex, name in cities_re.items():
    if regex.search(h):
        print(name)

London


In [28]:
def city_in_headline(headline, re_dictionary):
    for regex, name in re_dictionary.items():
        if regex.search(headline):
            return name
    return None

In [29]:
df['City'] = df['headlines'].apply(lambda x : city_in_headline(x, cities_re))
df

Unnamed: 0,headlines,City
0,Zika Outbreak Hits Miami,Miami
1,Could Zika Reach New York City?,York
2,First Case of Zika in Miami Beach,Of
3,"Mystery Virus Spreads in Recife, Brazil",Recife
4,Dallas man comes down with case of Zika,Man
...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem
646,More Zika patients reported in Indang,Indang
647,Suva authorities confirmed the spread of Rotav...,Suva
648,More Zika patients reported in Bella Vista,Bella Vista


In [30]:
df.describe()

Unnamed: 0,headlines,City
count,650,618
unique,647,504
top,Spanish Flu Spreading through Madrid,Of
freq,2,61


In [75]:
df[df['City'].str.len() <= 4]

Unnamed: 0,headlines,City,Country,Country_Code
45,Lima tries to address Zika Concerns,Lima,Peru,PE
71,Pune woman diagnosed with Zika,Pune,India,IN
80,Authorities are Worried about the Spread of Ma...,Rome,Italy,IT
130,Molo Cholera Spread Causing Concern,Molo,Kenya,KE
171,Zika arrives in Miri,Miri,Malaysia,MY
273,More people in Nadi are infected with HIV ever...,Nadi,Fiji,FJ
283,Rumors about Tuberculosis Spreading in Baud ha...,Baud,India,IN
320,Chikungunya re-emerges in Kobe,Kobe,Japan,JP
363,More Zika patients reported in Waco,Waco,United States,US
365,Erie County sets Zika traps,Erie,United States,US


Problem with data frame.  Ho and San were 'matched' instead of Ho Chi Minh and San Salvador

In [32]:
df[df['City'] == 'San']

Unnamed: 0,headlines,City
361,How to Avoid Respiratory Syncytial Virus in Sa...,San
443,Bronchitis Keeps Spreading in San Mateo,San


In [33]:

def get_cities_in_headline(headline):
    cities_in_headline = set()
    for regex, name in cities_re.items():
        match = regex.search(headline)
        if match:
            if headline[match.start()].isupper():
                cities_in_headline.add(name)
    return list(cities_in_headline)

df['Cities'] = df['headlines'].apply(lambda x: get_cities_in_headline(x))
df


Unnamed: 0,headlines,City,Cities
0,Zika Outbreak Hits Miami,Miami,[Miami]
1,Could Zika Reach New York City?,York,"[New York City, York]"
2,First Case of Zika in Miami Beach,Of,"[Miami Beach, Miami]"
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[Recife]
4,Dallas man comes down with case of Zika,Man,[Dallas]
...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,[Jerusalem]
646,More Zika patients reported in Indang,Indang,[Indang]
647,Suva authorities confirmed the spread of Rotav...,Suva,[Suva]
648,More Zika patients reported in Bella Vista,Bella Vista,"[Vista, Bella Vista]"


In [36]:
def get_longest_string_in_list(input_list):
    if input_list is not None:
        result = ''
        for l in input_list:
            if len(l) > len(result):
                result = l
        return result
    else:
        return None
df['City'] = df['Cities'].apply(lambda x: get_longest_string_in_list(x))
df

Unnamed: 0,headlines,City,Cities
0,Zika Outbreak Hits Miami,Miami,[Miami]
1,Could Zika Reach New York City?,New York City,"[New York City, York]"
2,First Case of Zika in Miami Beach,Miami Beach,"[Miami Beach, Miami]"
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[Recife]
4,Dallas man comes down with case of Zika,Dallas,[Dallas]
...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,[Jerusalem]
646,More Zika patients reported in Indang,Indang,[Indang]
647,Suva authorities confirmed the spread of Rotav...,Suva,[Suva]
648,More Zika patients reported in Bella Vista,Bella Vista,"[Vista, Bella Vista]"


In [39]:
def get_country_from_city_name(city_name):
    """This function assumes that the city with the largest population is the correct one"""
    if city_name:
   
        best_match = max(gc.get_cities_by_name(city_name),
                        key=lambda x: list(x.values())[0]['population'])
        if len(best_match.values()) > 0:
            best_match_data = list(best_match.values())[0]
            return best_match_data['countrycode']
    return None

print(get_country_from_city_name('San Francisco'))
print(get_country_from_city_name('Los Ángeles'))
print(get_country_from_city_name('Miami'))
print(get_country_from_city_name(None))


US
CL
US
None


In [43]:
# get country name from code
def get_country_name_from_code(country_code):
    if country_code:
        for country in gc.get_countries().values():
            if country['iso'] == country_code.upper():
                return country['name']
        return None
    else:
        return None
print(get_country_name_from_code('AE'))
print(get_country_name_from_code('GB'))
print(get_country_name_from_code('US'))
print(get_country_name_from_code('DE'))

United Arab Emirates
United Kingdom
United States
Germany


In [46]:
df['Country_Code'] = df['City'].apply(get_country_from_city_name)
df

Unnamed: 0,headlines,City,Cities,Country,Country_Code
0,Zika Outbreak Hits Miami,Miami,[Miami],United States,US
1,Could Zika Reach New York City?,New York City,"[New York City, York]",United States,US
2,First Case of Zika in Miami Beach,Miami Beach,"[Miami Beach, Miami]",United States,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[Recife],Brazil,BR
4,Dallas man comes down with case of Zika,Dallas,[Dallas],United States,US
...,...,...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,[Jerusalem],Israel,IL
646,More Zika patients reported in Indang,Indang,[Indang],Philippines,PH
647,Suva authorities confirmed the spread of Rotav...,Suva,[Suva],Fiji,FJ
648,More Zika patients reported in Bella Vista,Bella Vista,"[Vista, Bella Vista]",Dominican Republic,DO


In [45]:
df['Country'] = df['Country'].apply(get_country_name_from_code)
df

Unnamed: 0,headlines,City,Cities,Country
0,Zika Outbreak Hits Miami,Miami,[Miami],United States
1,Could Zika Reach New York City?,New York City,"[New York City, York]",United States
2,First Case of Zika in Miami Beach,Miami Beach,"[Miami Beach, Miami]",United States
3,"Mystery Virus Spreads in Recife, Brazil",Recife,[Recife],Brazil
4,Dallas man comes down with case of Zika,Dallas,[Dallas],United States
...,...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,[Jerusalem],Israel
646,More Zika patients reported in Indang,Indang,[Indang],Philippines
647,Suva authorities confirmed the spread of Rotav...,Suva,[Suva],Fiji
648,More Zika patients reported in Bella Vista,Bella Vista,"[Vista, Bella Vista]",Dominican Republic


In [49]:
df = df.drop(columns=['Cities'])
df

Unnamed: 0,headlines,City,Country,Country_Code
0,Zika Outbreak Hits Miami,Miami,United States,US
1,Could Zika Reach New York City?,New York City,United States,US
2,First Case of Zika in Miami Beach,Miami Beach,United States,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil,BR
4,Dallas man comes down with case of Zika,Dallas,United States,US
...,...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,Israel,IL
646,More Zika patients reported in Indang,Indang,Philippines,PH
647,Suva authorities confirmed the spread of Rotav...,Suva,Fiji,FJ
648,More Zika patients reported in Bella Vista,Bella Vista,Dominican Republic,DO


In [53]:
df.describe()

Unnamed: 0,headlines,City,Country,Country_Code
count,650,650.0,611,611
unique,647,579.0,94,94
top,Spanish Flu Spreading through Madrid,,United States,US
freq,2,39.0,302,302


In [73]:
df = df[df['City'] != '']
df

Unnamed: 0,headlines,City,Country,Country_Code
0,Zika Outbreak Hits Miami,Miami,United States,US
1,Could Zika Reach New York City?,New York City,United States,US
2,First Case of Zika in Miami Beach,Miami Beach,United States,US
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil,BR
4,Dallas man comes down with case of Zika,Dallas,United States,US
...,...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,Israel,IL
646,More Zika patients reported in Indang,Indang,Philippines,PH
647,Suva authorities confirmed the spread of Rotav...,Suva,Fiji,FJ
648,More Zika patients reported in Bella Vista,Bella Vista,Dominican Republic,DO


In [None]:
"""Build map and add features"""
import cartopy
from cartopy.crs import PlateCarree
import matplotlib.pyplot as plt 
plt.figure(figsize=(30,15))
ax = plt.axes(projection=PlateCarree())
def add_map_features():
    ax.coastlines()
    ax.add_feature(cartopy.feature.BORDERS)
    ax.add_feature(cartopy.feature.OCEAN)
    ax.add_feature(cartopy.feature.LAND)
    ax.add_feature(cartopy.feature.STATES)

add_map_features()
plt.show()

