# Manning liveProject - Task 2 - Adding Latitude and Longitude Coordinates

In [1]:
# Import required libraries
import pandas as pd
import geonamescache
import re
import unidecode

# Load data from headlines.txt

In [2]:
# Read all headlines and put those into Pandas dataframe
df = pd.read_fwf('headlines.txt')

# Name that column as Headlines
df.columns = ['headlines']

# Get Cities and Countries

In [3]:
# Initialize genamescache
gc = geonamescache.GeonamesCache()

countries = [country['name'] for country in gc.get_countries().values()]
cities = [city['name'] for city in gc.get_cities().values()]


# Remove Accent Marks and sort values by length

In [4]:
country_accent_mapping = {unidecode.unidecode(country): country for country in countries}
unaccented_countries = set(country_accent_mapping.keys())
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)

city_accent_mapping = {unidecode.unidecode(city): city for city in cities}
unaccented_cities = list(city_accent_mapping.keys())
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)


df['headlines'] = df['headlines'].apply(lambda x: unidecode.unidecode(x))

# Constructing the Regular Expression

In [5]:
city_regex = re.compile(r'\b|\b'.join(unaccented_cities))
country_regex = re.compile(r'\b|\b'.join(unaccented_countries))

# Functions to get country, city and longitude/latitude

In [6]:
def getCountry(value):
    result = country_regex.search(value)
    
    if result:
        return result.group(0)
    return ''
    
def getCity(value):
    result = city_regex.search(value)
    if result:
        temp = result.group(0)
        return temp
    return ''

def getLocation(value):
    location = {}
    if value:
        location_list = [val for val in gc.get_cities_by_name(value)]
        if location_list and len(location_list) > 0:
            for record in location_list:
                list_keys = record.keys()
            
                for key in list_keys:                    
                    location_val = record[key]
                    location['countrycode'] = location_val['countrycode']
                    location['latitude'] = location_val['latitude']
                    location['longitude'] = location_val['longitude']
        else:
            location['countrycode'] = ''
            location['latitude'] = ''
            location['longitude'] = ''
        
    else:
        location['countrycode'] = ''
        location['latitude'] = ''
        location['longitude'] = ''
    return location

def getData(value):
    data = {}
    country = getCountry(value)
    city = getCity(value)
    if city:
        location = getLocation(city)
        data['countrycode'] = location['countrycode']
        data['latitude'] = location['latitude']
        data['longitude'] = location['longitude']
    else:
        location = ''
        data['countrycode'] = ''
        data['latitude'] = ''
        data['longitude'] = ''
        
    data['country'] = country
    data['city'] = city
    data['headlines'] = value
    return data

In [7]:
# df['countries'] = df['headlines'].apply(lambda x: getCountry(x))
# df['cities'] = df['headlines'].apply(lambda x: getCity(x))
# df = df.apply(lambda x: getLocation(x))

list_data = []
for index, val in df.iterrows():
    list_data.append(getData(val['headlines']))
    
df_new = pd.DataFrame(list_data)
df_new

Unnamed: 0,countrycode,latitude,longitude,country,city,headlines
0,US,40.7143,-74.006,,New York City,Could Zika Reach New York City?
1,US,25.7906,-80.13,,Miami Beach,First Case of Zika in Miami Beach
2,BR,-8.05389,-34.8811,Brazil,Recife,"Mystery Virus Spreads in Recife, Brazil"
3,US,44.9193,-123.317,,Dallas,Dallas man comes down with case of Zika
4,UY,-33.5165,-56.8996,,Trinidad,Trinidad confirms first Zika case
5,US,29.7633,-95.3633,,Houston,Zika Concerns are Spreading in Houston
6,,,,,Geneve,Geneve Scientists Battle to Find Cure
7,US,33.749,-84.388,,Atlanta,The CDC in Atlanta is Growing Worried
8,,,,,Sao Paulo,Zika Infested Monkeys in Sao Paulo
9,US,40.6609,-73.9201,,Brownsville,Brownsville teen contracts Zika virus
