In [87]:
import re
import geonamescache
import pandas as pd
import numpy as np

from unidecode import unidecode

class DiseaseOutbreakDataMiner:

    def __init__(self,file_path):
        self.data_file = file_path
        self.regexp = re.compile(r'[a-zA-z.]+')
        self.gc = geonamescache.GeonamesCache()
        self.df = None
        self.headlines = []
        self.countries = []
        self.lats = []
        self.longs =[]
        self.places = []

    def find_city(self,c_word, n_word, nn_word):

        c_city = None
        n_city = None
        nn_city = None
        cn_city = None  
        cnn_city = None

        #print(f"finding geo for city {c_word} {n_word} {nn_word}")

        #Three words
        search_word = c_word + " " + n_word + " " + nn_word
        #print(f"searching for: {search_word}")
        cnn_city = self.gc.get_cities_by_name(search_word)

        if len(cnn_city) > 0:
            #print(f"Searched for city {search_word} got geo: {cnn_city}")
            return (3,search_word,cnn_city)

        #first two words   
        search_word = c_word + " " + n_word
        #print(f"searching for: {search_word}")
        cn_city = self.gc.get_cities_by_name(search_word)

        if len(cn_city) > 0 :
            #print(f"searched for city {search_word} got geo: {cn_city}")
            return (2,search_word,cn_city)

        #last two words
        search_word = n_word + " " + nn_word
        #print(f"searching for: {search_word}")
        nn_city = self.gc.get_cities_by_name(search_word)
        if len(nn_city) > 0:
            #print(f"searched for city {search_word} got geo: {nn_city}")
            return (2,search_word,nn_city)

        #first word
        c_city = self.gc.get_cities_by_name(c_word)
        #print(f"searching for: {c_word}")

        if len(c_city) > 0:
            #print(f"serached for {c_word} geo: {c_city}")
            return (1,c_word,c_city)

        #second word
        n_city = self.gc.get_cities_by_name(n_word)
        #print(f"searching for: {n_word}")

        if len(n_city) > 0:
            #print(f"serached for {n_word} geo: {n_city}")
            return (1,n_word,n_city)

        #third word
        nn_city = self.gc.get_cities_by_name(nn_word)
        #print(f"searching for: {nn_word}")

        if len(nn_city) > 0:
            #print(f"serached for {nn_word} geo: {nn_city}")
            return (1,nn_word,nn_city)    


        return None
    
    def data_to_dataframe(self,headline, geo_points):
        #print(geo_points)

        t_place = np.nan
        t_country = np.nan
        t_lat = np.nan
        t_long = np.nan


        max_point = 0
        for point in geo_points:
            if point[0] > max_point:
                t_place = point[1]
                max_point = point[0]
                loc_data =  point[2]
                for loc in loc_data:
                    for key in loc:
                        t_country = loc[key]["countrycode"]
                        t_lat = loc[key]["latitude"]
                        t_long = loc[key]["longitude"]

                        #print(f"{headline} {t_place} {t_country} {t_lat} {t_long}")

        self.headlines.append(headline)
        self.places.append(t_place)
        self.countries.append(t_country)
        self.lats.append(t_lat)
        self.longs.append(t_long)
    

    def save_dataframe(self):
        data = {"headlines": self.headlines, "cities": self.places, "countries": self.countries, "latittudes": self.lats, "longitudes" : self.longs }
        df = pd.DataFrame(data)
        df.replace(to_replace='None', value=np.nan)
        return df

    def load_data(self):
        geo_collections = {}
        file = open(self.data_file, "r")
        for number,line in enumerate(file.readlines()):
            head_line = unidecode(line)
            result = re.findall(r'[a-zA-Z.]+',line,flags=re.I)
            if result ==  None:
                print("Wrong with Regexp construction")
            else:
                g_pos = []
                for idx,res in enumerate(result):
                    if (idx < (len(result) - 2)):
                        result_city = self.find_city(result[idx], result[idx+1], result[idx+2])
                        if result_city is not None:
                            g_pos.append(result_city)

            geo_collections[number] = g_pos
            self.data_to_dataframe(line,geo_collections[number])


        file.close()
        self.df = self.save_dataframe()


    


In [88]:
data_miner = DiseaseOutbreakDataMiner("data/headlines.txt")
data_miner.load_data()
len(data_miner.df.index)
data_miner.df.head(50)

Unnamed: 0,headlines,cities,countries,latittudes,longitudes
0,Zika Outbreak Hits Miami\n,Miami,US,25.77427,-80.19366
1,Could Zika Reach New York City?\n,New York City,US,40.71427,-74.00597
2,First Case of Zika in Miami Beach\n,Miami Beach,US,25.79065,-80.13005
3,"Mystery Virus Spreads in Recife, Brazil\n",Recife,BR,-8.05389,-34.88111
4,Dallas man comes down with case of Zika\n,Dallas,US,44.91928,-123.31705
5,Trinidad confirms first Zika case\n,Trinidad,UY,-33.5165,-56.89957
6,Zika Concerns are Spreading in Houston\n,Houston,US,29.76328,-95.36327
7,Geneva Scientists Battle to Find Cure\n,Geneva,US,41.88753,-88.30535
8,The CDC in Atlanta is Growing Worried\n,Atlanta,US,33.749,-84.38798
9,Zika Infested Monkeys in SÃ£o Paulo\n,,,,


In [64]:
data_miner.gc.get_cities_by_name('São Paulo')

[{'3448439': {'geonameid': 3448439,
   'name': 'São Paulo',
   'latitude': -23.5475,
   'longitude': -46.63611,
   'countrycode': 'BR',
   'population': 10021295,
   'timezone': 'America/Sao_Paulo',
   'admin1code': '27'}}]