## Geoparsing using GPE and LOC from each article

In [2]:
#Mount your drive to get cleaned_text
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Clone this repository
!git clone https://github.com/alexdseo/Visualization-App-on-World-Events

Cloning into 'Visualization-App-on-World-Events'...
remote: Enumerating objects: 258, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 258 (delta 35), reused 40 (delta 13), pack-reused 186[K
Receiving objects: 100% (258/258), 1.49 GiB | 11.85 MiB/s, done.
Resolving deltas: 100% (68/68), done.


In [4]:
#Change directory
%cd Visualization-App-on-World-Events

/content/Visualization-App-on-World-Events


In [5]:
import spacy
#from spacy import displacy #visualize option
from collections import Counter
from pprint import pprint
from geopy.geocoders import Nominatim

In [6]:
# Use GPU
spacy.prefer_gpu()

True

In [7]:
#Load nlp machine
import en_core_web_sm
nlp = en_core_web_sm.load()
nlp.max_length = 4000000

In [8]:
#Get file_path
import os

all_filepaths = []

for newssource in os.listdir('/content/drive/MyDrive/cleaned_text'):
    if newssource != ".DS_Store":
        for filename in os.listdir(os.path.join('/content/drive/MyDrive/cleaned_text',newssource)):
            if filename != ".DS_Store":
                all_filepaths.append(f"/content/drive/MyDrive/cleaned_text/{newssource}/{filename}")

all_filepaths.sort()

In [9]:
len(all_filepaths)

81

In [10]:
import pandas as pd
import numpy as np
#Make empty dataset to fill in
# 81 article, GPE/LOC entity type, Top 20 for each entity type
geoparse=pd.DataFrame()
col_list =['Entity','Frequency','Entity Type', 'Year', 'Month', 'Day', 'Latitude', 'Longitude', 'Country', 'City', 'News Source']

In [11]:
#Fill in day of the article and the news source, entity type
#Geoparsing using Nominatim API
geolocator = Nominatim(user_agent="NSF", timeout =10)

for i in all_filepaths:
    f = open(i,"r").read().replace('\n', ' ')
    components = i.split("/")
    day = int(components[6].split('-')[1])
    news_source = components[5]
    
    article=nlp(f)
    GPE_placeholder, LOC_placeholder = [], []
    for t in article.ents:
      if t.label_ == 'GPE':
        GPE_placeholder.append(t.text)   
      if t.label_ == 'LOC':
        LOC_placeholder.append(t.text)

    GPE_20=Counter(GPE_placeholder).most_common(20)
    LOC_20=Counter(LOC_placeholder).most_common(20)
    
    if len(GPE_20) >0:
      for e in range(len(GPE_20)):
        GPE_entity = str(GPE_20[e][0])
        GPE_frequency = GPE_20[e][1]
        locations = geolocator.geocode(GPE_entity)
        try:
          address= geolocator.reverse(str(locations.latitude)+","+str(locations.longitude), language='en').raw['address']

          temp=[[GPE_entity, GPE_frequency, 'GPE', 2022, 3, day, str(locations.latitude),str(locations.longitude),
                address.get('country'), address.get('city'), news_source]]

          temp_df=pd.DataFrame(temp, columns = col_list)
          geoparse=pd.concat([geoparse,temp_df], ignore_index=True)

        #geocode not available for the entity
        except AttributeError:
          temp=[[GPE_entity, GPE_frequency,'GPE', 2022, 3, day, np.nan , np.nan, np.nan, np.nan, news_source]]
          temp_df=pd.DataFrame(temp, columns = col_list)
          geoparse=pd.concat([geoparse,temp_df], ignore_index=True)
        
        #geocode not available to get address
        except KeyError:
          temp=[[GPE_entity, GPE_frequency,'GPE', 2022, 3, day, str(locations.latitude),str(locations.longitude),
                 np.nan, np.nan, news_source]]
          temp_df=pd.DataFrame(temp, columns = col_list)
          geoparse=pd.concat([geoparse,temp_df], ignore_index=True)

    if len(LOC_20) >0:
      for e in range(len(LOC_20)):
        LOC_entity = str(LOC_20[e][0])
        LOC_frequency= LOC_20[e][1]
        locations = geolocator.geocode(LOC_entity)
        try:
          address= geolocator.reverse(str(locations.latitude)+","+str(locations.longitude), language='en').raw['address']

          temp=[[LOC_entity, LOC_frequency,'LOC', 2022, 3, day, str(locations.latitude),str(locations.longitude),
                address.get('country'), address.get('city'), news_source]]
          
          temp_df=pd.DataFrame(temp, columns = col_list)
          geoparse=pd.concat([geoparse,temp_df], ignore_index=True)
        
        #geocode not available for the entity
        except AttributeError:
          temp=[[LOC_entity, LOC_frequency,'LOC', 2022, 3, day, np.nan , np.nan, np.nan, np.nan, news_source]]
          temp_df=pd.DataFrame(temp, columns = col_list)
          geoparse=pd.concat([geoparse,temp_df], ignore_index=True)
        
        #geocode not available to get address
        except KeyError:
          temp=[[LOC_entity, LOC_frequency,'LOC', 2022, 3, day, str(locations.latitude),str(locations.longitude),
                 np.nan, np.nan, news_source]]
          temp_df=pd.DataFrame(temp, columns = col_list)
          geoparse=pd.concat([geoparse,temp_df], ignore_index=True)



In [12]:
geoparse

Unnamed: 0,Entity,Frequency,Entity Type,Year,Month,Day,Latitude,Longitude,Country,City,News Source
0,Ukraine,129,GPE,2022,3,1,49.4871968,31.2718321,Ukraine,,aljazeera
1,Russia,117,GPE,2022,3,1,64.6863136,97.7453061,Russia,,aljazeera
2,US,25,GPE,2022,3,1,39.7837304,-100.445882,United States,,aljazeera
3,Moscow,23,GPE,2022,3,1,55.7504461,37.6174943,Russia,Moscow,aljazeera
4,Poland,8,GPE,2022,3,1,52.215933,19.134422,Poland,,aljazeera
...,...,...,...,...,...,...,...,...,...,...,...
2347,Europe,2,LOC,2022,3,31,51.0,10.0,Germany,,fox
2348,Latin America,1,LOC,2022,3,31,48.84774885,2.351138358927914,France,Paris,fox
2349,the Middle East,1,LOC,2022,3,31,42.3638002,-71.1013814,United States,Cambridge,fox
2350,Southeast Asia,1,LOC,2022,3,31,51.3305381,12.3786243,Germany,Leipzig,fox


In [15]:
geoparse.to_csv('geoparse.csv', encoding='utf-8', index=False)