# Location extraction with spaCy

In [1]:
import numpy as np
import sys, os, pandas as pd

# Import libriaries for database connection
import base64
from sqlalchemy import create_engine
import iso639
import psycopg2 as pg

# Import libriary for natural language processing 
import nltk

# Import libriary to open URL
import urllib

# Maximise column width
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading the spaCy high accuracy english pretrained model
import spacy
from spacy import displacy # Displacy is used to visualise spaCy tokens
nlp =spacy.load('en_core_web_trf') # model trf higher accuracy, bigger model, slower in exercution

In [3]:
# Defining separate functions to extract each locational entity separately. 
def filter_location_entities(entities):
    locations = []
    for entity in entities:
        if entity.label_ == 'GPE':
                locations.append(entity)
                
    return locations

def filter_location_entities1(entities):
    locations1 = []
    for entity in entities:
        if entity.label_ == 'FAC':
                locations1.append(entity)
                      
    return locations1

def filter_location_entities2(entities):
    locations2 = []
    for entity in entities:
        if entity.label_ == 'ORG':
                locations2.append(entity)
                      
    return locations2

def filter_location_entities3(entities):
    locations3 = []
    for entity in entities:
        if entity.label_ == 'LOC':
                locations3.append(entity)
                      
    return locations3

In [4]:
# Load data 
df = pd.read_csv('data2//data_english.csv')
df.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,message_id,date,text,source,longitude,latittude,split_text,WordsCount,lang
0,0,2,2,1161754851510030342,2019-08-14 21:41:55,we are in and the tickets are REAL,iphone,-96.712301,40.817276,"['we', 'are', 'in', 'and', 'the', 'tickets', 'are', 'REAL']",8,en


In [5]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'message_id', 'date',
       'text', 'source', 'longitude', 'latittude', 'split_text', 'WordsCount',
       'lang'],
      dtype='object')

In [6]:
df = df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1','split_text', 'WordsCount'])
df.head(2)

Unnamed: 0,message_id,date,text,source,longitude,latittude,lang
0,1161754851510030342,2019-08-14 21:41:55,we are in and the tickets are REAL,iphone,-96.712301,40.817276,en
1,1197029965797175303,2019-11-20 05:52:38,We should have this 475 ml sugar baby,iphone,-84.383281,33.76105,en


In [7]:
#Extracting locational entities
df['GPE'] = df['text'].astype(str).apply(lambda x: filter_location_entities(nlp(x).ents))
df['FAC'] = df['text'].astype(str).apply(lambda x: filter_location_entities1(nlp(x).ents))
df['ORG'] = df['text'].astype(str).apply(lambda x: filter_location_entities2(nlp(x).ents))
df['LOC'] = df['text'].astype(str).apply(lambda x: filter_location_entities3(nlp(x).ents))
df.head()

Unnamed: 0,message_id,date,text,source,longitude,latittude,lang,GPE,FAC,ORG,LOC
0,1161754851510030342,2019-08-14 21:41:55,we are in and the tickets are REAL,iphone,-96.712301,40.817276,en,[],[],[],[]
1,1197029965797175303,2019-11-20 05:52:38,We should have this 475 ml sugar baby,iphone,-84.383281,33.76105,en,[],[],[],[]
2,1195453434780143622,2019-11-15 21:28:03,Guys I got Pokemon Sword Today,Android,-80.203721,26.663375,en,[],[],[],[]
3,1195688311177785344,2019-11-16 13:01:22,"Today is all about our writing buddies.My writing buddy, @Jacqui_Nelson is amazing! We may be thousands of kilometres apart now, but our connection is strong.We are cheerleaders for one another. Her drive and",Instagram,-123.3645,48.4287,en,[],[],[],[]
4,1184520943399399424,2019-10-16 17:26:14,"The IAFF has bucked the trend lines regarding union membership Our union continues to proudly grow through robust organizing efforts. Over 319,000.....IAFF union strong",iphone,-122.431374,37.787679,en,[],[],"[(IAFF), (IAFF)]",[]


In [8]:
df.to_csv('data2//data2_english_locations.csv')