# Cleaning extracted entities

In [1]:
# Import libriaries 
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# from util import getCursor
from IPython.display import display, HTML # import display from Ipython.display instead of IPython.core.display
import base64

In [3]:
# Define display limits
pd.options.display.max_colwidth = 400
pd.set_option("display.max_rows", None,)

# %matplotlib inline  
%load_ext autoreload
%autoreload 2
display(HTML("<style>.container { width:98% !important; }</style>"))

In [None]:
df = pd.read_csv('data//analysis_data//keyword+temp_filtered_posts.csv')
df.shape

In [None]:
df.columns

In [None]:
df =df.drop(columns =['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1.1','Unnamed: 4'])

In [None]:
df.head(2)

In [None]:
# Count distinct by source
df.groupby(['source']).size().sort_values(ascending=False)

In [None]:
# replace all NaN with empty string
df = df.replace(np.nan, '', regex=True)
df.head()

In [None]:
# Data cleaning: Remove square brakets from entities
df['GPE'] =  df['GPE'].apply(lambda x: x.replace('[','').replace(']',''))
df['FAC'] =  df['FAC'].apply(lambda x: x.replace('[','').replace(']',''))
df['ORG'] =  df['ORG'].apply(lambda x: x.replace('[','').replace(']',''))
df['LOC'] =  df['LOC'].apply(lambda x: x.replace('[','').replace(']',''))
df['TE'] =  df['TE'].apply(lambda x: x.replace('[','').replace(']',''))
df.head(2)

In [None]:
# Extracting rows with TE
df_TE = df[(df['TE'] != '')]
df_TE.shape

In [None]:
# Count distinct by source
df_TE.groupby(['source']).size().sort_values(ascending=False)

In [None]:
df_TE[['en_text','TE']].head(50)

In [None]:
# Droping rows without location entity extracted
# Only drops rows where all conditions are met that is all four entities are equal to zero

index_names = df[(df['GPE']== '') & (df['FAC']== '') & (df['ORG']== '') & (df['LOC']== '')].index
df.drop(index_names, inplace = True)
df.shape

In [None]:
# Count distinct by source for only posts with location extracted
df.groupby(['source']).size().sort_values(ascending=False)

In [None]:
# Extracting rows with TE after dropping rows without location
df_TE = df[(df['TE'] != '')]
df_TE.shape

In [None]:
#df.to_csv('data//analysis_data//all_locations_extracted.csv')

## Clean location entities

In [None]:
# Load dataset after keyword filtering and Temporal Information matching
df = pd.read_csv('data//analysis_data//keyword+temp_filtered_posts_final.csv')
df.shape

In [None]:
# replace all NaN with empty string
df = df.replace(np.nan, '', regex=True)
df.head()

In [None]:
print("Number of unique values:")
print("\n GPE =", df['GPE'].nunique(), "\n ORG =", df['ORG'].nunique(), "\n LOC =", df['LOC'].nunique(), "\n FAC =", df['FAC'].nunique())

In [None]:
# Remove hashtag signs on extracted locations
df['GPE'] = df['GPE'].str.replace('#','')
df['LOC'] = df['LOC'].str.replace('#','')
df['FAC'] = df['FAC'].str.replace('#','')
df['ORG'] = df['ORG'].str.replace('#','')

In [None]:
# Remove @ signs from extracted locations
df['GPE'] = df['GPE'].str.replace('@','')
df['LOC'] = df['LOC'].str.replace('@','')
df['FAC'] = df['FAC'].str.replace('@','')
df['ORG'] = df['ORG'].str.replace('@','')

In [None]:
# Add space when case changes 
df['GPE'] = df['GPE'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['LOC'] = df['LOC'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['FAC'] = df['FAC'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['ORG'] = df['ORG'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['LOC'].unique()

In [None]:
df[['GPE','FAC','LOC','ORG']].head(30)

In [None]:
# Counting comma separated entities
df['#LOC'] = df['LOC'].str.count(',')
df['#GPE'] = df['GPE'].str.count(',')
df['#ORG'] = df['ORG'].str.count(',')
df['#FAC'] = df['FAC'].str.count(',')
df[['GPE','#GPE']].head(10)

In [None]:
# Remove spaces between entities to generate unique entities
df['GPE'] = df['GPE'].str.replace(" ", "")
df['FAC'] = df['FAC'].str.replace(" ", "")
df['LOC'] = df['LOC'].str.replace(" ", "")
df['ORG'] = df['ORG'].str.replace(" ", "")

In [None]:
# Add space when case changes 
df['GPE'] = df['GPE'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['LOC'] = df['LOC'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['FAC'] = df['FAC'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()
df['ORG'] = df['ORG'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ').str.strip()

In [None]:
#Keep only unique entities
df['unq_GPE'] = df['GPE'].str.split(',').apply(pd.unique)
df['unq_FAC'] = df['FAC'].str.split(',').apply(pd.unique)
df['unq_LOC'] = df['LOC'].str.split(',').apply(pd.unique)
df['unq_ORG'] = df['ORG'].str.split(',').apply(pd.unique)
df[['GPE','unq_GPE']].loc[df['#GPE'] ==3]

In [None]:
df['unq_GPE'].dtype

In [None]:
# Data cleaning: Remove square brakets from location entities
df['unq_GPE'] =  df['unq_GPE'].astype(str).apply(lambda x: x.replace('[','').replace(']',''))
df['unq_FAC'] =  df['unq_FAC'].astype(str).apply(lambda x: x.replace('[','').replace(']',''))
df['unq_LOC'] =  df['unq_LOC'].astype(str).apply(lambda x: x.replace('[','').replace(']',''))
df['unq_ORG'] =  df['unq_ORG'].astype(str).apply(lambda x: x.replace('[','').replace(']',''))

In [None]:
df[['GPE','unq_GPE']].loc[df['#GPE'] ==3].head(10)

In [None]:
# Split with comma on unq entities
df['unq_GPE'] =  df['unq_GPE'].astype(str).replace('' '',',')
df[['GPE','unq_GPE']].loc[df['#GPE'] ==3].head(10)

In [None]:
df.to_csv('data//analysis_data//unique.csv')

# Final cleaned unique locations

In [4]:
df = pd.read_csv('data//analysis_data//unique_all_cleaned.csv')
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,id,Unnamed: 4,date,en_text,source,longitude,...,LOC,TE,#LOC,#GPE,#ORG,#FAC,unq_GPE,unq_FAC,unq_LOC,unq_ORG
0,0,230,1311,1311,3411639,,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,...,,Apr-04,0,0,0,0,,Boston Common,,
1,1,231,1315,1315,3412188,,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,...,the St.Lawrence Cach River,Apr-10,0,0,0,0,Quebec,,the St.Lawrence Cach River,
2,2,232,1320,1320,3412818,,11-Feb-20,I did mine February 11. @NASA be capping,iPhone,-71.436234,...,,Feb-11,0,0,0,0,,,,NASA


In [5]:
df =df.drop(columns =['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1','Unnamed: 0.1.1.1','Unnamed: 4'])
df.head(3)

Unnamed: 0,id,date,en_text,source,longitude,latittude,lang,GPE,FAC,ORG,LOC,TE,#LOC,#GPE,#ORG,#FAC,unq_GPE,unq_FAC,unq_LOC,unq_ORG
0,3411639,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,42.35628,en,,Boston Common,,,Apr-04,0,0,0,0,,Boston Common,,
1,3412188,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,47.917575,fr,Quebec,,,the St.Lawrence Cach River,Apr-10,0,0,0,0,Quebec,,the St.Lawrence Cach River,
2,3412818,11-Feb-20,I did mine February 11. @NASA be capping,iPhone,-71.436234,41.850653,en,,,NASA,,Feb-11,0,0,0,0,,,,NASA


In [6]:
# replace all NaN with empty string
df = df.replace(np.nan, '', regex=True)
df.head()

Unnamed: 0,id,date,en_text,source,longitude,latittude,lang,GPE,FAC,ORG,LOC,TE,#LOC,#GPE,#ORG,#FAC,unq_GPE,unq_FAC,unq_LOC,unq_ORG
0,3411639,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,42.35628,en,,Boston Common,,,Apr-04,0,0,0,0,,Boston Common,,
1,3412188,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,47.917575,fr,Quebec,,,the St.Lawrence Cach River,Apr-10,0,0,0,0,Quebec,,the St.Lawrence Cach River,
2,3412818,11-Feb-20,I did mine February 11. @NASA be capping,iPhone,-71.436234,41.850653,en,,,NASA,,Feb-11,0,0,0,0,,,,NASA
3,3413044,14-Feb-20,2/13 @ OnStage Atlanta,instagram,-84.26823,33.79319,en,Atlanta,,On Stage,,Feb-13,0,0,0,0,Atlanta,,,On Stage
4,3413467,15-Jan-20,"The view from the On The Green Magazine office. This is #Beachwood, 9:30 AM on January 15. It's already 70 degrees but later it's supposed to plunge down to 68. Time to plan your golf vacation!#myrtlebeach #myrtlebeachgolf #onthegreen #wintergolf #golfvacation",hootsuite,-78.701422,33.813737,en,,,On The Green Magazine,,Jan-15,0,0,0,0,,,,On The Green Magazine


In [7]:
# Counting comma separated entities
df['#LOC2'] = df['unq_LOC'].str.count(',')
df['#GPE2'] = df['unq_GPE'].str.count(',')
df['#ORG2'] = df['unq_ORG'].str.count(',')
df['#FAC2'] = df['unq_FAC'].str.count(',')
df[['GPE','#GPE','unq_GPE','#GPE2']].head(10)

Unnamed: 0,GPE,#GPE,unq_GPE,#GPE2
0,,0,,0
1,Quebec,0,Quebec,0
2,,0,,0
3,Atlanta,0,Atlanta,0
4,,0,,0
5,"Youngstown,Japan,Japan",2,"Youngstown, Japan",1
6,Idaho,0,Idaho,0
7,,0,,0
8,,0,,0
9,,0,,0


In [9]:
df.to_csv('data//analysis_data//unique_all_cleaned.csv')

In [10]:
df.columns

Index(['id', 'date', 'en_text', 'source', 'longitude', 'latittude', 'lang',
       'GPE', 'FAC', 'ORG', 'LOC', 'TE', '#LOC', '#GPE', '#ORG', '#FAC',
       'unq_GPE', 'unq_FAC', 'unq_LOC', 'unq_ORG', '#LOC2', '#GPE2', '#ORG2',
       '#FAC2'],
      dtype='object')

## Discard GPE > 1 comma

In [22]:
df[['FAC','unq_FAC','#FAC','#FAC2']].loc[df['#FAC'] >=2]

Unnamed: 0,FAC,unq_FAC,#FAC,#FAC2
37,"Park Ave,Park Ave,Park Avenue,94th Street","Park Ave, Park Avenue, 94th Street",3,2
383,"Washington Avenue Corridor Historic District!U Albany,Washington Avenue,the Schuyler Building","Washington Avenue Corridor Historic District!U Albany\n Washington Avenue, the Schuyler Building",2,1
1225,"24th,Mechanic,the Powell Arch","24th, Mechanic, the Powell Arch",2,2
1318,"Cherokee Street,Boone Drive,Big Shanty Drive,Pine Hill Drive","Cherokee Street, Boone Drive, Big Shanty Drive, Pine Hill Drive",3,3
1652,"Rodeo,NRG Stadium,Astrodome,RODEOHOUSTON","Rodeo, NRG Stadium, Astrodome, RODEOHOUSTON",3,3
1732,"Dale Street Shaw,St Thomas Church Hall,Glebe Street","Dale Street Shaw, St Thomas Church Hall, Glebe Street",2,2
2339,"MLKDRIVE.,30th Ave,Lincoln Way","MLKDRIVE., 30th Ave, Lincoln Way",2,2
2398,"UA,SFO,LAX","UA, SFO, LAX",2,2
2432,"Bryant Park,Winter Village,Bryant Park Ice Rink","Bryant Park, Winter Village, Bryant Park Ice Rink",2,2
2788,"Metro2033,VDN Kh,Metro","Metro2033, VDN Kh, Metro",2,2


In [25]:
df.shape

(18510, 24)

In [27]:
df_final = df.loc[df['#GPE2'] <=1]
df_final.shape

(18226, 24)

In [28]:
df_final = df_final.loc[df['#LOC2'] <=1]
df_final.shape

(18219, 24)

In [29]:
df_final = df_final.loc[df['#FAC2'] <=1]
df_final.shape

(18183, 24)

In [30]:
df_final = df_final.loc[df['#ORG2'] <=1]
df_final.shape

(17969, 24)

In [33]:
df_final.columns

Index(['id', 'date', 'en_text', 'source', 'longitude', 'latittude', 'lang',
       'GPE', 'FAC', 'ORG', 'LOC', 'TE', '#LOC', '#GPE', '#ORG', '#FAC',
       'unq_GPE', 'unq_FAC', 'unq_LOC', 'unq_ORG', '#LOC2', '#GPE2', '#ORG2',
       '#FAC2'],
      dtype='object')

In [34]:
df_final = df_final.drop(columns = ['#LOC', '#GPE', '#ORG', '#FAC', 'GPE', 'FAC', 'ORG', 'LOC', '#LOC2', '#GPE2', '#ORG2', '#FAC2'])
df_final.columns

Index(['id', 'date', 'en_text', 'source', 'longitude', 'latittude', 'lang',
       'TE', 'unq_GPE', 'unq_FAC', 'unq_LOC', 'unq_ORG'],
      dtype='object')

In [39]:
df_final = df_final.rename(columns = {'unq_GPE':'GPE','unq_FAC':'FAC', 'unq_LOC':'LOC', 'unq_ORG':'ORG' })
df_final.columns

Index(['id', 'date', 'en_text', 'source', 'longitude', 'latittude', 'lang',
       'TE', 'GPE', 'FAC', 'LOC', 'ORG'],
      dtype='object')

In [40]:
df_final.head(10)

Unnamed: 0,id,date,en_text,source,longitude,latittude,lang,TE,GPE,FAC,LOC,ORG
0,3411639,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,42.35628,en,Apr-04,,Boston Common,,
1,3412188,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,47.917575,fr,Apr-10,Quebec,,the St.Lawrence Cach River,
2,3412818,11-Feb-20,I did mine February 11. @NASA be capping,iPhone,-71.436234,41.850653,en,Feb-11,,,,NASA
3,3413044,14-Feb-20,2/13 @ OnStage Atlanta,instagram,-84.26823,33.79319,en,Feb-13,Atlanta,,,On Stage
4,3413467,15-Jan-20,"The view from the On The Green Magazine office. This is #Beachwood, 9:30 AM on January 15. It's already 70 degrees but later it's supposed to plunge down to 68. Time to plan your golf vacation!#myrtlebeach #myrtlebeachgolf #onthegreen #wintergolf #golfvacation",hootsuite,-78.701422,33.813737,en,Jan-15,,,,On The Green Magazine
5,3414717,15-Feb-20,"Youve been a piece of my heart since January 2016. Youngstown wont be the same after you travel back to Japan, but I know that youll be a ray of sunshine anywhere you go. #LoveHer #RayOfSunshine #Japan #Japanese",instagram,-80.64921,41.09861,en,Jan-16,"Youngstown, Japan",,,
6,3416128,15-Apr-20,"Ready, set, order... Only for free delivery in Treasure Valley (Idaho)... Order online at Orders made by April 16, 5pm MST will be eligible for Unicorn, Sakasquatch and Beanie drawing. Join us for",instagram,-116.39033,43.61129,en,Apr-16,Idaho,,Treasure Valley,
7,3416192,19-Mar-20,Ava combos! 3/19 @ Tong Dragon Mixed Martial Arts,instagram,-74.172632,39.908788,en,Mar-19,,Tong Dragon Mixed Martial Arts,,
8,3416548,04-Jan-20,2020/1/4: - Drinking an Oatology by @imprintbeer @ Imprint Beer Co.,untappd,-75.284,40.264,en,04/01/2020,,,,Imprint Beer Co.
9,3418141,21-Apr-20,Weekly NYC Covid19 report 4/20,iPhone,-73.883894,40.665907,en,Apr-20,,,,"NYC, Covid19"


In [41]:
df_final.to_csv('geocoding//data//final_posts.csv')

# Preparing geocoding locations

In [42]:
df = pd.read_csv('geocoding//data//final_posts.csv')
df.shape

(17969, 13)

In [43]:
# Print number of unique entities
print("Number of unique values:")
print("\n GPE =", df['GPE'].nunique(), "\n ORG =", df['ORG'].nunique(), "\n LOC =", df['LOC'].nunique(), "\n FAC =", df['FAC'].nunique())

Number of unique values:

 GPE = 4197 
 ORG = 6125 
 LOC = 702 
 FAC = 3000


In [46]:
# replace all NaN with empty string
df = df.replace(np.nan, '', regex=True)
df.head()

Unnamed: 0.1,Unnamed: 0,id,date,en_text,source,longitude,latittude,lang,TE,GPE,FAC,LOC,ORG
0,0,3411639,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,42.35628,en,Apr-04,,Boston Common,,
1,1,3412188,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,47.917575,fr,Apr-10,Quebec,,the St.Lawrence Cach River,
2,2,3412818,11-Feb-20,I did mine February 11. @NASA be capping,iPhone,-71.436234,41.850653,en,Feb-11,,,,NASA
3,3,3413044,14-Feb-20,2/13 @ OnStage Atlanta,instagram,-84.26823,33.79319,en,Feb-13,Atlanta,,,On Stage
4,4,3413467,15-Jan-20,"The view from the On The Green Magazine office. This is #Beachwood, 9:30 AM on January 15. It's already 70 degrees but later it's supposed to plunge down to 68. Time to plan your golf vacation!#myrtlebeach #myrtlebeachgolf #onthegreen #wintergolf #golfvacation",hootsuite,-78.701422,33.813737,en,Jan-15,,,,On The Green Magazine


In [47]:
df['LOC'].unique()

array(['', 'the St.Lawrence Cach River', 'Treasure Valley',
       'Georgian Bay', 'Hill Valley', 'bayridge', 'Valley',
       'Latin America', 'Red River', 'Heaven Hill', 'Deep Creek North',
       'South Texas', 'Washington Heights', 'Mars', 'Old Sacramento',
       'Centennial Hills', 'the Hudson Valley',
       'Jeffery Epsteins, Doucheowotz', 'Lake Bluff, Lakebluff',
       'New England', 'Nevada', 'South', 'Rose Hill', 'Lost Cities',
       'Lake Braddock', 'Bayarea', 'ELCENTROCALIFORNIA', 'West, East',
       'Germantown Ave', 'South Miami', 'Sugarland', 'valley',
       'Las Vegas Strip', 'Colorado National Monument', 'So Cal',
       'the Great Northwest', 'the Tidal Basin', 'Cheyenne Mountain',
       'NORTHKOREA', 'Europe', 'DC', 'Earth', 'the French Quarter',
       'Las Trampas', 'Jordan River', 'Forth Ward', 'WY', 'Himalayas',
       'White Mountainsof Arizona', 'saltlake', 'the Gaslight District',
       'North Buffalo', 'Redblacks', '6thstreet', 'Bayside', 'Lake Tahoe',

In [48]:
# Combining locational entities to get finer and more informed place names
# Locations are combined only when both columns are not null

df['FAC_GPE'] = np.where(((df['FAC'] != '') & (df['GPE'] != '')), df['FAC'].str.cat(df['GPE'], sep = ", "), '')
df['ORG_GPE'] = np.where(((df['ORG'] != '') & (df['GPE'] != '')), df['ORG'].str.cat(df['GPE'], sep = ", "), '')
df['LOC_GPE'] = np.where(((df['LOC'] != '') & (df['GPE'] != '')), df['LOC'].str.cat(df['GPE'], sep = ", "), '')

df['FAC_LOC'] = np.where(((df['FAC'] != '') & (df['LOC'] != '')), df['FAC'].str.cat(df['LOC'], sep = ", "), '')
df['ORG_LOC'] = np.where(((df['ORG'] != '') & (df['LOC'] != '')), df['ORG'].str.cat(df['LOC'], sep = ", "), '')

df['LOC_FAC'] = np.where(((df['LOC'] != '') & (df['FAC'] != '')), df['LOC'].str.cat(df['FAC'], sep = ", "), '')
df['FAC_ORG'] = np.where(((df['FAC'] != '') & (df['ORG'] != '')), df['FAC'].str.cat(df['ORG'], sep = ", "), '')

df['ORG_FAC'] = np.where(((df['ORG'] != '') & (df['FAC'] != '')), df['ORG'].str.cat(df['FAC'], sep = ", "), '')
df['LOC_ORG'] = np.where(((df['LOC'] != '') & (df['ORG'] != '')), df['LOC'].str.cat(df['ORG'], sep = ", "), '')
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,date,en_text,source,longitude,latittude,lang,TE,GPE,...,ORG,FAC_GPE,ORG_GPE,LOC_GPE,FAC_LOC,ORG_LOC,LOC_FAC,FAC_ORG,ORG_FAC,LOC_ORG
0,0,3411639,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,42.35628,en,Apr-04,,...,,,,,,,,,,
1,1,3412188,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,47.917575,fr,Apr-10,Quebec,...,,,,"the St.Lawrence Cach River, Quebec",,,,,,


In [49]:
df['FAC_LOC_GPE'] = np.where(((df['FAC_LOC'] != '') & (df['GPE'] != '')), df['FAC_LOC'].str.cat(df['GPE'], sep = ", "), '')
df['ORG_LOC_GPE'] = np.where(((df['ORG_LOC'] != '') & (df['GPE'] != '')), df['ORG_LOC'].str.cat(df['GPE'], sep = ", "), '')
df['LOC_FAC_GPE'] = np.where(((df['LOC_FAC'] != '') & (df['GPE'] != '')), df['LOC_FAC'].str.cat(df['GPE'], sep = ", "), '')
df['FAC_ORG_GPE'] = np.where(((df['FAC_ORG'] != '') & (df['GPE'] != '')), df['FAC_ORG'].str.cat(df['GPE'], sep = ", "), '')
df['ORG_FAC_GPE'] = np.where(((df['ORG_FAC'] != '') & (df['GPE'] != '')), df['ORG_FAC'].str.cat(df['GPE'], sep = ", "), '')
df['LOC_ORG_GPE'] = np.where(((df['LOC_ORG'] != '') & (df['GPE'] != '')), df['LOC_ORG'].str.cat(df['GPE'], sep = ", "), '')
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,date,en_text,source,longitude,latittude,lang,TE,GPE,...,LOC_FAC,FAC_ORG,ORG_FAC,LOC_ORG,FAC_LOC_GPE,ORG_LOC_GPE,LOC_FAC_GPE,FAC_ORG_GPE,ORG_FAC_GPE,LOC_ORG_GPE
0,0,3411639,05-Apr-20,April 4 #fms_gratitude: a gray day that welcomes a fiery night. @ Boston Common,instagram,-71.06923,42.35628,en,Apr-04,,...,,,,,,,,,,
1,1,3412188,10-Apr-20,@_Amixem #lavueDepuismonfinement with the St. Lawrence Cach River by a snow tempt on April 10 ... #Quebec,Android,-69.498244,47.917575,fr,Apr-10,Quebec,...,,,,,,,,,,


In [53]:
df.shape

(17969, 28)

In [54]:
df.to_csv('Geocoding//data//final_posts.csv')

## Extract Unique locations to Geocode

In [51]:
unique_GPE = df.groupby('GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//GPE.csv')
unique_GPE.to_csv(outfilename)

unique_FAC = df.groupby('FAC')['id'].unique()
outfilename = ('geocoding//unq_locations//FAC.csv')
unique_FAC.to_csv(outfilename)

unique_LOC = df.groupby('LOC')['id'].unique()
outfilename = ('geocoding//unq_locations//LOC.csv')
unique_LOC.to_csv(outfilename)

unique_ORG = df.groupby('ORG')['id'].unique()
outfilename = ('geocoding//unq_locations//ORG.csv')
unique_ORG.to_csv(outfilename)

unique_FAC_GPE = df.groupby('FAC_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//FAC_GPE.csv')
unique_FAC_GPE.to_csv(outfilename)

unique_LOC_GPE = df.groupby('LOC_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//LOC_GPE.csv')
unique_LOC_GPE.to_csv(outfilename)

unique_ORG_GPE = df.groupby('ORG_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//ORG_GPE.csv')
unique_ORG_GPE.to_csv(outfilename)

unique_FAC_LOC_GPE = df.groupby('FAC_LOC_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//FAC_LOC_GPE.csv')
unique_FAC_LOC_GPE.to_csv(outfilename)

unique_FAC_ORG_GPE = df.groupby('FAC_ORG_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//FAC_ORG_GPE.csv')
unique_FAC_ORG_GPE.to_csv(outfilename)

unique_LOC_FAC_GPE = df.groupby('LOC_FAC_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//LOC_FAC_GPE.csv')
unique_LOC_FAC_GPE.to_csv(outfilename)

unique_ORG_LOC_GPE = df.groupby('ORG_LOC_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//ORG_LOC_GPE.csv')
unique_ORG_LOC_GPE.to_csv(outfilename)

unique_ORG_FAC_GPE = df.groupby('ORG_FAC_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//ORG_FAC_GPE.csv')
unique_ORG_FAC_GPE.to_csv(outfilename)

unique_LOC_ORG_GPE = df.groupby('LOC_ORG_GPE')['id'].unique()
outfilename = ('geocoding//unq_locations//LOC_ORG_GPE.csv')
unique_LOC_ORG_GPE.to_csv(outfilename)