<a href="https://colab.research.google.com/github/anya765/climate-evidence-synthesis-nlp/blob/main/geoparse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import os

if os.path.exists("/content/"):
    from google.colab import drive
    import os
    drive.mount('/content/drive', force_remount=True)
    os.chdir("/content/drive/MyDrive/climate-science") 

from run_cv_experiments import load_data

df = load_data(False)
df.loc[pd.isna(df["id"]), "id"] = df.loc[pd.isna(df["id"]), "OA_id"]

#Merge data with predictions
df = df.merge(pd.read_csv('cv_data/INCLUDE/predictions_5_splits.csv'), how="outer")

#if no data- put in diff column
df.loc[pd.isna(df["INCLUDE_prediction"]),"INCLUDE_prediction"] = df.loc[pd.isna(df["INCLUDE_prediction"]),"INCLUDE"]
print(df.shape)
df = df[(df["INCLUDE_prediction"]>=0.5)]
print(df.shape)
df.head()

In [None]:
#load any combined_place_df that have already been processed or initialise an empty dataframe
if os.path.exists("data/combined_place_df.csv"):
    processed_place_df = pd.read_csv("data/combined_place_df.csv")
    unprocessed_place_df = df[~df['id'].isin(processed_place_df)]
else:
    processed_place_df = pd.DataFrame()
    unprocessed_place_df = df

In [None]:
! pip install mordecai
! python -m spacy download en_core_web_lg
! python -m spacy download en
! python –m spacy info

In [None]:
nlp = spacy.load('en')
from mordecai import Geoparser
geo = Geoparser()
geo.geoparse("I travelled from Oxford to Ottawa")

In [None]:
%%capture 
places = []
geos = []

import re

#loop through rows in dataframe
for i, row in unprocessed_place_df.iterrows():
    
    #add text to geoparse, join title and abstract, get rid of copyright stuff
    t = str(row['title']) + " " + str(row['abstract'])
    t = t.split("Copyright (C)")[0] 
    t = re.split("\([C-c]\) [1-2][0-9]{3} Elsevier",t)[0] 
    t = t.split("Published by Elsevier")[0] 
    t = t.split("Copyright. (C)")[0] 
    t = re.split("\. \(C\) [1-2][0-9]{3} ",t)[0] 
    t = re.split("\. \(C\) Copyright",t)[0]   
    t = re.split("\. \\xA9 [1-2][0-9]{3}", t)[0] #Copyright symbol
    
    #remove common place names involved in environmental studies
    t = re.sub("paris agreement", "", t, flags=re.I)
    t = re.sub("kyoto protocol", "", t, flags=re.I)
    t = re.sub("montreal protocol", "", t, flags=re.I)
    t = re.sub("london protocol", "", t, flags=re.I)
    
    #geoparse yay
    gp = geo.geoparse(t)
    
    #for each place, append to a list of dictionaries, with a field for the doc_id
    for p in gp:
        if "geo" in p:
            for key, value in p["geo"].items():
                p[key] = value
            del p["geo"]
            
        p["doc_id"] = row["id"]
        places.append(p)

    #save every thousand rows - helps to not start again if there's interruptions
    if i % 1000 == 0:
        combined_place_df = processed_place_df.append(pd.DataFrame.from_dict(places))
        print(combined_place_df.shape)
        combined_place_df.to_csv("data/combined_place_df.csv", index=False)
    
#mergeeeeeeeeeeee 
combined_place_df = processed_place_df.append(pd.DataFrame.from_dict(places))
print(combined_place_df.shape)
combined_place_df.to_csv("data/combined_place_df.csv", index=False)


In [None]:
df['tstring'] = df['title'] + " " + df['abstract']

gm_docs = df.loc[
    (df['tstring'].str.lower().str.contains("gulf of mexico")),
    "id"
]
geocolumns = ["word", "country_conf", "feature_code","lat","lon","place_name","feature_class","geonameid"]
gm = pd.DataFrame({"doc_id": gm_docs})
gm[geocolumns] = ["Gulf of Mexico",0.8,"GULF", 25, -90, "Gulf of Mexico", "H", 3523271]

combined_place_df = pd.concat([combined_place_df, gm])


lab_docs = df.loc[
    (df['tstring'].str.lower().str.contains("labrador sea")),
    "id"
]
geocolumns = ["word", "country_conf", "feature_code","lat","lon","place_name","feature_class","geonameid"]
lab = pd.DataFrame({"doc_id": lab_docs})
lab[geocolumns] = ["Labrador Sea",0.8,"SEA", 57, -55, "Labrador Sea", "H", 3424929]

combined_place_df = pd.concat([combined_place_df, lab])

baf_docs = df.loc[
    (df['tstring'].str.lower().str.contains("baffin bay")),
    "id"
]
geocolumns = ["word", "country_conf", "feature_code","lat","lon","place_name","feature_class","geonameid"]
baf = pd.DataFrame({"doc_id": baf_docs})
baf[geocolumns] = ["Baffin Bay",0.8,"BAY", 74, -68, "Baffin Bay", "H", 3831554]
combined_place_df = pd.concat([combined_place_df, baf])


ok_docs = df.loc[
    (df['tstring'].str.lower().str.contains("sea of okhotsk")) ,
    "id"
]
geocolumns = ["word", "country_conf", "feature_code","lat","lon","place_name","feature_class","geonameid"]
ok = pd.DataFrame({"doc_id": ok_docs})
ok[geocolumns] = ["Sea of Okhotsk",0.8, "SEA", 55, 150, "Sea of Okhotsk", "H", 2127380]
combined_place_df = pd.concat([combined_place_df, ok])


kyoto_docs = df.loc[
    (df['tstring'].str.lower().str.contains("kyoto target")) |
    (df['tstring'].str.lower().str.contains("kyoto process")) |
    (df['tstring'].str.lower().str.contains("kyoto emission")) |
    (df['tstring'].str.lower().str.contains("kyoto gas")) |
    (df['tstring'].str.lower().str.contains("kyoto agreement")) |
    (df['tstring'].str.lower().str.contains("kyoto protocol")) |
    (df['tstring'].str.lower().str.contains("kyoto framework")),
    "id"
]

combined_place_df = combined_place_df.drop(combined_place_df[(combined_place_df['doc_id'].isin(kyoto_docs)) & (combined_place_df['word'].str.lower()=="kyoto")].index)

paris_docs = df.loc[
    (df['tstring'].str.contains('(Paris(?:\S* ){0,15}COP)|(COP(?:\S* ){0,15}Paris)')) |
    (df['tstring'].str.contains('(Paris(?:\S* ){0,15}Agreement)|(COP(?:\S* ){0,15}Agreement)')) ,
    'id'
]
combined_place_df = combined_place_df.drop(combined_place_df[(combined_place_df['doc_id'].isin(paris_docs)) & (combined_place_df['word'].str.lower()=="paris")].index)

#Copenhagen
copenhagen_docs = df.loc[
    (df['tstring'].str.contains('(Copenhagen(?:\S* ){0,15}COP)|(COP(?:\S* ){0,15}Copenhagen)')) |
    (df['tstring'].str.contains('(Copenhagen(?:\S* ){0,3}Accord)|(Accord(?:\S* ){0,3}Copenhagen)')) ,
    'id'
]
combined_place_df = combined_place_df.drop(combined_place_df[(combined_place_df['doc_id'].isin(copenhagen_docs)) & (combined_place_df['word'].str.lower()=="copenhagen")].index)

#Berlin
berlin_docs = df.loc[
    (df['tstring'].str.contains('(Berlin(?:\S* ){0,15}COP)|(COP(?:\S* ){0,15}Berlin)')),
    'id'
]
combined_place_df = combined_place_df.drop(combined_place_df[(combined_place_df['doc_id'].isin(berlin_docs)) & (combined_place_df['word'].str.lower()=="berlin")].index)

#Glasgow
berlin_docs = df.loc[
    (df['tstring'].str.contains('(Glasgow(?:\S* ){0,15}COP)|(COP(?:\S* ){0,15}Glasgow)')),
    'id'
]
combined_place_df = combined_place_df.drop(combined_place_df[(combined_place_df['doc_id'].isin(berlin_docs)) & (combined_place_df['word'].str.lower()=="berlin")].index)

#Cancun
cancun_docs = df.loc[
    (df['tstring'].str.contains('(Cancun(?:\S* ){0,15}COP)|(COP(?:\S* ){0,15}Cancun)')) |
    (df['tstring'].str.lower().str.contains('cancun pledge')),
    'id'
]
combined_place_df = combined_place_df.drop(combined_place_df[(combined_place_df['doc_id'].isin(cancun_docs)) & (combined_place_df['word'].str.lower()=="cancun")].index)


geocolumns = ["feature_code", "lat", "lon", "place_name", "feature_class", "geonameid", "country_code3"]

combined_place_df.loc[combined_place_df["word"]=="Pakistan", geocolumns]=["PCLI",30,70,"Islamic Republic of Pakistan","A",1168579,"PAK"]
combined_place_df.loc[combined_place_df["word"]=="Colombia", geocolumns]=["PCLI",4,-73.25,"Colombia","A",3686110, "COL"]
combined_place_df.loc[combined_place_df["word"]=="Argentina", geocolumns]=["PCLI",-34,-64,"Argentine Republic","A",3865483, "ARG"]
combined_place_df.loc[combined_place_df["word"]=="Sahara", geocolumns] = ["DSRT", 26, 13, "Sahara", "T", 2212709, None]
combined_place_df.loc[combined_place_df["word"]=="Alps",geocolumns] = ["MTS", 46.41667, 10, "Alps", "T", 2661786, None]
combined_place_df.loc[combined_place_df["word"]=="Mediterranean Sea",geocolumns] = ["SEA", 35, 20, "Mediterranean Sea", "T", 2661786, None]
combined_place_df.loc[combined_place_df["word"]=="MEDITERRANEAN",geocolumns] = ["SEA", 35, 20, "Mediterranean Sea", "T", 2661786, None]
combined_place_df.loc[combined_place_df["word"]=="East China",geocolumns] = ["PCLI", 35, 105, "China", "A", 1814991, "CHN"]
combined_place_df.loc[combined_place_df["word"]=="South China",geocolumns] = ["PCLI", 35, 105, "China", "A", 1814991, "CHN"]
combined_place_df.loc[combined_place_df["word"]=="Great Lakes",geocolumns] = ["LK", 45.68751, -84.43753, "Great Lakes", "H", 4994594, "USA"]
combined_place_df.loc[combined_place_df["word"]=="Catalonia",geocolumns] = ["ADM1", 41.82046, 1.86768, "Catalunya", "A", 3336901, "ESP"]
combined_place_df.loc[combined_place_df["word"]=="South Pacific",geocolumns] = ["OCN", -45, -130, "South Pacific Ocean", "H", 4030483, None]
combined_place_df.loc[combined_place_df["word"]=="Gulf Coast",geocolumns] = ["AREA", 29.36901, -95.00565, "Gulf Coast", "L", 7287689, "USA"]
combined_place_df.loc[combined_place_df["word"]=="Gulf coast",geocolumns] = ["AREA", 29.36901, -95.00565, "Gulf Coast", "L", 7287689, "USA"]
combined_place_df.loc[combined_place_df["word"]=="Hainan Island",geocolumns] = ["ISL", 19.2, 109.7, "Hainan Dao", "T", 1809055, "CHN"]
combined_place_df.loc[combined_place_df["word"]=="Red Sea",geocolumns] = ["SEA", 20.26735, 38.53455, "Red Sea", "H", 350155, None]
combined_place_df.loc[combined_place_df["word"]=="Himalayan",geocolumns] = ["MTS", 28,84, "Himalayas", "T", 1252558, None]
combined_place_df.loc[combined_place_df["word"]=="Himalayas",geocolumns] = ["MTS", 28,84, "Himalayas", "T", 1252558, None]
combined_place_df.loc[combined_place_df["word"]=="North America's",geocolumns] = ["CONT", 46.07323, -100.54688, "North America", "L", 6255149, None]
combined_place_df.loc[combined_place_df["word"]=="Atlantic Ocean",geocolumns] = ["OCN", 10, -25, "Atlantic Ocean", "H", 3373405, None]
combined_place_df.loc[combined_place_df["word"]=="Scandinavia",geocolumns] = ["RGN", 63, 12, "Scandinavia", "L", 2614165, None]
combined_place_df.loc[combined_place_df["word"]=="California (USA",geocolumns] = ["ADM1", 37.25022, -119.75126, "California", "A", 5332921, "USA"]
combined_place_df.loc[combined_place_df["word"]=="California, USA",geocolumns] = ["ADM1", 37.25022, -119.75126, "California", "A", 5332921, "USA"]
combined_place_df.loc[combined_place_df["word"]=="North Pacific",geocolumns] = ["OCN", 30, -170, "North Pacific Ocean", "H", 4030875, None]
combined_place_df.loc[combined_place_df["word"]=="Huai",geocolumns] = ["STM", 33.133333, 118.5, "Huai He", "H", 1807690, "CHN"]
combined_place_df.loc[combined_place_df["word"]=="Washington, DC",geocolumns] = ["PPLC", 38.89511, -77.03637, "Washington", "P", 4140963, "USA"]
combined_place_df.loc[combined_place_df["word"]=="Messinian",geocolumns] = ["ADM2", 37.25, -21.83333, "Nomos Messinias", "A", 257149, "GRC"]
combined_place_df.loc[combined_place_df["word"]=="Ionian Sea",geocolumns] = ["SEA", 39, 19, "Ionian Sea", "H", 2463713, None]
combined_place_df.loc[combined_place_df["word"]=="NYC",geocolumns] = ["PPL", 40.71427, -74.00597, "New York City", "P", 5128581, "USA"]
combined_place_df.loc[combined_place_df["word"]=="Indian Ocean",geocolumns] = ["OCN", -10, 70, "Indian Ocean", "P", 1545739, None]
combined_place_df.loc[combined_place_df["word"]=="North Sea",geocolumns] = ["SEA", 55, 3, "North Sea", "P", 2960848, None]
combined_place_df.loc[combined_place_df["word"]=="Philippine Sea",geocolumns] = ["SEA", 20, 135, "Philippine Sea", "P", 1818190, None]
combined_place_df.loc[combined_place_df["word"]=="Black Sea",geocolumns] = ["SEA", 43, 34, "Black Sea", "H", 630673, None]
combined_place_df.loc[combined_place_df["word"]=="Coral Sea",geocolumns] = ["SEA", -20, 155, "Coral Sea", "H", 2194166, None]
combined_place_df.loc[combined_place_df["word"]=="Timor Sea",geocolumns] = ["SEA", -11, 127, "Timor Sea", "H", 2078065, None]
combined_place_df.loc[combined_place_df["word"]=="Hudson Bay",geocolumns] = ["BAY", 60, -85, "Hudson Bay", "H", 5978134, "CAN"]
combined_place_df.loc[combined_place_df["word"]=="Bering Sea",geocolumns] = ["SEA", 60, -175, "Bering Sea", "H", 4031788, None]
combined_place_df.loc[combined_place_df["word"]=="Okhotsk Sea",geocolumns] = ["SEA", 55, 150, "Sea of Okhotsk", "H", 2127380, None]

combined_place_df.loc[combined_place_df["place_name"]=="Central Upper Nile",geocolumns] = ["ADM1", 10, 32.7, "Upper Nile", "A", 381229, "SSD"]
combined_place_df.loc[combined_place_df["place_name"]=="Gobolka Woqooyi Galbeed","place_name"] = "Woqooyi Galbeed"

combined_place_df = combined_place_df[combined_place_df["place_name"]!="Pacific County"]
combined_place_df = combined_place_df.loc[combined_place_df["word"]!="B.V."]
combined_place_df = combined_place_df[combined_place_df["word"]!="MMT"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Yellow"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Hadley"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Western North"]
combined_place_df = combined_place_df[combined_place_df["word"]!="colonies"]
combined_place_df = combined_place_df[combined_place_df["word"]!="TN"]
combined_place_df = combined_place_df[combined_place_df["word"]!="NH"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Mn"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Tx"]
combined_place_df = combined_place_df[combined_place_df["word"]!="TX"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Tn"]
combined_place_df = combined_place_df[combined_place_df["word"]!="FL"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Spartina"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Tamarix"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Eurasia"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Phillyrea"]
combined_place_df = combined_place_df[combined_place_df["word"]!="N-15"]
combined_place_df = combined_place_df[combined_place_df["word"]!="LT50"]
combined_place_df = combined_place_df[combined_place_df["word"]!="POSEIDON"]
combined_place_df = combined_place_df[combined_place_df["word"]!="LC50"]
combined_place_df = combined_place_df[combined_place_df["word"]!="El Nio"]
combined_place_df = combined_place_df[combined_place_df["word"]!="La Nia"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Red"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Gulf Stream"]
combined_place_df = combined_place_df[combined_place_df["word"].str.len()>2]
combined_place_df = combined_place_df[combined_place_df["word"]!="NH 1"]
combined_place_df = combined_place_df[combined_place_df["word"]!="Quercus"]


combined_place_df = combined_place_df[(combined_place_df["word"]!="ZJP")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="MSW")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="CCS")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="Tier-3")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="N2O")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="VKT")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="OECD")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="States")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="North to South")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="Stabilising")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="Mass Railway")]
combined_place_df = combined_place_df[(combined_place_df["word"]!="City")]

combined_place_df.loc[combined_place_df["word"]=="Ireland", geocolumns]=["PCLI",53,-8,"Ireland","A",2963597,"IRL"]
combined_place_df.loc[combined_place_df["word"]=="United States", geocolumns] = ["PCLI",39.76,-98.5,"United States","A",6252001, "USA"]
combined_place_df.loc[combined_place_df["word"]=="Czech Republic", geocolumns] = ["PCLI",49.75,15,"Czechia","A",3077311, "CZE"]
combined_place_df.loc[combined_place_df["word"]=="Czechia", geocolumns] = ["PCLI",49.75,15,"Czechia","A",3077311, "CZE"]
combined_place_df.loc[combined_place_df["word"]=="China", geocolumns] = ["PCLI", 35, 105, "China", "A", 1814991, "CHN"]
combined_place_df.loc[combined_place_df["word"]=="United Arab Emirates", geocolumns] = ["PCLI", 23.75, 54.5, "United Arab Emirates", "A", 290557, "ARE"]


combined_place_df.to_csv('data/places.csv', index=False)

print(combined_place_df.shape)

combined_place_df.tail()