In [3]:
# Uncomment below to use the medium size model, for production (possibly?)
#!python -m spacy download en_core_web_md
# nlp = en_core_web_md.load()
# import en_core_web_md
import en_core_web_sm
# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

In [4]:
import os
import pandas as pd
from collections import Counter
from spacy import explain

# load the text or book in spacy.tokens.doc.Doc object
doc = open(os.getcwd() +'/datasets/secretagent.txt').read()
print('total tokens', len(doc))

# use small sample
sentences = nlp(doc[:100000])

entities=[(i, i.label_, i.label) for i in sentences.ents]


total tokens 541226


In [5]:
# very crude list of possible placenames
places = list(map(lambda x:x[0], filter(lambda x: (x[1] in ['GPE','PERSON']), entities)))

## normalise these strings
places = list(set(map(lambda x:x.orth_.strip().lower(), places)))

labels = [x.label_ for x in sentences.ents]

# useful to refer to for the various labels of entities
for x in Counter(labels).keys():
    print(x, explain(x))

PERSON People, including fictional
WORK_OF_ART Titles of books, songs, etc.
ORG Companies, agencies, institutions, etc.
DATE Absolute or relative dates or periods
MONEY Monetary values, including unit
ORDINAL "first", "second", etc.
LANGUAGE Any named language
GPE Countries, cities, states
CARDINAL Numerals that do not fall under another type
PRODUCT Objects, vehicles, foods, etc. (not services)
NORP Nationalities or religious or political groups
TIME Times smaller than a day
LOC Non-GPE locations, mountain ranges, bodies of water
FAC Buildings, airports, highways, bridges, etc.
LAW Named documents made into laws.
EVENT Named hurricanes, battles, wars, sports events, etc.
QUANTITY Measurements, as of weight or distance


In [6]:
# lookup places and plot if they exist. Keep within bounding box of europe somehow.
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
from pprint import pprint
'''
rubric:
if an individual search takes longer than n seconds,
it’s not really a city name, and the script will thus 
weed out non-names from the final map.
'''
n_wait=2
geolocator = Nominatim(timeout=n_wait, user_agent="book_location_resolver") # OSM service
 
lat_lon = []
valid = []

bounds_uk = [-10.8544921875, 49.82380908513249,2.021484375,59.478568831926395] # UK
bounds_ukse = [-10.8544921875, 49.82380908513249,2.021484375,59.478568831926395] # UK
bounds_g_london = [-0.489,51.28,0.236,51.686]
for idx, original_text in enumerate(places[:2000]): 
    # do not hammer the service:
    if ( (idx > 0) and (0 == (idx % 10)) ): 
        # delay
        time.sleep(12)
    
    try:
        # TODO: can in fact in API use viewbox and bounded=1, viewbox=<x1>,<y1>,<x2>,<y2>
        location = geolocator.geocode(original_text)        
        
        if location:
            if (bounds_g_london[1] < location.latitude < bounds_g_london[3]) and (bounds_g_london[0] < location.longitude < bounds_g_london[2]): 
                
                lat_lon.append(location)
                
                # we add this as a separate column to the DF as useful to verify the results of geocoding
                valid.append(original_text)  
    except GeocoderTimedOut as e:
        print("Error: geocode failed on input %s with message %s"% (original_text, e))

df = pd.DataFrame(lat_lon, columns=['Place Name', 'Coordinates'])
df['Original Text'] = valid # add a column of our valid original text queries for the geocoder
file_name = 'retrieved_locations.csv'
df.to_csv(file_name, sep='\t', encoding='utf-8', index=False)

In [7]:
import folium

def generateBaseMap(default_location=[51.5, 0], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

def add_marker(place, coordinates, text="  "):
    ''' add a location marker to a folium map 
    '''
    folium.Marker(location=list(coordinates), popup=place, icon=folium.DivIcon(
        icon_size=(150,36),
        icon_anchor=(7,20),
        html='<div style="font-size: 12pt; color : #222;text-decoration:underline">'+ text +'</div>',
        )).add_to(base_map)
    
base_map = generateBaseMap()

result = [add_marker(x, y, z) for x, y, z in zip(df['Place Name'], df['Coordinates'], df['Original Text'])]
base_map # display the map