In [19]:
# Uncomment below to use this  model, for production (possibly?)
#!python -m spacy download xx_ent_wiki_sm
# nlp = en_core_web_md.load()
# import en_core_web_md
import en_core_web_sm
nlp = en_core_web_sm.load()
# import xx_ent_wiki_sm
# load en_core_web_sm of English for vocabluary, syntax & entities
# load in a simpler wikipedia based named entity model 
#nlp = xx_ent_wiki_sm.load()

In [20]:
import os
import pandas as pd
from collections import Counter
from spacy import explain

# load the text or book in spacy.tokens.doc.Doc object
files = ['secretagent', 'innocence-father-brown']
doc = open(os.getcwd() +'/datasets/' + files[1] + '.txt').read()
print('total tokens', len(doc))

# use small sample
sentences = nlp(doc[:40000])

entities=[(i, i.label_, i.label) for i in sentences.ents]


total tokens 460041


In [21]:
# very crude list of possible placenames 
# for Ontosnotes based model use some of these types: 'GPE','PERSON', PER, LOC, ORG and MISC
places = list(map(lambda x:x[0], filter(lambda x: (x[1] in ['GPE', 'LOC', 'PERSON']), entities)))

## normalise these strings
places = list(set(map(lambda x:x.orth_.strip().lower(), places)))

labels = [x.label_ for x in sentences.ents]

# useful to refer to for the various labels of entities
for x in Counter(labels).keys():
    print(x, explain(x))

PERSON People, including fictional
ORG Companies, agencies, institutions, etc.
DATE Absolute or relative dates or periods
MONEY Monetary values, including unit
LANGUAGE Any named language
GPE Countries, cities, states
WORK_OF_ART Titles of books, songs, etc.
NORP Nationalities or religious or political groups
LOC Non-GPE locations, mountain ranges, bodies of water
CARDINAL Numerals that do not fall under another type
TIME Times smaller than a day
FAC Buildings, airports, highways, bridges, etc.
QUANTITY Measurements, as of weight or distance
ORDINAL "first", "second", etc.
EVENT Named hurricanes, battles, wars, sports events, etc.


In [22]:
# lookup places and plot if they exist. Keep within bounding box of europe somehow.
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
from pprint import pprint
'''
rubric:
if an individual search takes longer than n seconds,
it’s not really a city name, and the script will thus 
weed out non-names from the final map. 

Idea to improve this search, is to get results first without bounds, then
discard any of those using python comparison of bounds. 

Then get results using API bounds parameter and then compare returned results with prior results.

Keep only those where the result was the same. 

'''
n_wait=4
geolocator = Nominatim(timeout=n_wait, user_agent="book_location_resolver") # OSM service
 
lat_lon = []
valid = []

bounds_uk = [-10.8544921875, 49.82380908513249,2.021484375,59.478568831926395] # UK
bounds_ukse = [-10.8544921875, 49.82380908513249,2.021484375,59.478568831926395] # UK
bounds_g_london = [(51.28, -0.489),(51.686, 0.236)]

for idx, original_text in enumerate(places): 
    # do not hammer the service:
    if ( (idx > 0) and (0 == (idx % 10)) ): 
        # delay
        time.sleep(12)
    
    try:
        location = geolocator.geocode(original_text, bounded=True, viewbox=bounds_g_london, country_codes='gb')        
        
        if location:
            print('found: ',original_text)
            lat_lon.append(location)
                
            # we add this as a separate column to the DF as useful to verify the results of geocoding
            valid.append(original_text) 
        else:
            print('not found: ',original_text)
    except GeocoderTimedOut as e:
        print("Error: geocode failed on input %s with message %s"% (original_text, e))

# Use a dataframe to store retrieved results:
df = pd.DataFrame(lat_lon, columns=['Place Name', 'Coordinates'])
df['Original Text'] = valid # add a column of our valid original text queries for the geocoder
file_name = 'retrieved_locations.csv'
df.to_csv(file_name, sep='\t', encoding='utf-8', index=False)
df.head()

found:  soho
not found:  aristide valentin
found:  scotland
found:  europe
found:  paris
found:  the north sea
found:  tottenham
found:  elizabethan
found:  london
not found:  judith boss
not found:  utf-8
found:  england
not found:  ﻿project gutenberg
not found:  m. valentin
found:  nelson
found:  the north pole
not found:  valentin
found:  hampstead heath
found:  hampstead
not found:  harwich
found:  pacific
found:  g. k. chesterton
not found:  the hammer of god
not found:  statuesque
not found:  pale grey jacket
found:  brown
not found:  thou
found:  brussels
found:  essex
found:  earth
found:  williams
not found:  turnip
found:  essex village
found:  north london
found:  holland
not found:  flambeau
found:  victoria
found:  williamson
found:  france
found:  hartlepool


Unnamed: 0,Place Name,Coordinates,Original Text
0,"Soho, City of Westminster, London, Greater Lon...","(51.5131628, -0.1311754)",soho
1,"Royal Bank of Scotland, Feltham, London Boroug...","(51.44027345, -0.404133119376528)",scotland
2,"Europe Road, Royal Borough of Greenwich, Londo...","(51.4932696, 0.0553015)",europe
3,"Paris, 17, Brixton Station Road, Stockwell, Lo...","(51.46339705, -0.113663055448795)",paris
4,"Caterham Sea Cadets & Royal Marines Cadets, Ca...","(51.294297, -0.10579591772903)",the north sea


In [23]:
import folium

def generateBaseMap(default_location=[51.5, 0], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

def add_marker(place, coordinates, text="  "):
    ''' add a location marker to a folium map 
    '''
    folium.Marker(location=list(coordinates), popup=place, icon=folium.DivIcon(
        icon_size=(150,36),
        icon_anchor=(7,20),
        html='<div style="font-size: 12pt; color : #222;text-decoration:underline">'+ text +'</div>',
        )).add_to(base_map)
    
base_map = generateBaseMap()

result = [add_marker(x, y, z) for x, y, z in zip(df['Place Name'], df['Coordinates'], df['Original Text'])]
base_map.fit_bounds(base_map.get_bounds())
base_map # display the map