In [2]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# Create list of word tokens
token_list = []
for token in doc:
    token_list.append(token.text)
print(token_list)



['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [3]:
from spacy.lang.en import stop_words
spacy_stopwords = stop_words.STOP_WORDS
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

First ten stop words: ['thereupon', 'their', 'with', 'might', 'the', "'ve", 'eleven', 'whose', 'whereafter', 'out']


In [4]:
import en_core_web_sm
# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()
docs = nlp(u"All is well that ends well.")
for word in docs:
    print(word.text,word.pos_)

All DET
is AUX
well ADJ
that DET
ends VERB
well ADV
. PUNCT


In [5]:
from spacy import displacy

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities
# visual
# displacy.render(nytimes, style = "ent",jupyter = True)

[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (Williamsburg, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox Jews, 'PERSON', 380),
 (6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

In [6]:
!python -m spacy download en_core_web_md
import en_core_web_md



[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [23]:
import os
import pandas as pd
from collections import Counter
from spacy import explain
# medium size model, for production
# nlp = en_core_web_md.load()

nlp = en_core_web_sm.load()

# place book in spacy.tokens.doc.Doc object
doc = open(os.getcwd() +'/datasets/secretagent.txt').read()

# use small sample
sents = nlp(doc)
print('total tokens', len(doc))

entities=[(i, i.label_, i.label) for i in sents.ents]
print(entities[:20])




total tokens 541226
[(Gutenberg eBook, 'PERSON', 380), (The Secret Agent, 'WORK_OF_ART', 388), (Joseph Conrad, 'PERSON', 380), (eBook, 'ORG', 383), (eBook, 'ORG', 383), (The Secret Agent
       , 'WORK_OF_ART', 388), (Joseph Conrad



, 'PERSON', 380), (December 24, 2010, 'DATE', 391), (974, 'MONEY', 394), (First, 'ORDINAL', 396), (June 28, 1997, 'DATE', 391), (English, 'LANGUAGE', 389), (UTF-8, 'GPE', 384), (Transcribed, 'PERSON', 380), (1907, 'DATE', 391), (Methuen & Co, 'ORG', 383), (David Price, 'PERSON', 380), (AGENT, 'ORG', 383), (SIMPLE, 'ORG', 383), (JOSEPH CONRAD, 'PERSON', 380)]
PLACES {'wheezy', 'this karl yundt', 'somewhere', 'victoria', 'inspector heat', 'drunks', 'thou', 'grit', 'observatory', 'mrs verloc', 'murder', 'karl yundt', 'went', 'these f. p.', 'an m. p.', 'greenwich park', 'moser', 'mr verloc’s', 'whitehall', 'lewisham', 'states', 'england', 'mumbled', 'transcribed', 'gloomy dampness', 'gutenberg ebook', 'dingy white', 'winnie', 'ill', 'green park', 'utf-8', 'pa

In [27]:
# very crude list of possible placenames
places = list(map(lambda x:x[0], filter(lambda x: (x[1] in ['GPE','PERSON']), entities)))
## normalise these
places = list(set(map(lambda x:x.orth_.strip().lower(), places)))

print('PLACES', places)

labels = [x.label_ for x in sents.ents]
#print(Counter(labels))

for x in Counter(labels).keys():
    print(x, explain(x))

PLACES ['wheezy', 'this karl yundt', 'somewhere', 'victoria', 'inspector heat', 'drunks', 'thou', 'grit', 'observatory', 'mrs verloc', 'murder', 'karl yundt', 'went', 'these f. p.', 'an m. p.', 'greenwich park', 'moser', 'mr verloc’s', 'whitehall', 'lewisham', 'states', 'england', 'mumbled', 'transcribed', 'gloomy dampness', 'gutenberg ebook', 'dingy white', 'winnie', 'ill', 'green park', 'utf-8', 'paris', 'inch', 'the f. p.', 'wherein', 'embassy', 'stevie', 'heat', 'america', 'lausanne', 'alfred wallace', 'apostle', 'girls', 'stott-wartenheim', 'strand', 'alexander ossipon', 'verloc', 'logical', 'spectre', 'artists', 'headlong', 'hygiene', 'somebody', 'f. p.', 'fertile', 'street lodge', 'india', 'aloud', 'commissioners', 'st stephen’s', 'mr vladimir’s', 'the winnie of the belgravian', 'ard', 'brussels', 'madness', 'latorre', 'nevertheless', 'verlocs', 'vladimir', 'stephen', 'outward deference', 'ethelred', 'joseph conrad', 'china bowl', 'caligula', 'sicily', 'this mr vladimir', 'winn'

In [28]:
import folium
import geopandas as gpd

def generateBaseMap(default_location=[51.5, 0], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map
base_map = generateBaseMap()
base_map

In [40]:
# lookup places and plot if they exist. Keep within bounding box of europe somehow.
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
'''
rubric:
if an individual search takes longer than n seconds,
it’s not really a city name, and the script will thus 
weed out non-names from the final map.
'''
n_wait=1
geolocator = Nominatim(timeout=n_wait)
geolocator = Nominatim() # OSM service
lat_lon = []

bounds_uk = [-10.8544921875, 49.82380908513249,2.021484375,59.478568831926395] # UK
bounds_ukse = [-10.8544921875, 49.82380908513249,2.021484375,59.478568831926395] # UK
bounds_g_london = [-0.489,51.28,0.236,51.686]
for idx, city in enumerate(places): 
    # do not hammer the service:
    if ( (idx > 0) and (0 == (idx % 10)) ): 
        # delay
        time.sleep(10)
    
    try:
        location = geolocator.geocode(city)
        if location:
            print(location.latitude, location.longitude)
            if (bounds_g_london[1] < location.latitude < bounds_g_london[3]) and (bounds_g_london[0] < location.longitude < bounds_g_london[2]): 
                
                lat_lon.append(location)
            else: 
                print('out of bounds')
                
    except GeocoderTimedOut as e:
        print("Error: geocode failed on input %s with message %s"% (city, e))


df = pd.DataFrame(lat_lon, columns=['Place Name', 'Coordinates'])

print(places)

df.head(50)

# TODO cache this output


  if sys.path[0] == '':
  del sys.path[0]


-12.094974 -77.0088783
out of bounds
-36.5986096 144.6780052
out of bounds
-37.3168664 145.2517555
out of bounds
46.0839685 -0.9172451
out of bounds
38.9953683 21.9877132
out of bounds
-33.9369444 18.4683333
out of bounds
37.9560721 126.6726497
out of bounds
54.7096079 -3.4510149
out of bounds
8.9483665 125.5429965
out of bounds
54.6320074 9.93283747833048
out of bounds
51.47722835 0.00092473393014189
52.2109951 11.7940564581209
out of bounds
51.5023278 -0.1260826
51.4624325 -0.0101331
39.7837304 -100.4458825
out of bounds
52.7954791 -0.540240286617432
out of bounds
38.6517633 -80.9467723
out of bounds
29.8202205 -94.3840777
out of bounds
48.0423069 7.4178933
out of bounds
51.5066192 -0.1429113
-23.1866119 -50.6570573380358
out of bounds
48.8566101 2.3514992
out of bounds


KeyboardInterrupt: 