In [1]:
# Perform standard imports
import spacy, os
nlp = spacy.load('en_core_web_sm')

In [2]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [3]:
# print it

# Print the full text:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [4]:
# Print the fifth word and associated tags
# pos_ is course grain and tag_ is fine grain
# get the text, course, fine, and explanation

print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))

jumped VERB VBD verb, past tense


In [5]:
# get the all in a table

for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [6]:
# understanding morphology, same word has different meanings
# looking at the word read

doc = nlp(u'I read books on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBP    verb, non-3rd person singular present


In [7]:
# note the difference in read
# does not work in this example

doc = nlp(u'I read a book on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBD    verb, past tense


In [8]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [9]:
# Print the fifth word and associated tags:
print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))

jumped VERB VBD verb, past tense


In [10]:
# get the information and explain it

for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [15]:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [16]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

83. ADJ   3
84. ADP   1
89. DET   2
91. NOUN  3
93. PART  1
96. PUNCT 1
99. VERB  1


In [17]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [18]:
len(doc.vocab)

57863

In [19]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

399. amod  3
412. det   2
426. nsubj 1
436. pobj  1
437. poss  1
440. prep  1
442. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


In [20]:
from spacy import displacy

In [21]:
displacy.render(doc, style='dep', jupyter=True)

In [22]:
options = options={'distance': 75,
                   'compact':'True',
                   'color':'yellow',
                   'bg':'#09a3d5',
                   'font':'Times'}

In [23]:
displacy.render(doc, style='dep', jupyter=True, options=options)

In [24]:
doc2 = nlp(u"This is a sentence.  This is another sentence, possibly longer than the other")

In [25]:
# make them a list of sentences

spans = list(doc2.sents)

In [26]:
# server on a localhost port 5000

displacy.serve(spans, style='dep', options = {'distance':75})


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [19/Aug/2019 18:47:37] "GET / HTTP/1.1" 200 10284
127.0.0.1 - - [19/Aug/2019 18:47:37] "GET /favicon.ico HTTP/1.1" 200 10284



    Shutting down server on port 5000.



In [27]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [28]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [29]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [30]:
# does not know Tesla is a company


doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [31]:
from spacy.tokens import Span

In [32]:
ORG = doc.vocab.strings[u"ORG"]

In [33]:
ORG

381

In [34]:
# get Tesla, item 0, and make it an ORG

new_ent = Span(doc,0,1,label=ORG)

In [35]:
doc.ents=list(doc.ents) + [new_ent]

# now rerun it
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


# Handle Multiple Additions

In [36]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [56]:
# test on another one

doc2 = nlp(u"Our company created a bran new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

show_ents(doc2)

No named entities found.


In [57]:
from spacy.matcher import PhraseMatcher

In [58]:
matcher = PhraseMatcher(nlp.vocab)

In [59]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [60]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [61]:
matcher.add('newproduct', None, *phrase_patterns)

In [62]:
found_matches = matcher(doc2)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [63]:
from spacy.tokens import Span

In [67]:
# make spans and named entities
# only need start and end of span

PROD = doc2.vocab.strings[u"PRODUCT"]

In [70]:
new_ents = [Span(doc2, match[1], match[2], label=PROD) for match in found_matches]
new_ents

[vacuum cleaner, vacuum-cleaner]

In [71]:
doc2.ents = list(doc2.ents) + new_ents
show_ents(doc2)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [72]:
doc3 = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by $10.")

In [75]:
# get the count of number of money items

len([ent for ent in doc3.ents if ent.label_ == "MONEY"])

2

In [76]:
# show  them
[ent for ent in doc3.ents if ent.label_ == "MONEY"]

[29.95, 10]

In [77]:
# there is a problem with line breaks in this version of spacy, the \n is tagged as GPE

doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit

 - GPE - Countries, cities, states
five dollars - MONEY - Monetary values, including unit


In [78]:
# Quick function to remove ents formed on whitespace:
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

# Insert this into the pipeline AFTER the ner component:
nlp.add_pipe(remove_whitespace_entities, after='ner')

In [79]:
# Rerun nlp on the text above, and show ents:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [80]:
# visualize named entities
# multiple lines to view line by line

doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

In [81]:
# break up the sentences

for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent',jupyter=True)


In [82]:
# maybe just interested in product entities

options = {'ents':['PRODUCT', 'ORG']}

for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent',jupyter=True, options=options)

In [83]:
# choose colors

colors = {'ORG':'red'}


options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent',jupyter=True, options=options)

In [87]:
# choose colors
# add a gradient effect

colors = {'ORG':'radial-gradient(yellow,green)'}


# linear gradient
colors = 


options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent',jupyter=True, options=options)

In [93]:
# linear gradient
# degrees are clockwise

colors = {'ORG':'linear-gradient(90deg, orange, red)'}


options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent',jupyter=True, options=options)

In [94]:
displacy.serve(doc, style='ent', options=options)


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [19/Aug/2019 19:43:36] "GET / HTTP/1.1" 200 2154



    Shutting down server on port 5000.

