In [None]:
import spacy

In [None]:
nlp =spacy.load('en_core_web_sm')

In [None]:
doc = nlp(u"The quick brown box jumped over the lazy dog's back.")

In [None]:
print(doc.text)

The quick brown box jumped over the lazy dog's back.


In [None]:
print(doc[4])

jumped


In [None]:
print(doc[4].pos_)

VERB


In [None]:
print(doc[4].text)

jumped


In [None]:
print(doc[4].tag_)

VBD


In [None]:
for token in doc:
  print(f"{token.text},{token.pos_}, {token.tag_}, {spacy.explain(token.tag_)}")

The,DET, DT, determiner
quick,ADJ, JJ, adjective
brown,ADJ, JJ, adjective
box,NOUN, NN, noun, singular or mass
jumped,VERB, VBD, verb, past tense
over,ADP, IN, conjunction, subordinating or preposition
the,DET, DT, determiner
lazy,ADJ, JJ, adjective
dog,NOUN, NN, noun, singular or mass
's,PART, POS, possessive ending
back,NOUN, NN, noun, singular or mass
.,PUNCT, ., punctuation mark, sentence closer


In [None]:
for token in doc:
  print(f"{token.text:{10}}{token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The       DET        DT         determiner
quick     ADJ        JJ         adjective
brown     ADJ        JJ         adjective
box       NOUN       NN         noun, singular or mass
jumped    VERB       VBD        verb, past tense
over      ADP        IN         conjunction, subordinating or preposition
the       DET        DT         determiner
lazy      ADJ        JJ         adjective
dog       NOUN       NN         noun, singular or mass
's        PART       POS        possessive ending
back      NOUN       NN         noun, singular or mass
.         PUNCT      .          punctuation mark, sentence closer


In [None]:
doc = nlp(u"I read books on NLP")

In [None]:
word = doc[1]

In [None]:
word.text

'read'

In [None]:
token = word
print(f"{token.text:{10}}{token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read      VERB       VBD        verb, past tense


In [None]:
doc = nlp(u"I read a book on NLP")

word = doc[1]

token = word
print(f"{token.text:{10}}{token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read      VERB       VBD        verb, past tense


In [None]:
# Counting POS

doc = nlp(u"The quick brown box jumped over the lazy dog's back.")

In [None]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [None]:
POS_counts

{84: 3, 85: 1, 90: 2, 92: 3, 94: 1, 97: 1, 100: 1}

In [None]:
doc.vocab[83].text

'LANG'

In [None]:
doc[4]

jumped

In [None]:
doc[3].pos

92

In [None]:
for k,v in sorted(POS_counts.items()):
  print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
97. PUNCT 1
100. VERB  1


In [None]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
  print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [None]:
len(doc.vocab)

508

In [None]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
  print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  2
415. det   2
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 1
7037928807040764755. compound 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


Visualizing POS

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [None]:
from spacy import displacy

In [None]:
displacy.render(doc, style='dep', jupyter=True)

In [None]:
options = {'distance':110, 'compact':'True', 'color':'yellow','bg':'#09a3d5', 'font':'Times'}

In [None]:
displacy.render(doc, style='dep', jupyter=True, options=options)

In [None]:
doc2 = nlp("This is a sentence. This is another sentence, possibly longer than the other")

In [None]:
spans = list(doc2.sents)

In [None]:
#displacy.serve(spans, style='dep', options={'distance':110})

Named Entity Recognition

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text +' - '+ent.label_ + ' - '+str(spacy.explain(ent.label_)))
    
  else:
    print('No entities found')

In [None]:
doc=nlp(u'Hi how are you?')

In [None]:
show_ents(doc)

No entities found


In [None]:
doc = nlp(u'May I go to Wasington, DC next may to see the Wasington Monument?.')

In [None]:
show_ents(doc)

Wasington - GPE - Countries, cities, states
DC - GPE - Countries, cities, states
the Wasington Monument - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [None]:
doc = nlp(u"can I please have 500 dollars of Microsoft stock?")

In [None]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [None]:
doc = nlp(u'Tesla to build a U.K. for &6 million')

In [None]:
show_ents(doc)

U.K. - GPE - Countries, cities, states
6 million - CARDINAL - Numerals that do not fall under another type


In [None]:
from spacy.tokens import Span

In [None]:
ORG = doc.vocab.strings[u'ORG']

In [None]:
ORG

383

In [None]:
new_ent = Span(doc,0,1,label=ORG)

In [None]:
doc.ents = list(doc.ents) + [new_ent]

In [None]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
6 million - CARDINAL - Numerals that do not fall under another type


In [None]:
doc = nlp(u"Our company created a brand new vacuum cleaner." u"This new vacuum-cleaner is the best in show")

In [None]:
show_ents(doc)

No entities found


In [None]:
from spacy.matcher import PhraseMatcher

In [None]:
matcher = PhraseMatcher(nlp.vocab)

In [None]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [None]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [None]:
matcher.add('newproduct', None, *phrase_patterns)

In [None]:
found_matches = matcher(doc)

In [None]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [None]:
from spacy.tokens import Span

In [None]:
PROD =doc.vocab.strings[u"PRODUCT"]

In [None]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [None]:
new_ents= [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [None]:
doc.ents = list(doc.ents) + new_ents

In [None]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [None]:
doc = nlp(u"Original I paid $29.95 for this car toy, but now it is marked down by 10 dollars")

In [None]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.95, 10 dollars]

In [None]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

In [None]:
# Visualization

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
from spacy import displacy

In [None]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for profit of $6 million")

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for profit of $6 million." u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
for sent in doc.sents:
  displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [None]:
options = {'ents':['PRODUCT', 'ORG']}

In [None]:
displacy.render(doc, style='ent', jupyter=True, options = options)

In [None]:
#Choosing colors
colors = {'ORG': 'red'}
options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options = options)

In [None]:
#Choosing colors
colors = {'ORG': 'radial-gradient(yellow, green)'}
options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options = options)

In [None]:
#Choosing colors
colors = {'ORG': 'linear-gradient(yellow, red)'}
options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options = options)

In [None]:
#Choosing colors
colors = {'ORG': 'linear-gradient(45deg, yellow, red)'}
options = {'ents':['PRODUCT', 'ORG'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options = options)

In [None]:
#displacy.serve(doc, style='ent', options=options)

Sentence Segmentation

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc= nlp(u"This is the first sentence. This is another sentence. this is the last sentence.")

In [None]:
for sent in doc.sents:
  print(sent)

This is the first sentence.
This is another sentence.
this is the last sentence.


In [None]:
doc.sents[0]

TypeError: ignored

In [None]:
doc[0]

This

In [None]:
doc.sents[0]

TypeError: ignored

In [None]:
list(doc.sents)[0]

This is the first sentence.

In [None]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [None]:
doc = nlp(u'"Management is going the right things; leadership is doing the right things." -Peter Drucker')

In [None]:
doc.text

'"Management is going the right things; leadership is doing the right things." -Peter Drucker'

In [None]:
for sent in doc.sents:
  print(sent)
  print('\n')

"Management is going the right things; leadership is doing the right things."


-Peter


Drucker




In [None]:
# ADD A SEGMENTATION RULE

def set_custtom_boundries(doc):
  for token in doc:
    print(token)
    print(token.i)

set_custtom_boundries(doc)

"
0
Management
1
is
2
going
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
things
13
.
14
"
15
-Peter
16
Drucker
17


In [None]:
def set_custtom_boundries(doc):
  for token in doc[:-1]:
    print(token)
    print(token.i)

doc[:-1]

"Management is going the right things; leadership is doing the right things." -Peter

In [None]:
def set_custtom_boundries(doc):
  for token in doc[:-1]:
    if token.text == ';':
      doc[token.i+1].is_sent_start =True

    return doc

In [None]:
nlp.add_pipe(set_custtom_boundries, before='parser')

nlp.pipe_names

['tagger', 'set_custtom_boundries', 'parser', 'ner']

In [None]:
doc[:-1]

"Management is going the right things; leadership is doing the right things." -Peter

In [None]:
doc4 = nlp(u'"Management is going the right things; leadership is doing the right things." -Peter Drucker')

In [None]:
for sent in doc4.sents:
  print(sent)

"Management is going the right things; leadership is doing the right things."
-Peter
Drucker


In [None]:
nlp = space.load('en_core_web_sm')

NameError: ignored

In [None]:
mystring = u"This ia a snetence. This is another .\n\nThis is a \nthird sentence."

In [None]:
print(mystring)

This ia a snetence. This is another .

This is a 
third sentence.


In [None]:
doc = nlp(mystring)

In [None]:
for sentence in doc.sents:
  print(sentence)

This ia a snetence.
This is another .


This is a 
third sentence.


In [None]:
from spacy.pipeline import SentenceSegmenter

In [None]:
def split_on_newlines(doc):
  start =0
  seen_newline= False

  for word in doc:
    if seen_newline:
      yield doc[start:word.i]
      start = word.i
      seen_newline = False

    elif word.text.startswith('\n'):
      seen_newline =True

  yield doc[start:]

In [None]:
sbd =SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [None]:
nlp.add_pipe(sbd)

In [None]:
doc =nlp(mystring)

In [None]:
for sentence in doc.sents:
  print(sentence)

This ia a snetence. This is another .


This is a 

third sentence.
