# Parts of Speech Tagging

In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [2]:
doc= nlp(u"Apple is looking at buying a U.K. startup for &1 Billion.")

In [3]:
doc.text

'Apple is looking at buying a U.K. startup for &1 Billion.'

In [9]:
for token in doc:
    print(f'{token.text:{10}}{token.pos_:{8}}{token.tag_:{6}}{token.dep_:{10}}{spacy.explain(token.tag_):{10}}')

Apple     PROPN   NNP   nsubj     noun, proper singular
is        VERB    VBZ   aux       verb, 3rd person singular present
looking   VERB    VBG   ROOT      verb, gerund or present participle
at        ADP     IN    prep      conjunction, subordinating or preposition
buying    VERB    VBG   pcomp     verb, gerund or present participle
a         DET     DT    det       determiner
U.K.      PROPN   NNP   compound  noun, proper singular
startup   NOUN    NN    dobj      noun, singular or mass
for       ADP     IN    prep      conjunction, subordinating or preposition
&         CCONJ   CC    cc        conjunction, coordinating
1         NUM     CD    compound  cardinal number
Billion   NUM     CD    conj      cardinal number
.         PUNCT   .     punct     punctuation mark, sentence closer


**By looking at the sentence, spacy find out the past tense**

In [10]:
doc=nlp(u"I read books on NLP.")
for token in doc:
    print(f'{token.text:{10}}{token.pos_:{8}}{token.tag_:{6}}{spacy.explain(token.tag_):{10}}')

I         PRON    PRP   pronoun, personal
read      VERB    VBP   verb, non-3rd person singular present
books     NOUN    NNS   noun, plural
on        ADP     IN    conjunction, subordinating or preposition
NLP       PROPN   NNP   noun, proper singular
.         PUNCT   .     punctuation mark, sentence closer


In [12]:
doc2=nlp(u"I read a book on NLP.")
for token in doc2:
    print(f'{token.text:{10}}{token.pos_:{8}}{token.tag_:{6}}{spacy.explain(token.tag_):{10}}')

I         PRON    PRP   pronoun, personal
read      VERB    VBD   verb, past tense
a         DET     DT    determiner
book      NOUN    NN    noun, singular or mass
on        ADP     IN    conjunction, subordinating or preposition
NLP       PROPN   NNP   noun, proper singular
.         PUNCT   .     punctuation mark, sentence closer


### counting POS Tags

In [14]:
doc= nlp(u"Apple is looking at buying a U.K. startup for &1 Billion.")
pos_counts=doc.count_by(spacy.attrs.POS)
pos_counts

{96: 1, 99: 3, 84: 2, 88: 1, 89: 1, 91: 1, 92: 2, 95: 2}

In [15]:
doc.vocab[98].text

'SYM'

In [16]:
for k,v in sorted(pos_counts.items()):
    print(f'{k}.{doc.vocab[k].text:{5}}:{v}')

84.ADP  :2
88.CCONJ:1
89.DET  :1
91.NOUN :1
92.NUM  :2
95.PROPN:2
96.PUNCT:1
99.VERB :3


In [18]:
tag_counts=doc.count_by(spacy.attrs.TAG)
for k,v in sorted(tag_counts.items()):
    print(f'{k}.{doc.vocab[k].text:{5}}:{v}')

1292078113972184607.IN   :2
1534113631682161808.VBG  :2
8427216679587749980.CD   :2
12646065887601541794..    :1
13927759927860985106.VBZ  :1
15267657372422890137.DT   :1
15308085513773655218.NN   :1
15794550382381185553.NNP  :2
17571114184892886314.CC   :1


# Visualizing Parts of Speech

In [20]:
from spacy import displacy

In [21]:
doc=nlp(u"A quick brown fox jumps over the lazy dog.")

In [22]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [26]:
for token in doc:
    print(f' {token.text:{10}} {token.pos_:{7}} {token.dep_:{7}} {spacy.explain(token.dep_)}')

 A          DET     det     determiner
 quick      ADJ     amod    adjectival modifier
 brown      ADJ     amod    adjectival modifier
 fox        NOUN    nsubj   nominal subject
 jumps      VERB    ROOT    None
 over       ADP     prep    prepositional modifier
 the        DET     det     determiner
 lazy       ADJ     amod    adjectival modifier
 dog        NOUN    pobj    object of preposition
 .          PUNCT   punct   punctuation


**displacy the tree differently**

In [27]:
options={'distance':110,'compact':'True','color':'yellow','bg':'#09a3d5','fonts':'Times'}
displacy.render(doc,style='dep',jupyter=True,options=options)

**drop outside jupyter notebook**

# Named Entity Recognition

In [31]:
def show_ents(doc):
    if doc.ents:
        for entity in doc.ents:
            print(entity.text + '-----' + entity.label_ + '-----' + str(spacy.explain(entity.label_)))
    else:
        print('No named entities found!')

In [32]:
doc=nlp(u"I am heading to New Yorks City and will visit Statue of Liberty tomorrow")

In [33]:
show_ents(doc)

New Yorks City-----GPE-----Countries, cities, states
Statue of Liberty-----ORG-----Companies, agencies, institutions, etc.
tomorrow-----DATE-----Absolute or relative dates or periods


### add our own named entity

In [34]:
doc=nlp(u"Tesla is planning to build a new plant in U.K. for $40 Million.")
show_ents(doc)

U.K.-----GPE-----Countries, cities, states
$40 Million-----MONEY-----Monetary values, including unit


ents returns a tuple of entities.

In [35]:
type(doc.ents)

tuple

In [36]:
type(doc.ents[0])

spacy.tokens.span.Span

step one: find the hash value of 'ORG'

In [38]:
from spacy.tokens import Span
ORG=doc.vocab.strings['ORG']
ORG

381

step two create a new span on doc,put the hash value on the span

In [39]:
new_entity=Span(doc,0,1,label=ORG)

step three: append the new entity

In [40]:
doc.ents=list(doc.ents)+[new_entity]

In [41]:
show_ents(doc)

Tesla-----ORG-----Companies, agencies, institutions, etc.
U.K.-----GPE-----Countries, cities, states
$40 Million-----MONEY-----Monetary values, including unit


### Adding multiple named entities for all matching spans

In [52]:
doc=nlp("Our company plans to introduce new vacuum cleaner.If this works out the new vacuum cleaner will be our first product.")
show_ents(doc)

first-----ORDINAL-----"first", "second", etc.


In [46]:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)

# the patterns we want to match
phrase_list=['vacuum cleaner','vacuum-cleaner','vaccumcleaner']
phrase_patterns=[nlp(text) for text in phrase_list]

In [47]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [48]:
matcher.add('clientproducts',None,*phrase_patterns)

In [53]:
matches=matcher(doc)
matches

[(245706805299067201, 6, 8), (245706805299067201, 15, 17)]

create a label

In [54]:
prod=doc.vocab.strings[u'PRODUCT']

create a list of spans

In [55]:
new_entities=[Span(doc,match[1],match[2],label=prod) for match in matches ]

In [56]:
len(new_entities)

2

In [57]:
type(new_entities[0])

spacy.tokens.span.Span

In [58]:
doc.ents=list(doc.ents)+new_entities

In [59]:
show_ents(doc)

vacuum cleaner-----PRODUCT-----Objects, vehicles, foods, etc. (not services)
vacuum cleaner-----PRODUCT-----Objects, vehicles, foods, etc. (not services)
first-----ORDINAL-----"first", "second", etc.


### counting entities of a certain type (label)

In [60]:
doc=nlp(u"I found a furniture priced at $2000 which is marked down by 500 dollars.")
show_ents(doc)

2000-----MONEY-----Monetary values, including unit
500 dollars-----MONEY-----Monetary values, including unit


In [63]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

## Noun Chunks

In [65]:
doc=nlp(u"Autonomous cars shift insurance liability towards manufacturers.")

for chunk in doc.noun_chunks:
    print(chunk.text+'-----'+chunk.root.text+'-----'+chunk.root.dep_+'-----'+chunk.root.head.text)

Autonomous cars-----cars-----nsubj-----shift
insurance liability-----liability-----dobj-----shift
manufacturers-----manufacturers-----pobj-----towards


In [66]:
type(doc.noun_chunks)

generator

noun_chunks is not a container,
object of type **'generator'** has no len()

In [67]:
type(doc.sents)

generator

---
# Visualizing Named Entities

In [68]:
doc=nlp(u"Over the last quarter, Apple sold nearly 20 thousand iPhone 12 for a profit of $200 million.")

In [69]:
displacy.render(doc,style='ent',jupyter=True)

In [70]:
options={'ents':['ORG','DATE','MONEY']}
displacy.render(doc,style='ent',jupyter=True,options=options)

In [72]:
#colors={'ORG':'orange','MONEY':'yellow'}
colors={'ORG':'Linear-gradient(90deg,orange,green)','MONEY':'radial-gradient(red,yellow)'}

options={'ents':['ORG','DATE','MONEY'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

---
# Sentence Segmentation

In [73]:
doc=nlp(u"This is the first sentence. This is another sentence. This is the third sentence.")

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the third sentence.


In [74]:
type(doc.sents)

generator

In [75]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

the attribute doesnt hold the message.No copy here.

In [76]:
list(doc.sents)[0]

This is the first sentence.

Here you can get the elements of sentences.

In [77]:
len(list(doc.sents))

3

In [78]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [79]:
(list(doc.sents)[1]).start

6

In [80]:
(list(doc.sents)[1]).end

11

### Add sentence segmentation rule

In [83]:
doc1=nlp(u"'Management is about doing things right; leadership is about doing right things.' --- Peter Drucken")
for sent in doc1.sents:
    print(sent)

'Management is about doing things right; leadership is about doing right things.' ---
Peter Drucken


In [84]:
def set_custom_boundary(doc):
    for token in doc[:-1]: # to skip the last word
        if token.text==';':
            doc[token.i+1].is_sent_start=True
            
    return doc

In [85]:
nlp.add_pipe(set_custom_boundary,before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundary', 'parser', 'ner']

In [87]:
doc2=nlp(u"'Management is about doing things right; leadership is about doing right things.' --- Peter Drucken")
for sent in doc2.sents:
    print(sent)

'Management is about doing things right;
leadership is about doing right things.' ---
Peter Drucken
