In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
# imports

import spacy
#nlp = spacy.load('en_core_web_sm') #not so fast internet
nlp = spacy.load('en_core_web_lg') #very fast internet

In [4]:
# tokenization

doc = nlp('I am flying to Manila.')
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila', '.']


In [5]:
# lemmatization

doc = nlp('this product integrates both libraries for downloading and applying patches')
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


In [6]:
# part of speech tagging

doc = nlp('I have flown to Singapore. I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
have AUX VBP
flown VERB VBN
to ADP IN
Singapore PROPN NNP
. PUNCT .
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [7]:
spacy.explain('AUX')

'auxiliary'

In [12]:
spacy.explain('VB')

'verb, base form'

In [10]:
doc

I have flown to Singapore. I am flying to Manila.

In [11]:
for token in doc:
    print(token.text, token.lemma_)

I I
have have
flown fly
to to
Singapore Singapore
. .
I I
am be
flying fly
to to
Manila Manila
. .


In [13]:
print([w.text for w in doc if w.tag_=='VBG' or w.tag_=='VB'])

['flying']


In [14]:
doc

I have flown to Singapore. I am flying to Manila.

In [15]:
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Singapore, .]
[I, am, flying, to, Manila, .]


In [16]:
doc = nlp('The Golden Gate Bridge is an iconic landmark in San Francisco.')
print([w.text for w in doc])

['The', 'Golden', 'Gate', 'Bridge', 'is', 'an', 'iconic', 'landmark', 'in', 'San', 'Francisco', '.']


In [17]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[7:9])
for token in doc:
    print(token.text, token.lemma_, token.pos_)

The the DET
Golden Gate Bridge Golden Gate Bridge PROPN
is be AUX
an an DET
iconic iconic ADJ
landmark landmark NOUN
in in ADP
San Francisco San Francisco PROPN
. . PUNCT


In [18]:
# dependency parsing

doc = nlp('I want a green apple.')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
. PUNCT punct punctuation


In [19]:
from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [21]:
# entity recognition

doc = nlp('The firm earned $1.5 million in 2017.')
phrase = ''
for token in doc:
    if token.tag_ == '$':
        phrase = token.text
        i = token.i+1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 million


In [20]:
spacy.explain('CD')

'cardinal number'

In [22]:
doc = nlp('The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.')
phrase = ''
for token in doc:
    if token.tag_ == '$':
        phrase = token.text
        i = token.i+1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 million
$1.2 million


In [26]:
doc = nlp('I want to but an Apple computer.')
from IPython.core.display import display, HTML

from spacy import displacy
html = displacy.render(doc,style='ent',page=True)

display(HTML(html))

<IPython.core.display.HTML object>

In [24]:
spacy.explain('GPE')

'Countries, cities, states'

In [27]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [28]:
# Word Similarity
nlp('apple').similarity(nlp('banana'))

0.5831844168885263

In [29]:
nlp('king').similarity(nlp('queen'))

0.7252610345406867

In [30]:
doc = nlp('I want a green apple.')
doc.similarity(doc[2:5])

0.8776482403927138

In [31]:
doc.similarity(doc)

1.0