In [1]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.0.3-cp38-cp38-win_amd64.whl (11.8 MB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp38-cp38-win_amd64.whl (6.5 MB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.5-cp38-cp38-win_amd64.whl (112 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.3-cp38-cp38-win_amd64.whl (1.8 MB)
Collecting spacy-legacy<3.1.0,>=3.0.0
  Downloading spacy_legacy-3.0.1-py2.py3-none-any.whl (7.0 kB)
Collecting catalogue<2.1.0,>=2.0.1
  Downloading catalogue-2.0.1-py3-none-any.whl (9.6 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp38-cp38-win_amd64.whl (36 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting thinc<8.1.0,>=8.0.0
  Downloading thinc-8.0.1-cp38-cp38-win_amd64.whl (1.0 MB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting srsly<3.0.0,>=2.4.0
  Downloading srsly-2.4.0-cp38-cp38-win_amd64.whl (451 kB)
Collecting pathy
 

In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0-py3-none-any.whl (778.8 MB)
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.0.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
import spacy
print(spacy.__version__)

3.0.3


In [4]:
nlp = spacy.load('en_core_web_lg')

In [7]:
## tokenization

doc = nlp(u'I am flying to Manila.')
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila', '.']


In [8]:
### lemmatization

doc = nlp(u'this product integrates both libraries for downloading and applying patches')
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


In [10]:
### Part of speech tagging

doc = nlp(u'I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
have AUX VBP
flown VERB VBN
to ADP IN
Cebu PROPN NNP
. PUNCT .
Now ADV RB
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [11]:
spacy.explain('PRP')

'pronoun, personal'

In [12]:
## Separating sentences

doc = nlp(u'I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


In [14]:
doc = nlp(u'The Golden Gate Bridge is an iconic landmark in San Francisco.')
[doc[i] for i in range(len(doc))]

[The, Golden, Gate, Bridge, is, an, iconic, landmark, in, San, Francisco, .]

In [17]:
## Retokenizer

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
    

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[7:9])

for token in doc:
    print(token.text, token.lemma_, token.pos_)

The the DET
Golden Gate Bridge Golden Gate Bridge PROPN
is be AUX
an an DET
iconic iconic ADJ
landmark landmark NOUN
in in ADP
San Francisco San Francisco PROPN
. . PUNCT


In [18]:
### Syntactic Parsing

doc = nlp(u'I want a green apple.')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
. PUNCT punct punctuation


In [19]:
from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [22]:
doc = nlp(u'The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.')
phrase = ''
for token in doc:
    if token.tag_ == '$':
        i = token.i + 1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
        break

phrase = phrase[:-1]
print(phrase)

1.5 million


In [23]:
## taking 1.5 million and 1.2 million

doc = nlp(u'The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.')
phrase = ''
for token in doc:
    if token.tag_ == '$':
        phrase = token.text
        i = token.i + 1
        while doc[i].tag_ == 'CD':
            phrase += doc[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 million
$1.2 million


In [24]:
## using regex

import re

pattern = '\$.*million'
test_string = 'The firm earned $1.5 million in 2017'
result = re.findall(pattern, test_string)
print(result)

['$1.5 million']


In [25]:
## using regex

import re

pattern = '\$.+?million'
test_string = 'The firm earned $1.5 million in 2017, in comparison with $1.2 million in 2016.'
result = re.findall(pattern, test_string)
print(result)

['$1.5 million', '$1.2 million']


In [26]:
from IPython.core.display import display, HTML

doc = nlp(u'I want a Greek pizza.')

from spacy import displacy
html = displacy.render(doc, style='ent', page=True)

display(HTML(html))

<IPython.core.display.HTML object>

In [27]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [29]:
### Similarity

doc = nlp('I want a green apple.')
doc.similarity(doc[2:5])

0.8776482403927138

In [31]:
doc.similarity(doc)

1.0

In [33]:
nlp('apple').similarity(nlp('banana'))

0.5831844168885263

In [34]:
nlp('king').similarity(nlp('queen'))

0.7252610345406867

In [36]:
nlp('susceptible').similarity(nlp('vulnerable'))

0.6251328421409517

In [41]:
nlp('Pneumonoultramicroscopicsilicovolcanoconiosis').vector

array([ 1.3118e+00,  3.1203e-02,  6.9682e-01, -1.4502e-01, -5.4314e-01,
       -3.3900e-01,  1.1069e+00,  3.1314e-01, -9.4353e-02, -1.4020e+00,
        2.7567e-01, -6.6789e-01, -1.1626e-01, -3.3193e-01,  1.2936e-01,
        5.1401e-01,  1.2313e+00, -1.5955e+00,  2.9266e-01, -1.2529e+00,
       -9.0306e-01,  2.2884e-01,  2.8904e-01,  3.2362e-01,  8.0127e-01,
       -3.0785e-01,  6.9487e-01, -1.8954e-01,  2.4882e-01,  3.4997e-01,
        1.2864e+00,  7.5302e-01,  6.0450e-02,  3.4085e-01, -1.2464e-01,
       -3.2313e-01,  3.7892e-01,  2.8298e-01, -8.0175e-02,  7.9767e-01,
        8.4346e-01, -3.0442e-01, -8.4389e-01, -1.2494e-02, -5.9192e-02,
        8.2790e-01,  2.2620e-01, -6.4191e-02,  1.1475e-01,  6.1042e-02,
        7.5387e-01,  5.8426e-01, -2.1308e-01, -4.7659e-01,  5.5876e-01,
        4.2223e-02,  3.8060e-01,  1.8259e-01, -8.8816e-02,  3.3387e-01,
        1.1277e-01, -9.0535e-02,  6.5429e-01, -5.1379e-02, -7.4891e-02,
       -2.6348e-01,  7.6872e-04, -1.0968e-01,  2.0391e-01,  7.25