In [1]:
import spacy
import en_core_web_lg

print(f'SpaCy:\t{spacy.__version__}')
print(f'en_core_web_lg:\t{en_core_web_lg.__version__}')

SpaCy:	3.0.3
en_core_web_lg:	3.0.0


In [2]:
# nlp = spacy.load("en_core_web_lg")
nlp = en_core_web_lg.load()

In [3]:
doc = nlp("The big brown fox jump over the lazy dog. Today is International Womens' day")
doc

The big brown fox jump over the lazy dog. Today is International Womens' day

In [4]:
### Tokenization
tokens = [w.text for w in doc]
print(tokens)

['The', 'big', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'Today', 'is', 'International', 'Womens', "'", 'day']


In [5]:
### Lemmatization
for token in doc:
    print(token.text, token.lemma_)

The the
big big
brown brown
fox fox
jump jump
over over
the the
lazy lazy
dog dog
. .
Today today
is be
International International
Womens Womens
' '
day day


In [6]:
### POS Tagging
for token in doc:
    print(f'{token.text:20} {token.pos_:10} {spacy.explain(token.tag_)}')

The                  DET        determiner
big                  ADJ        adjective
brown                ADJ        adjective
fox                  NOUN       noun, singular or mass
jump                 NOUN       noun, singular or mass
over                 ADP        conjunction, subordinating or preposition
the                  DET        determiner
lazy                 ADJ        adjective
dog                  NOUN       noun, singular or mass
.                    PUNCT      punctuation mark, sentence closer
Today                NOUN       noun, singular or mass
is                   AUX        verb, 3rd person singular present
International        PROPN      noun, proper singular
Womens               PROPN      noun, proper singular
'                    PART       possessive ending
day                  NOUN       noun, singular or mass


In [7]:
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[The, big, brown, fox, jump, over, the, lazy, dog, .]
[Today, is, International, Womens, ', day]


In [8]:
[doc[i] for i in range(len(doc))]

[The,
 big,
 brown,
 fox,
 jump,
 over,
 the,
 lazy,
 dog,
 .,
 Today,
 is,
 International,
 Womens,
 ',
 day]

In [9]:
doc2 = doc.copy()
with doc2.retokenize() as retokenizer:
    retokenizer.merge(doc2[1:4])
with doc2.retokenize() as retokenizer:
    retokenizer.merge(doc2[7:9])
for token in doc2:
    print(f'{token.text:30} {token.lemma_:20}  {token.pos_:10} {spacy.explain(token.tag_)}')

The                            the                   DET        determiner
big brown fox                  big brown fox         ADJ        adjective
jump                           jump                  NOUN       noun, singular or mass
over                           over                  ADP        conjunction, subordinating or preposition
the                            the                   DET        determiner
lazy                           lazy                  ADJ        adjective
dog                            dog                   NOUN       noun, singular or mass
. Today                        . today               NOUN       noun, singular or mass
is                             be                    AUX        verb, 3rd person singular present
International                  International         PROPN      noun, proper singular
Womens                         Womens                PROPN      noun, proper singular
'                              '                     PART       p

In [10]:
for token in doc:
    print(f'{token.text:20} {token.pos_:10}  {token.dep_:15} {spacy.explain(token.dep_)}')

The                  DET         det             determiner
big                  ADJ         amod            adjectival modifier
brown                ADJ         amod            adjectival modifier
fox                  NOUN        compound        compound
jump                 NOUN        ROOT            None
over                 ADP         prep            prepositional modifier
the                  DET         det             determiner
lazy                 ADJ         amod            adjectival modifier
dog                  NOUN        pobj            object of preposition
.                    PUNCT       punct           punctuation
Today                NOUN        nsubj           nominal subject
is                   AUX         ROOT            None
International        PROPN       compound        compound
Womens               PROPN       poss            possession modifier
'                    PART        case            case marking
day                  NOUN        attr            

In [36]:
displacy.render(doc, style='dep', jupyter=True)

<IPython.core.display.HTML object>

In [38]:
doc2 = nlp(u'The form earned $1.5 billion in 2017, in comparison with $1.2 million in 2016.')
phrase = ''
for token in doc2:
    if token.tag_ == '$':
        i = token.i + 1
        while doc2[i].tag_ == 'CD':
            phrase += doc2[i].text + ' '
            i += 1
phrase = phrase[:-1]
print(phrase)

1.5 billion 1.2 million


In [39]:
doc2 = nlp(u'The form earned $1.5 billion in 2017, in comparison with $1.2 million in 2016.')
phrase = ''
for token in doc2:
    if token.tag_ == '$':
        phrase = token.text
        i = token.i + 1
        while doc2[i].tag_ == 'CD':
            phrase += doc2[i].text + ' '
            i += 1
        phrase = phrase[:-1]
        print(phrase)

$1.5 billion
$1.2 million


In [40]:
import re
text = doc2.text
pattern = '\$.+?[bm]illion'
result = re.findall(pattern, text)
result

['$1.5 billion', '$1.2 million']

In [42]:
from spacy import displacy
from IPython.core.display import display, HTML

In [45]:
html = displacy.serve(doc, style='dep')
display(HTML(html))

<IPython.core.display.HTML object>

In [51]:
html = displacy.render(nlp(u'I want to fly to Japan'), style='ent', page=True)
display(HTML(html))

<IPython.core.display.HTML object>

In [50]:
spacy.explain('GPE')

'Countries, cities, states'

In [55]:
### Similarity
doc3 = nlp(u'I want a green apple.')
doc3.similarity(doc3[2:5])

0.8776482403927138

In [57]:
nlp('apple').similarity(nlp('banana'))

0.5831844567891399

In [58]:
nlp('king').similarity(nlp('queen'))

0.7252610345406867

In [59]:
nlp('banana').vector

array([ 2.0228e-01, -7.6618e-02,  3.7032e-01,  3.2845e-02, -4.1957e-01,
        7.2069e-02, -3.7476e-01,  5.7460e-02, -1.2401e-02,  5.2949e-01,
       -5.2380e-01, -1.9771e-01, -3.4147e-01,  5.3317e-01, -2.5331e-02,
        1.7380e-01,  1.6772e-01,  8.3984e-01,  5.5107e-02,  1.0547e-01,
        3.7872e-01,  2.4275e-01,  1.4745e-02,  5.5951e-01,  1.2521e-01,
       -6.7596e-01,  3.5842e-01, -4.0028e-02,  9.5949e-02, -5.0690e-01,
       -8.5318e-02,  1.7980e-01,  3.3867e-01,  1.3230e-01,  3.1021e-01,
        2.1878e-01,  1.6853e-01,  1.9874e-01, -5.7385e-01, -1.0649e-01,
        2.6669e-01,  1.2838e-01, -1.2803e-01, -1.3284e-01,  1.2657e-01,
        8.6723e-01,  9.6721e-02,  4.8306e-01,  2.1271e-01, -5.4990e-02,
       -8.2425e-02,  2.2408e-01,  2.3975e-01, -6.2260e-02,  6.2194e-01,
       -5.9900e-01,  4.3201e-01,  2.8143e-01,  3.3842e-02, -4.8815e-01,
       -2.1359e-01,  2.7401e-01,  2.4095e-01,  4.5950e-01, -1.8605e-01,
       -1.0497e+00, -9.7305e-02, -1.8908e-01, -7.0929e-01,  4.01