[View in Colaboratory](https://colab.research.google.com/github/rksharma55555/testspacy/blob/master/test_spacy.ipynb)

In [1]:
!pip install spacy




In [2]:
!python -m spacy download en              # default English model (~50MB)

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 53.5MB/s 

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [0]:
#!python -m spacy download en              # default English model (~50MB)
!python -m spacy download en_core_web_md  # larger English model (~1GB)


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K

In [4]:
import spacy.cli
spacy.cli.download("en_core_web_sm")




[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [0]:
#Installing en model
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
print(doc)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple is looking at buying U.K. startup for $1 billion
Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [0]:
!python -m spacy download en_core_web_lg

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    15% |████▉                           | 129.9MB 68.2MB/s eta 0:00:11

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love coffee')
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


In [0]:
#Word Vectors and Similarity
import spacy

nlp = spacy.load('en_core_web_md')
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print('apple <-> banana', apple.similarity(banana))
print('pasta <-> hippo', pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

apple <-> banana 0.5831845
pasta <-> hippo 0.12069741
True True True True


In [0]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1

pattern1 = [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji

doc = nlp(u"A text about Google I/O 😀😀")
matches = matcher(doc)

for match_id, start, end in matches:
   string_id = nlp.vocab.strings[match_id]
   span = doc[start:end]
   print(string_id, span.text)
print('Sentiment', doc.sentiment)

GoogleIO Google I/O
HAPPY 😀😀
HAPPY 😀
Sentiment 0.20000000298023224


In [0]:
#Uploading code file from local system
from google.colab import files
files.upload()
# choose the file on your computer to upload it then
#import data

Saving testspcy.py to testspcy (2).py


{'testspcy.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""A simple example of extracting relations between phrases and entities using\r\nspaCy\'s named entity recognizer and the dependency parse. Here, we extract\r\nmoney and currency values (entities labelled as MONEY) and then check the\r\ndependency tree to find the noun phrase they are referring to \xe2\x80\x93 for example:\r\n$9.4 million --> Net income.\r\nCompatible with: spaCy v2.0.0+\r\n"""\r\nfrom __future__ import unicode_literals, print_function\r\n\r\nimport plac\r\nimport spacy\r\n\r\n\r\nTEXTS = [\r\n    \'Net income was $9.4 million compared to the prior year of $2.7 million.\',\r\n    \'Revenue exceeded twelve billion dollars, with a loss of $1b.\',\r\n]\r\n\r\n\r\n@plac.annotations(\r\n    model=("Model to load (needs parser and NER)", "positional", None, str))\r\ndef main(model=\'en_core_web_sm\'):\r\n    nlp = spacy.load(model)\r\n    print("Loaded model \'%s\'" % model)\r\n    print("Processing %d texts" % le

In [0]:
#Check if file upload is correct or not
!cat testspcy.py

#!/usr/bin/env python
# coding: utf8
"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import spacy


TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
]


@plac.annotations(
    model=("Model to load (needs parser and NER)", "positional", None, str))
def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)

In [0]:
#Run the uploaded code file
!python testspcy.py

Loaded model 'en_core_web_sm'
Processing 2 texts
Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million
Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b


In [0]:
#Uploading code file from local system
from google.colab import files
files.upload()


Saving testspcy2.py to testspcy2.py


{'testspcy2.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""A simple example of extracting relations between phrases and entities using\r\nspaCy\'s named entity recognizer and the dependency parse. Here, we extract\r\nmoney and currency values (entities labelled as MONEY) and then check the\r\ndependency tree to find the noun phrase they are referring to \xe2\x80\x93 for example:\r\n$9.4 million --> Net income.\r\nCompatible with: spaCy v2.0.0+\r\n"""\r\nfrom __future__ import unicode_literals, print_function\r\n\r\nimport plac\r\nimport spacy\r\n\r\n\r\nTEXTS = [\r\n    \'The School has Revenue Bonds, Series 2010 in the amount of $26,600,000.  These bonds were issued at \\\r\na discount of $133,266.  One group of bonds totaling $9,650,000 matures on July 1, 2030, interest \\\r\npayable at 6.00%, and the other group of bonds totaling $16,950,000 matures on July 1, 2042, interest \\\r\npayable at 7.00%.  Interest payments began July 1, 2011, and principal payments began July 1, 201

In [0]:
!python testspcy2.py

Loaded model 'en_core_web_sm'
Processing 1 texts
the amount	MONEY	26,600,000
a discount	MONEY	133,266


In [0]:
#Uploading code file from local system
from google.colab import files
files.upload()

Saving testspcy2.py to testspcy2 (1).py


{'testspcy2.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""A simple example of extracting relations between phrases and entities using\r\nspaCy\'s named entity recognizer and the dependency parse. Here, we extract\r\nmoney and currency values (entities labelled as MONEY) and then check the\r\ndependency tree to find the noun phrase they are referring to \xe2\x80\x93 for example:\r\n$9.4 million --> Net income.\r\nCompatible with: spaCy v2.0.0+\r\n"""\r\nfrom __future__ import unicode_literals, print_function\r\n\r\nimport plac\r\nimport spacy\r\n\r\n\r\nTEXTS = [\r\n    \'The School has Revenue Bonds, Series 2010 in the amount of $26,600,000.  These bonds were issued at \\\r\na discount of $133,266.  One group of bonds totaling $9,650,000 matures on July 1, 2030, interest \\\r\npayable at 6.00%, and the other group of bonds totaling $16,950,000 matures on July 1, 2042, interest \\\r\npayable at 7.00%.  Interest payments began July 1, 2011, and principal payments began July 1, 201

In [0]:
!cat testspcy2.py


#!/usr/bin/env python
# coding: utf8
"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import spacy


TEXTS = [
    'The School has Revenue Bonds, Series 2010 in the amount of $26,600,000.  These bonds were issued at \
a discount of $133,266.  One group of bonds totaling $9,650,000 matures on July 1, 2030, interest \
payable at 6.00%, and the other group of bonds totaling $16,950,000 matures on July 1, 2042, interest \
payable at 7.00%.  Interest payments began July 1, 2011, and principal payments began July 1, 2013.'
]


@plac.annotations(
    model=("Model

In [0]:
!python testspcy2.py

Loaded model 'en_core_web_sm'
Processing 1 texts
the amount	MONEY	26,600,000
a discount	MONEY	133,266


In [0]:
#POS Tagging
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

# =============================================================================
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?
# =============================================================================


Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [0]:
#Noun Chunking
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


**Navigating the parse tree**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [0]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)


{shift}


**Iterating around the local tree**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"bright red apples on the tree")
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

['bright', 'red']
['on']
2
1


In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
          descendant.n_rights,
          [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['holders', 'submit']
and cc 0 0 ['Credit', 'holders', 'submit']
mortgage compound 0 0 ['account', 'Credit', 'holders', 'submit']
account conj 1 0 ['Credit', 'holders', 'submit']
holders nsubj 1 0 ['submit']


In [0]:
#Visualizing
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
displacy.render(doc, style='dep', jupyter=True)

**Named Entity Recognition 101**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


**Accessing entity annotations**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # [u'San', u'B', u'GPE']
print(ent_francisco)  # [u'Francisco', u'I', u'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


In [0]:

#!python -m spacy download custom_ner_model


/usr/bin/python3: No module named spacy


In [0]:
!python -m spacy validate

/usr/bin/python3: No module named spacy


In [0]:
!cd /usr/local/lib/python3.6/dist-packages/spacy


In [0]:
!ls -ltr /usr/local/lib/python3.6/dist-packages/spacy

total 15096
-rw-r--r--  1 root staff    1093 Jun 18 05:50 __main__.py
-rw-r--r--  1 root staff     630 Jun 18 05:50 __init__.py
-rw-r--r--  1 root staff       0 Jun 18 05:50 __init__.pxd
-rw-r--r--  1 root staff   18993 Jun 18 05:50 _ml.py
-rw-r--r--  1 root staff     730 Jun 18 05:50 about.py
-rw-r--r--  1 root staff    4531 Jun 18 05:50 attrs.pyx
-rw-r--r--  1 root staff    1027 Jun 18 05:50 attrs.pxd
-rwxr-xr-x  1 root staff  409680 Jun 18 05:50 attrs.cpython-36m-x86_64-linux-gnu.so
-rw-r--r--  1 root staff   12528 Jun 18 05:50 glossary.py
-rw-r--r--  1 root staff   17690 Jun 18 05:50 errors.py
-rw-r--r--  1 root staff    3855 Jun 18 05:50 compat.py
-rwxr-xr-x  1 root staff 1902744 Jun 18 05:50 gold.cpython-36m-x86_64-linux-gnu.so
-rw-r--r--  1 root staff     754 Jun 18 05:50 gold.pxd
-rw-r--r--  1 root staff    3969 Jun 18 05:50 lemmatizer.py
-rw-r--r--  1 root staff   31074 Jun 18 05:50 language.py
-rw-r--r--  1 root staff   20730 Jun 18 05:50 gold.pyx
-rwxr-xr-x 

In [10]:
#Uploading prhase matcher
#Uploading code file from local system
from google.colab import files
files.upload()

Saving phrase_matcher.py to phrase_matcher.py


{'phrase_matcher.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""Match a large set of multi-word expressions in O(1) time.\r\nThe idea is to associate each word in the vocabulary with a tag, noting whether\r\nthey begin, end, or are inside at least one pattern. An additional tag is used\r\nfor single-word patterns. Complete patterns are also stored in a hash set.\r\nWhen we process a document, we look up the words in the vocabulary, to\r\nassociate the words with the tags.  We then search for tag-sequences that\r\ncorrespond to valid candidates. Finally, we look up the candidates in the hash\r\nset.\r\nFor instance, to search for the phrases "Barack Hussein Obama" and "Hilary\r\nClinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with\r\nthe I tag, and Obama and Clinton with the L tag.\r\nThe document "Barack Clinton and Hilary Clinton" would have the tag sequence\r\n[{B}, {L}, {}, {B}, {L}], so we\'d get two matches. However, only the second\r\ncandidate is

In [12]:
import spacy
nlp = spacy.load('en')
doc = nlp('Hello     World!')
for token in doc:
    print('"' + token.text + '"')

"Hello"
"    "
"World"
"!"


In [13]:
import spacy
nlp = spacy.load('en')
doc = nlp('Hello     World!')
for token in doc:
    print('"' + token.text + '"', token.idx)
 
# "Hello" 0
# "    " 6
# "World" 10
# "!" 15

"Hello" 0
"    " 6
"World" 10
"!" 15


In [14]:
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))
 

Next	0	next	False	False	Xxxx	ADJ	JJ
week	5	week	False	False	xxxx	NOUN	NN
I	10	-PRON-	False	False	X	PRON	PRP
'll	11	will	False	False	'xx	VERB	MD
  	15	  	False	True	  	SPACE	_SP
be	17	be	False	False	xx	VERB	VB
in	20	in	False	False	xx	ADP	IN
Madrid	23	madrid	False	False	Xxxxx	PROPN	NNP
.	29	.	True	False	.	PUNCT	.


**Sentence detection**

In [15]:
doc = nlp("These are apples. These are oranges.")
 
for sent in doc.sents:
    print(sent)

These are apples.
These are oranges.


**Part Of Speech Tagging**

In [16]:
doc = nlp("Next week I'll be in Madrid.")
print([(token.text, token.tag_) for token in doc])

[('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]


Named Entity Recognition
bold text

In [17]:
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Next week DATE
Madrid GPE


**Entity Types**

In [18]:
doc = nlp("I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ")
for ent in doc.ents:
    print(ent.text, ent.label_)

2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


**displayCY**

In [19]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

**Chunking**

In [20]:

doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

Wall Street Journal NP Journal
an interesting piece NP piece
crypto currencies NP currencies


**Dependency Parsing**

In [21]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/JJ <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


**If this doesn’t help visualizing the dependency tree, displaCy comes in handy:**

In [22]:
from spacy import displacy
 
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

**Word Vectors**

In [24]:
!python -m spacy download en_core_web_lg


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    50% |████████████████                | 428.6MB 54.1MB/s eta 0:00:08

[K    100% |████████████████████████████████| 852.3MB 13.3MB/s 
[?25hInstalling collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg ... [?25l- \ | / - \ | / - done
[?25hSuccessfully installed en-core-web-lg-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



In [25]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [28]:
from scipy import spatial
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
 
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])
 
# ['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'KINGS', 'kings', 'Kings']

['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'Kings', 'KINGS', 'kings']


**Computing Similarity**

In [29]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']
 
print(dog.similarity(animal), dog.similarity(fruit)) # 0.6618534 0.23552845
print(banana.similarity(fruit), banana.similarity(animal)) # 0.67148364 0.2427285

0.66185343 0.23552851
0.67148364 0.24272855


In [30]:
target = nlp("Cats are beautiful animals.")
 
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")
 
print(target.similarity(doc1))  # 0.8901765218466683
print(target.similarity(doc2))  # 0.9115828449161616
print(target.similarity(doc3))  # 0.7822956752876101

0.8901766262114666
0.9115828449161616
0.7822956256736615


**Creating Document level Extension**

In [32]:
!pip install -U nltk

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 17.8MB/s 
[?25hRequirement not upgraded as not directly required: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0)
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25l- \ | / - done
[?25h  Stored in directory: /content/.cache/pip/wheels/d1/ab/40/3bceea46922767e42986aef7606a600538ca80de6062dc266c
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.3


In [35]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /content/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [36]:
import spacy
from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer
 
sentiment_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)
 
Doc.set_extension('polarity_scores', getter=polarity_scores)
 
nlp = spacy.load('en')
doc = nlp("Really Whaaat event apple nice! it!")
print(doc._.polarity_scores)
# {'neg': 0.0, 'neu': 0.596, 'pos': 0.404, 'compound': 0.5242}

{'neg': 0.0, 'neu': 0.596, 'pos': 0.404, 'compound': 0.5242}


In [39]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
from nltk.corpus import wordnet as wn
from spacy.tokens import Token
 
 
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
 
class WordnetPipeline(object):
    def __init__(self, nlp):
        Token.set_extension('synset',force=True)
 
    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue
 
            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss)
 
        return doc
 
 
nlp = spacy.load('en')
wn_pipeline = WordnetPipeline(nlp)
nlp.add_pipe(wn_pipeline, name='wn_synsets')
doc = nlp("Paris is the awesome capital of France.")
 
for token in doc:
    print(token.text, "-", token._.synset)

ValueError: ignored