[View in Colaboratory](https://colab.research.google.com/github/rksharma55555/testspacy/blob/master/test_spacy.ipynb)

In [1]:
!pip install spacy




In [2]:
!python -m spacy download en              # default English model (~50MB)

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 53.5MB/s 

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [0]:
#!python -m spacy download en              # default English model (~50MB)
!python -m spacy download en_core_web_md  # larger English model (~1GB)


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K

In [4]:
import spacy.cli
spacy.cli.download("en_core_web_sm")




[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [0]:
#Installing en model
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
print(doc)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Apple is looking at buying U.K. startup for $1 billion
Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [0]:
!python -m spacy download en_core_web_lg

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    15% |████▉                           | 129.9MB 68.2MB/s eta 0:00:11

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love coffee')
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


In [0]:
#Word Vectors and Similarity
import spacy

nlp = spacy.load('en_core_web_md')
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")

apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]

print('apple <-> banana', apple.similarity(banana))
print('pasta <-> hippo', pasta.similarity(hippo))
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)

apple <-> banana 0.5831845
pasta <-> hippo 0.12069741
True True True True


In [0]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1

pattern1 = [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji

doc = nlp(u"A text about Google I/O 😀😀")
matches = matcher(doc)

for match_id, start, end in matches:
   string_id = nlp.vocab.strings[match_id]
   span = doc[start:end]
   print(string_id, span.text)
print('Sentiment', doc.sentiment)

GoogleIO Google I/O
HAPPY 😀😀
HAPPY 😀
Sentiment 0.20000000298023224


In [0]:
#Uploading code file from local system
from google.colab import files
files.upload()
# choose the file on your computer to upload it then
#import data

Saving testspcy.py to testspcy (2).py


{'testspcy.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""A simple example of extracting relations between phrases and entities using\r\nspaCy\'s named entity recognizer and the dependency parse. Here, we extract\r\nmoney and currency values (entities labelled as MONEY) and then check the\r\ndependency tree to find the noun phrase they are referring to \xe2\x80\x93 for example:\r\n$9.4 million --> Net income.\r\nCompatible with: spaCy v2.0.0+\r\n"""\r\nfrom __future__ import unicode_literals, print_function\r\n\r\nimport plac\r\nimport spacy\r\n\r\n\r\nTEXTS = [\r\n    \'Net income was $9.4 million compared to the prior year of $2.7 million.\',\r\n    \'Revenue exceeded twelve billion dollars, with a loss of $1b.\',\r\n]\r\n\r\n\r\n@plac.annotations(\r\n    model=("Model to load (needs parser and NER)", "positional", None, str))\r\ndef main(model=\'en_core_web_sm\'):\r\n    nlp = spacy.load(model)\r\n    print("Loaded model \'%s\'" % model)\r\n    print("Processing %d texts" % le

In [0]:
#Check if file upload is correct or not
!cat testspcy.py

#!/usr/bin/env python
# coding: utf8
"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import spacy


TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
]


@plac.annotations(
    model=("Model to load (needs parser and NER)", "positional", None, str))
def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)

In [0]:
#Run the uploaded code file
!python testspcy.py

Loaded model 'en_core_web_sm'
Processing 2 texts
Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million
Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b


In [0]:
#Uploading code file from local system
from google.colab import files
files.upload()


Saving testspcy2.py to testspcy2.py


{'testspcy2.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""A simple example of extracting relations between phrases and entities using\r\nspaCy\'s named entity recognizer and the dependency parse. Here, we extract\r\nmoney and currency values (entities labelled as MONEY) and then check the\r\ndependency tree to find the noun phrase they are referring to \xe2\x80\x93 for example:\r\n$9.4 million --> Net income.\r\nCompatible with: spaCy v2.0.0+\r\n"""\r\nfrom __future__ import unicode_literals, print_function\r\n\r\nimport plac\r\nimport spacy\r\n\r\n\r\nTEXTS = [\r\n    \'The School has Revenue Bonds, Series 2010 in the amount of $26,600,000.  These bonds were issued at \\\r\na discount of $133,266.  One group of bonds totaling $9,650,000 matures on July 1, 2030, interest \\\r\npayable at 6.00%, and the other group of bonds totaling $16,950,000 matures on July 1, 2042, interest \\\r\npayable at 7.00%.  Interest payments began July 1, 2011, and principal payments began July 1, 201

In [0]:
!python testspcy2.py

Loaded model 'en_core_web_sm'
Processing 1 texts
the amount	MONEY	26,600,000
a discount	MONEY	133,266


In [0]:
#Uploading code file from local system
from google.colab import files
files.upload()

Saving testspcy2.py to testspcy2 (1).py


{'testspcy2.py': b'#!/usr/bin/env python\r\n# coding: utf8\r\n"""A simple example of extracting relations between phrases and entities using\r\nspaCy\'s named entity recognizer and the dependency parse. Here, we extract\r\nmoney and currency values (entities labelled as MONEY) and then check the\r\ndependency tree to find the noun phrase they are referring to \xe2\x80\x93 for example:\r\n$9.4 million --> Net income.\r\nCompatible with: spaCy v2.0.0+\r\n"""\r\nfrom __future__ import unicode_literals, print_function\r\n\r\nimport plac\r\nimport spacy\r\n\r\n\r\nTEXTS = [\r\n    \'The School has Revenue Bonds, Series 2010 in the amount of $26,600,000.  These bonds were issued at \\\r\na discount of $133,266.  One group of bonds totaling $9,650,000 matures on July 1, 2030, interest \\\r\npayable at 6.00%, and the other group of bonds totaling $16,950,000 matures on July 1, 2042, interest \\\r\npayable at 7.00%.  Interest payments began July 1, 2011, and principal payments began July 1, 201

In [0]:
!cat testspcy2.py


#!/usr/bin/env python
# coding: utf8
"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import spacy


TEXTS = [
    'The School has Revenue Bonds, Series 2010 in the amount of $26,600,000.  These bonds were issued at \
a discount of $133,266.  One group of bonds totaling $9,650,000 matures on July 1, 2030, interest \
payable at 6.00%, and the other group of bonds totaling $16,950,000 matures on July 1, 2042, interest \
payable at 7.00%.  Interest payments began July 1, 2011, and principal payments began July 1, 2013.'
]


@plac.annotations(
    model=("Model

In [0]:
!python testspcy2.py

Loaded model 'en_core_web_sm'
Processing 1 texts
the amount	MONEY	26,600,000
a discount	MONEY	133,266


In [0]:
#POS Tagging
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

# =============================================================================
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?
# =============================================================================


Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [0]:
#Noun Chunking
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


**Navigating the parse tree**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [0]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)


{shift}


**Iterating around the local tree**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"bright red apples on the tree")
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

['bright', 'red']
['on']
2
1


In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
          descendant.n_rights,
          [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['holders', 'submit']
and cc 0 0 ['Credit', 'holders', 'submit']
mortgage compound 0 0 ['account', 'Credit', 'holders', 'submit']
account conj 1 0 ['Credit', 'holders', 'submit']
holders nsubj 1 0 ['submit']


In [0]:
#Visualizing
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
displacy.render(doc, style='dep', jupyter=True)

**Named Entity Recognition 101**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


**Accessing entity annotations**

In [0]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # [u'San', u'B', u'GPE']
print(ent_francisco)  # [u'Francisco', u'I', u'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


In [0]:

#!python -m spacy download custom_ner_model


/usr/bin/python3: No module named spacy


In [0]:
!python -m spacy validate

/usr/bin/python3: No module named spacy


In [0]:
!cd /usr/local/lib/python3.6/dist-packages/spacy


In [0]:
!ls -ltr /usr/local/lib/python3.6/dist-packages/spacy

total 15096
-rw-r--r--  1 root staff    1093 Jun 18 05:50 __main__.py
-rw-r--r--  1 root staff     630 Jun 18 05:50 __init__.py
-rw-r--r--  1 root staff       0 Jun 18 05:50 __init__.pxd
-rw-r--r--  1 root staff   18993 Jun 18 05:50 _ml.py
-rw-r--r--  1 root staff     730 Jun 18 05:50 about.py
-rw-r--r--  1 root staff    4531 Jun 18 05:50 attrs.pyx
-rw-r--r--  1 root staff    1027 Jun 18 05:50 attrs.pxd
-rwxr-xr-x  1 root staff  409680 Jun 18 05:50 attrs.cpython-36m-x86_64-linux-gnu.so
-rw-r--r--  1 root staff   12528 Jun 18 05:50 glossary.py
-rw-r--r--  1 root staff   17690 Jun 18 05:50 errors.py
-rw-r--r--  1 root staff    3855 Jun 18 05:50 compat.py
-rwxr-xr-x  1 root staff 1902744 Jun 18 05:50 gold.cpython-36m-x86_64-linux-gnu.so
-rw-r--r--  1 root staff     754 Jun 18 05:50 gold.pxd
-rw-r--r--  1 root staff    3969 Jun 18 05:50 lemmatizer.py
-rw-r--r--  1 root staff   31074 Jun 18 05:50 language.py
-rw-r--r--  1 root staff   20730 Jun 18 05:50 gold.pyx
-rwxr-xr-x 

In [0]:
import spacy

ModuleNotFoundError: ignored