In [1]:
import spacy

en_core_web_sm is trained on OntoNotes5 which is an annotated corpus comprising new, blogs, transcripts, etc. A statistical model was generated using these documents and this model is then used to predict things in subsequent documents 


In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
type(nlp)

spacy.lang.en.English

### Basic spacy usage

In [11]:
s = "He didn't want to pay $20 for this book."
doc = nlp(s)

NameError: name 'nlp' is not defined

In [None]:
print([t.text for t in doc])

['He', 'did', "n't", 'want', 'to', 'pay', '$', '20', 'for', 'this', 'book', '.']


In [None]:
print(doc[0])
print(type(doc[0]))

He
<class 'spacy.tokens.token.Token'>


In [None]:
print(doc[0:3])
print(type(doc[0:3]))

He didn't
<class 'spacy.tokens.span.Span'>


In [None]:
print([(t.text, t.i) for t in doc])

[('He', 0), ('did', 1), ("n't", 2), ('want', 3), ('to', 4), ('pay', 5), ('$', 6), ('20', 7), ('for', 8), ('this', 9), ('book', 10), ('.', 11)]


In [None]:
print(doc.text)

He didn't want to pay $20 for this book.


In [None]:
s = """Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next. First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs."""

doc = nlp(s)

In [None]:
for sent in doc.sents:
    print(sent)
    print("*"*15)

Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next.
***************
First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs.
***************


In [None]:
s = "He didn't want to pay $20 for this book."
doc = nlp(s)

In [None]:
print([t.text for t in doc])

['He', 'did', "n't", 'want', 'to', 'pay', '$', '20', 'for', 'this', 'book', '.']


In [None]:
print([t.is_currency for t in doc])
print([t.is_digit for t in doc])

[False, False, False, False, False, False, True, False, False, False, False, False]
[False, False, False, False, False, False, False, True, False, False, False, False]


In [None]:
for t in doc:
    if t.is_currency:
        if doc[t.i+1].is_digit:
            print(t.text + doc[t.i+1].text)

$20


In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer

s = "Let's go to N.Y.C. for the weekend."
TreebankWordTokenizer().tokenize(s)

['Let', "'s", 'go', 'to', 'N.Y.C.', 'for', 'the', 'weekend', '.']

In [None]:
nlp = spacy.load('en_core_web_sm')
s = "He told Dr. Lovato that he was done with the tests and would post the results shortly."
doc = nlp(s)

In [None]:
print([t.lower_ for t in doc])

['he', 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


In [None]:
print([t.lower_ if not t.is_sent_start else t for t in doc])

[He, 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


In [None]:
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

{'put', 'against', 'get', 'indeed', 'on', 'other', 'him', 'during', 'now', 'herein', 'thereby', 'has', 'if', 'own', 'anyway', 'herself', 'who', 'also', 'meanwhile', 'front', 'do', 'take', 'their', 'seem', 'thence', 'itself', 'elsewhere', 'the', 'most', 'six', 'least', 'latterly', "'ll", 'up', 'towards', 'next', 'whereas', 'just', 'throughout', 'either', 'beyond', 'over', '‘ll', 'much', '‘s', 'becomes', 'its', 'former', 'mine', 'nobody', 'alone', 'anyhow', 'nowhere', 'moreover', 'am', 'my', 'though', 'there', 'you', 'where', 'whereupon', '’ve', 'along', 'or', 'some', 'due', 'above', 'until', "'ve", 'doing', 'around', '‘ve', 'upon', 'nothing', 'five', 'namely', 'afterwards', 'i', 'already', 'become', 'others', 'often', 'again', 'below', 'under', 'eight', 'when', 'been', 'whom', 'sixty', 'go', 'down', 'which', 'move', 'seemed', 'forty', 'whoever', 'latter', 'twelve', 'same', 'seeming', 'from', 'than', 'well', 'while', 'almost', 'hereupon', 'toward', 'fifteen', 'call', 'enough', 'else', 'n

In [10]:
print([t for t in doc if not t.is_stop])

[told, Dr., Lovato, tests, post, results, shortly, .]


### Lemmatization

In [11]:
[(t.text, t.lemma_) for t in doc]

[('He', 'he'),
 ('told', 'tell'),
 ('Dr.', 'Dr.'),
 ('Lovato', 'Lovato'),
 ('that', 'that'),
 ('he', 'he'),
 ('was', 'be'),
 ('done', 'do'),
 ('with', 'with'),
 ('the', 'the'),
 ('tests', 'test'),
 ('and', 'and'),
 ('would', 'would'),
 ('post', 'post'),
 ('the', 'the'),
 ('results', 'result'),
 ('shortly', 'shortly'),
 ('.', '.')]

### NLTK

In [2]:
from nltk.stem.snowball import SnowballStemmer
s = 'He told Dr. Lovato that he was done with the tests and would post the results shortly.'

In [3]:
stemmer = SnowballStemmer(language='english')

In [9]:
stemmer.stem(s)

'he told dr. lovato that he was done with the tests and would post the results shortly.'

### Advanced preprocessing

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')
s = "John watched an old movie at the cinema."
doc = nlp(s)

In [13]:
[(t.text, t.pos_) for t in doc]

[('John', 'PROPN'),
 ('watched', 'VERB'),
 ('an', 'DET'),
 ('old', 'ADJ'),
 ('movie', 'NOUN'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('cinema', 'NOUN'),
 ('.', 'PUNCT')]

In [14]:
spacy.explain('PROPN')

'proper noun'

In [15]:
[(t.text, t.tag_) for t in doc]

[('John', 'NNP'),
 ('watched', 'VBD'),
 ('an', 'DT'),
 ('old', 'JJ'),
 ('movie', 'NN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('cinema', 'NN'),
 ('.', '.')]

In [16]:
spacy.explain('VBD')

'verb, past tense'

### NER

In [17]:
s = "Volkswagen is developing an electric sedan which could potentially come to America next fall."
doc = nlp(s)

[(t.text, t.ent_type_) for t in doc]

[('Volkswagen', 'ORG'),
 ('is', ''),
 ('developing', ''),
 ('an', ''),
 ('electric', ''),
 ('sedan', ''),
 ('which', ''),
 ('could', ''),
 ('potentially', ''),
 ('come', ''),
 ('to', ''),
 ('America', 'GPE'),
 ('next', 'DATE'),
 ('fall', 'DATE'),
 ('.', '')]

In [18]:
spacy.explain('GPE')

'Countries, cities, states'

In [19]:
print([(t.text, t.ent_type_) for t in doc if t.ent_type !=0])

[('Volkswagen', 'ORG'), ('America', 'GPE'), ('next', 'DATE'), ('fall', 'DATE')]


In [24]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Volkswagen', 'ORG'), ('America', 'GPE'), ('next fall', 'DATE')]


In [25]:
print([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents])

[('Volkswagen', 'ORG', 0, 10), ('America', 'GPE', 75, 82), ('next fall', 'DATE', 83, 92)]


In [27]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [28]:
s = "Ridley Scott directed The Martian."
doc = nlp(s)
displacy.render(doc, style='ent', jupyter=True)

### Parsing

In [29]:
s = "She enrolled in the course at the university."
doc = nlp(s)

displacy.render(doc, style='dep', jupyter=True)

In [30]:
spacy.explain('nsubj')

'nominal subject'

In [31]:
[(t.text, t.dep_) for t in doc]

[('She', 'nsubj'),
 ('enrolled', 'ROOT'),
 ('in', 'prep'),
 ('the', 'det'),
 ('course', 'pobj'),
 ('at', 'prep'),
 ('the', 'det'),
 ('university', 'pobj'),
 ('.', 'punct')]

In [32]:
[(t.text, t.dep_, t.head.text) for t in doc]

[('She', 'nsubj', 'enrolled'),
 ('enrolled', 'ROOT', 'enrolled'),
 ('in', 'prep', 'enrolled'),
 ('the', 'det', 'course'),
 ('course', 'pobj', 'in'),
 ('at', 'prep', 'course'),
 ('the', 'det', 'university'),
 ('university', 'pobj', 'at'),
 ('.', 'punct', 'enrolled')]

### Matcher

In [3]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
s = "I want to book a hotel room."
doc = nlp(s)


In [4]:
pattern = [
  {'TEXT': 'book'},
  {'POS': 'DET', 'OP': '?'},
  {'POS': 'NOUN', 'OP': '+'},
]

In [7]:
matcher.add('USER_INTENT', [pattern])

In [8]:
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['book a hotel', 'book a hotel room']


In [9]:
doc = nlp("I want to book a flight and hotel room in Berlin.")
for noun_phrase in doc.noun_chunks:
  print("phrase: {}, root head: {}".format(noun_phrase, noun_phrase.root.head))

phrase: I, root head: want
phrase: a flight and hotel room, root head: book
phrase: Berlin, root head: in


In [12]:
def yodize(s: str):
    doc = nlp(s)
    for t in doc:
        if t.dep_ == "ROOT":
            print(t.text)
            seq = [doc[t.i + 1: -1].text, doc[0:t.i].text, t.text + "."]
            seq[0] = seq[0].capitalize()
            print(" ".join(seq))

In [13]:
yodize("I will fly to Texas.")

fly
To texas I will fly.


In [14]:
s = "We'll be in Osaka on Feb 13th and leave on Feb 24th."
doc = nlp(s)

In [26]:
print([ent for ent in doc.ents if ent.label_ == "DATE"])

[Feb 13th, Feb 24th]


In [27]:
from spacy.matcher import PhraseMatcher
s = "Caesar Augustus was the founder of the Roman Principate (the first phase of the Roman Empire)."
doc = nlp(s)

In [36]:
matcher = PhraseMatcher(nlp.vocab)


In [37]:
terms = ["Caesar Augustus", "Roman Empire"]
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)
matches = matcher(doc)

In [38]:
print([(start, end) for match_id, start, end in matches])

[(0, 2), (15, 17)]
