In [1]:
from spacy.lang.en import English

nlp = English()

In [2]:
doc = nlp("Hello World!")

for token in doc:
    print(token.text)

Hello
World
!


In [3]:
token = doc[1]

In [4]:
print(token.text)

World


In [5]:
span = doc[1:3]

In [6]:
span

World!

In [7]:
print(span.text)

World!


In [8]:
doc = nlp("It costs $5.")

print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])

print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4]
Text:     ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
doc = nlp("She ate the pizza")

for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [11]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [12]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [13]:
spacy.explain("GPE")

'Countries, cities, states'

In [14]:
spacy.explain("NNP")

'noun, proper singular'

In [15]:
spacy.explain("dobj")

'direct object'

In [16]:
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'TEXT':'iPhone'}, {'TEXT': "X"}]
matcher.add("IPHONE_PATTERN", None, pattern)

doc = nlp("Upcoming iPhone X release date leaked")

In [17]:
matches = matcher(doc)

In [18]:
doc = nlp("Upcoming iPhone X release date leaked")

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start: end]
    print(matched_span.text)

iPhone X


In [19]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

In [20]:
doc = nlp("2018 FIFA World Cup: France won!")

In [21]:
matcher.add("FIFA PATTERN", None, pattern)

In [22]:
matches = matcher(doc)

In [23]:
for match_id, start, end in matches:
    matched_span = doc[start: end]
    print(matched_span.text)

2018 FIFA World Cup:


In [24]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

In [25]:
matcher.add("LOVE PATTERN", None, pattern)

In [26]:
doc = nlp("I loved dogs but now I love cats more.")

In [27]:
matches = matcher(doc)

In [28]:
for match_id, start, end in matches:
    matched_span = doc[start: end]
    print(matched_span.text)

loved dogs
love cats


In [29]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},
    {"POS": "NOUN"}
]

In [30]:
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("BUY PATTERN", None, pattern)

In [31]:
matches = matcher(doc)
for match_id, start, end in matches:
    matched_span = doc[start: end]
    print(matched_span.text)

bought a smartphone
buying apps


In [32]:
doc = nlp("I bought a smartphone. Now I'm buying apps. I loved dogs but now I love cats more. 2018 FIFA World Cup: France won! Upcoming iPhone X release date leaked")

In [33]:
coffee_hash = nlp.vocab.strings["coffee"]
coffee_string = nlp.vocab.strings[coffee_hash]

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'. This usually refers to an issue with the `Vocab` or `StringStore`."

In [34]:
nlp.vocab.strings['coffee']

3197928453018144401

In [35]:
doc = nlp("I love coffee")
print("hash value:", nlp.vocab.strings["coffee"])
print("string value:", nlp.vocab.strings[3197928453018144401])

hash value: 3197928453018144401
string value: coffee


In [36]:
doc = nlp("I love coffee")
print("hash value:", doc.vocab.strings["coffee"])

hash value: 3197928453018144401


In [37]:
doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

In [38]:
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [39]:
from spacy.tokens import Doc

words = ["Hello","world", "!"]
spaces = [True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [40]:
doc

Hello world!

In [41]:
from spacy.tokens import Doc, Span

In [42]:
span = Span(doc, 0, 2)
span_with_label = Span(doc, 0, 2, label="GREETING")

doc_ents = [span_with_label]

In [43]:
doc_ents

[Hello world]

In [44]:
import spacy

In [45]:
nlp = spacy.load("en_core_web_md")

doc1 = nlp("I like fast food")
doc2 = nlp("I like cats")
print(doc1.similarity(doc2))

0.8126938142087882


In [46]:
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.73695457


In [47]:
doc = nlp("I like pizza")
token = nlp("soap")[0]
print(doc.similarity(token))

0.3253198600655889


In [48]:
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199091710787739


In [49]:
nlp = spacy.load("en_core_web_md")

doc = nlp("I have a banana")

print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [50]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

0.9501447503553421


In [51]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER":"cats"}]
matcher.add("LOVE CATS", None, pattern)

pattern = [{"TEXT": "very", "OP": "+"},{"TEXT":"happy"}]
matcher.add("VERY_HAPPY", None, pattern)

doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

In [52]:
matcher = Matcher(nlp.vocab)
matcher.add("DOG", None, [{"LOWER": "golden"}, {"LOWER": "retriever"}])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span: ", span.text)
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    print("Previous token:",doc[start-1].text, doc[start-1].pos_)

Matched span:  Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


In [53]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add("DOG", None, pattern)
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)

Matched span: Golden Retriever


In [54]:
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [55]:
print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7f810293ce50>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7f8182084220>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f81820846a0>)]


In [56]:
def custom_component(doc):
    # Do something to the doc here
    return doc

nlp.add_pipe(custom_component)

### Components
    last - If TRUE, add last         nlp.add_pipe(component, last=True)
    first - If TRUE, add first       nlp.add_pipe(component, first=True)
    before - Add before component    nlp.add_pipe(component, before="ner")
    after - Add after component      nlp.add_pipe(component, after="trigger")
    
"ner" = name entity recognizer

In [58]:
nlp = spacy.load("en_core_web_sm")

def custom_component(doc):
    print("Doc length:", len(doc))
    return doc

nlp.add_pipe(custom_component, first=True)
print("Pipeline:", nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [59]:
doc = nlp("Hello world!")

Doc length: 3


In [None]:
# Test exercise adding a phrase matcher to the nlp pipeline.
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label) for ent in doc.ents])

In [61]:
from spacy.tokens import Doc, Token, Span

# Set meta data. Define attributes globally first.

Doc.set_extension("title", default=None)
Token.set_extension("is_color", default=False)
Span.set_extension("has_color", default=False)

In [None]:
# Attributes and meta data
doc._.title='My Document'
token._.is_color = True
span._.has_color = False

In [64]:
# Atrribute Extensions

doc = nlp("The sky is blue.")
doc[3]._.is_color = True

Doc length: 5


In [65]:
doc[3]._.is_color

True

In [67]:
from spacy.tokens import Token

def get_is_color(token):
    colors = ['red', 'yellow', 'blue']
    return token.text in colors

Token.set_extension("is_color", getter=get_is_color, force=True)
doc = nlp("The sky is red")
print(doc[3]._.is_color, "-", doc[3].text)

Doc length: 4
True - red


In [69]:
def get_has_color(span):
    colors = ["red", "yellow", "blue"]
    return any(token.text in colors for token in span)

Span.set_extension("has_color", getter=get_has_color, force=True)
doc = nlp("The sky is yellow")
print(doc[1:4]._.has_color, "-", doc[1:4].text)
print(doc[0:2]._.has_color, "-", doc[0:2].text)

Doc length: 4
True - sky is yellow
False - The sky


In [70]:
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

Doc.set_extension("has_token", method=has_token, force=True)

doc = nlp("The sky is blue.")
print(doc._.has_token("blue"), "- blue")
print(doc._.has_token("cloud"), "- cloud")

Doc length: 5
True - blue
False - cloud


In [None]:
# Find if there is a number in a doc

from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension "has_number" with the getter get_has_number
Doc.set_extension("has_number", getter=get_has_number, force=True)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)

In [None]:
# Transform a span into a new version

from spacy.lang.en import English
from spacy.tokens import Span

nlp = English()

# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return f"<{tag}>{span.text}</{tag}>"


# Register the Span method extension "to_html" with the method to_html
Span.set_extension("to_html", method=to_html)

# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))

In [None]:
# Finds entities and then turns them into wikipedia URLsb

import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

In [None]:
# Needs capitals and countries to be loaded, but will find countries and print capitals

import json


with open("exercises/en/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/en/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Create an entity Span with the label "GPE" for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# Register the Span extension attribute "capital" with the getter get_capital
Span.set_extension("capital", getter=get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

In [None]:
# BAD

docs = [nlp(text) for text in LOTS_OF_TEXTS]

# GOOD

docs = list(nlp.pipe(LOTS_OF_TEXTS))

In [74]:
Doc.set_extension("id", default=None, force=True)
Doc.set_extension("page_number", default=None, force=True)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number":16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

Doc length: 4
Doc length: 3


In [None]:
# Using only the tokenizer

# BAD

doc = nlp("Hello World")

# GOOD

doc = nlp.make_doc("Hello world!")

In [75]:
# Disable tagger and parser

with nlp.disable_pipes("tagger", "parser"):
    # Proces the text and print the entities
    doc = nlp(text)
    print(doc.ents)

NameError: name 'text' is not defined

In [None]:
# Efficient way of pulling out tokens from lots of docs.

import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == "ADJ"])

In [None]:
# How to set multiple pieces of meta data quickly

import json
from spacy.lang.en import English
from spacy.tokens import Doc

with open("exercises/en/bookquotes.json", encoding="utf8") as f:
    DATA = json.loads(f.read())

nlp = English()

# Register the Doc extension "author" (default None)
Doc.set_extension("author", default=None)

# Register the Doc extension "book" (default None)
Doc.set_extension("book", default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context["book"]
    doc._.author = context["author"]

    # Print the text and custom attribute data
    print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")

In [None]:
# Pattern matcher for iPhones labeled 'GADGET'

import json
from spacy.matcher import Matcher
from spacy.lang.en import English
​
with open("exercises/en/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())
​
nlp = English()
matcher = Matcher(nlp.vocab)
​
# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
​
# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
​
# Add patterns to the matcher and check the result
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])

In [None]:
# Creates labeled training data based on pre-made matcher

import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open("exercises/en/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = English()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")

In [77]:
# SpaCy let's you build your own model pathways

In [None]:
TRAINING_DATA = [
    ("How to preorder the iPhone X", {"entities": [(20, 28, "GADGET")]})
]

In [None]:
# Loop for 10 iterations
for i in range(10):
    # shuffle the training data
    random.shuffle(TRAINING_DATA)
    # Create batches and iterate over them
    for batch in spacy.util.minibatch(TRIANING_DATA):
        # Split the batch in texts and annotations
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # update the model
        nlp.update(texts, annotations)
        
# Save the model
nlp.to_disk(path_to_model)

In [None]:
# Start with blank English model
nlp = spacy.blank("en")
# Create blank entity recognizer and add it to the pipeline
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
# Add a new label
ner.add_label("GADGET")

# Start the training
nlp.begin_training()
# Train for 10 iterations
for itn in range(10):
    random.shuffle(examples)
    # Divide examples into batches
    for batch in spacy.util.minibatch(examples, size=2):
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # Update the model
        nlp.update(texts, annotations)

In [None]:
# You should be minimizing the loss function. Ideal sample size is several hundred or a few thousand examples

import spacy
import random
import json

with open("exercises/en/gadgets.json", encoding="utf8") as f:
    TRAINING_DATA = json.loads(f.read())

nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("GADGET")

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
    print(losses)

# Problem 1: 

Models can "forget" things. Make sure to mix in examples of old correct labeling.

In [None]:
# BAD:

TRAINING_DATA = [
    ("Reddit is a website", {"entities": [(0, 6, "WEBSITE")]})
]

# GOOD:

TRAINING_DATA = [
    ("Reddit is a website", {"entities": [(0, 6, "WEBSITE")]}),
    ("Obama is a person", {"entities": [(0, 5, "PERSON")]})
]

# Problem 2:

Predictions made within local context. Label scheme should be consistent and the less specific the better. Stick with general labels.

In [None]:
# BAD:

LABELS = ["ADULT_SHOES", "CHILDRENS_SHOES", "BANDS_I_LIKE"]

# GOOD

LABELS = ["CLOTHING", "BAND"]

In [None]:
TRAINING_DATA = [
    (
        "i went to amsterdem last year and the canals were beautiful",
        {"entities": [(10, 19, "GPE")]},
    ),
    (
        "You should visit Paris once in your life, but the Eiffel Tower is kinda boring",
        {"entities": [(17, 22, "GPE")]},
    ),
    (
        "There's also a Paris in Arkansas, lol",
        {"entities": [(15, 20, "GPE"), (24, 32, "GPE")]},
    ),
    (
        "Berlin is perfect for summer holiday: lots of parks, great nightlife, cheap beer!",
        {"entities": [(0, 6, "GPE")]},
    ),
]

In [None]:
TRAINING_DATA = [
    (
        "Reddit partners with Patreon to help creators build communities",
        {"entities": [(0, 6, "WEBSITE"), (21, 28, "WEBSITE")]},
    ),
    ("PewDiePie smashes YouTube record", {"entities": [(0, 9, "PERSON"), (18, 25, "WEBSITE")]}),
    (
        "Reddit founder Alexis Ohanian gave away two Metallica tickets to fans",
        {"entities": [(0, 6, "WEBSITE"), (15, 29, "PERSON")]},
    ),
    # And so on...
]