In [10]:
import spacy

# Spacy Best Practices
1. If need to output strings, convert to strings as late as possible to main complex relationships with hashs, etc.
2. Use token attributes whenever possible. E.g. token.pos_, token.text, etc.

# Training Best Practices

1. Model can forget certain things it already learned - for example, if it learned to identify people and now you are teaching it how to identify a website, it might forget how to identify a person. A solution to this is to constantly mix in examples of what it previously learned so that it can be reinforced in the model's "brain"
2. Label scheme needs to be consistent and not overly specific --> It can be difficult for a model to learn to identify adult clothing or children's clothing. Having the model identify just clothing would be better.


In [3]:
## BAD CODE
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc] #BAD
pos_tags = [token.pos_ for token in doc] #BAD

#BAD - does not use token attributes
for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)

Found proper noun before a verb: Berlin


In [4]:
## GOOD CODE
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc] #GOOD - uses token attributes

for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)

Found proper noun before a verb: Berlin


# Basics

In [6]:
from spacy.lang.en import English

nlp = English()

doc = nlp("Hello World")

for token in doc:
    print(token.text)

word = doc[1]
print(word)

chunk = doc[0:]
print("chunk", chunk)


Hello
World
World
chunk Hello World


In [7]:
# breaking down a sentence
doc = nlp("it costs $5.")
print("Índex: ", [token.i for token in doc])
print("Text: ", [token.text for token in doc])
print("Is_alpha", [token.is_alpha for token in doc]) #is_alpha means does characters at certain index consist of members of alphabet
print("is_punct", [token.is_punct for token in doc])
print("like_num", [token.like_num for token in doc])

Índex:  [0, 1, 2, 3, 4]
Text:  ['it', 'costs', '$', '5', '.']
Is_alpha [True, True, False, False, False]
is_punct [False, False, False, False, True]
like_num [False, False, False, True, False]


In [8]:
# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. " "Now less than 4% are.")

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


# Statistical Models
### Allow for text to be interpreted in context

In [9]:
nlp = spacy.load("en_core_web_sm") #small language model that allows for English lanuage to be understood

"""
en_core_web_sm
Statistical model allow you to generalize based on a set of training examples. 
Once they’re trained, they use binary weights to make predictions. 
"""

'\nen_core_web_sm\nStatistical model allow you to generalize based on a set of training examples. \nOnce they’re trained, they use binary weights to make predictions. \n'

In [10]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

#always add _ to get string representation

# {___:<12} 12 character gap between each value, formatting
for token in doc:
    print(f"{token.text:<12} {token.lemma_:<10} {token.pos_:<10}{token.tag_:<10}{token.dep_:<10} {token.shape_:<10} {token.is_alpha :<10} {token.is_stop :<10}")

Apple        apple      PROPN     NNP       nsubj      Xxxxx      1          0         
is           be         VERB      VBZ       aux        xx         1          1         
looking      look       VERB      VBG       ROOT       xxxx       1          0         
at           at         ADP       IN        prep       xx         1          1         
buying       buy        VERB      VBG       pcomp      xxxx       1          0         
U.K.         u.k.       PROPN     NNP       compound   X.X.       0          0         
startup      startup    NOUN      NN        dobj       xxxx       1          0         
for          for        ADP       IN        prep       xxx        1          1         
$            $          SYM       $         quantmod   $          0          0         
1            1          NUM       CD        compound   d          0          0         
billion      billion    NUM       CD        pobj       xxxx       1          0         


In [11]:
# doc.ents will return all real world objects that are assigned a name
#e.g. person, name, org, country

for ent in doc.ents:
    print(ent.text, ent.label_) #label_ says type of real world object

Apple ORG
U.K. GPE
$1 billion MONEY


In [12]:
spacy.explain("GPE")

'Countries, cities, states'

# Rule-Based Matching
Allows us to create patterns that will find matches inside the text

In [13]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
doc = nlp("2018 FIFA World Cup: France won!")

In [14]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

In [15]:
matcher.add("PATTERN", None, pattern)
matches = matcher(doc)

In [16]:
for match_id, start, end in matches:
    matched_span = doc[start:end] # start and end are character numbers that show where the match is
    print(matched_span.text)

2018 FIFA World Cup:


In [17]:
# patterns that require NLP
pattern_2 = [
    {"LEMMA:", "love", "POS:", "VERB"},
    {"POS": "NOUN"}
]

doc = nlp("I loved dogs but now I love cats more.")

In [18]:
matcher.add("PATTERN_2", None, pattern_2)
matches = matcher(doc) 

#matches would be loved dogs and love cats

AttributeError: 'set' object has no attribute 'items'

In [19]:
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [20]:
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns

# "OP" --> define how often a token pattern should be matched:
# ! -> Negate the pattern, by requiring it to match exactly 0 times.
# ? -> Make the pattern optional, by allowing it to match 0 or 1 times.
# + -> Require the pattern to match 1 or more times.
# * -> Allow the pattern to match 0 or more times.

pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 4
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice responses


In [21]:
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

doc2 = nlp("According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14.")

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"IS_ALPHA": False} , {"LOWER": "free"} ,{"POS": "NOUN"}] # ad-free is stored as ad,-,free

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


# Phrase-Based Matching

Takes doc objects as patterns <br>
More efficient than Rule-Based <br>
Great for matching lists of large words <br>

In [22]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever") #CASE-SENSITIVE!!!
pattern2 = nlp("golden retriever")

matcher.add("DOG", None, pattern)
matcher.add("DOG2", None, pattern2)

doc = nlp("I have a golden retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched Span: ", span.text)

Matched Span:  golden retriever


# Shared Vocabulary

1. Add any new work to Spacy shared vocab (nlp.vocab.strings)
2. Automatically generates a hash value for this word using a hash function, which can be reversed to get original string, as shown below

In [23]:
print(nlp.vocab.strings["Alzeheimer's"])

1214040188179850886


In [24]:
print(nlp.vocab.strings["Alzheimer"])

6081433347173383943


In [25]:
doc = nlp("I love coffee")
print("hash value: ", nlp.vocab.strings["coffee"])
print("string value: ", nlp.vocab.strings[nlp.vocab.strings["coffee"]])

hash value:  3197928453018144401
string value:  coffee


In [26]:
lexeme = nlp.vocab["coffee"] #lexeme holds all attributes about a certain word
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


# Doc + Span Class

Manually creating spaCy objects

In [27]:
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
label = nlp.vocab.strings["PERSON"]
span = Span(doc, 2, 4, label=label)
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = list(doc.ents) + [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


# Word Vectors

Need medium or large models that has word vectors

Allow for comparing objects and predicting similarity

In [28]:
nlp = spacy.load("en_core_web_md")
doc1 = nlp("I'm hungry")
doc2 = nlp("I'm starving")
print(doc1.similarity(doc2))

0.955339749893996


In [29]:
span = doc1[1:3]
print(token)
print(doc1.similarity(span))

billion
0.9368200277285751


In [30]:
print(doc1[2].vector) #getting word vector

[-2.5519e-01 -4.5706e-01  1.5926e-01 -2.8942e-01 -1.1358e-01  4.0340e-01
  2.2152e-01  2.1713e-01  3.2619e-01  2.0210e+00 -1.4097e-01 -5.6283e-02
 -9.5612e-02 -5.6380e-01 -1.8752e-01 -5.8578e-02 -8.6614e-02  3.3660e-01
  3.3917e-02  3.4581e-01 -8.0374e-03 -6.2276e-01 -3.4314e-01 -5.4515e-01
 -2.2171e-01 -3.8986e-01  2.0809e-01  1.9913e-01  1.6447e-01 -5.2185e-01
 -6.8712e-01 -6.3531e-01  5.3983e-02 -4.4942e-01  5.0645e-01  1.2429e-01
 -1.0032e-01 -1.1886e-01 -3.4388e-01  5.7359e-01 -2.2550e-01 -3.3255e-01
 -1.9401e-02 -3.4872e-01 -2.8209e-01 -2.3237e-01 -5.5767e-02  7.1624e-01
 -1.8586e-01  3.2787e-02 -4.4107e-01  2.6368e-01 -4.5362e-01 -2.7263e-01
  1.5438e-01  2.3338e-01 -3.1650e-01  4.6242e-02  2.3259e-01 -9.2897e-03
 -2.8470e-01 -3.2090e-01  2.5173e-01 -8.8286e-01 -2.1064e-01 -8.7721e-01
  5.7446e-01 -6.1218e-04  1.2941e-01  1.3231e-01 -3.9421e-02  2.7136e-01
  1.6217e-01 -6.0974e-01 -4.0965e-02  1.3208e-01 -4.9916e-02  2.8669e-01
  7.3196e-02  1.9804e-01 -7.7658e-03  3.6383e-01  7

# Processing Pipelines

Functions applied to doc to add pos tags, entities, etc.

![title](Images/parts_of_processing_pipeline.png)


In [32]:
print(nlp.pipe_names)
print(nlp.pipeline)

['tagger', 'parser', 'ner']
[('tagger', <spacy.pipeline.Tagger object at 0x0C5D4FD0>), ('parser', <spacy.pipeline.DependencyParser object at 0x0A8C32A0>), ('ner', <spacy.pipeline.EntityRecognizer object at 0x0C5F3420>)]


## Custom Pipeline Components

Spacy supports base set of piplines (parser, tagger, entity recognizer), and also has ways to add your own 

### Options on where to add component
1. last = True, adds component last
2. first = True, adds component first
3. before = "ner", adds component before ner component
4. after = "ner", adds component after ner

In [33]:
nlp = spacy.load("en_core_web_sm")

def custom_component(doc):
    print("Length: ", len(doc))
    return doc

nlp.add_pipe(custom_component, first = True)

doc = nlp("Hello World!")

Length:  3


In [53]:

animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label=nlp.vocab.strings("ANIMAL")) for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

TypeError: an integer is required

# Optimizing Spacy Pipelines

## Processing Large Volumes of Text

1. Use nlp.pipe method

```py
#BAD

docs = [nlp(text) for text in LOTS_OF_TEXTS]

#GOOD

docs = list(nlp.pipe(LOTS_OF_TEXTS))
```

2. Instead of running whole pipeline every time (which can be time-consuming), you can use methods that will do only what you need instead of the whole thing.

```py
doc = nlp("Hello World") #whole thing
doc = nlp.make_doc("Hello World") #makes only tokenized doc object, no ner or any other components of the pipeline

# you can also disable certain components:

with nlp.disable_pipes("tagger", "parser"):
    doc = nlp(text)
    print(doc.ents)
    
#disabled components are restored after with block is run
```

In [93]:
# nlp.pipe also allows you to pass in data through tuple format (text, context) tuples, yieldss (doc, context tuples)
data = [
    ("This is some text", {"id": 1, "page_num": 15}),
    ("Add another text", {"id": 2, "page_num": 16})
]

for doc, context in nlp.pipe(data, as_tuples = True):
    print(doc.text, context["page_num"])

This is some text 15
Add another text 16


# Custom Extension Attributes

Allows you to add custom metadata to documents, tokens, and spans

1. Attribute Extensions - set default val that can be overriden
2. Property Extensions - def get/set methods, getter function is called wehn you retrieve the attribute value,, span extensions should mostly always use a a getter
3. Method extensions make the attribute a callable method

In [70]:
doc = nlp("The sky is blue.")

from spacy.tokens import Doc, Token, Span

## attribute extensions

Doc.set_extension("title", default = None, force = True)
Token.set_extension("is_color", force = True, getter = get_is_color) #cant do default and getter within the same extension
Span.set_extension("has_color", default = False, force = True)

doc._.title = "My text"
token._.is_color = True
span._.has_color = False

doc[3]._.is_color = True #overriding default val of false

In [71]:
#getters

def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors

print(doc[3]._.is_color, "->", doc[3].text)

True -> blue


In [87]:
## using getters with a span

def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", force = True, getter = get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

In [88]:
#method extensions
def has_token(doc, token_text):
    return token_text in[token.text for token in doc]

Doc.set_extension("has_token", force = True, method = has_token)

doc = nlp("The sky is blue")
print(doc._.has_token("blue"), " -> blue")
print(doc._.has_token("cloud"), " -> cloud")

True  -> blue
False  -> cloud


In [90]:
#method extensions

# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return f"<{tag}>{span.text}</{tag}>"


# Register the Span method extension "to_html" with the method to_html
Span.set_extension("to_html", force = True, method = to_html)

# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))

<strong>Hello world</strong>


# Creating Training Data for Model to Learn Generalizations Specific to Project

In [8]:
TEXTS = [
  "How to preorder the iPhone X",
  "iPhone X is coming",
  "Should I pay $1,000 for the iPhone X?",
  "The iPhone 8 reviews are here",
  "iPhone 11 vs iPhone 8: What's the difference?",
  "I need a new phone! Any tips?"
]

import json
from spacy.matcher import Matcher
from spacy.lang.en import English

nlp = English()
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# Add patterns to the matcher and check the result
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])

[iPhone X]
[iPhone X]
[iPhone X]
[iPhone 8]
[iPhone 11, iPhone 8]
[]


In [9]:
## Generating Training Data
nlp = English()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})
("iPhone 11 vs iPhone 8: What's the difference?", {'entities': [(0, 9, 'GADGET'), (13, 21, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})


# Creating New Training Pipeline and Running Model on data

In [13]:
## Creating a new training pipeline for Entity Recoginizing

import random

data = [
    ['How to preorder the iPhone X', {'entities': [[20, 28, 'GADGET']]}], 
    ['iPhone X is coming', {'entities': [[0, 8, 'GADGET']]}], 
    ['Should I pay $1,000 for the iPhone X?', {'entities': [[28, 36, 'GADGET']]}], 
    ['The iPhone 8 reviews are here', {'entities': [[4, 12, 'GADGET']]}], 
    ['Your iPhone goes up to 11 today', {'entities': [[5, 11, 'GADGET']]}], 
    ['I need a new phone! Any tips?', {'entities': []}]
]

# Create a blank "en" model
nlp = spacy.blank("en")
nlp.vocab.vectors.name = 'spacy_pretrained_vectors'

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

# Add the label "GADGET" to the entity recognizer
ner.add_label("GADGET")

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
    print(losses)

{'ner': 4.506747016371719}
{'ner': 0.9478891206886438}
{'ner': 0.0015377429287959771}
{'ner': 8.57005368438091e-07}
{'ner': 1.1351266841395656e-10}
{'ner': 2.6114565732663255e-11}
{'ner': 7.87826427000158e-12}
{'ner': 4.708774008385329e-12}
{'ner': 3.4910373842099664e-12}
{'ner': 2.5390248219380898e-12}
