---
title: "Spacy 101"
format: html
date: 2024-09-06
description: ''
---

Notes from <https://spacy.io/usage/spacy-101>

In [17]:
# !pip install spacy tabulate
import spacy
from spacy import displacy
from tabulate import tabulate

nlp = spacy.load('en_core_web_sm')
doc = nlp("Apple is looking at buying a UK startup for $1 billion")

# lets see all the tokens
print([t for t in doc])

[Apple, is, looking, at, buying, a, UK, startup, for, $, 1, billion]


In [18]:
# display entities
displacy.render(doc, style="ent")

In [19]:
# show dependencies
displacy.render(doc, style="dep")

In [20]:
headers = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
data = [[token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop] for token in doc]

print(tabulate(data, headers=headers, tablefmt="grid"))

+---------+---------+-------+-------+----------+---------+------------+-----------+
| text    | lemma   | pos   | tag   | dep      | shape   | is_alpha   | is_stop   |
| Apple   | Apple   | PROPN | NNP   | nsubj    | Xxxxx   | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| is      | be      | AUX   | VBZ   | aux      | xx      | True       | True      |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| looking | look    | VERB  | VBG   | ROOT     | xxxx    | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| at      | at      | ADP   | IN    | prep     | xx      | True       | True      |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| buying  | buy     | VERB  | VBG   | pcomp    | xxxx    | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----

### Named Entities


In [21]:
headers = ["text", "start", "end", "label"]
data = [[ent.text, ent.start, ent.end, ent.label_] for ent in doc.ents]
print(tabulate(data, headers=headers, tablefmt="grid"))

+------------+---------+-------+---------+
| text       |   start |   end | label   |
| Apple      |       0 |     1 | ORG     |
+------------+---------+-------+---------+
| UK         |       6 |     7 | GPE     |
+------------+---------+-------+---------+
| $1 billion |       9 |    12 | MONEY   |
+------------+---------+-------+---------+



<img src="https://spacy.io/images/pipeline.svg" alt="Spacy Pipeline" width="50%">

<!-- ### Architecture -->
<!-- <img src="https://spacy.io/images/architecture.svg" alt="Spacy Pipeline" width="50%"> -->



### Custom matcher for NER
Lets create a custom matcher to identify which franchise a movie belongs to. 

In [37]:
from spacy.matcher import PhraseMatcher
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_sm")

bond_movies = ["Casino Royale", "Quantum of Solace", "Skyfall", "Spectre", "No Time To Die"]
star_wars_movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", 
                    "The Force Awakens", "The Last Jedi", "The Rise of Skywalker"]

# Create PhraseMatcher and add patterns
matcher = PhraseMatcher(nlp.vocab)
bond_patterns = [nlp.make_doc(text) for text in bond_movies]
star_wars_patterns = [nlp.make_doc(text) for text in star_wars_movies]

matcher.add("BOND_MOVIE", bond_patterns)
matcher.add("STAR_WARS_MOVIE", star_wars_patterns)

# Process texts
texts = ["I watched No Time To Die last night in India. Great movie!",
         "The Last Jedi is an American movie. I watched it in 2019."]

for text in texts:
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        print(f"Entity: '{span.text}'\t Label: '{nlp.vocab.strings[match_id]}'")


Entity: 'No Time To Die'	 Label: 'BOND_MOVIE'
Entity: 'The Last Jedi'	 Label: 'STAR_WARS_MOVIE'
