## Combining extensions with pipeline components

Extension attributes can be combined with custom pipeline components.

Demonstration:
Write a phrase matcher with all countries that is available as the variable matcher.

In [2]:
import json
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("countries.json") as f:
    COUNTRIES = json.loads(f.read())

with open("capitals.json") as f:
    CAPITALS = json.loads(f.read())

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))

In [3]:
def countries_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc

In [5]:
# Add the component to the pipeline
nlp.add_pipe(countries_component,last=True)
print(nlp.pipe_names)

# Getter that retrieves capital based on country
get_capital = lambda span: CAPITALS.get(span.text)

# Add the lambda method as a custom extension to a span
Span.set_extension("capital", getter=get_capital)

['countries_component']


In [6]:
# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]
