In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

This is a text 15
And another text 16


In [11]:
from spacy.tokens import Doc

Doc.set_extension("id", default=None, force=True)
Doc.set_extension("page_number", default=None, force=True)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

In [17]:
# Disable tagger and parser (should you only want to use the entity recognizer to process a document)
with nlp.select_pipes(disable=["tagger", "parser"]):
    # In the `with` block, spaCy will only run the remaining components.
    # Process the text and print the entities
    doc = nlp("This is yet another text")
    print(doc.ents)
    
# After the `with` block, the disabled pipeline components are automatically restored.

()
