In [1]:
import spacy

In [2]:
import re

In [4]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [None]:
pattern = r"Paul [A-Z]\w+" #\w+ requires at least one word character after the uppercase letter.

In [7]:
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [8]:
from spacy.tokens import Span

In [16]:
# Load a blank English spaCy model for tokenization
# Purpose: Initializes a minimal spaCy pipeline for English with only a tokenizer.
nlp = spacy.blank("en")

# Process the input text to create a spaCy Doc object
# Purpose: Converts the input string 'text' into a tokenized Doc object for further processing.
doc = nlp(text)

# Extract named entities from the Doc and store as a list
# Purpose: Attempts to retrieve named entities from the Doc, but since the model is blank, this is empty.
# Explanation: doc.ents contains entities from an NER component, which is absent in a blank model, so doc.ents is an empty tuple. Converting to a list creates an empty original_ents.
original_ents = list(doc.ents)

# Initialize an empty list for storing multi-word token entities
# Purpose: Creates a list to hold tuples of matched spans (start index, end index, text).
mwt_ents = []

# Find all non-overlapping matches of the regex pattern in the Doc's raw text
# Purpose: Uses the regex pattern to identify matches in the text (e.g., names like "Paul Smith").
for match in re.finditer(pattern, doc.text):
    # Extract the start and end character indices of the regex match
    # Purpose: Gets the character positions of the matched substring in the text.
    start, end = match.span()
    
    # Convert character indices to a spaCy Span object aligned with token boundaries
    # Purpose: Maps the character-based match to a token-based Span object.
    span = doc.char_span(start, end)
    
    # Check if the Span is valid before storing
    # Purpose: Ensures the Span aligns with token boundaries, skipping invalid matches.
    if span is not None:
        # Store the Span’s start token index, end token index, and text as a tuple
        # Purpose: Saves the matched span’s details for further processing.
        mwt_ents.append((span.start, span.end, span.text))

# Convert matched spans into spaCy Span objects with a "PERSON" label
# Purpose: Creates named entity Spans labeled as "PERSON" from the matched spans.
for ent in mwt_ents:
    # Unpack the tuple into start index, end index, and text
    # Purpose: Extracts the components of each matched span tuple.
    start, end, name = ent
    
    # Create a new Span object with the "PERSON" label
    # Purpose: Assigns the matched span as a named entity with the label "PERSON".
    per_ent = Span(doc, start, end, label="PERSON")
    
    # Add the new Span to the list of entities
    # Purpose: Collects the new "PERSON" entities for assignment to the Doc.
    original_ents.append(per_ent)

# Assign the updated list of entities to the Doc
# Purpose: Sets the Doc’s entities to the newly created "PERSON" entities.
doc.ents = original_ents


# Print the Doc’s named entities
# Purpose: Outputs the final list of named entities for inspection.
for ent in doc.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


The code processes a text string using a blank spaCy English model to tokenize it, applies a regular expression pattern (e.g., r"Paul [A-Z]\w+") to identify specific patterns like names, and converts these matches into spaCy Span objects labeled as "PERSON" entities. It stores these entities in a list and assigns them to the Doc’s ents attribute, effectively performing custom named entity recognition (NER) for patterns matching the regex. Finally, it prints the recognized entities. This is useful for extracting specific entities (e.g., names) from text when a pre-trained NER model is not available or suitable.

In [12]:
print (mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [20]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+" 
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return(doc)
    


The code defines a custom spaCy pipeline component named "paul_ner" using the @Language.component decorator. It processes a doc object by:

Saving the original named entities from doc.ents into original_ents.
Initializing an empty list mwt_ents to store matched spans.
Using a regex pattern to find matches in the text, converting each match’s character span to a token-based Span, and adding valid spans (start index, end index, text) to mwt_ents.
Converting each matched span into a new Span labeled as "PERSON" and appending it to original_ents.
Updating doc.ents with the new list of entities.
Returning the modified doc for use in the spaCy pipeline. This effectively adds custom entity recognition for patterns matching the regex (e.g., names like "Paul Smith").


In [21]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [22]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [28]:
from spacy.language import Language
from spacy.util import filter_spans

@Language.component("cinema_ner")
def paul_ner(doc):
    pattern = r"Hollywood" 
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return(doc)
    


In [29]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.paul_ner(doc)>

In [30]:
doc3 = nlp3(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
