# Advanced RegEx

In [2]:
import re

In [3]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [4]:
pattern = r"Paul [A-Z]\w+"

The `\w+` says there can be some more letters connected to the A-Z letter (which is the first letter of the last name/)

In [5]:
matches = re.finditer(pattern, text)

for match in matches:
    print (match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


**The above code span gives us the indices of the characters from the original string.**

In [6]:
import spacy
from spacy.tokens import Span

### Using `match.span()` and `doc.char_span`

We use these to access the indices of the tokens and the characters that comprise the entire length of these indices./

In [9]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
#- Multi-word token entities
mwt_ents = []

"""Use regex to find matches in the text.
Convert character spans to spaCy spans.
Append valid spans to the mwt_ents list."""
print("These are the character spans")
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    print(match)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
        
"""Create new entities labeled as "PERSON" for each match.
Add them to the list of original entities."""
print("These are the spaCy spans")
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    print(per_ent)
    original_ents.append(per_ent)

#- Update the document's entities with the modified list.
doc.ents = original_ents

print("These are the multi word token entities")
print(mwt_ents)

print("These are the entity texts and labels from the spans we created")
for ent in doc.ents:
    print(ent.text, ent.label_)

These are the character spans
<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>
These are the spaCy spans
Paul Newman
Paul Hollywood
These are the multi word token entities
[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]
These are the entity texts and labels from the spans we created
Paul Newman PERSON
Paul Hollywood PERSON


**The span here is different from the one before.**

**This span gives us the indices of the TOKENS, not the characters from the initial string.**

# Custom Components

In [10]:
from spacy.language import Language

In [28]:
pattern = r"Paul [A-Z]\w+"

@Language.component("paul_ner")

def paul_ner(doc):
    original_ents = list(doc.ents)
    #- Multi-word token entities
    mwt_ents = []

    """Use regex to find matches in the text.
    Convert character spans to spaCy spans.
    Append valid spans to the mwt_ents list."""
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
            
    """Create new entities labeled as "PERSON" for each match.
    Add them to the list of original entities."""
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    #- Update the document's entities with the modified list.
    doc.ents = original_ents
    
    return doc

In [29]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [30]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [39]:
from spacy.util import filter_spans

@Language.component("cinema_ner")

def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    #- Multi-word token entities
    mwt_ents = []

    """Use regex to find matches in the text.
    Convert character spans to spaCy spans.
    Append valid spans to the mwt_ents list."""
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
            
    """Create new entities labeled as "PERSON" for each match.
    Add them to the list of original entities."""
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    
    filtered = filter_spans(original_ents)
    
    #- Update the document's entities with the modified list.
    doc.ents = filtered
    
    return doc

In [40]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [41]:
doc3 = nlp3(text)

for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON


### Why does the above code not work without `filter_spans`?

**This is because we are using a custom component that looks for Hollywood.**

**This causes it to be part of multiple spans as part of the encore_web_sm model.**

**One of the spans is extracting Paul Hollywood as a name from the encore_web_sm model and another span is extracting Hollywood as CINEMA from our custom pipe.**

### How do we fix it?

We use `filter_spans`!

`filter_spans` gives priority to **longer spans**.

**Therefore Paul Hollywood is recognized as a person and Hollywood is not recognized as CINEMA.**