# Overview
This notebook will test a few matching methods for speed:
- Built-in `re` module
- Custom `RegexMatcher` in spaCy
- spaCy `Matcher`
- spaCy `PhraseMatcher`

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

In [3]:
import spacy

In [4]:
from regex_matcher import RegexMatcher

In [5]:
texts = [
    """
    It was the best of times,
    it was the worst of times,
    it was the age of wisdom,
    it was the age of foolishness,
    it was the epoch of belief,
    it was the epoch of incredulity,
    it was the season of Light,
    it was the season of Darkness,
    it was the spring of hope,
    it was the winter of despair,
    """,
    "June 24, 06/25/2020, August 9, 08/09/2019, Dec 12, 12/12/2012"
        ]

In [51]:
nlp = spacy.load("en_core_web_sm")

In [52]:
nlp.remove_pipe("parser")
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1ba0690aa08>)

In [53]:
nlp.pipe_names

['tagger']

In [54]:
regex_patterns = ["the [a-z]+ of [A-Z][a-z]+",
          "[a-zA-Z]+ \d+",
          "(\d+/\d+/\d+)"]

# I. Built-in `re` module

In [55]:
compiled_res = [re.compile(pattern) for pattern in regex_patterns]

In [56]:
%%timeit
for text in texts:
    for pattern in compiled_res:
        pattern.findall(text)

21.9 µs ± 111 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# II. Custom RegexMatcher

In [57]:
regex_matcher = RegexMatcher(nlp.vocab)
regex_matcher.add("examples", regex_patterns)

In [58]:
%%timeit
docs = nlp.pipe(texts)
for doc in docs:
    regex_matcher(doc)

3.4 ms ± 71.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# III. Matcher

In [59]:
from spacy.matcher import Matcher

In [60]:
patterns = [
    [{"TEXT": "the"}, {"IS_ALPHA": True, "IS_LOWER": True}, 
         {"TEXT": "of"}, {"TEXT": {"REGEX": "[A-Z][a-z]+"}}],
    [{"IS_ALPHA": True}, {"IS_DIGIT": True}],
    [{"SHAPE": 'dd/dd/dddd'}],
]

In [61]:
matcher = Matcher(nlp.vocab)
matcher.add("patterns", patterns)

In [62]:
%%timeit
docs = nlp.pipe(texts)
for doc in docs:
    matcher(doc)

3.35 ms ± 134 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [63]:
nlp("08/09/2019")[0].shape_

'dd/dd/dddd'

# III. PhraseMatcher

In [64]:
from spacy.matcher import PhraseMatcher

In [65]:
# First, get all of the matched phrases
matched_texts = []
for text in texts:
    for pattern in compiled_res:
        matched_texts += pattern.findall(text)

In [66]:
matched_texts

['the season of Light',
 'the season of Darkness',
 'June 24',
 'August 9',
 'Dec 12',
 '06/25/2020',
 '08/09/2019',
 '12/12/2012']

In [67]:
phrase_matcher = PhraseMatcher(nlp.vocab)
phrase_patterns = list(nlp.tokenizer.pipe(matched_texts))
phrase_matcher.add("example", phrase_patterns)

In [68]:
%%timeit
docs = nlp.pipe(texts)
for doc in docs:
    phrase_matcher(doc)

3.22 ms ± 84.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
pattern.findall(text)

['06/25/2020', '08/09/2019', '12/12/2012']