In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en_core_web_sm')
m_tool = Matcher(nlp.vocab)

## Rule-Base Matching
1. spaCy comes with `Matcher` that can be used to specify custom rules for phrase matching.
2. Define patterns and add them to `Matcher` and apply.

In [3]:
p1 = [{'LOWER': 'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 =  [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]


In [6]:
m_tool.add('QBF', [p1 , p2, p3, p4])

In [7]:
sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')

In [10]:
# output
# first arg: id
# second arg: start position
# third arg: end position
phrase_matches = m_tool(sentence)
print(phrase_matches)

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16), (12825528024649263697, 21, 22), (12825528024649263697, 29, 31)]


In [11]:
for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]
    span = sentence[start:end]
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 quick-brown-fox
12825528024649263697 QBF 13 16 quick brown fox
12825528024649263697 QBF 21 22 quickbrownfox
12825528024649263697 QBF 29 31 quick brownfox


### More options for Rule based



In [12]:
m_tool.remove('QBF')

In [15]:
p1 = [{'LOWER': 'quick'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'brown'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'fox'}]
m_tool.add('QBF', [p1])

In [16]:
sentence = nlp(u'The quick--brown--fox jumps over the  quick-brown---fox')

In [18]:
phrase_matches = m_tool(sentence)

for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]
    span = sentence[start:end]
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 quick--brown--fox
12825528024649263697 QBF 10 15 quick-brown---fox


### Phrase based matching