### <mark> Matchers

    Matcher(vocab, validate=True, fuzzy_compare=levenshtein_compare, *)
    Match sequences of tokens, based on pattern rules.

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [6]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

# matcher.add('SolarPower', None, pattern1, pattern2, pattern3)
matcher.add('SolarPower', [pattern1, pattern2, pattern3])

In [7]:
doc = nlp(u'The Solar Power industry continues to grow as demand for solarpower increases. Solar-power cars are gaining popularity.')

In [8]:
found_matches = matcher(doc)
for a,b,c in  found_matches : 
    print(f'Word ID {a} , starts at {b} & ends at {c} , and word is {doc[b:c]}')

Word ID 8656102463236116519 , starts at 1 & ends at 3 , and word is Solar Power
Word ID 8656102463236116519 , starts at 10 & ends at 11 , and word is solarpower
Word ID 8656102463236116519 , starts at 13 & ends at 16 , and word is Solar-power


In [9]:
matcher.remove('SolarPower')

In [10]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

In [12]:
matcher.add('SolarPower', [pattern1, pattern2])

In [13]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solarpowered'}]
pattern4 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'powered'}]

In [14]:
matcher.remove('SolarPower')

In [15]:
matcher.add('SolarPower', [pattern1, pattern2, pattern3, pattern4])

In [16]:
found_matches = matcher(doc)
for a,b,c in  found_matches : 
    print(f'Word ID {a} , starts at {b} & ends at {c} , and word is {doc[b:c]}')

Word ID 8656102463236116519 , starts at 1 & ends at 3 , and word is Solar Power
Word ID 8656102463236116519 , starts at 10 & ends at 11 , and word is solarpower
Word ID 8656102463236116519 , starts at 13 & ends at 16 , and word is Solar-power


#### PhraseMatcher

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [18]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [19]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [20]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [21]:
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [22]:
matcher.add('VoodooEconomics', None, *phrase_patterns)

In [23]:
matches = matcher(doc3)

for a,b,c in  matches : 
    print(f'Word ID {a} , starts at {b} & ends at {c} , and phrase is {doc3[b-3:c+3]}')
    print('---------------------------------------------------------------------------------')

Word ID 3473369816841043438 , starts at 41 & ends at 45 , and phrase is commonly associated with supply-side economics, referred to
---------------------------------------------------------------------------------
Word ID 3473369816841043438 , starts at 49 & ends at 53 , and phrase is referred to as trickle-down economics or voodoo economics
---------------------------------------------------------------------------------
Word ID 3473369816841043438 , starts at 54 & ends at 56 , and phrase is down economics or voodoo economics by political opponents
---------------------------------------------------------------------------------
Word ID 3473369816841043438 , starts at 61 & ends at 65 , and phrase is opponents, and free-market economics by political advocates
---------------------------------------------------------------------------------
Word ID 3473369816841043438 , starts at 673 & ends at 677 , and phrase is following from the supply-side economics movement, which
-----------------