# Rule-based Matching
spaCy offers a rule-matching tool called `Matcher` that allows you to build a library of token patterns, then match those patterns against a Doc object to return a list of found matches. You can match on any part of the token including text and annotations, and you can add multiple patterns to the same matcher.

## Import needed Libraries

In [2]:
import pandas as pd
import numpy as np
import spacy as sp

In [3]:
nlp = sp.load('en_core_web_sm')

### import spacy matcher library

In [4]:
from spacy.matcher import Matcher

In [67]:
matcher = Matcher(nlp.vocab) #creating matcher object

### Defining the pattern

In [6]:
pattern1 = [{'LOWER':'solarpower'}] #SolarPower
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] #Solar-power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}] #solar power


In [9]:
matcher.add('SolarPower', [pattern1, pattern2, pattern3])

### Applying the matcher on a doc 

In [30]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [31]:
found_matches = matcher(doc)
print(found_matches)
#(match_id,start_index,end_index)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [32]:
#beautiful print out

In [33]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


### Setting pattern options and quantifiers

In [68]:
doc2 = nlp(u'Solar--powered energy runs solar--powered cars.')

In [73]:
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] #LEMMA works well here

# Remove the old patterns to avoid duplication:
#matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2])

In [72]:
found_matches = matcher(doc2)
print(found_matches)  

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


### PhraseMatcher

In [52]:
from spacy.matcher import PhraseMatcher

In [53]:
#create matcher object
matcher = PhraseMatcher(nlp.vocab)

In [57]:
with open('../TextFiles/reaganomics.txt', encoding='cp1252') as f:
    doc3 = nlp(f.read())

In [61]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

matcher.add('VoodooEconomics', None, *phrase_patterns)

matches = matcher(doc3)

matches


[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]

In [63]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3473369816841043438 VoodooEconomics 41 45 supply-side economics
3473369816841043438 VoodooEconomics 49 53 trickle-down economics
3473369816841043438 VoodooEconomics 54 56 voodoo economics
3473369816841043438 VoodooEconomics 61 65 free-market economics
3473369816841043438 VoodooEconomics 673 677 supply-side economics
3473369816841043438 VoodooEconomics 2987 2991 trickle-down economics
