# Phrase Matching and Vocabulary

### Rule Based Matching

Spacy allows rule based matching through matcher().

In [0]:
import spacy

In [0]:
nlp = spacy.load('en_core_web_sm')

In [0]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [0]:
#  Lets define some of the patterns for matching

# SolarPanel
pattern1 = [{'LOWER': 'solarpower'}]

# Solar panel
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

# solar-panel
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)  #none is to turn off callbacks

In [0]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [6]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [7]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


### Phase matching

In [0]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [12]:
with open('drive/My Drive/Pytorch_DataSet/reaganomics.txt',encoding='cp1252') as f:
  doc3 = nlp(f.read())

# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)

# (match_id, start, end)
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]