In [18]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [19]:
from spacy.matcher import Matcher

In [20]:
matcher = Matcher(nlp.vocab) # loading our matcher

In [21]:
# SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# Solar-power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
# Solar power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [22]:
# creating matchers
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [25]:
doc = nlp(u"The Solar Power industry continues to grow as solar power increases. Solar-power is amazing.")

In [26]:
found_matches = matcher(doc)

In [27]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 10), (8656102463236116519, 12, 15)]


In [28]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]     # get string representation
    span = doc[start:end]                       # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 10 solar power
8656102463236116519 SolarPower 12 15 Solar-power


In [29]:
# remove matcher
matcher.remove('SolarPower')

In [30]:
# solarPower SolarPower
pattern1 = [{'LOWER':'solarpowers'}]
# solar.power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

In [32]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [33]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [34]:
found_matches = matcher(doc2)

In [35]:
print(found_matches)

[(8656102463236116519, 0, 3)]


In [36]:
# Part 2
from spacy.matcher import PhraseMatcher

In [37]:
matcher = PhraseMatcher(nlp.vocab)

In [40]:
with open('../TextFiles/reaganomics.txt', encoding='cp1252') as f:
    doc3 = nlp(f.read())

In [41]:
phrase_list = ['voodoo economics', 'supply-sides economics', 'tickle-down economics', 'free-market economics']

In [42]:
phrase_patterns = [nlp(text) for text in phrase_list] # create patterns for phrases

In [43]:
phrase_patterns

[voodoo economics,
 supply-sides economics,
 tickle-down economics,
 free-market economics]

In [44]:
matcher.add('EconMatcher', None, *phrase_patterns) # creating matcher

In [45]:
found_matches = matcher(doc3)

In [46]:
found_matches

[(3680293220734633682, 54, 56), (3680293220734633682, 61, 65)]

In [47]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]     # get string representation
    span = doc3[start:end]                      # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
