In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
from spacy.matcher import Matcher

In [3]:
matcher = Matcher(nlp.vocab)

Creating Patterns

In [4]:
p1 = [{'LOWER' : 'solarpower'}]
p2 = [{'LOWER' : 'solar'}, {'LOWER' : 'power'}]
p3 = [{'LOWER' : 'solar'}, {'IS_PUNCT' : True}, {'LOWER' : 'power'}]

# The patterns p1, p2, and p3 should be in a list,
# and the callback (None in this case) should be in a list as well.
matcher.add('SolarPower', [p1, p2, p3])

In [5]:
doc = nlp(u"The Solar Power industry continues to grow as demand \ for solarpower increases. Solar-power cars are gaining popularity.")

In [6]:
found_matches = matcher(doc)
print (found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 11, 12), (8656102463236116519, 14, 17)]


In [7]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span.
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 11 12 solarpower
8656102463236116519 SolarPower 14 17 Solar-power


In [8]:
pa1 = [{'LOWER' : 'solarpower'}]
pa2 = [{'LOWER' : 'solar'}, {'IS_PUNCT' : True, 'OP' : '*'}, {'LOWER' : 'power'}]

# add the new set of pattern to the solarpower matcher
matcher.add('SolarPower', [pa1, pa2])

In [9]:
found_matches = matcher(doc)
print (found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 11, 12), (8656102463236116519, 14, 17)]


Be Carefull with lemmas!

In [10]:
p1 = [{'LOWER' : 'solarpower'}]
p2 = [{'LOWER' : 'solar'}, {'IS_PUNCT' : True, 'OP' : '*'}, {'LEMMA' : 'power'}]

#remove the old pattern to avoid duplication
matcher.remove('SolarPower')

matcher.add('SolarPower', [p1, p2])

In [11]:
doc2 = nlp(u"Solar-powered energy runs solar-powered cars.")

In [12]:
foud_matches = matcher(doc2)
print (foud_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


Phrase Matcher

In [13]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [15]:
with open("/content/reaganomics.txt", "w") as f:
    f.write("Reaganomics is a portmanteau of Reagan and economics, created by Paul Harvey. It refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.")

In [16]:
doc3 = open("/content/reaganomics.txt", 'r', encoding='latin-1') # Specify the correct encoding when opening the file
f = nlp(doc3.read())

In [17]:
#first create a list of match phrase
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics','free-market economics']
#next convert each phrase to a doc object
phrase_patterns = [nlp(text) for text in phrase_list]

# The matcher requires the patterns to be added with a specific name.
# The `None` argument is a placeholder for a callback function.
matcher.add('VoodooEconomics', None, *phrase_patterns)

matches = matcher(f)
matches

[(3473369816841043438, 36, 40),
 (3473369816841043438, 44, 48),
 (3473369816841043438, 49, 51),
 (3473369816841043438, 56, 60)]

In [18]:
matches

[(3473369816841043438, 36, 40),
 (3473369816841043438, 44, 48),
 (3473369816841043438, 49, 51),
 (3473369816841043438, 56, 60)]