In [51]:
import spacy

In [52]:
nlp = spacy.load('en_core_web_sm')

## Rule-based Matching
spaCy offers a rule-matching tool called `Matcher` that allows you to build a library of token patterns, then match those patterns against a Doc object to return a list of found matches. You can match on any part of the token including text and annotations, and you can add multiple patterns to the same matcher.

In [16]:
from spacy.matcher import Matcher

In [20]:
matcher = Matcher(nlp.vocab)

In [21]:
# solarpower
pattern1 = [{'LOWER': 'solarpower'}]

# solar-power
pattern2 = [{'LOWER': 'solar'},{'IS_PUNCT':True}, {'LOWER': 'power'}]

# solar power
pattern3 = [{'LOWER': 'solar'},{'LOWER': 'power'}]

<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

In [27]:
matcher.add('SolarPower', patterns = [pattern1, pattern2, pattern3], on_match = None)

In [28]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-power is great.")

In [29]:
found_matches = matcher(doc)

In [30]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [31]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [32]:
# remove the pattern

matcher.remove('SolarPower')

In [33]:
# solarpower
pattern1 = [{'LOWER': 'solarpower'}]

# solar.-_power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER':'power'}]

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


In [34]:
matcher.add('SolarPower', patterns = [pattern1, pattern2], on_match = None)

In [35]:
doc2 = nlp(u"Solar--power is  solarpower.")

In [36]:
found_matches = matcher(doc2)

In [37]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 6)]


## PhraseMatcher
In the above section we used token patterns to perform rule-based matching. An alternative - and often more efficient - method is to match on terminology lists. In this case we use PhraseMatcher to create a Doc object from a list of phrases, and pass that into `matcher` instead.

In [38]:
from spacy.matcher import PhraseMatcher

In [39]:
matcher = PhraseMatcher(nlp.vocab)

In [40]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [41]:
phrase_list = ['voodoo economics','supply-side economics', 'trickle-down economics', 'free-market economics']

In [42]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [46]:
matcher.add('EcoMatcher', [*phrase_patterns], on_match=None)

In [47]:
found_matches = matcher(doc3)

In [48]:
found_matches

[(2351661100535932681, 41, 45),
 (2351661100535932681, 49, 53),
 (2351661100535932681, 54, 56),
 (2351661100535932681, 61, 65),
 (2351661100535932681, 673, 677),
 (2351661100535932681, 2987, 2991)]

In [50]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

2351661100535932681 EcoMatcher 41 45 supply-side economics
2351661100535932681 EcoMatcher 49 53 trickle-down economics
2351661100535932681 EcoMatcher 54 56 voodoo economics
2351661100535932681 EcoMatcher 61 65 free-market economics
2351661100535932681 EcoMatcher 673 677 supply-side economics
2351661100535932681 EcoMatcher 2987 2991 trickle-down economics


## Dependency Matcher

The DependencyMatcher lets you match patterns within the dependency parse using Semgrex operators. It requires a model containing a parser such as the DependencyParser. Instead of defining a list of adjacent tokens as in Matcher patterns, the DependencyMatcher patterns match tokens in the dependency parse and specify the relations between them.

In [53]:
from spacy.matcher import DependencyMatcher

In [54]:
matcher = DependencyMatcher(nlp.vocab)

In [55]:
pattern = [
  {
    "RIGHT_ID": "anchor_founded",       # unique name
    "RIGHT_ATTRS": {"ORTH": "founded"}  # token pattern for "founded"
  }
]

In [56]:
matcher.add("FOUNDED", [pattern])
doc = nlp("Smith founded two companies.")
matches = matcher(doc)
print(matches)

[(4851363122962674176, [1])]


In [61]:
doc[1]

founded