In [1]:
import spacy
from spacy.matcher import Matcher

import pandas as pd

nlp = spacy.load('en_core_web_sm')

# Pattern Matching

In [2]:
matcher = Matcher(nlp.vocab)

patterns = []

# SolarPower, solarpower, SOLARPOWER
patterns.append([ 
        {'LOWER': 'solarpower'},
    ])
# SOLAR-POWER, Solar-power, solar-power
patterns.append([
    {'LOWER':'solar'}, 
    {'IS_PUNCT': True}, 
    {'LOWER': 'power'}
])
# Solar Power, solar power, Solar power
patterns.append([
    {'LOWER':'solar'}, 
    {'LOWER': 'power'}
])

matcher.add('SolarPower', patterns)

string = u'The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.'
doc = nlp(string)

matches = matcher(doc)

print(matches)
pd.DataFrame(matches, columns=['match', 'start', 'stop'])

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


Unnamed: 0,match,start,stop
0,8656102463236116519,1,3
1,8656102463236116519,8,9
2,8656102463236116519,11,14


In [3]:
matcher = Matcher(nlp.vocab)

patterns = []

patterns.append([
    {'LOWER': 'solarpower'},
])
patterns.append([
    {'LOWER': 'solar'},
    {'IS_PUNCT': True, 'OP': '*'},
    {'LOWER': 'power'},
])

matcher.add('SolarPower', patterns)

doc = nlp(u'Solar--power is solarpower, yeah.')

matches = matcher(doc)

print(matches)
pd.DataFrame(matches, columns=['match', 'start', 'stop'])

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


Unnamed: 0,match,start,stop
0,8656102463236116519,0,3
1,8656102463236116519,4,5


# Phrase Matching 

In [8]:
from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm')

In [13]:
matcher = PhraseMatcher(nlp.vocab)

with open('reaganomics.txt') as f:
    doc = nlp(f.read())

phrases = [
    'voodoo economics', 
    'supply-side economics',
    'trickle-down economics', 
    'free-market economics',
]

patterns = [nlp(text) for text in phrases]

matcher.add('EconMatcher', patterns)

matches = matcher(doc)

matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [15]:
for match_id, start, end in matches:
    matcher_name = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print( matcher_name, match_id, start, end, span.text)

EconMatcher 3680293220734633682 41 45 supply-side economics
EconMatcher 3680293220734633682 49 53 trickle-down economics
EconMatcher 3680293220734633682 54 56 voodoo economics
EconMatcher 3680293220734633682 61 65 free-market economics
EconMatcher 3680293220734633682 673 677 supply-side economics
EconMatcher 3680293220734633682 2987 2991 trickle-down economics
