In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en_core_web_sm')
m_tool = Matcher(nlp.vocab)

## Rule-Base Matching
1. spaCy comes with `Matcher` that can be used to specify custom rules for phrase matching.
2. Define patterns and add them to `Matcher` and apply.

In [3]:
p1 = [{'LOWER': 'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 =  [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]


In [4]:
m_tool.add('QBF', [p1 , p2, p3, p4])

In [5]:
sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')

In [6]:
# output
# first arg: id
# second arg: start position
# third arg: end position
phrase_matches = m_tool(sentence)
print(phrase_matches)

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16), (12825528024649263697, 21, 22), (12825528024649263697, 29, 31)]


In [7]:
for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]
    span = sentence[start:end]
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 quick-brown-fox
12825528024649263697 QBF 13 16 quick brown fox
12825528024649263697 QBF 21 22 quickbrownfox
12825528024649263697 QBF 29 31 quick brownfox


### More options for Rule based



In [8]:
m_tool.remove('QBF')

In [9]:
p1 = [{'LOWER': 'quick'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'brown'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'fox'}]
m_tool.add('QBF', [p1])

In [10]:
sentence = nlp(u'The quick--brown--fox jumps over the  quick-brown---fox')

In [11]:
phrase_matches = m_tool(sentence)

for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]
    span = sentence[start:end]
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 quick--brown--fox
12825528024649263697 QBF 10 15 quick-brown---fox


### Phrase based matching

In [12]:
import bs4 as bs
import urllib.request
import re
import nltk

In [14]:

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

In [16]:
parsed_article = bs.BeautifulSoup(article, 'html')

In [17]:
paragraphs = parsed_article.find_all('p')

In [18]:
article_text = ""
for p in paragraphs:
    article_text += p.text

In [20]:
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

In [21]:
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm')
phrase_matcher = PhraseMatcher(nlp.vocab)

In [22]:
phrases = ['machine learning', 'robots', 'intelligent agents']

patterns = [nlp(text) for text in phrases]


In [23]:
print(patterns)

[machine learning, robots, intelligent agents]


In [25]:
phrase_matcher.add('AI', None, *patterns)

In [26]:
sentence = nlp(processed_article)
matched_phrases = phrase_matcher(sentence)

In [28]:
for match_id, start, end in matched_phrases:
    string_id = nlp.vocab.strings[match_id]
    span = sentence[start:end]
    print(match_id, string_id, start, end, span.text)

5530044837203964789 AI 30 32 intelligent agents
5530044837203964789 AI 249 251 machine learning
5530044837203964789 AI 1053 1055 machine learning
5530044837203964789 AI 1527 1529 intelligent agents
5530044837203964789 AI 1686 1688 machine learning
5530044837203964789 AI 2200 2201 robots
5530044837203964789 AI 2629 2631 machine learning
5530044837203964789 AI 2671 2673 machine learning
5530044837203964789 AI 2698 2700 machine learning
5530044837203964789 AI 2999 3000 robots
5530044837203964789 AI 4195 4196 robots
5530044837203964789 AI 4390 4391 robots
5530044837203964789 AI 5232 5233 robots
5530044837203964789 AI 6681 6682 robots
5530044837203964789 AI 6802 6803 robots
5530044837203964789 AI 6846 6847 robots


## Stop words



In [29]:
print(nlp.Defaults.stop_words)

{'thereby', 'just', 'again', 'last', 'all', 'fifty', 'such', 'you', 'various', '‘re', 'being', 'some', 'indeed', 'much', 'whoever', 'already', 'others', '’ll', 'at', 'latterly', 'make', 'from', 'yourself', '’ve', 'was', "'ll", 'even', 'enough', 'beyond', 'most', 'whereby', 'four', '’re', '‘ve', 'above', 'becoming', 'never', 'nobody', 'still', 'anyhow', 'to', 'her', 'six', 'an', 'under', 'sixty', 'or', 'its', "'s", 'whether', 'otherwise', 'once', 'beside', 'three', 'whole', 'elsewhere', 'which', 'hundred', 'wherein', 'namely', 'less', 'those', 'become', 'except', '’m', 'i', 'also', 'n‘t', 'part', 'their', 'but', 'used', 'twelve', 'towards', 'yet', '‘ll', '‘m', 'sometimes', 'further', 'none', 'nor', '’d', 'using', 'first', 'get', 'serious', 'made', 'between', 'myself', "'m", 'among', 'while', 'thereupon', 'done', 'below', 'mostly', 'former', 'side', 'are', 'seeming', 'move', 'another', 'had', 'it', 'be', 'therefore', 'because', 'thru', 'seems', 'mine', 'why', 'been', 'toward', 'per', '‘d

In [30]:

nlp.vocab['wonder'].is_stop

False

In [31]:
nlp.vocab['is'].is_stop



True

In [32]:
nlp.Defaults.stop_words.add('wonder')

In [33]:
nlp.vocab['wonder'].is_stop = True

In [34]:
nlp.vocab['wonder'].is_stop

True