In [1]:
# Import prerequisite libraries
import spacy
from spacy.matcher import Matcher

#### Basic Example

In [2]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL":True}]
matcher.add("EMAIL_ADDRESS",[pattern])
doc = nlp("This is an email address: abs.alchemy20@gmail.com")
matches = matcher(doc)

In [3]:
print(matches)

[(16571425990740197027, 6, 7)]


In [4]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


#### Attributes Taken by Matcher
*ORTH-The exact verbatim of a token(str)*

*TEXT-The exact verbatim of a token(str)*

*LOWER-The lowercase form of the token text(str)*

*LENGTH-The length of the token text(int)*

*IS_ALPHA*

*IS_ASCII*

*IS_DIGIT*

*IS_LOWER*

*IS_UPPER*

*IS_TITLE*

*IS_PUNCT*

*IS_SPACE*

*IS_STOP*

*IS_SENT_START*

*LIKE_NUM*

*LIKE_URL*

*LIKE_EMAIL*

*SPACY*

*POS*

*TAG*

*MORPH*

*DEP*

*LEMMA*

*SHAPE*

*ENT_TYPE*

*_-Customer extension attributes(Dict[str,Any])*

*OP*

#### Applied Matcher

In [5]:
with open("dataset/nlp_wiki.txt", "r") as f:
    text = f.read()

In [8]:
# Print text
#print(text)

#### Grabbing all Proper Noun

In [6]:
# Load a small model
nlp = spacy.load("en_core_web_sm")

In [7]:
# Grabbing proper nouns
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN"}]
matcher.add("PROPER_NOUNS", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match,doc[match[1]:match[2]])

35
(3232560085755078826, 11, 12) NLP
(3232560085755078826, 21, 22) NLP
(3232560085755078826, 23, 24) corpus
(3232560085755078826, 62, 63) corpora
(3232560085755078826, 73, 74) Corpora
(3232560085755078826, 83, 84) Biblical
(3232560085755078826, 89, 90) Corpora
(3232560085755078826, 143, 144) Wikipedia
(3232560085755078826, 201, 202) English
(3232560085755078826, 202, 203) Wikipedia


##### Improving with Multi-Word Tokens

In [8]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}]
matcher.add("PROPER_NOUNS",[pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])    # Didn't understand

38
(3232560085755078826, 11, 12) NLP
(3232560085755078826, 21, 22) NLP
(3232560085755078826, 23, 24) corpus
(3232560085755078826, 62, 63) corpora
(3232560085755078826, 73, 74) Corpora
(3232560085755078826, 83, 84) Biblical
(3232560085755078826, 89, 90) Corpora
(3232560085755078826, 143, 144) Wikipedia
(3232560085755078826, 201, 202) English
(3232560085755078826, 201, 203) English Wikipedia


##### Greedy Keyword Argument

In [9]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}]
matcher.add("PROPER_NOUNS",[pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

32
(3232560085755078826, 201, 203) English Wikipedia
(3232560085755078826, 307, 309) Wikipedia Dump
(3232560085755078826, 341, 343) English Wikipedia
(3232560085755078826, 11, 12) NLP
(3232560085755078826, 21, 22) NLP
(3232560085755078826, 23, 24) corpus
(3232560085755078826, 62, 63) corpora
(3232560085755078826, 73, 74) Corpora
(3232560085755078826, 83, 84) Biblical
(3232560085755078826, 89, 90) Corpora


##### Sorting it to Apperance

In [10]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}]
matcher.add("PROPER_NOUNS",[pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key=lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

32
(3232560085755078826, 11, 12) NLP
(3232560085755078826, 21, 22) NLP
(3232560085755078826, 23, 24) corpus
(3232560085755078826, 62, 63) corpora
(3232560085755078826, 73, 74) Corpora
(3232560085755078826, 83, 84) Biblical
(3232560085755078826, 89, 90) Corpora
(3232560085755078826, 143, 144) Wikipedia
(3232560085755078826, 201, 203) English Wikipedia
(3232560085755078826, 214, 215) Install


##### Adding in Sequences

In [11]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}, {"POS":"VERB"}]
matcher.add("PROPER_NOUNS",[pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key=lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

1
(3232560085755078826, 214, 216) Install gensim


#### Finding Quotes and Speakers