In [1]:
import spacy

In [2]:
from spacy.matcher import Matcher

In [3]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])
doc = nlp("This is an email address: wgemmalemma@domain.com")
matches = matcher(doc)

In [8]:
print("Matches found:", matches) # Lexeme, start token, end token

Matches found: [(16571425990740197027, 6, 7)]


In [6]:
print(nlp.vocab[matches[0][0]].text)  


EMAIL_ADDRESS


In [7]:
print(nlp.vocab[matches[0][1]].text)  # Print the matched text

IS_SPACE


In [9]:
print(nlp.vocab[matches[0][2]].text)  

IS_TITLE


# Grabbing all Proper Nouns

In [10]:
with open("data/wiki_mlk.txt", "r") as file:
    text = file.read()

In [11]:
nlp = spacy.load("en_core_web_sm")


In [12]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)


In [13]:
print("Matches found:", len(matches))

Matches found: 101


In [15]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 49, 50) King


# Improving it with multi word tokens

In [16]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUNS", [pattern])
doc = nlp(text)
matches = matcher(doc)

In [17]:
print("Matches found:", len(matches))

Matches found: 174


In [18]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(3232560085755078826, 0, 1) Martin
(3232560085755078826, 0, 2) Martin Luther
(3232560085755078826, 1, 2) Luther
(3232560085755078826, 0, 3) Martin Luther King
(3232560085755078826, 1, 3) Luther King
(3232560085755078826, 2, 3) King
(3232560085755078826, 0, 4) Martin Luther King Jr.
(3232560085755078826, 1, 4) Luther King Jr.
(3232560085755078826, 2, 4) King Jr.
(3232560085755078826, 3, 4) Jr.


Too many repetitions

# Greedy Keyword Argument

In [19]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)

In [20]:
print("Matches found:", len(matches))

Matches found: 60


In [21]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(3232560085755078826, 83, 88) Martin Luther King Sr.
(3232560085755078826, 469, 474) Martin Luther King Jr. Day
(3232560085755078826, 536, 541) Martin Luther King Jr. Memorial
(3232560085755078826, 0, 4) Martin Luther King Jr.
(3232560085755078826, 128, 132) Southern Christian Leadership Conference
(3232560085755078826, 247, 251) Director J. Edgar Hoover
(3232560085755078826, 6, 9) Michael King Jr.
(3232560085755078826, 325, 328) Nobel Peace Prize
(3232560085755078826, 422, 425) James Earl Ray
(3232560085755078826, 463, 466) Congressional Gold Medal


# Sorting it to appearance

In [22]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])

In [23]:
print("Matches found:", len(matches))

Matches found: 60


In [24]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(3232560085755078826, 0, 4) Martin Luther King Jr.
(3232560085755078826, 6, 9) Michael King Jr.
(3232560085755078826, 10, 11) January
(3232560085755078826, 15, 16) April
(3232560085755078826, 49, 50) King
(3232560085755078826, 69, 71) Mahatma Gandhi
(3232560085755078826, 83, 88) Martin Luther King Sr.
(3232560085755078826, 89, 90) King
(3232560085755078826, 113, 114) King
(3232560085755078826, 117, 118) Montgomery


# Adding in sequences

In [25]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB"}]
matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])

In [26]:
print("Matches found:", len(matches))

Matches found: 7


In [27]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(3232560085755078826, 49, 51) King advanced
(3232560085755078826, 89, 91) King participated
(3232560085755078826, 113, 115) King led
(3232560085755078826, 167, 169) King helped
(3232560085755078826, 247, 252) Director J. Edgar Hoover considered
(3232560085755078826, 322, 324) King won
(3232560085755078826, 485, 488) United States beginning


# Finding quotes and speakers

In [28]:
import json

In [29]:
with open('data/alice.json') as f:
    data = json.load(f)

In [30]:
text = data[0][2][0]
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'


Replacing non standard quotation:

In [31]:
text = data[0][2][0].replace("`", "'")
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [32]:
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "'"}, 
{"IS_ALPHA": True, "OP": "+"}, 
{"IS_PUNCT": True, "OP": "*"}, 
{"ORTH": "'"}]

matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])

In [33]:
print("Matches found:", len(matches))

Matches found: 2


In [34]:
for match in matches:
    print(match, doc[match[1]:match[2]])

(3232560085755078826, 47, 58) 'and what is the use of a book,'
(3232560085755078826, 60, 67) 'without pictures or conversation?'


## Find Speaker

In [35]:
speak_lemmas = ["think", "say"]
text = data[0][2][0].replace("`", "'")
matcher = Matcher(nlp.vocab)
pattern1 = [{"ORTH": "'"},
{"IS_ALPHA": True, "OP": "+"},
{"IS_PUNCT": True, "OP": "*"},
{"ORTH": "'"},
{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
{"POS": "PROPN", "OP": "+"},
{"ORTH": "'"},
{"IS_ALPHA": True, "OP": "+"},
{"IS_PUNCT": True, "OP": "*"},
{"ORTH": "'"}]

In [36]:
matcher.add("PROPER_NOUNS", [pattern1], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])

In [37]:
print("Matches found:", len(matches))

Matches found: 1


In [38]:
for match in matches:
    print(match, doc[match[1]:match[2]])

(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


## Problem

In [39]:
# over the whole chapter
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print(len(matches))
    for match in matches[:10]:
        print(match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


?????

## Adding more patterns

In [40]:
speak_lemmas = ["think", "say"]
text = data[0][2][0].replace( "`", "'")
matcher = Matcher(nlp.vocab)

In [41]:
pattern1 = [{'ORTH': "'"}, 
{'IS_ALPHA': True, "OP": "+"}, 
{'IS_PUNCT': True, "OP": "*"}, 
{'ORTH': "'"}, 
{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, 
{"POS": "PROPN", "OP": "+"}, 
{'ORTH': "'"}, 
{'IS_ALPHA': True, "OP": "+"}, 
{'IS_PUNCT': True, "OP": "*"}, 
{'ORTH': "'"}]


In [42]:
pattern2 = [{'ORTH': "'"}, 
{'IS_ALPHA': True, "OP": "+"}, 
{'IS_PUNCT': True, "OP": "*"}, 
{'ORTH': "'"}, 
{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, 
{"POS": "PROPN", "OP": "+"}]


In [43]:
pattern3 = [{"POS": "PROPN", "OP": "+"},
{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, 
{'ORTH': "'"}, 
{'IS_ALPHA': True, "OP": "+"}, 
{'IS_PUNCT': True, "OP": "*"}, 
{'ORTH': "'"}]


In [44]:
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')

In [45]:
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print(len(matches))
    for match in matches[:10]:
        print(match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
1
(3232560085755078826, 0, 6) 'Well!' thought Alice
0
0
0
0
0
0
0
1
(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice
0
0


This is where being a domain expert and knowing the text comes into play.