In [2]:
import spacy
from spacy.matcher import Matcher

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADRESS",[pattern])

In [5]:
doc = nlp("This is an email adress: mahtabhaque2587@gmail.com")
matches  = matcher(doc)

In [6]:
print(matches)
# index 0: lexeme,
# index 1: start token
# index 2: end token

[(2197859665807148658, 6, 7)]


In [7]:
print (nlp.vocab[matches[0][0]].text)

EMAIL_ADRESS


In [8]:
with open ("data/wiki_mlk.txt","r") as f:
    text = f.read()

In [9]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.

King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famous 

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
# Initialize the Matcher with the vocabulary of the spaCy NLP pipeline
# This creates a matcher object that will use the same vocab as your model
matcher = Matcher(nlp.vocab)

# Define a pattern to match proper nouns (PROPN)
# This pattern will match any token with Part-of-Speech tag "PROPN" (proper noun)
pattern = [{"POS": "PROPN"}]

# Add the pattern to the matcher with a rule name "PROPER_NOUN"
# The name helps identify matches later (useful when you have multiple patterns)
matcher.add("PROPER_NOUN", [pattern])

# Process the input text with the NLP pipeline to create a Doc object
# This tokenizes the text and assigns POS tags, dependencies, etc.
doc = nlp(text)

# Apply the matcher to the processed document
# Returns a list of tuples containing (match_id, start_token_index, end_token_index)
matches = matcher(doc)

# Print the total number of matches found
print(len(matches))

# Print the first 10 matches with their text spans
for match in matches[:10]:
    # match[0] = rule ID (hash of "PROPER_NOUN"), lexeme
    # match[1] = start token index
    # match[2] = end token index (exclusive)
    print(match, doc[match[1]:match[2]])

101
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 49, 50) King


In [12]:
#grabbing multiword tokens (proper nouns)
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"}] # requires the pattern to match 1 or more times

matcher.add("PROPER_NOUN",[pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

174
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [13]:
#grabbing multiword tokens (proper nouns)
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"}] # requires the pattern to match 1 or more times

matcher.add("PROPER_NOUN",[pattern], greedy="LONGEST") #looks for the greatest token possible
doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

60
(451313080118390996, 83, 88) Martin Luther King Sr.
(451313080118390996, 469, 474) Martin Luther King Jr. Day
(451313080118390996, 536, 541) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 128, 132) Southern Christian Leadership Conference
(451313080118390996, 247, 251) Director J. Edgar Hoover
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 325, 328) Nobel Peace Prize
(451313080118390996, 422, 425) James Earl Ray
(451313080118390996, 463, 466) Congressional Gold Medal


In [14]:
#grabbing multiword tokens (proper nouns)
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"},{"POS": "VERB"}] # a verb has to come after the proper noun
matcher.add("PROPER_NOUN",[pattern], greedy="LONGEST") #looks for the greatest token possible
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

7
(451313080118390996, 49, 51) King advanced
(451313080118390996, 89, 91) King participated
(451313080118390996, 113, 115) King led
(451313080118390996, 167, 169) King helped
(451313080118390996, 247, 252) Director J. Edgar Hoover considered
(451313080118390996, 322, 324) King won
(451313080118390996, 485, 488) United States beginning


In [15]:
import json
with open ("data/alice.json","r") as f:
    data = json.load(f)

In [16]:
text = data[0][2][0]
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'


In [17]:
# a cleaner text:
text = text.replace("`","'")
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [None]:
# Define a list of verb lemmas to match, focusing on verbs related to speaking
speak_lemmas = ["think", "say"]

# Initialize a Matcher object with the spaCy vocabulary from the nlp object
matcher = Matcher(nlp.vocab)

# Define a pattern to match a quoted proper noun followed by a verb and another quoted proper noun
pattern = [
    {"ORTH": "'"},                          # Match a single quote (')
    {"IS_ALPHA": True, "OP": "+"},          # Match one or more alphabetic characters (part of a word)
    {"IS_PUNCT": True, "OP": "*"},          # Match zero or more punctuation marks
    {"ORTH": "'"},                          # Match a closing single quote (')
    {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},  # Match a verb with lemma in speak_lemmas (e.g., "think" or "say")
    {"POS": "PROPN", "OP": "+"},            # Match one or more proper nouns
    {"ORTH": "'"},                          # Match another single quote (')
    {"IS_ALPHA": True, "OP": "+"},          # Match one or more alphabetic characters (part of a word)
    {"IS_PUNCT": True, "OP": "*"},          # Match zero or more punctuation marks
    {"ORTH": "'"}                           # Match a final closing single quote (')
]

# Add the pattern to the matcher with the label "PROPER_NOUN" and use greedy="LONGEST" to prefer longer matches
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

# Process the input text using the spaCy nlp pipeline to create a Doc object
doc = nlp(text)

# Apply the matcher to the Doc object to find all matches of the pattern
matches = matcher(doc)

# Sort matches by their start index (second element of each match tuple)
matches.sort(key=lambda x: x[1])

# Print the total number of matches found
print(len(matches))

# Iterate over the first 10 matches (or fewer if there are less than 10)
for match in matches[:10]:
    # Print the match tuple (match_id, start, end) and the corresponding text span from the Doc
    print(match, doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [23]:
for text in data[0][2]:
    text = text.replace("`","'")
    doc = nlp(text)
    matches = matcher(doc)
    print(len(matches))
    matches.sort(key = lambda x: x[1])

    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [24]:

matcher = Matcher(nlp.vocab)
pattern1 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
pattern2 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}]
pattern3 = [{"POS": "PROPN", "OP": "+"},{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print (len(matches))
    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
1
(3232560085755078826, 0, 6) 'Well!' thought Alice
0
0
0
0
0
0
0
1
(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice
0
0
