In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp('Hello World!')

In [4]:
doc

Hello World!

In [5]:
for token in doc:
    print(token)

Hello
World
!


In [6]:
pattern = [{"LOWER": "hello", 'OP': '?'},
          {"IS_PUNCT": True, 'OP': '?'},
          {"LOWER": "world"}]

In [7]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [pattern])

In [8]:
doc = nlp("Hello, World!")

In [9]:
matches = matcher(doc)

In [10]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [11]:
for token in doc:
    print(token)

Hello
,
World
!


In [12]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


### Regular Expression

In [13]:
text = "my phone number is 1256. ohh its wrong! Correct one is 1256348790. Call me!"

In [14]:
import re

In [15]:
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1256348790'>

In [16]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1256'>

In [17]:
re.findall(r'\d{4,10}', text)

['1256', '1256348790']

In [18]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1256', 'wrong', 'Correct', '1256348790', 'Call']

### Wildcard Text

In [19]:
re.findall(r'c...', text)

['ct o']

In [20]:
text = "this is cat but no that. i want hat and cat both"

In [21]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [22]:
text = 'hi thanks for watching <3'

In [23]:
re.findall(r'\d$', text)

['3']

In [24]:
re.findall(r'^\d', text)

[]

### Exclusion

In [25]:
text

'hi thanks for watching <3'

In [26]:
re.findall(r'[^\d]+', text)

['hi thanks for watching <']

In [27]:
re.findall(r'[^\D]+', text)

['3']

In [28]:
text = "you can get free-videos on kgp-talkie"

In [29]:
re.findall(r'[\w]+-[\w]+', text)

['free-videos', 'kgp-talkie']

### Regular Expression in Spacy

In [37]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

In [38]:
text

'Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google.'

In [39]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O'}]

In [40]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [41]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', [pattern], on_match=callback_method)

In [42]:
doc = nlp(text)

In [43]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 6, 10), (11578853341595296054, 10, 14)]

### Find Word Google

In [44]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I', 'OP': '?'}, {'TEXT': '/', 'OP': '?'}, {'TEXT': 'O', 'OP': '?'}]

In [45]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [46]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', [pattern], on_match=callback_method)

In [47]:
doc = nlp(text)

In [48]:
matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


[(11578853341595296054, 0, 1),
 (11578853341595296054, 6, 7),
 (11578853341595296054, 6, 8),
 (11578853341595296054, 6, 9),
 (11578853341595296054, 6, 10),
 (11578853341595296054, 10, 11),
 (11578853341595296054, 10, 12),
 (11578853341595296054, 10, 13),
 (11578853341595296054, 10, 14),
 (11578853341595296054, 23, 24)]

## Using Linguistic Annotations

In [66]:
matcher = Matcher(nlp.vocab)

In [67]:
matched_sents = []

In [68]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]

In [69]:
def callback_method(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    match_ents = [{
        'start': span.start_char - sent.start_char,
        'end': span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    
    matched_sents.append({'text': sent.text, 'ents': match_ents})

In [70]:
matcher.add("fb", [pattern], on_match=callback_method)

In [71]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [72]:
matches = matcher(doc)

In [73]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [74]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [75]:
displacy.render(matched_sents, style='ent', manual=True)

### Phone Numbers

In [76]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [77]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber", [pattern])

In [85]:
doc = nlp("Call me at (123) 4560-7890")

In [86]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4560', '-', '7890']


In [87]:
matches = matcher(doc)
matches

[(7978097794922043545, 3, 9)]

In [88]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 4560-7890


###  Email Address Matching

In [89]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [90]:
matcher = Matcher(nlp.vocab)
matcher.add("Email", [pattern])

In [94]:
doc = nlp("Yo! my mail is ankithans1947@gmail.com & abc@bh.co.in")

In [95]:
matches = matcher(doc)
matches

[(11010771136823990775, 5, 6), (11010771136823990775, 7, 8)]

In [96]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

ankithans1947@gmail.com
abc@bh.co.in


### Hashtags and emoji Detection on social media

In [98]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]
neg_emoji = ["🥲", "😥", "😫", "😔", "😠", "😑"]

In [99]:
pos_emoji

['😀', '😃', '😂', '🤣', '😊', '😍']

In [124]:
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [125]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [141]:
neg_patterns

[[{'ORTH': '\U0001f972'}],
 [{'ORTH': '😥'}],
 [{'ORTH': '😫'}],
 [{'ORTH': '😔'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😑'}]]

In [148]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == 'HAPPY':
        doc.sentiment += 0.1
        
    elif doc.vocab.strings[match_id] == 'SAD':
        doc.sentiment -= 0.1

In [149]:
matcher = Matcher(nlp.vocab)

In [150]:
matcher.add("HAPPY", [*pos_patterns], on_match=label_sentiment)
matcher.add("SAD", [*neg_patterns], on_match=label_sentiment)

In [151]:
matcher.add("HASHTAG", [[{'TEXT': '#'}, {'IS_ASCII': True}]])

In [152]:
doc = nlp("Hello world 😫 #ankitHans")

In [153]:
matches = matcher(doc)

In [154]:
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span)

SAD 😫
HASHTAG #ankitHans


### Efficient Phrase Matching

In [155]:
from spacy.matcher import PhraseMatcher

In [156]:
matcher = PhraseMatcher(nlp.vocab)

In [165]:
terms = ['BARACK OBAMA', 'ANGELA MERKEL', 'WASHINGTON D.C.']

In [192]:
pattern = [nlp.make_doc(text) for text in terms]

In [193]:
pattern

[BARACK OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [194]:
matcher.add('term', [*pattern])

In [195]:
doc = nlp("German chancellor ANGELA MERKEL and US President BARACK OBAMA "
         "converse in the oval Office inside the White House in WASHINGTON D.C.")

In [196]:
doc

German chancellor ANGELA MERKEL and US President BARACK OBAMA converse in the oval Office inside the White House in WASHINGTON D.C.

In [197]:
matches = matcher(doc)

In [198]:
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 19, 21)]

In [199]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

ANGELA MERKEL
BARACK OBAMA
WASHINGTON D.C.


### Custom Rule Bases Entity Recognition

In [200]:
from spacy.pipeline import EntityRuler

In [201]:
nlp = spacy.load('en_core_web_sm')

In [202]:
ruler = EntityRuler(nlp)

In [204]:
patterns = [{"label": "ORG", "pattern": "KGP Talkie"},
           {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]

In [206]:
patterns

[{'label': 'ORG', 'pattern': 'KGP Talkie'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [217]:
ruler.add_patterns(patterns)

In [220]:
nlp.add_pipe('entity_ruler')

<spacy.pipeline.entityruler.EntityRuler at 0x192534f3500>

In [221]:
doc = nlp("KGP Talkie is opening its first big office in San Francisco.")

In [222]:
for ent in doc.ents:
    print(ent.text, ent.label_)

KGP Talkie PERSON
first ORDINAL
San Francisco GPE
