#### Installation and imports

In [3]:
conda install -c conda-forge spacy


Note: you may need to restart the kernel to use updated packages.


In [4]:
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


You should consider upgrading via the 'D:\Anaconda\python.exe -m pip install --upgrade pip' command.


In [5]:
pip install spacy-lookup

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'D:\Anaconda\python.exe -m pip install --upgrade pip' command.


### Matching

In [6]:
# rule based matching - matches and returns components, also gives access to the tokens within the doc and their relationships

In [7]:
#token based - rules that operate on individual tokens, also allows custom callbacks.

### code

In [8]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp('Hello World!')

In [11]:
doc

Hello World!

In [12]:
for token in doc:
    print(token)

Hello
World
!


In [13]:
#token based matching
pattern = [{"LOWER": "hello", 'OP':'?'}, {"IS_PUNCT": True, 'OP':'?'}, {"LOWER": "world"}]

In [14]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [15]:
doc = nlp('Hello, World!')

In [16]:
matches = matcher(doc)

In [17]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [18]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


### Regular Expressions

In [19]:
text = "my phone number is 1234. Ohh its wrong! Correct one is 1234678963. Okay, call me"

In [20]:
import re
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1234678963'>

In [21]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1234'>

In [22]:
#between 4-10 digits
re.findall(r'\d{4,10}', text)

['1234', '1234678963']

In [23]:
#atleast 4 char.
re.findall(r'\w{4,}', text)

['phone', 'number', '1234', 'wrong', 'Correct', '1234678963', 'Okay', 'call']

#### Wildcard text

In [24]:
re.findall(r'c....', text)

['ct on', 'call ']

In [25]:
text = "This is a cat, but that is not a cat. The hat fell on the cat."

In [26]:
re.findall(r'.a.',text)

[' a ', 'cat', 'hat', ' a ', 'cat', 'hat', 'cat']

In [27]:
text = 'hi thanks, it was a pleasure talking to you too <3'

In [28]:
re.findall(r'\d$', text)

['3']

In [29]:
re.findall(r'[^\d]',text)

['h',
 'i',
 ' ',
 't',
 'h',
 'a',
 'n',
 'k',
 's',
 ',',
 ' ',
 'i',
 't',
 ' ',
 'w',
 'a',
 's',
 ' ',
 'a',
 ' ',
 'p',
 'l',
 'e',
 'a',
 's',
 'u',
 'r',
 'e',
 ' ',
 't',
 'a',
 'l',
 'k',
 'i',
 'n',
 'g',
 ' ',
 't',
 'o',
 ' ',
 'y',
 'o',
 'u',
 ' ',
 't',
 'o',
 'o',
 ' ',
 '<']

In [30]:
text = "you can learn anything free-of-cost these days"

In [31]:
re.findall(r'[\w]+-[\w]+-[\w]+',text)

['free-of-cost']

#### regex in spacy

In [32]:
text = "Google announced a new Pixel phone at Google I/O Google I/O is a great place to get all updates from Google."

In [33]:
text

'Google announced a new Pixel phone at Google I/O Google I/O is a great place to get all updates from Google.'

In [34]:
#list of dict.
pattern = [{'TEXT':'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O'}]

In [74]:
##callback k saath abb
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [36]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [37]:
doc = nlp(text)

In [38]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 7, 11), (11578853341595296054, 11, 15)]

## Using Linguistic Annotations

#### Matched Entities 

In [39]:
#facebook is, facebook was dhundo

In [40]:
matcher = Matcher(nlp.vocab)

In [41]:
matched_sents = []

In [42]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]

In [43]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    
    # getting matched entity
    match_ents = [{
        'start': span.start_char - sent.start_char,
        'end': span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    
    #adding matched sentence to initialized empty list
    matched_sents.append({'text': sent.text, 'ents': match_ents})

In [44]:
matcher.add("fb", callback_method_fb, pattern)

In [45]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [46]:
matches = matcher(doc)

In [47]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [48]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [49]:
#coloration
displacy.render(matched_sents, style = 'ent', manual = True)

### Extracting Phone Numbers

In [50]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]

In [51]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber", None, pattern)

In [52]:
doc = nlp ("Call me at (123) 4516 7890")

In [53]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4516', '7890']


In [54]:
matches = matcher(doc)
matches

[(7978097794922043545, 3, 8)]

In [55]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 4516 7890


#### Email address matching

In [56]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [57]:
matcher = Matcher(nlp.vocab)
matcher.add("Email", None, pattern)

In [58]:
text = "Email me at email2me@anmolpant.com and talk.me@gmail.com"

In [59]:
doc = nlp(text)

In [60]:
matches = matcher(doc)

In [61]:
matches

[(11010771136823990775, 3, 4), (11010771136823990775, 5, 6)]

In [62]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

email2me@anmolpant.com
talk.me@gmail.com


### Hashtags and Emoji detection in social media

In [63]:
pos_emoji = ["üòÇ","üòç","üòò","üòä","üòè","üòÑ"]
neg_emoji = ["üò¢","üò°","üòî","üòí","üò©","üò≠"]

In [64]:
pos_emoji

['üòÇ', 'üòç', 'üòò', 'üòä', 'üòè', 'üòÑ']

In [65]:
#add patterns
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [66]:
pos_patterns

[[{'ORTH': 'üòÇ'}],
 [{'ORTH': 'üòç'}],
 [{'ORTH': 'üòò'}],
 [{'ORTH': 'üòä'}],
 [{'ORTH': 'üòè'}],
 [{'ORTH': 'üòÑ'}]]

In [67]:
neg_patterns

[[{'ORTH': 'üò¢'}],
 [{'ORTH': 'üò°'}],
 [{'ORTH': 'üòî'}],
 [{'ORTH': 'üòí'}],
 [{'ORTH': 'üò©'}],
 [{'ORTH': 'üò≠'}]]

In [68]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == 'HAPPY':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'SAD':
        doc.sentiment -= 0.1

In [69]:
matcher = Matcher(nlp.vocab)

In [70]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher.add("SAD", label_sentiment, *neg_patterns)

In [71]:
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])

In [72]:
doc = nlp("Hello world üòä #anmolpant")

In [73]:
matches = matcher(doc)

In [76]:
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span.text)

HAPPY üòä
HASHTAG #anmolpant


### Efficient phrase matching

In [77]:
from spacy.matcher import PhraseMatcher

In [78]:
matcher = PhraseMatcher(nlp.vocab)

In [85]:
terms = ['Barack Obama', "Prince Charles", "Donald Trump", "Washington D.C."]

In [86]:
pattern = [nlp.make_doc(text) for text in terms]

In [87]:
pattern

[Barack Obama, Prince Charles, Donald Trump, Washington D.C.]

In [88]:
matcher.add('term', None, *pattern)

In [89]:
doc = nlp ("After the retirement of Barack Obama, Donald Trump met Prince Charles in Washington D.C.")

In [90]:
matches = matcher(doc)

In [93]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Barack Obama
Donald Trump
Prince Charles
Washington D.C.
