#### Installation and imports

In [1]:
conda install -c conda-forge spacy


Note: you may need to restart the kernel to use updated packages.


In [2]:
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


You should consider upgrading via the 'D:\Anaconda\python.exe -m pip install --upgrade pip' command.


In [3]:
pip install spacy-lookup

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'D:\Anaconda\python.exe -m pip install --upgrade pip' command.


### Matching

In [4]:
# rule based matching - matches and returns components, also gives access to the tokens within the doc and their relationships

In [5]:
#token based - rules that operate on individual tokens, also allows custom callbacks.

### code

In [6]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [7]:
nlp = spacy.load('en_core_web_sm')

In [8]:
doc = nlp('Hello World!')

In [9]:
doc

Hello World!

In [10]:
for token in doc:
    print(token)

Hello
World
!


In [11]:
#token based matching
pattern = [{"LOWER": "hello", 'OP':'?'}, {"IS_PUNCT": True, 'OP':'?'}, {"LOWER": "world"}]

In [12]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [13]:
doc = nlp('Hello, World!')

In [14]:
matches = matcher(doc)

In [15]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [16]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


### Regular Expressions

In [17]:
text = "my phone number is 1234. Ohh its wrong! Correct one is 1234678963. Okay, call me"

In [18]:
import re
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1234678963'>

In [19]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1234'>

In [20]:
#between 4-10 digits
re.findall(r'\d{4,10}', text)

['1234', '1234678963']

In [21]:
#atleast 4 char.
re.findall(r'\w{4,}', text)

['phone', 'number', '1234', 'wrong', 'Correct', '1234678963', 'Okay', 'call']

#### Wildcard text

In [22]:
re.findall(r'c....', text)

['ct on', 'call ']

In [23]:
text = "This is a cat, but that is not a cat. The hat fell on the cat."

In [24]:
re.findall(r'.a.',text)

[' a ', 'cat', 'hat', ' a ', 'cat', 'hat', 'cat']

In [25]:
text = 'hi thanks, it was a pleasure talking to you too <3'

In [26]:
re.findall(r'\d$', text)

['3']

In [27]:
re.findall(r'[^\d]',text)

['h',
 'i',
 ' ',
 't',
 'h',
 'a',
 'n',
 'k',
 's',
 ',',
 ' ',
 'i',
 't',
 ' ',
 'w',
 'a',
 's',
 ' ',
 'a',
 ' ',
 'p',
 'l',
 'e',
 'a',
 's',
 'u',
 'r',
 'e',
 ' ',
 't',
 'a',
 'l',
 'k',
 'i',
 'n',
 'g',
 ' ',
 't',
 'o',
 ' ',
 'y',
 'o',
 'u',
 ' ',
 't',
 'o',
 'o',
 ' ',
 '<']

In [28]:
text = "you can learn anything free-of-cost these days"

In [29]:
re.findall(r'[\w]+-[\w]+-[\w]+',text)

['free-of-cost']

#### regex in spacy

In [30]:
text = "Google announced a new Pixel phone at Google I/O Google I/O is a great place to get all updates from Google."

In [31]:
text

'Google announced a new Pixel phone at Google I/O Google I/O is a great place to get all updates from Google.'

In [32]:
#list of dict.
pattern = [{'TEXT':'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O'}]

In [33]:
##callback k saath abb
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [34]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', callback_method, pattern)

In [35]:
doc = nlp(text)

In [36]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 7, 11), (11578853341595296054, 11, 15)]

## Using Linguistic Annotations

#### Matched Entities 

In [37]:
#facebook is, facebook was dhundo

In [38]:
matcher = Matcher(nlp.vocab)

In [41]:
matched_sents = []

In [42]:
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}]

In [43]:
def callback_method_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    
    # getting matched entity
    match_ents = [{
        'start': span.start_char - sent.start_char,
        'end': span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    
    #adding matched sentence to initialized empty list
    matched_sents.append({'text': sent.text, 'ents': match_ents})

In [44]:
matcher.add("fb", callback_method_fb, pattern)

In [45]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [46]:
matches = matcher(doc)

In [47]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [48]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [49]:
#coloration
displacy.render(matched_sents, style = 'ent', manual = True)

### Extracting Phone Numbers