In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp('Hello World!')

In [4]:
doc

Hello World!

In [5]:
for token in doc:
    print(token)

Hello
World
!


In [6]:
pattern = [{"LOWER": "hello", 'OP': '?'},
          {"IS_PUNCT": True, 'OP': '?'},
          {"LOWER": "world"}]

In [7]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [pattern])

In [8]:
doc = nlp("Hello, World!")

In [9]:
matches = matcher(doc)

In [10]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [11]:
for token in doc:
    print(token)

Hello
,
World
!


In [12]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


### Regular Expression

In [13]:
text = "my phone number is 1256. ohh its wrong! Correct one is 1256348790. Call me!"

In [14]:
import re

In [15]:
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1256348790'>

In [16]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1256'>

In [17]:
re.findall(r'\d{4,10}', text)

['1256', '1256348790']

In [18]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1256', 'wrong', 'Correct', '1256348790', 'Call']

### Wildcard Text

In [19]:
re.findall(r'c...', text)

['ct o']

In [20]:
text = "this is cat but no that. i want hat and cat both"

In [21]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [22]:
text = 'hi thanks for watching <3'

In [23]:
re.findall(r'\d$', text)

['3']

In [24]:
re.findall(r'^\d', text)

[]

### Exclusion

In [25]:
text

'hi thanks for watching <3'

In [26]:
re.findall(r'[^\d]+', text)

['hi thanks for watching <']

In [27]:
re.findall(r'[^\D]+', text)

['3']

In [28]:
text = "you can get free-videos on kgp-talkie"

In [29]:
re.findall(r'[\w]+-[\w]+', text)

['free-videos', 'kgp-talkie']

### Regular Expression in Spacy

In [37]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

In [38]:
text

'Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google.'

In [39]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O'}]

In [40]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [41]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', [pattern], on_match=callback_method)

In [42]:
doc = nlp(text)

In [43]:
matcher(doc)

Google I/O
Google I/O


[(11578853341595296054, 6, 10), (11578853341595296054, 10, 14)]

### 