In [16]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [17]:
nlp = spacy.load('en_core_web_sm')

In [18]:
doc = nlp('Hello World!')

In [19]:
doc

Hello World!

In [20]:
for token in doc:
    print(token)

Hello
World
!


In [75]:
pattern = [{"LOWER": "hello", 'OP': '?'},
          {"IS_PUNCT": True, 'OP': '?'},
          {"LOWER": "world"}]

In [76]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [pattern])

In [87]:
doc = nlp("Hello, World!")

In [88]:
matches = matcher(doc)

In [89]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [90]:
for token in doc:
    print(token)

Hello
,
World
!


In [91]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


### Regular Expression

In [92]:
text = "my phone number is 1256. ohh its wrong! Correct one is 1256348790. Call me!"

In [93]:
import re

In [95]:
re.search(r'\d{10}', text)

<re.Match object; span=(55, 65), match='1256348790'>

In [97]:
re.search(r'\d{4}', text)

<re.Match object; span=(19, 23), match='1256'>

In [103]:
re.findall(r'\d{4,10}', text)

['1256', '1256348790']

In [107]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1256', 'wrong', 'Correct', '1256348790', 'Call']

### Wildcard Text

In [115]:
re.findall(r'c...', text)

['ct o']

In [116]:
text = "this is cat but no that. i want hat and cat both"

In [117]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [118]:
text = 'hi thanks for watching <3'

In [120]:
re.findall(r'\d$', text)

['3']

In [122]:
re.findall(r'^\d', text)

[]

### Exclusion

In [123]:
text

'hi thanks for watching <3'

In [125]:
re.findall(r'[^\d]+', text)

['hi thanks for watching <']

In [131]:
re.findall(r'[^\D]+', text)

['3']

In [132]:
text = "you can get free-videos on kgp-talkie"

In [134]:
re.findall(r'[\w]+-[\w]+', text)

['free-videos', 'kgp-talkie']

### Regular Expression in Spacy

In [136]:
text = "Google announced a new Pixel at Google I/O. Google I/O is a great place to get all updates from Google."

In [None]:
text

In [None]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT': 'O'}]

In [None]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [None]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', [pattern], on_match=callback_method)

In [None]:
doc = nlp(text)

In [None]:
matcher(doc)