In [4]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [5]:
'phone' in text

True

In [6]:
import re

In [7]:
pattern = 'phone'

In [8]:
match = re.search(pattern, text)
match

<re.Match object; span=(12, 17), match='phone'>

In [9]:
# match has many useful methods we can use
match.span()  # Returns start and end index for matched string

(12, 17)

In [10]:
match.start()

12

In [11]:
match.end()

17

#### For multiple matches, we can use re.finalall() method

In [19]:
text = "My phone once - 888-333-1234, my phone twice - 408-333-2456, my phone thrice - 333-212-4444."

In [20]:
matches = re.findall(pattern, text)

In [21]:
matches

[]

#### re.finditer() iterates through given text and find each match object in the text

In [22]:
for match in re.finditer(pattern, text):
    print(match.span())

In [23]:
# This will print all the matched strings found
for match in re.finditer(pattern, text):
    print(match.group())

### This is how we use regex in Python. See re.compile() method as well that allows grabbing part of strings

In [29]:
pattern = r'\d{3}-\d{3}-\d{4}'
pattern

'\\d{3}-\\d{3}-\\d{4}'

In [30]:
re.search(r'\d{3}-\d{3}-\d{4}', text)

<re.Match object; span=(16, 28), match='888-333-1234'>

In [31]:
for match in re.finditer(pattern, text):
    print(match.group())

888-333-1234
408-333-2456
333-212-4444


In [34]:
# You don't need to prefix pattern text with r. Just escape the backslash with another escape!
new_pattern = '\\d{3}-\\d{3}-\\d{4}'

In [35]:
for match in re.finditer(new_pattern, text):
    print(match.group())

888-333-1234
408-333-2456
333-212-4444


In [38]:
# What if we want to grab just the area codes from all phone numbers?
# We need our pattern in compiled form then with various groups in pattern speparated by parantheses
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
phone_pattern

re.compile(r'(\d{3})-(\d{3})-(\d{4})', re.UNICODE)

In [39]:
results = re.search(phone_pattern, text)

In [40]:
results.group()

'888-333-1234'

In [41]:
# Now, we can grab individual group we defined in pattern by indexes. For groups, index starts with 1, not 0
results.group(1)

'888'

In [44]:
# We can grab all the area codes too! Similarly we can get other groups as well by indices 2 onwards!
for match in re.finditer(phone_pattern, text):
    print(match.group(1))

888
408
333


## Additional Regex Syntax

In [46]:
# The pipe operator for OR:
re.search(r'cat|dog', "There's a cat there")

<re.Match object; span=(10, 13), match='cat'>

In [47]:
re.search(r'cat|dog', "There's a dog there")

<re.Match object; span=(10, 13), match='dog'>

In [48]:
# wildcard operator - matches any character.

In [50]:
re.findall(r'at', "The cat and a hat sat there")

['at', 'at', 'at']

In [51]:
# if we had to grab letter(s) before 'at', we will need a wildcard
re.findall(r'.at', "The cat and a hat sat there")

['cat', 'hat', 'sat']

In [60]:
# Starts with (a number). Caret symbol (^) denotes start of an expression!
re.findall(r'^\d', '1 is a number')

['1']

In [57]:
# Ends with (a number, as example). $ denotes end of expression!
re.findall(r'\d$', 'My role number is 1')

['1']

In [58]:
re.findall(r'\d$', 'My role number is 1, what about you?')

[]

In [72]:
# Caret symbol (^) also is used to exclude something in search, but there, it is used along with square brackets
# Example: I want to exclude digits from below sentence
phrase = "There are 3 numbers 34 inside 5 this sentence"
pattern = r'[^\d]'
pattern

'[^\\d]'

In [73]:
# So, it will find evey single character that is not a number
re.findall(pattern, phrase)

['T',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [74]:
#  Add a + after [] in pattern to grab the characters together without numbers
pattern = pattern = r'[^\d]+'
pattern

'[^\\d]+'

In [75]:
re.findall(pattern, phrase)

['There are ', ' numbers ', ' inside ', ' this sentence']

In [78]:
# Remove all the punctuations from a sentence
test_phrase = "This is a string! But it has punctuations. How can I remove it?"
pattern = r'[^!.? ]+'
pattern

'[^!.? ]+'

In [81]:
clean = re.findall(pattern, test_phrase)
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuations',
 'How',
 'can',
 'I',
 'remove',
 'it']

In [83]:
# You can then join the elements of the string with space between them as below:
" ".join(clean)

'This is a string But it has punctuations How can I remove it'

In [89]:
# Group characters in [ ] for inclusion. In example above, we use [] for exclusions
# For example, find all the words that occur with a hyphen in between them
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are.'

In [90]:
pattern = r'[\w]+-[\w]+'
pattern

'[\\w]+-[\\w]+'

In [91]:
re.findall(pattern, text)

['hypen-words', 'long-ish']

In [92]:
# You can also use parantheses for purpose other than groups
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [95]:
re.search(r'cat(fish|nap|pillar)', text)

<re.Match object; span=(27, 34), match='catfish'>

In [96]:
re.search(r'cat(fish|nap|pillar)', texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [99]:
re.search(r'cat(fish|nap|erpillar)', textthree)

<re.Match object; span=(26, 37), match='caterpillar'>

In [101]:
# With this pattern, you can also grab the pipe separated parts you used inside parantheses
re.findall(r'cat(fish|nap|pillar)', text)

['fish']

In [102]:
re.findall(r'cat(fish|nap|pillar)', texttwo)

['nap']