# Regular Expressions (Part One)

In [1]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
# find 'phone' in text
'phone' in text

True

In [3]:
# import regex library
import re

In [4]:
pattern = 'phone'

In [5]:
# find phone in text using regex, which returns a Match object
re.search(pattern, text)

<re.Match object; span=(12, 17), match='phone'>

In [6]:
pattern = 'NOT IN TEXT'

In [7]:
# nothing is returned when no match is found
re.search(pattern, text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern, text)

In [10]:
# get the spanned indices of the match from start to end
match.span()

(12, 17)

In [12]:
# get the starting index of the matched string
match.start()

12

In [26]:
# get the ending index of the matched string
match.end()

23

In [14]:
text = 'my phone once, my phone twice'

In [15]:
match = re.search('phone', text)

In [16]:
# search only returns the first matched occurrence
match

<re.Match object; span=(3, 8), match='phone'>

In [17]:
# use findall to return all matched occurrences
matches = re.findall('phone', text)

In [18]:
# list of matched strings by default
matches

['phone', 'phone']

In [19]:
len(matches)

2

In [20]:
# use finditer to return the list of matched objects
for match in re.finditer('phone', text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


In [23]:
# typecast the results using list for easy access
matches = list(re.finditer('phone', text))

In [24]:
matches

[<re.Match object; span=(3, 8), match='phone'>,
 <re.Match object; span=(18, 23), match='phone'>]

In [25]:
# get the actual match using group()
matches[0].group()

'phone'

# Regular Expressions (Part Two)

#### Character identifiers

* \d = digit
* \w = alphanumeric
* \s = whitespace
* \D = non-digit
* \W = non-alphanumeric
* \S = non-whitespace

#### Quantifiers

* \* = occurs 0 or more times
* \+ = occurs 1 or more times
* ? = occurs 1 time or none
* {n} = occurs exactly n times
* {n, m} = occurs n to m times

In [36]:
text = 'My phone number is 408-555-7777'

In [41]:
# search for a phone number using character identifiers
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d', text)

In [42]:
phone

<re.Match object; span=(19, 31), match='408-555-7777'>

In [43]:
phone.group()

'408-555-7777'

In [44]:
# quantify the pattern which makes it easier to write a search pattern
phone = re.search(r'\d{3}-\d{3}-\d{4}', text)

In [45]:
phone

<re.Match object; span=(19, 31), match='408-555-7777'>

In [46]:
phone.group()

'408-555-7777'

In [47]:
# use compile to create groups of patterns and access them individually later on
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [48]:
results = re.search(phone_pattern, text)

In [49]:
# return the actual result
results.group()

'408-555-7777'

In [50]:
# return the first group in the match
results.group(1)

'408'

In [51]:
# return the second group in the match
results.group(2)

'555'

In [52]:
# return the third group in the match
results.group(3)

'7777'

In [53]:
# error is returned when attempting to get a non-existing group match
results.group(4)

IndexError: no such group

# Regular Expressions (Part Three)

In [54]:
re.search(r'cat', 'The cat is here.')

<re.Match object; span=(4, 7), match='cat'>

In [55]:
re.search(r'cat', 'The dog is here.')

In [56]:
# find either matches using | (or)
re.search(r'cat|dog', 'The cat is here.')

<re.Match object; span=(4, 7), match='cat'>

In [57]:
re.search(r'cat|dog', 'The dog is here.')

<re.Match object; span=(4, 7), match='dog'>

In [59]:
# use . wildcard for any character
re.findall(r'.at', 'The cat in the hat sat there.')

['cat', 'hat', 'sat']

In [61]:
# be careful using wildcards
re.findall(r'...at', 'The cat in the hat went splat.')

['e cat', 'e hat', 'splat']

In [62]:
# use ^ to find that starts with the pattern
re.findall(r'^\d', '1 is a number')

['1']

In [63]:
re.findall(r'^\d', 'Text 2 is a number')

[]

In [64]:
# use $ to find that ends with the pattern
re.findall(r'\d$', 'The number is 3')

['3']

In [65]:
phrase = 'There are 3 numbers 34 inside 5 this sentence'

In [68]:
# use [^] to exclude characters in the text
pattern = r'[^\d]+'

In [69]:
re.findall(pattern, phrase)

['There are ', ' numbers ', ' inside ', ' this sentence']

In [70]:
# useful for cleaning a text with punctuations
test_phrase = 'This is a string! But there are punctuations, so how can we remove them?'

In [73]:
cleaned = re.findall(r'[^!.?, ]+', test_phrase)

In [74]:
' '.join(cleaned)

'This is a string But there are punctuations so how can we remove them'

In [75]:
text = 'Only find the hyphen-words in this sentence. But you do not know how long-ish they are.'

In [76]:
# useful for grouping patterns together
pattern = r'[\w]+'

In [77]:
re.findall(pattern, text)

['Only',
 'find',
 'the',
 'hyphen',
 'words',
 'in',
 'this',
 'sentence',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [78]:
pattern = r'[\w]+-[\w]+'

In [79]:
re.findall(pattern, text)

['hyphen-words', 'long-ish']

In [87]:
text1 = 'Hello, would you like some catfish?'
text2 = "Hello, would you like to take a catnap?"
text3 = "Hello, have you seen this caterpillar?"

In [88]:
re.search(r'cat(fish|nap|erpillar)', text1)

<re.Match object; span=(27, 34), match='catfish'>

In [89]:
re.search(r'cat(fish|nap|erpillar)', text2)

<re.Match object; span=(32, 38), match='catnap'>

In [90]:
re.search(r'cat(fish|nap|erpillar)', text3)

<re.Match object; span=(26, 37), match='caterpillar'>