### Regular Expressions

- Allow us to search a text for strings matching a specific pattern


In [1]:
import re
result = re.search(r"aza", "plaza")
print(result)

<re.Match object; span=(2, 5), match='aza'>


In [2]:
result = re.search(r"aza", "bazar")
print(result)

<re.Match object; span=(1, 4), match='aza'>


In [3]:
result = re.search(r"aza", "maze")
print(result)

None


In [4]:
print(re.search(r"p.ng", "sponge"))

<re.Match object; span=(1, 5), match='pong'>


In [5]:
print(re.search(r"p.ng", "ping"))

<re.Match object; span=(0, 4), match='ping'>


In [6]:
print(re.search(r"p.ng", "Pangaea", re.IGNORECASE)) #Ignore the uppercase/lowercase

<re.Match object; span=(0, 4), match='Pang'>


In [7]:
#Wildcard Character
print(re.search(r"[pP]ython", "python"))

<re.Match object; span=(0, 6), match='python'>


In [8]:
print(re.search(r"[a-z]way", "The end of the highway"))

<re.Match object; span=(18, 22), match='hway'>


In [9]:
#[0-9], [a-z], [uU], [a-zA-Z0-9]
print(re.search(r"[a-zA-Z0-9]$", "cloud9"))

<re.Match object; span=(5, 6), match='9'>


In [10]:
print(re.search(r"[^a-zA-Z]", "This is a sentence with spaces."))
#Here this ^ character will match not a-zA-Z

<re.Match object; span=(4, 5), match=' '>


In [11]:
print(re.search(r"[^a-zA-Z ]", "This is a sentence with spaces.")) 
#Here we add a space( ) in the expressin so it does not match the space because of ^ character

<re.Match object; span=(30, 31), match='.'>


In [12]:
print(re.search(r"cat|dog", "I like cats")) 
# Here the pipe(|) character will match cat `or` dog

<re.Match object; span=(7, 10), match='cat'>


In [13]:
print(re.findall(r"cat|dog", "I love cats and dogs.")) 
#find all will find all the matched cases. return [array]

['cat', 'dog']


###### `.*` This means it will match any character repeated as many times as possible including zero(0)

In [14]:
print(re.search(r"Py.*n", "Pygmalion"))

<re.Match object; span=(0, 9), match='Pygmalion'>


In [15]:
print(re.search(r"Py.*n", "Python Programming")) 
# Because It will take as many characters as possible
print(re.search(r"Py[a-z]*n", "Python Programming")) 
# This will take only a word
print(re.search(r"Py[a-z]*n", "Pyn")) 
# This poves that it also matches 0 characters

<re.Match object; span=(0, 17), match='Python Programmin'>
<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 3), match='Pyn'>


##### The `Plus( + )` character matches one or more occurences of the character that comers before it.

In [16]:
#Examples
print(re.search(r"o+l+", "goldfish"))

<re.Match object; span=(1, 3), match='ol'>


In [17]:
print(re.search(r"o+l+", "woolly"))

<re.Match object; span=(1, 5), match='ooll'>


In [18]:
print(re.search(r"o+l+", "boil"))

None


##### The `Question Mark( ? )` character will make the character optional comes before it

In [19]:
print(re.search(r"p?each", "To each their own"))

<re.Match object; span=(3, 7), match='each'>


In [20]:
print(re.search(r"p?each", "I like peaches"))

<re.Match object; span=(7, 12), match='peach'>


In [21]:
pattern = r"^[a-zA-Z_][a-zA-Z0-9_]*$"
print(re.search(pattern, "_this_is_a_vald_variable_name"))
print(re.search(pattern, "this isn't a valid variable"))
print(re.search(pattern, "9_not_valid_variable"))

<re.Match object; span=(0, 29), match='_this_is_a_vald_variable_name'>
None
None


#### Capturing groups
##### Portions of the pattern that are enclosed in parentheses

#### `\w` matches any letter

In [22]:
result = re.search(r"^(\w*), (\w*)$", "Lovelace, Ada")
print(result)

<re.Match object; span=(0, 13), match='Lovelace, Ada'>


In [23]:
print(result.groups())
print(result[0])
print(result[1])
print(result[2])

('Lovelace', 'Ada')
Lovelace, Ada
Lovelace
Ada


In [24]:
def rearrange_name(name):
    result = re.search(r"^(\w*), (.*)$", name)
    if result is None:
        return name
    else:
        return f"{result[2]} {result[1]}"

print(rearrange_name("Lovelace, Ada"))
print(rearrange_name("Ritchie, Dennis"))
print(rearrange_name("Ritchie, Dennis M."))

Ada Lovelace
Dennis Ritchie
Dennis M. Ritchie


#### `{n}` will match n no of words match

In [25]:
# Example
print(re.search(r"[a-zA-Z]{5}", "a scary ghost appeared"))

#find all
print(re.findall(r"[a-zA-Z]{5}", "a scary ghost appeared"))

<re.Match object; span=(2, 7), match='scary'>
['scary', 'ghost', 'appea']


#### `\b` matches word limits at the begining and end of pattern

In [26]:
# Example
print(re.findall(r"\b[a-zA-Z]{5}\b", "a scary ghost appeared"))

['scary', 'ghost']


#### `{n1, n2}` By these we can set range of our match

In [27]:
# Example
print(re.findall(r"\w{5,10}", "I really like straberries"))

['really', 'straberrie']


In [28]:
print(re.findall(r"\w{5,}", "I really like straberries")) 
# This will take 5 to max number of works mathes #run it

print(re.findall(r"s\w{,20}", "I really like straberrires")) 
# Starting from s followed by up to 20 alphanumeric characters

['really', 'straberries']
['straberrires']


#### Extracting a PID Using regexes in Python

#### `\d` match one or more numeric characters

In [61]:
def extract_pid(log_line):
    regex = r"\[(\d+)\]: ([A-Z]*)"
    result = re.search(regex, log_line)
    if result is None:
        return ""
    else:
        return result[1], result[2]

print(extract_pid("July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade"))

('12345', 'ERROR')


## Other important methods 
#### `re.split()` 

In [62]:
re.split(r"[.?!]", "One sentence. Another One? And the last one!")

['One sentence', ' Another One', ' And the last one', '']

In [63]:
re.split(r"([.?!])", "One sentence. Another One? And the last one!") 
# This will give us with notation marks as elements of a list

['One sentence', '.', ' Another One', '?', ' And the last one', '!', '']

#### `re.sub()`

In [64]:
re.sub(r"[\w.%+-]+@[\w.-]+", "[REDACTED]" , "Recieved an email for go_nuts95@my.example.com")

'Recieved an email for [REDACTED]'

In [65]:
re.sub(r"^([\w .-]*), ([\w .-]*)$", r"\2 \1", "Lovelace, Ada") # Here index are changed
#                 Indexe reversed   --^  -^

'Ada Lovelace'