# Regular expression

Sequence of characters that define a search pattern

In [15]:
import re

""" . -> matches any character other than \n"""
text = "cat cop cut"
print(re.search("c.t", text).group()) # cat cut cop

cat


In [16]:
# ^ matches the start of the string
text = "hello world"
pattern  ="^hello"
print(re.search(pattern, text, re.I).group())
text2 = "world hello"
print(re.search(pattern, text2, re.I))

hello
None


In [None]:
# $ matches the end of the string
text = "hello world"
pattern = "world$"
print(re.search(pattern, text, re.I).group())

text

world


In [None]:
# * matches 0 or more
text = "ac abc abbc abbbc"
# b* zero or more occurance of b before c
pattern = r"ab*c"
matches = re.findall(pattern, text)
print(matches)

['ac', 'abc', 'abbc', 'abbbc']


In [18]:
# + matches 1 or more
text = "ac abc abbc abbbc"
# b* one or more occurance of b before c
pattern = r"ab+c"
matches = re.findall(pattern, text)
print(matches)

['abc', 'abbc', 'abbbc']


In [None]:
# ? - zero or one occurance
text = "colour color"
pattern = r"colou?r" # u? - if u appear 0 or 1 time
matches = re.findall(pattern, text)
print(matches)

['colour', 'color']


In [23]:
# {} - range or exact occurance
text = "abc abbc abbbc abbbbc abbbbbc abbbbbbbbbc"
pattern = r"ab{2,4}c" # b{2,4} - b must appear min 2 and max 4 times
matches = re.findall(pattern, text)
print(matches)

['abbc', 'abbbc', 'abbbbc']


In [None]:
# [] -  matches any single character
text = "cat cot cut cop"
pattern = r"c[ou]t" # [ou] - either o or u
matches = re.findall(pattern, text)
print(matches)

['cot', 'cut']


In [None]:
# [^] - caret inside, matches everything except the given char
text = "cat cut cot"
pattern = r"c[^o]t" # [^o] - matches anything other than o
matches = re.findall(pattern, text)
print(matches)

['cat', 'cut']


In [None]:
# \d - matches digits
text = "Room 101, Room 102, Room 103"
pattern = r"\d" # matches one digit
matches = re.findall(pattern, text)
print(matches)
pattern = r"\d+" # matches one or more digits
matches = re.findall(pattern, text)
print(matches)

['1', '0', '1', '1', '0', '2', '1', '0', '3']
['101', '102', '103']


In [28]:
# \D - non digits
text = "Room 101"
pattern = r"\D+"
matches = re.findall(pattern, text)
print(matches)

['Room ']


In [30]:
# \s - matches whitespace
text = "Hello     World"
pattern = r"\s+"
matches = re.findall(pattern, text)
print(matches)

['     ']


In [31]:
# \S - matches non whitespace
text = "Hello      World"
pattern = r"\S+"
matches = re.findall(pattern, text)
print(matches)

['Hello', 'World']


In [None]:
# \w - any word character
text = "John Doe, 123 New Stree, House_12"
pattern = r"\w+"  # letters or numbers or _
matches = re.findall(pattern, text)
print(matches)

['John', 'Doe', '123', 'New', 'Stree', 'House_12']


In [None]:
# \W - any non word character
text = "John Doe, 123 New Stree, House_12"
pattern = r"\W+"  # anything other than letters or numbers or _
matches = re.findall(pattern, text)
print(matches)

[' ', ', ', ' ', ' ', '.', ', ']


In [38]:
# () - group capturing
text = "John Doe, Jane Smith"
pattern = r"(\w+) (\w+)"
matches = re.findall(pattern, text)
print(matches)

[('John', 'Doe'), ('Jane', 'Smith')]


In [None]:
text = "1234567890 12345678901 123456789"
pattern = r"\b\d{10}\b" # \b - word boundary
matches = re.findall(pattern, text)
print(matches)

['1234567890']


In [43]:
text = "Valid numbers: 9876543210, 9123456789. Invalid: 123456789, 0987654321"
pattern = r"\b[6789]\d{9}\b"
matches = re.findall(pattern, text)
print(matches)

['9876543210', '9123456789']


In [None]:
text = "1234-1234-1234-1234 , 1234 1234 1234 1234"
pattern = r"\b\d{4}-\d{4}-\d{4}-\d{4}\b"
matches = re.findall(pattern, text)
print(matches)

['1234-1234-1234-1234']
