https://www.datacamp.com/tutorial/python-regular-expression-tutorial

In [1]:
import re

In [4]:
# a raw string literal. It changes how the string literal is interpreted. Such literals are stored as they appear.
pattern = r"Cookie"
sequence = "Cookie"

if re.match(pattern, sequence):
    print("Match!")
else: print("Not a match!")

Match!


In [5]:
#. - A period. Matches any single character except the newline character.

#the search function, you scan through the given string/sequence, 
#looking for the first location where the regular expression produces a match.
#The group function returns the string matched by the re. You will see both these functions in more detail later.
re.search(r'Co.k.e', 'Cookie').group()

'Cookie'

In [6]:
#^ - A caret. Matches the start of the string.
re.search(r'^Eat', "Eat cake!").group()

## However, the code below will not give the same result. Try it for yourself:
# re.search(r'^eat', "Let's eat cake!").group()

'Eat'

In [7]:
#$ - Matches the end of string
re.search(r'cake$', "Cake! Let's eat cake").group()

'cake'

In [8]:
#[abc] - Matches a or b or c.
#[a-zA-Z0-9] - Matches any letter from (a to z) or (A to Z) or (0 to 9).
re.search(r'[0-6]', 'Number: 5').group()

'5'

In [9]:
## Matches any character except 5
re.search(r'Number: [^5]', 'Number: 0').group()

## This will not match and hence a NONE value will be returned
#re.search(r'Number: [^5]', 'Number: 5').group()

'Number: 0'

In [10]:
#\w - Lowercase 'w'. Matches any single letter, digit, or underscore.
#\W - Uppercase 'W'. Matches any character not part of \w (lowercase w).

print("Lowercase w:", re.search(r'Co\wk\we', 'Cookie').group())

## Matches any character except single letter, digit or underscore
print("Uppercase W:", re.search(r'C\Wke', 'C@ke').group())

## Uppercase W won't match single letter, digit
print("Uppercase W won't match, and return:", re.search(r'Co\Wk\We', 'Cookie'))


Lowercase w: Cookie
Uppercase W: C@ke
Uppercase W won't match, and return: None


In [11]:
#\s - Lowercase 's'. Matches a single whitespace character like: space, newline, tab, return.
#\S - Uppercase 'S'. Matches any character not part of \s (lowercase s).
print("Lowercase s:", re.search(r'Eat\scake', 'Eat cake').group())
print("Uppercase S:", re.search(r'cook\Se', "Let's eat cookie").group())


Lowercase s: Eat cake
Uppercase S: cookie


In [12]:
#\d - Lowercase d. Matches decimal digit 0-9.
#\D - Uppercase d. Matches any character that is not a decimal digit.
print("How many cookies do you want? ", re.search(r'\d+', '100 cookies').group())

How many cookies do you want?  100


In [24]:
#\t - Lowercase t. Matches tab.
#\n - Lowercase n. Matches newline.
#\r - Lowercase r. Matches return
#\A - Uppercase a. Matches only at the start of the string. Works across multiple lines as well.
#\Z - Uppercase z. Matches only at the end of the string.
#TIP: ^ and \A are effectively the same, and so are $ and \Z. Except when dealing with MULTILINE mode. Learn more about it in the flags section.

#\b - Lowercase b. Matches only the beginning or end of the word.

# Example for \t
print( re.search(r'Eat\tcake', 'Eat     cake').group())

#Example for \b
print(re.search(r'\b[A-E]ookie', 'Cookie').group())

AttributeError: 'NoneType' object has no attribute 'group'

In [25]:
#+ - Checks if the preceding character appears one or more times starting from that
re.search(r'Co+kie', 'Cooookie').group()

'Cooookie'

In [26]:
#* - Checks if the preceding character appears zero or more times starting from that position.
re.search(r'Ca*o*kie', 'Cookie').group()

'Cookie'

In [27]:
#? - Checks if the preceding character appears exactly zero or one time starting from that position.
re.search(r'Colou?r', 'Color').group()

'Color'

In [28]:
#{x} - Repeat exactly x number of times.
#{x,} - Repeat at least x times or more.
#{x, y} - Repeat at least x times but no more than y times.
re.search(r'\d{9,10}', '0987654321').group()

'0987654321'

In [29]:
#Parts of a regular expression pattern bounded by parenthesis () are called groups. 
#The parenthesis does not change what the expression matches, but rather forms groups within the matched sequence

statement = 'Please contact us at: support@datacamp.com'
match = re.search(r'([\w\.-]+)@([\w\.-]+)', statement)
if statement:
  print("Email address:", match.group()) # The whole matched text
  print("Username:", match.group(1)) # The username (group 1)
  print("Host:", match.group(2)) # The host (group 2

Email address: support@datacamp.com
Username: support
Host: datacamp.com


In [30]:
#Another way of doing the same is with the usage of <> brackets instead. 
#This will let you create named groups. Named groups will make your code more readable. 
#The syntax for creating named group is: (?P<name>...).
#Replace the name part with the name you want to give to your group. 
#The ... represent the rest of the matching syntax.

statement = 'Please contact us at: support@datacamp.com'
match = re.search(r'(?P<email>(?P<username>[\w\.-]+)@(?P<host>[\w\.-]+))', statement)
if statement:
  print("Email address:", match.group('email'))
  print("Username:", match.group('username'))
  print("Host:", match.group('host'))

Email address: support@datacamp.com
Username: support
Host: datacamp.com


In [31]:
#Greedy vs. Non-Greedy Matching
pattern = "cookie"
sequence = "Cake and cookie"

heading  = r'<h1>TITLE</h1>'
re.match(r'<.*>', heading).group()

'<h1>TITLE</h1>'

In [32]:
#Adding ? after the qualifier makes it perform the match in a non-greedy or minimal fashion; 
#That is, as few characters as possible will be matched.
#When you run <.*>, you will only get a match with <h1>.
heading  = r'<h1>TITLE</h1>'
re.match(r'<.*?>', heading).group()

'<h1>'