# Regular Expressions

In [1]:
import re

In [2]:
help(re)

Help on module re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.6/library/re
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.
    
    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last mat

![](img/regex_functions.png)

![](img/regex_flags.png)

In [3]:
import IPython
url = 'https://www.debuggex.com/cheatsheet/regex/python'
iframe = '<iframe src=' + url + ' width=1000 height=750></iframe>'
IPython.display.HTML(iframe)

## match

In [4]:
m = re.match('python', 'python.org')

In [5]:
m

<_sre.SRE_Match object; span=(0, 6), match='python'>

In [6]:
m.group()

'python'

In [7]:
m = re.match('python', 'www.python.org')

In [8]:
m is None

True

In [9]:
m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist")
m.group(0)       # The entire match

'Isaac Newton'

In [10]:
m.group(1)       # The first parenthesized subgroup.

'Isaac'

In [11]:
m.group(2)       # The second parenthesized subgroup.

'Newton'

In [12]:
m.group(1, 2)    # Multiple arguments give us a tuple.

('Isaac', 'Newton')

In [13]:
m.span()

(0, 12)

## search

In [14]:
line = "Cats are smarter than dogs";

searchObj = re.search( r'(.*) are (.*?) .*', line, re.M|re.I)

In [15]:
searchObj

<_sre.SRE_Match object; span=(0, 26), match='Cats are smarter than dogs'>

In [16]:
print("searchObj.group(0) : ", searchObj.group(0))
print("searchObj.group(1) : ", searchObj.group(1))
print("searchObj.group(2) : ", searchObj.group(2))

searchObj.group(0) :  Cats are smarter than dogs
searchObj.group(1) :  Cats
searchObj.group(2) :  smarter


In [17]:
# . matches any character
re.search(r'Co.k.e', 'Cookie').group()

'Cookie'

In [18]:
re.search(r'Co.k.e', 'Co?k,e').group()

'Co?k,e'

In [19]:
# \w matches a letter or digit
re.search(r'Co\wk\we', 'Cookie').group()

'Cookie'

In [20]:
re.search(r'Co\wk\we', 'Co9kAe').group()

'Co9kAe'

In [21]:
re.search(r'Co\wk\we', 'Co,k*e').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [22]:
# anti-w
re.search(r'C\Wke', 'C@ke').group()

'C@ke'

In [23]:
re.search(r'C\Wke', 'Coke').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [24]:
# \s matches single space character
re.search(r'Eat\scake', 'Eat cake').group()

'Eat cake'

In [25]:
re.search(r'Eat\scake', 'Eat,cake').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [26]:
# anti-s
re.search(r'Cook\Se', 'Cookie').group()

'Cookie'

In [27]:
re.search(r'Cook\Se', 'Cook e').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [28]:
# \d matches decimal digit 0-9
re.search(r'c\d\dkie', 'c00kie').group()

'c00kie'

In [29]:
# Caret ^ matches a pattern at the start of the string.
re.search(r'^Eat', 'Eat cake').group()

'Eat'

In [30]:
# $ matches a pattern at the end of string.
re.search(r'cake$', 'Eat cake').group()

'cake'

In [31]:
re.search(r'cake$', 'Eat cake.').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [32]:
re.search(r'Number: 1|2|3|4|5|6', 'Number: 5').group()

'5'

In [33]:
# [abc] matches one of them
# [a-zA-Z0-9] matches a letter no matter the case or a digit
re.search(r'Number: [0-6]', 'Number: 5').group()

'Number: 5'

In [34]:
# [^5] matches any character except 5
re.search(r'Number: [^5]', 'Number: 0').group()

'Number: 0'

In [35]:
# \b matches only the beginning or end of the word
re.search(r'\b[A-E]ookie', 'Cookie').group()

'Cookie'

In [36]:
# This checks for '\' in the string instead of '\t' due to the '\' used 
re.search(r'Back\\st', 'Back\stail').group()

'Back\\st'

In [37]:
# This treats '\s' as an escape character because it lacks '\' at the start of '\s'
re.search(r'Back\stail', 'Back tail').group()

'Back tail'

## Repetitions

In [38]:
re.search(r'Co+kie', 'Cooookie').group()

'Cooookie'

In [39]:
# Checks for any occurrence of a or o or both in the given sequence
re.search(r'Ca*o*kie', 'Cookie').group()

'Cookie'

In [40]:
# Checks for exactly zero or one occurrence of a or o or both in the given sequence
re.search(r'Colou?r', 'Color').group()

'Color'

In [41]:
re.search(r'\d{7,11}', 'Phone: 03121234567').group()

'03121234567'

In [42]:
re.search(r'\d{7,11}', 'Phone: 1234567').group()

'1234567'

### Greedy vs non-greedy

In [43]:
pattern = "cookie"
sequence = "Cake and cookie"

heading  = r'<h1>TITLE</h1>'
re.match(r'<.*>', heading).group()

'<h1>TITLE</h1>'

In [44]:
# *? matches as little text as possible
heading  = r'<h1>TITLE</h1>'
re.match(r'<.*?>', heading).group()

'<h1>'

## findall

In [45]:
email_address = "Please contact us at: support@datacamp.com, xyz@datacamp.com"

In [46]:
addresses = re.search(r'[\w\.-]+@[\w\.-]+', email_address)

In [47]:
addresses.group(0)

'support@datacamp.com'

In [48]:
addresses.group(1)

IndexError: no such group

In [49]:
email_address = "Please contact us at: support@datacamp.com, xyz@datacamp.com"

#'addresses' is a list that stores all the possible match
addresses = re.findall(r'[\w\.-]+@[\w\.-]+', email_address)
for address in addresses: 
    print(address)

support@datacamp.com
xyz@datacamp.com


In [50]:
text = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'
addresses = re.findall(r'([\w\.-]+)@([\w\.-]+)', text)
print(addresses)

[('alice', 'google.com'), ('bob', 'abc.com')]


In [51]:
for a in addresses:
    print(a[0], a[1])

alice google.com
bob abc.com


## sub

In [52]:
re.sub?

In [53]:
email_address = "Please contact us at: xyz@datacamp.com"
new_email_address = re.sub(r'([\w\.-]+)@([\w\.-]+)', r'support@datacamp.com', email_address)
print(new_email_address)

Please contact us at: support@datacamp.com


## compile

In [54]:
text = "Cake and cookie"

pattern = re.compile(r"cookie")
pattern.search(text).group()

'cookie'

In [55]:
re.search(pattern, sequence).group()

'cookie'

# Exercises

In [56]:
filename='Tesla_M0172_beta00.0_alpha9.0.txt'

In [57]:
pattern = re.compile(r'M(\d+)_beta(-*\d+\.\d)_alpha(-*\d+\.\d)')
m = re.search(pattern, filename)

In [58]:
m.group(2)

'00.0'

In [59]:
model=float(m.group(1))
beta=float(m.group(2))
alpha=float(m.group(3))

print(model, beta, alpha)

172.0 0.0 9.0


In [60]:
p = re.compile(r'(?<=_)([a-zA-Z]+)(-?\d+[.]?\d?)')
matches = p.findall(filename)
params = {k: float(v) for (k, v) in matches}

In [73]:
p = re.compile(r'(?<=_)([a-zA-Z]+)(-?\d+[.\d+]?)')
matches = p.findall('M172_beta0._alpha9.txt')
{k: float(v) for (k, v) in matches}

{'alpha': 9.0, 'beta': 0.0}

In [66]:
params

{'M': 172.0, 'alpha': 9.0, 'beta': 0.0}

In [61]:
import re
import requests
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'

def get_book(url):
    # Sends a http request to get the text from project Gutenberg
    raw = requests.get(url).text
    # Discards the metadata from the beginning of the book
    start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*",raw ).end()
    # Discards the metadata from the end of the book
    stop = re.search(r"II", raw).start()
    # Keeps the relevant text
    text = raw[start:stop]
    return text

def preprocess(sentence): 
    return re.sub('[^A-Za-z0-9.]+' , ' ', sentence).lower()

book = get_book(the_idiot_url)
processed_book = preprocess(book)
print(processed_book)

 produced by martin adamson david widger with corrections by andrew sly the idiot by fyodor dostoyevsky translated by eva martin part i i. towards the end of november during a thaw at nine o clock one morning a train on the warsaw and petersburg railway was approaching the latter city at full speed. the morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking and it was impossible to distinguish anything more than a few yards away from the carriage windows. some of the passengers by this particular train were returning from abroad but the third class carriages were the best filled chiefly with insignificant persons of various occupations and degrees picked up at the different stations nearer town. all of them seemed weary and most of them had sleepy eyes and a shivering expression while their complexions generally appeared to have taken on the colour of the fog outside. when day dawned two passengers in one of the third class carriages fou

In [62]:
# Find the number of the pronoun "the" in the corpus
len(re.findall(r'the', processed_book))

302

In [63]:
# Try to convert every single stand-alone instance of 'i' to 'I' in the corpus. 
# Make sure not to change the 'i' occuring in a word
processed_book = re.sub(r'\si\s', " I ", processed_book)
print(processed_book)

 produced by martin adamson david widger with corrections by andrew sly the idiot by fyodor dostoyevsky translated by eva martin part I i. towards the end of november during a thaw at nine o clock one morning a train on the warsaw and petersburg railway was approaching the latter city at full speed. the morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking and it was impossible to distinguish anything more than a few yards away from the carriage windows. some of the passengers by this particular train were returning from abroad but the third class carriages were the best filled chiefly with insignificant persons of various occupations and degrees picked up at the different stations nearer town. all of them seemed weary and most of them had sleepy eyes and a shivering expression while their complexions generally appeared to have taken on the colour of the fog outside. when day dawned two passengers in one of the third class carriages fou

In [64]:
# Find the number of times anyone was quoted ("") in the corpus
len(re.findall(r'\”', book))

96

In [65]:
# What are the words connected by '--' in the corpus?
re.findall(r'[a-zA-Z0-9]*--[a-zA-Z0-9]*', book)
# re.findall(r'\w*--\w*', book)

['ironical--it',
 'malicious--smile',
 'fur--or',
 'astrachan--overcoat',
 'it--the',
 'Italy--was',
 'malady--a',
 'money--and',
 'little--to',
 'No--Mr',
 'is--where',
 'I--I',
 'I--',
 '--though',
 'crime--we',
 'or--judge',
 'gaiters--still',
 '--if',
 'through--well',
 'say--through',
 'however--and',
 'Epanchin--oh',
 'too--at',
 'was--and',
 'Andreevitch--that',
 'everyone--that',
 'reduce--or',
 'raise--to',
 'listen--and',
 'history--but',
 'individual--one',
 'yes--I',
 'but--',
 't--not',
 'me--then',
 'perhaps--',
 'Yes--those',
 'me--is',
 'servility--if',
 'Rogojin--hereditary',
 'citizen--who',
 'least--goodness',
 'memory--but',
 'latter--since',
 'Rogojin--hung',
 'him--I',
 'anything--she',
 'old--and',
 'you--scarecrow',
 'certainly--certainly',
 'father--I',
 'Barashkoff--I',
 'see--and',
 'everything--Lebedeff',
 'about--he',
 'now--I',
 'Lihachof--',
 'Zaleshoff--looking',
 'old--fifty',
 'so--and',
 'this--do',
 'day--not',
 'that--',
 'do--by',
 'know--my',
 'il