# Regular Expressions

In [2]:
import re

In [2]:
# help(re)

![](img/regex_functions.png)

In [3]:
import IPython
url = 'https://www.debuggex.com/cheatsheet/regex/python'
iframe = '<iframe src=' + url + ' width=1000 height=750></iframe>'
IPython.display.HTML(iframe)

## Methods
```
match()    Determine if the RE matches at the beginning of the string.

search()   Scan through a string, looking for any location where this RE matches.

findall()  Find all substrings where the RE matches, and returns them as a list.

```

## match

In [5]:
# re.match(pattern, string, flags)
m = re.match('python', 'python.org')

In [5]:
m

<_sre.SRE_Match object; span=(0, 6), match='python'>

In [6]:
m.group()

'python'

In [7]:
m = re.match('python', 'www.python.org')

In [8]:
m is None

True

In [18]:
m = re.match('python', 'PyThOn', re.IGNORECASE)

In [19]:
m

<_sre.SRE_Match object; span=(0, 6), match='PyThOn'>

![](../img/regex_match_methods.png)

In [18]:
m.group(0)

'PyThOn'

In [6]:
m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist")
m.group(0)       # The entire match

'Isaac Newton'

In [8]:
m.group(1)       # The first parenthesized subgroup.

'Isaac'

In [9]:
m.group(2)       # The second parenthesized subgroup.

'Newton'

In [10]:
m.group(1, 2)    # Multiple arguments give us a tuple.

('Isaac', 'Newton')

In [11]:
m.span()

(0, 12)

## search

In [23]:
m = re.search('python', 'www.python.org')
m.group()

'python'

In [39]:
re.search(r'\d+', 'Emergency Phone: 911').group()

'911'

In [40]:
re.search(r'\d+/\d+/\d+', 'Exam date is 24/02/2019').group()

'24/02/2019'

#### Exercise



## Metacharacters

```
.    ^    $    *    +    ?    {    }    [    ]    \    |    (    )   

```

<pre>
.
Matches any character.

\d
Matches any decimal digit; this is equivalent to the class [0-9].

\D
Matches any non-digit character; this is equivalent to the class [^0-9].

\s
Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v].

\S
Matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v].

\w
Matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_].

\W
Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_].

\b
Matches the beginning of string.
</pre>

In [17]:
# . matches any character
re.match(r'Co.k.e', 'Cookie').group()

'Cookie'

In [18]:
re.match(r'Co.k.e', 'Co?k,e').group()

'Co?k,e'

In [19]:
# \w matches a letter or digit
re.match(r'Co\wk\we', 'Cookie').group()

'Cookie'

In [20]:
re.match(r'Co\wk\we', 'Co9kAe').group()

'Co9kAe'

In [21]:
re.match(r'Co\wk\we', 'Co,k*e').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [22]:
# anti-w
re.match(r'C\Wke', 'C@ke').group()

'C@ke'

In [23]:
re.match(r'C\Wke', 'Coke').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [24]:
# \s matches single space character
re.match(r'Eat\scake', 'Eat cake').group()

'Eat cake'

In [25]:
re.match(r'Eat\scake', 'Eat,cake').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [26]:
# anti-s
re.match(r'Cook\Se', 'Cookie').group()

'Cookie'

In [27]:
re.match(r'Cook\Se', 'Cook e').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [28]:
# \d matches decimal digit 0-9
re.match(r'c\d\dkie', 'c00kie').group()

'c00kie'

In [29]:
# Caret ^ matches a pattern at the start of the string.
re.search(r'^Eat', 'Eat cake').group()

'Eat'

In [30]:
# $ matches a pattern at the end of string.
re.match(r'cake$', 'Eat cake').group()

'cake'

In [31]:
re.match(r'cake$', 'Eat cake.').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [15]:
re.match(r'Number: (1|2|3|4|5|6)', 'Number: 5').group()

'Number: 5'

In [37]:
# This treats '\s' as an escape character because it lacks '\' at the start of '\s'
re.match(r'Back\stail', 'Back tail').group()

'Back tail'

#### Exercise

Write a regular expression that matches both theater and theatre.

### Inclusion  [ ]

The first metacharacters we’ll look at are [ and ]. They’re used for specifying a character class, which is a set of characters that you wish to match. Characters can be listed individually, or a range of characters can be indicated by giving two characters and separating them by a '-'. For example, [abc] will match any of the characters a, b, or c; this is the same as [a-c], which uses a range to express the same set of characters. If you wanted to match only lowercase letters, your RE would be [a-z].

In [84]:
m = re.match(r"[ab1]", "b")
m.group()

'b'

In [36]:
m = re.match(r"[a-z]", "python")
m.group()

'p'

In [38]:
m = re.match(r"[a-z]", "Python")
m.group(0)

'y'

In [24]:
m = re.match(r"[a-zA-Z]", "Python")
m.group(0)

'P'

In [30]:
m = re.match(r"[0-9]", "123456")
m.group(0)

'1'

In [32]:
m = re.match(r"[%+^$]", "$")
m.group(0)

'$'

In [33]:
m = re.match(r"[%+^$]", "^")
m.group(0)

'^'

In [33]:
# [abc] matches one of them
# [a-zA-Z0-9] matches a letter no matter the case or a digit
re.search(r'Number: [0-6]', 'Number: 5').group()

'Number: 5'

In [35]:
# \b matches only the beginning or end of the word
re.match(r'\b[A-E]ookie', 'Cookie').group()

'Cookie'

#### Exercise

Write a regular expression that matches one of the currency symbols among ₺ € $ at the beginning of a string.

### Exclusion  ^
<pre>
You can match the characters not listed within the class by complementing the set. This is indicated by including a '^' as the first character of the class; '^' outside a character class will simply match the '^' character. For example, [^5] will match any character except '5'.
</pre>

In [34]:
# [^5] matches any character except 5
re.match(r'Number: [^5]', 'Number: 0').group()

'Number: 0'

In [65]:
re.match('[^$]', "€ 40").group()

'€'

In [85]:
re.match('[^(1-9)]', "0").group()

'0'

In [87]:
re.match('[^(1-9)]', "3").group()

AttributeError: 'NoneType' object has no attribute 'group'

In [92]:
re.match('[^0-9A-Z]', "c").group()

'c'

In [93]:
re.match('[^0-9A-Z]', "5").group()

AttributeError: 'NoneType' object has no attribute 'group'

In [95]:
re.match('[^0-9A-Z]', "G").group()

AttributeError: 'NoneType' object has no attribute 'group'

In [80]:
re.match('[A-Z^0]', "0").group()

'0'

In [78]:
re.match('[A-Z^1-9]', "0").group()

'1'

#### Exercise
Write a regular expression that matches all digits except the ones starting with 2.

### Escape Special Characters  \

<pre>
Perhaps the most important metacharacter is the backslash, \. As in Python string literals, the backslash can be followed by various characters to signal various special sequences. It’s also used to escape all the metacharacters so  you can still match them in patterns; for example, if you need to match a [ or \, you can precede them with a backslash to remove their special meaning: \[ or \\.
</pre>

In [70]:
re.match(r'[', "[[").group()

error: unterminated character set at position 0

In [71]:
re.match(r'\[', "[1,2,3]").group()

'['

In [72]:
re.match(r'\.', ".com").group()

'.'

In [73]:
re.match(r'\*', "*com").group()

'*'

In [36]:
# This checks for '\' in the string instead of '\t' due to the '\' used 
re.match(r'Back\\st', 'Back\stail').group()

'Back\\st'

#### Exercise
Write a regex that matches one of the following characters:

. , * [ ] ( ) 

### Repetitions  *  +

In [15]:
re.match(r'ca*t', 'cat').group()

'cat'

In [16]:
re.match(r'ca*t', 'ct').group()

'ct'

In [68]:
re.match(r'ca*t', 'caaaaaaat').group()

'caaaaaaat'

In [69]:
re.match(r'ca+t', 'cat').group()

'cat'

In [19]:
re.match(r'ca+t', 'caaaaat').group()

'caaaaat'

In [20]:
re.match(r'ca+t', 'ct').group()

AttributeError: 'NoneType' object has no attribute 'group'

In [38]:
re.match(r'Co+kie', 'Cooookie').group()

'Cooookie'

In [39]:
# Checks for any occurrence of a or o or both in the given sequence
re.match(r'Ca*o*kie', 'Cookie').group()

'Cookie'

In [40]:
# Checks for exactly zero or one occurrence of a or o or both in the given sequence
re.match(r'Colou?r', 'Color').group()

'Color'

In [41]:
re.match(r'\d{7,11}', 'Phone: 03121234567').group()

'03121234567'

In [42]:
re.match(r'\d{7,11}', 'Phone: 1234567').group()

'1234567'

In [9]:
re.match(r'ab+?',  "ab").group()

'ab'

### Greedy vs non-greedy

<pre>
Repetitions such as * or + are greedy; when repeating a RE, the matching engine will try to repeat it as many times as possible. If later 
portions of the pattern don’t match, the matching engine will then back up and try again with fewer repetitions
</pre>

![](../img/regex.png)

In [43]:
pattern = "cookie"
sequence = "Cake and cookie"

heading  = r'<h1>TITLE</h1>'
re.match(r'<.*>', heading).group()

'<h1>TITLE</h1>'

In [44]:
# *? matches as little text as possible
heading  = r'<h1>TITLE</h1>'
re.match(r'<.*?>', heading).group()

'<h1>'

## findall

In [60]:
email_address = "Please contact us at: no.reply@datacamp.com, xyz@datacamp.com"

In [61]:
addresses = re.search(r'[\w\.-]+@[\w\.-]+', email_address)

In [63]:
addresses.group(0)

'no.reply@datacamp.com'

In [64]:
addresses.group(1)

IndexError: no such group

In [66]:
email_address = "Please contact us at: no.reply@datacamp.com, xyz@datacamp.com"

#'addresses' is a list that stores all the possible match
addresses = re.findall(r'[\w\.-]+@[\w\.-]+', email_address)
for address in addresses: 
    print(address)

no.reply@datacamp.com
xyz@datacamp.com


#### Exercise
Count words in a string with regular expression

## Grouping

In [33]:
line = "Cats are smarter than dogs"

searchObj = re.search( r'(.*) are (.*?) .*', line)

In [34]:
print("searchObj.group(0) : ", searchObj.group(0))
print("searchObj.group(1) : ", searchObj.group(1))
print("searchObj.group(2) : ", searchObj.group(2))

searchObj.group(0) :  Cats are smarter than dogs
searchObj.group(1) :  Cats
searchObj.group(2) :  smarter


In [67]:
text = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'
addresses = re.findall(r'([\w\.-]+)@([\w\.-]+)', text)
print(addresses)

[('alice', 'google.com'), ('bob', 'abc.com')]


In [51]:
for a in addresses:
    print(a[0], a[1])

alice google.com
bob abc.com


In [28]:
s = """
From: author@example.com
User-Agent: Thunderbird 1.5.0.9 (X11/20061227)
MIME-Version: 1.0
To: editor@example.com
"""

In [31]:
re.findall(r'\w+: (.+)', s)

['author@example.com',
 'Thunderbird 1.5.0.9 (X11/20061227)',
 '1.0',
 'editor@example.com']

## sub

In [52]:
re.sub?

In [53]:
email_address = "Please contact us at: xyz@datacamp.com"
new_email_address = re.sub(r'([\w\.-]+)@([\w\.-]+)', r'support@datacamp.com', email_address)
print(new_email_address)

Please contact us at: support@datacamp.com


#### Exercise
Write a regular expression that substitutes hotmail into outlook.

Input: "abc@hotmail.com"

Expected output: "abc@outlook.com"

In [16]:
# Lets try and reverse the order of the day and month in a date 
# string. Notice how the replacement string also contains metacharacters
# (the back references to the captured groups) so we use a raw 
# string for that as well.
# This will reorder the string and print:
#   24 of June, 9 of August, 12 of Dec

s = "June 24, August 9, Dec 12"
print(re.sub(r"([a-zA-Z]+) (\d+)", r"\2 of \1", s))

24 of June, 9 of August, 12 of Dec


#### Exercise

Write a regular expression that substitutes the date format dd/mm/yyyy into mm/dd/yyyy.

Input: "24-02-2018"

Expected output: "02/24/2018

## compile

In [54]:
text = "Cake and cookie"

pattern = re.compile(r"cookie")
pattern.search(text).group()

'cookie'

In [55]:
re.search(pattern, sequence).group()

'cookie'

#### Exercise

Match different date formats such as 24.02.2018 or 24/02/2018 or 24-02-18 or 24 02 2018

# Exercises

In [56]:
filename='Model12_beta00.0_alpha9.0.txt'

In [57]:
pattern = re.compile(r'M(\d+)_beta(-*\d+\.\d)_alpha(-*\d+\.\d)')
m = re.search(pattern, filename)

In [58]:
m.group(2)

'00.0'

In [59]:
model=float(m.group(1))
beta=float(m.group(2))
alpha=float(m.group(3))

print(model, beta, alpha)

172.0 0.0 9.0


In [60]:
p = re.compile(r'(?<=_)([a-zA-Z]+)(-?\d+[.]?\d?)')
matches = p.findall(filename)
params = {k: float(v) for (k, v) in matches}

In [73]:
p = re.compile(r'(?<=_)([a-zA-Z]+)(-?\d+[.\d+]?)')
matches = p.findall('M172_beta0._alpha9.txt')
{k: float(v) for (k, v) in matches}

{'alpha': 9.0, 'beta': 0.0}

In [66]:
params

{'M': 172.0, 'alpha': 9.0, 'beta': 0.0}

In [61]:
import re
import requests
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'

def get_book(url):
    # Sends a http request to get the text from project Gutenberg
    raw = requests.get(url).text
    # Discards the metadata from the beginning of the book
    start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*",raw ).end()
    # Discards the metadata from the end of the book
    stop = re.search(r"II", raw).start()
    # Keeps the relevant text
    text = raw[start:stop]
    return text

def preprocess(sentence): 
    return re.sub('[^A-Za-z0-9.]+' , ' ', sentence).lower()

book = get_book(the_idiot_url)
processed_book = preprocess(book)
print(processed_book)

 produced by martin adamson david widger with corrections by andrew sly the idiot by fyodor dostoyevsky translated by eva martin part i i. towards the end of november during a thaw at nine o clock one morning a train on the warsaw and petersburg railway was approaching the latter city at full speed. the morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking and it was impossible to distinguish anything more than a few yards away from the carriage windows. some of the passengers by this particular train were returning from abroad but the third class carriages were the best filled chiefly with insignificant persons of various occupations and degrees picked up at the different stations nearer town. all of them seemed weary and most of them had sleepy eyes and a shivering expression while their complexions generally appeared to have taken on the colour of the fog outside. when day dawned two passengers in one of the third class carriages fou

In [62]:
# Find the number of the pronoun "the" in the corpus
len(re.findall(r'the', processed_book))

302

In [63]:
# Try to convert every single stand-alone instance of 'i' to 'I' in the corpus. 
# Make sure not to change the 'i' occuring in a word
processed_book = re.sub(r'\si\s', " I ", processed_book)
print(processed_book)

 produced by martin adamson david widger with corrections by andrew sly the idiot by fyodor dostoyevsky translated by eva martin part I i. towards the end of november during a thaw at nine o clock one morning a train on the warsaw and petersburg railway was approaching the latter city at full speed. the morning was so damp and misty that it was only with great difficulty that the day succeeded in breaking and it was impossible to distinguish anything more than a few yards away from the carriage windows. some of the passengers by this particular train were returning from abroad but the third class carriages were the best filled chiefly with insignificant persons of various occupations and degrees picked up at the different stations nearer town. all of them seemed weary and most of them had sleepy eyes and a shivering expression while their complexions generally appeared to have taken on the colour of the fog outside. when day dawned two passengers in one of the third class carriages fou

In [64]:
# Find the number of times anyone was quoted ("") in the corpus
len(re.findall(r'\”', book))

96

In [65]:
# What are the words connected by '--' in the corpus?
re.findall(r'[a-zA-Z0-9]*--[a-zA-Z0-9]*', book)
# re.findall(r'\w*--\w*', book)

['ironical--it',
 'malicious--smile',
 'fur--or',
 'astrachan--overcoat',
 'it--the',
 'Italy--was',
 'malady--a',
 'money--and',
 'little--to',
 'No--Mr',
 'is--where',
 'I--I',
 'I--',
 '--though',
 'crime--we',
 'or--judge',
 'gaiters--still',
 '--if',
 'through--well',
 'say--through',
 'however--and',
 'Epanchin--oh',
 'too--at',
 'was--and',
 'Andreevitch--that',
 'everyone--that',
 'reduce--or',
 'raise--to',
 'listen--and',
 'history--but',
 'individual--one',
 'yes--I',
 'but--',
 't--not',
 'me--then',
 'perhaps--',
 'Yes--those',
 'me--is',
 'servility--if',
 'Rogojin--hereditary',
 'citizen--who',
 'least--goodness',
 'memory--but',
 'latter--since',
 'Rogojin--hung',
 'him--I',
 'anything--she',
 'old--and',
 'you--scarecrow',
 'certainly--certainly',
 'father--I',
 'Barashkoff--I',
 'see--and',
 'everything--Lebedeff',
 'about--he',
 'now--I',
 'Lihachof--',
 'Zaleshoff--looking',
 'old--fifty',
 'so--and',
 'this--do',
 'day--not',
 'that--',
 'do--by',
 'know--my',
 'il

#### Exercise
Parse Ansys input file

In [77]:
with open('../io/ansys_dynamic_analysis.txt','r') as f:
    lines = f.read().splitlines()

In [78]:
print(lines)

['  MODE    FREQUENCY (HERTZ)', '', '', '   FREQUENCY RANGE REQUESTED=   10 MODES ABOVE   1.00000     HERTZ', '', '    1     31.39597765453', '    2     197.7769261804', '    3     310.3047954715', '    4     560.2065008854', '    5     609.3867968072', '    6     1118.292750872', '    7     1842.107031029', '    8     1880.162355068', '    9     1895.676378805', '   10     2923.145449564', '   ', '   NAME         VALUE         TYPE     DIMENSIONS', '     AZ6       617.123580      SCALAR', '     AZ6A      381172.736      SCALAR', '     AZ6B      617.391882      SCALAR', '     UZ6      1.029165510E-02  SCALAR', '     UZ6A     1.060781103E-04  SCALAR', '     UZ6B     1.029942282E-02  SCALAR', '     VZ6       2.06301454      SCALAR', '     VZ6A      4.25731207      SCALAR', '     VZ6B      2.06332549      SCALAR', '', '', '']


In [94]:
ulines = [line for line in lines if line.strip() not in {'\n', ''}]

In [96]:
for line in ulines:
    print(line)

  MODE    FREQUENCY (HERTZ)
   FREQUENCY RANGE REQUESTED=   10 MODES ABOVE   1.00000     HERTZ
    1     31.39597765453
    2     197.7769261804
    3     310.3047954715
    4     560.2065008854
    5     609.3867968072
    6     1118.292750872
    7     1842.107031029
    8     1880.162355068
    9     1895.676378805
   10     2923.145449564
   NAME         VALUE         TYPE     DIMENSIONS
     AZ6       617.123580      SCALAR
     AZ6A      381172.736      SCALAR
     AZ6B      617.391882      SCALAR
     UZ6      1.029165510E-02  SCALAR
     UZ6A     1.060781103E-04  SCALAR
     UZ6B     1.029942282E-02  SCALAR
     VZ6       2.06301454      SCALAR
     VZ6A      4.25731207      SCALAR
     VZ6B      2.06332549      SCALAR


In [98]:
m = re.search(r'\((\w+)\)', ulines[0])

In [99]:
m

<_sre.SRE_Match object; span=(20, 27), match='(HERTZ)'>

In [49]:
freq_unit = m.group(1)

In [116]:
s = ulines[1]
s

'   FREQUENCY RANGE REQUESTED=   10 MODES ABOVE   1.00000     HERTZ'

In [117]:
m = re.search(r'(\d+)', s)

In [118]:
num_modes = int(m.group(1))

In [119]:
num_modes

10

In [150]:
modes = []
pattern = re.compile(r'(\d+).*?(\d+\.\d+)')
for line in ulines[2:2+num_modes]:
    print(line)
    m = re.search(pattern, line)
    modes.append(m.group(2))

    1     31.39597765453
    2     197.7769261804
    3     310.3047954715
    4     560.2065008854
    5     609.3867968072
    6     1118.292750872
    7     1842.107031029
    8     1880.162355068
    9     1895.676378805
   10     2923.145449564


In [151]:
m

<_sre.SRE_Match object; span=(3, 24), match='10     2923.145449564'>

In [152]:
m.group(2)

'2923.145449564'

In [153]:
modes

['31.39597765453',
 '197.7769261804',
 '310.3047954715',
 '560.2065008854',
 '609.3867968072',
 '1118.292750872',
 '1842.107031029',
 '1880.162355068',
 '1895.676378805',
 '2923.145449564']

In [156]:
s = ulines[2+num_modes]
s

'   NAME         VALUE         TYPE     DIMENSIONS'

In [159]:
headers = re.findall(r'\w+', s)
headers

['NAME', 'VALUE', 'TYPE', 'DIMENSIONS']

In [169]:
seq = []
for line in ulines[2+num_modes+1:]:
    m = re.search(r'\s*(\S+)\s*(\S+)\s*(\S+)' ,line)
    seq.append((m.group(1),m.group(2),m.group(3)))

In [170]:
seq

[('AZ6', '617.123580', 'SCALAR'),
 ('AZ6A', '381172.736', 'SCALAR'),
 ('AZ6B', '617.391882', 'SCALAR'),
 ('UZ6', '1.029165510E-02', 'SCALAR'),
 ('UZ6A', '1.060781103E-04', 'SCALAR'),
 ('UZ6B', '1.029942282E-02', 'SCALAR'),
 ('VZ6', '2.06301454', 'SCALAR'),
 ('VZ6A', '4.25731207', 'SCALAR'),
 ('VZ6B', '2.06332549', 'SCALAR')]

In [171]:
d = {}
for item in seq:
    d[item[0]] = float(item[1])

In [172]:
d

{'AZ6': 617.12358,
 'AZ6A': 381172.736,
 'AZ6B': 617.391882,
 'UZ6': 0.0102916551,
 'UZ6A': 0.0001060781103,
 'UZ6B': 0.01029942282,
 'VZ6': 2.06301454,
 'VZ6A': 4.25731207,
 'VZ6B': 2.06332549}