## Regular Expression

<img src="regex.png">

re.search(pattern, string)

In [1]:
import re

In [2]:
seq0 = "AAACCCTTTGGG"
seq1 = "AAGCGTTGGG"

In [3]:
m = re.search("GGA", seq0)
m is None

True

In [4]:
m = re.search("G.G", seq1)
m is None

False

In [5]:
m

<_sre.SRE_Match object; span=(2, 5), match='GCG'>

In [6]:
m.start()

2

In [7]:
m.end()

5

In [8]:
# only returns the first matching substring
m.group()

'GCG'

re.findall(pattern, string)

In [9]:
re.findall("G.G", seq1)

['GCG', 'GGG']

In [11]:
re.findall(r"a (?:\w+) dog", "It's a cute dog!")

['a cute dog']

re.compile(pattern)

In [12]:
# test whether a string matches the pattern
pattern = re.compile(r"m{2,4}")
print(pattern.match(""))
print(pattern.match("m"))
print(pattern.match("mm"))
print(pattern.match("mmm"))
print(pattern.match("mmmm"))
print(pattern.match("ammmmm"))
print(pattern.match("mammm"))
print(pattern.match("mmmmammm"))

None
None
<_sre.SRE_Match object; span=(0, 2), match='mm'>
<_sre.SRE_Match object; span=(0, 3), match='mmm'>
<_sre.SRE_Match object; span=(0, 4), match='mmmm'>
None
None
<_sre.SRE_Match object; span=(0, 4), match='mmmm'>


In [13]:
pattern = re.compile(r"a{5}")
if pattern.match("aaabb"):
    print("Found it!")
else:
    print("Nope...")

Nope...


In [20]:
# substitute any substrings matching the pattern with another string
s = "In principio erat verbum, et verbum erat apud Deum."
pattern = re.compile(r"verbum")
pattern.sub("XXX", s)

'In principio erat XXX, et XXX erat apud Deum.'

In [15]:
vowel_pattern = re.compile(r"a|A|e|E|o|O|u|U|i|I")
without_vowels = vowel_pattern.sub("", s)
print(without_vowels)

n prncp rt vrbm, t vrbm rt pd Dm.


In [16]:
p = re.compile(r"ri|um|Th")
print(p.sub("X", s))

In pXncipio erat verbX, et verbX erat apud DeX.


In [23]:
ups = re.compile(r"[^A-Z]")
without_ups = ups.sub(" ", s)
print(without_ups)

I                                             D    


In [24]:
paper = "My thesis on  biology     contains a lot of  double spaces.   I will remove  them."
mult = re.compile(r" +")
print(mult.sub(" ", paper))

My thesis on biology contains a lot of double spaces. I will remove them.


Escape characters

In [26]:
dot = re.compile(r"\.")
print(dot.sub("X", s))

In principio erat verbum, et verbum erat apud DeumX


In [27]:
s = "In principio [erat] verbum, et verbum erat apud Deum."
brackets_wrong = re.compile(r"[|]")
print(brackets_wrong.sub("X", s))
brackets_right = re.compile(r"\[|\]")
print(brackets_right.sub("X", s))

In principio [erat] verbum, et verbum erat apud Deum.
In principio XeratX verbum, et verbum erat apud Deum.


In [28]:
# Split a string using a pattern
s = """This is a text  on three   lines
with   multiple instances of  
double spaces."""
whitespace = re.compile(r"\s+")
print(whitespace.split(s))

['This', 'is', 'a', 'text', 'on', 'three', 'lines', 'with', 'multiple', 'instances', 'of', 'double', 'spaces.']


In [29]:
s

'This is a text  on three   lines\nwith   multiple instances of  \ndouble spaces.'

### Great Tutorial:
https://github.com/jiffyclub/notebooks/blob/master/regex-intro.ipynb