In [2]:
#chapter 7: regular expressions

#many techniques involve searching for fixed patterns but sometimes we want more flexibility
#for this we use regular expressions

#modules in python
#module = collection of specialized tools
#modules are not automatically available they must be loaded
#use import statement follwed by module name to load a module at the top of the program

#module that deals with regular expressions is re

#load the re module
import re

#when you want to use tools from the module you must prefix it with the module name
#re.search(pattern, string)

#if you forget to load the module or prefix the function with the module name you will get a name error

#raw strings


In [3]:
#raw strings

#put the letter r before quotation marks to tell python that any special characters inside the quotations
#are to be ignored-the r stands for raw which is python speak for a string where special characers are ignored

#example
print(r'\t\n')
#vs
print('l\tll\nlll')

\t\n
l	ll
lll


In [4]:
#searching for a pattern in a string

#re.search is a true/false function that determines whether or a not a pattern appears somewhere in a string
#re.search takes two arguments(both strings)
    #1st-pattern you want to search for
    #2nd-the string you want to search in
    
#check if a dna sequence contains an EcoRI restriction site
dna = 'ATCGCGAATTCAC'
if re.search(r'GAATTC', dna):
    print('restriction site found')

restriction site found


In [5]:
#alternation

#find AvaII motif which cuts at two different motifs
#using complex conditioning with an or statement
if re.search(r'GGACC', dna) or re.search(r'GGTCC', dna):
    print('restriction site found')
#using a singular regular expression
if re.search(r'GG(A|T)CC', dna):
    print('restriction site found')

In [6]:
#character groups

#for wider variaton
if re.search(r'GC(A|T|G|C)GC', dna):
    print('restriction site found')
    
#better way
#a pair of brackets with the list of characters inside them tells python to take any of the characters at that spot
#these characters within brackets are referred to as character groups

#example
if re.search(r'GC[ATGC]GC', dna):
    print('restriction site found')
    
#in situations where you want it to match any character in the position use a period
#example
if re.search(r'GC.GC', dna):
    print('restriction site found')
    
#instead of listing all the characters you want it to match, you can tell it all the characters you DON'T want it to
#do this by using square brackets with a ^ followed by the characters you don't want matched
#example
if re.search(r'GC[^XYZ]GC', dna):
    print('restriction site found')    

In [7]:
#Quantifiers

#quantifiers describe variation in the number of times a section of a pattern is repeated

#a question mark following a character denotes that the character is optional (can match 0 or 1 times)

#example
if re.search(r'GAT?C', dna):
    print('pattern found')
    #in this the T is optional and the program will match either GATC or GAC
    
#if you want the question mark to apply to more than one character in a row you can group the characters in ().

#a plus sign following a character denotes that the character must be present but can be so any number of times
if re.search(r'GA+TC', dna):
    print('found')
    #this will match GATC, GAATC, GAAATC, GAAAATC, etc but NEVER GTC
    
#an asterisk following a character means the character is optional and can be repeated (will match 0 or more)

#to specify a number of repeats use curly brackets

#curly brackets with a pair of numbers separated by comma specifies a range of acceptable repeats

In [8]:
#Positions
#don't represent characters but positions in an input string
#^ matches the start of a string
#$ matches the end of a string
#^AAA will match AAATGCGT but NOT CGTCTGAAATCTTA
#$GGG will match CTACGATTTGGG but NOT GGGATATCTA

#combining
#any of these tools can be combined
#example
#^ATG[ATGC]{30,1000}A{5,10}$ will recognize any full length mRNA

#re.match similar to re.search except re.match will only match if the pattern matches the entire string
#whereas re.search will match if the pattern can be found anywhere in the string

#extracting part of the string that matched
#store the result of re.search then use the group method on the resulting object
#re.search returns a value called a match object(represents the results of re.search)-can use methods to get data out

#if you use the group method on the match object you get the portion of the input string that matched the pattern

#store/group match object
dna = 'ATGACGTACGTACGACTG'
m = re.search(r'GA[ATGC]{3}AC', dna)
print(m.group())

GACGTAC


In [5]:
#extract more than one piece of a pattern

#capturing-put parentheses around the bits you want to extract
#refer to the bits by number
#use group method to return the bits you want
import re

dna = 'ATGACGTACGTACGACTG'
m = re.search(r'GA([ATGC]{3})AC([ATGC]{2})AC', dna)
print('entire match: ' + m.group())
print('first bit: ' + m.group(1))
print('second bit: ' + m.group(2))

entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [6]:
#getting the position of a match
#match objects  hold information about positions as well
#start and end methods get you the information about start and end of match(python starts counting at 0)

m = re.search(r'GA([ATGC]{3})AC([ATGC]{2})AC', dna)
print('start: ' + str(m.start()))
print('end: ' + str(m.end()))



start: 2
end: 13


In [7]:
#can get start and stop of groups as well by giving group number to start and end methods
print('group 1 start: ' + str(m.start(1)))
print('group 1 end: ' + str(m.end(1)))
print('group 2 start: ' + str(m.start(2)))
print('group 2 end: ' + str(m.end(2)))

group 1 start: 4
group 1 end: 7
group 2 start: 9
group 2 end: 11


In [8]:
#splitting a string using a regular expression as delimiter

#re module has a split function that takes regular expressions
    #arguments
        #regular expression to be used as delimiter
        #string to be split
        
#split dna with ambiguity codes at the positions of ambiguity
dna = 'ACTNGCATRGCTACGTYACGATSCGAWTCG'
runs = re.split(r'[^ATGC]', dna)
print(runs)

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [9]:
#finding multiple matches

#re.findall returns a list of all matches of a pattern in a string
    #not a match object, but a list, holds no positional info
    #arguments
        #pattern
        #string
        
#find all runs of A and T in a dna sequence
dna = 'ACTGCATTATATCGTACGAAATTATACGCGCG'
runs = re.findall(r'[AT]{4,100}', dna)
print(runs)
    


['ATTATAT', 'AAATTATA']


In [15]:
#re.finditer returns a sequence of match objects
    #to do anything useful must use return value in a loop
    
#example:
dna = 'ACTGCATTATATCGTACGAAATTATACGCGCG'
runs = re.finditer(r'[AT]{3,100}', dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print('AT rich region from ' + str(run_start) + ' to ' + str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 26
