# Common variables

In [13]:
all_nucleotides = ["a", "t", "g", "c", "u"]
dna_bases = ["A", "C", "G", "T"]
codons = [a+b+c for a in dna_bases for b in dna_bases for c in dna_bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
codon_table = dict(zip(codons, amino_acids))

## Common functions linked to DN

#### Count Codons

In [3]:
# Count codons
def count_codons(sequence:str, frame:int=0, num_nucleotide: int=3):
    ''' 
    This function returns a dictionary with the different codons within a sequence and the number of their occurrence.
    There is also the possibility to provide the reading frame in which to start, but then whatever comes before is not taken into account.
    Also can specify how many nucleotides per codon (default 3)
    
    '''
    codons = {}
    for i in range(frame, len(sequence), num_nucleotide):
        codon = sequence[i: i+num_nucleotide]
        if codon in codons:
            codons[codon] += 1
        else:
            codons[codon] = 1
    return codons

#### Translate sequence

In [5]:
# Translate sequence

def translate_sequence(sequence: str, frame: int=0, start_at = ""):
    '''
    This function returns a string containing the translated amino acids to the sequence provided. The sequence can either be RNA or DNA. 
    However, this needs the codon_table variable!
    If you want to find the ATG codon, just add find_start
    
    '''
    # Generates the translation dictionary
    dna_bases = ["A", "C", "G", "T"]
    codons = [a+b+c for a in dna_bases for b in dna_bases for c in dna_bases]
    amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
    codon_table = dict(zip(codons, amino_acids))
    
    # Actual code
    sequence = sequence.upper().replace("U", "T")
    start = 0
    translated_sequence = ""
    if len(start_at) > 0: 
        start = sequence.find(start_at.upper())
    for j in range(start, len(sequence), 3):
        codon = sequence[j:j+3]
        if codon in codon_table:
            translated_sequence += codon_table[codon]
    return translated_sequence


    

#### Random DNA generator

In [11]:
def gen_random_dna(seq_length: int = 100, nucleotides:str="ATGC", use_as_start: str = ""):
    '''
    This function generates a random string of DNA. You could also make it RNA by passing on the nucleotides you want as arguments. 
    You can also specify the sequence you want it to start with.
    '''
    import random as rd
    sub_this = len(use_as_start)
    sequence = use_as_start
    for n in range(0,seq_length - sub_this):
        nucleotide = rd.choice(nucleotides)
        sequence += nucleotide
    return sequence

## Common functions linked to File handling

#### Open and sort .FASTA file 

## MatPlotLib stuff

#### Info
MatPlotLib is a very powerful plotting program. Below are some things to take into account when making your plots.
Watch out, the MatPlotLib cheat sheet uses the module numpy to generate random data. We are not allowed to use numpy!

#### Plot types

In [None]:
import matplotlib as plt
import matplotlib.pyplot as plt
# line plot
plt.plot([x],[y])
# scatter (useful, but not in the exercises
plt.scatter([x],[y])
# bar
plt.bar([x],[height])
# histogram
plt.hist([x],binsize) #if x is a list of lists, multiple histograms will be shown in one plot (one for each list)


#### Extra stuff

In [None]:
# Make your plots fancy with the following:
plt.title('Title') 
plt.ylabel('Y')
plt.xlabel('X')
plt.legend(["1","2","3"])
plt.xticks([1, 2, 3, 4], ["A", "C", "G", "T"]) # change the name of the ticks on the x axis, see next cell for a different way to do this

In [None]:
# extra variables within the plotting commands
# the following two are the same:
plt.plot(x, y, 'go--', linewidth=2, markersize=12)
plt.plot(x, y, color='green', marker='o', linestyle='dashed',
     linewidth=2, markersize=12)
# add labels to your lines
plt.plot([x],[y], label = "plot")
# change tick names immediately
plt.plot([x],[y], tick_label = ["A", "C", "G", "T"])

In [None]:
# make a bar plot with two sets of data
plt.bar([1,3,5,7], data1, align = 'edge', width = -0.8)
plt.bar([1,3,5,7], data2, align = 'edge')
plt.show()

## RegEx stuff and explanation/examples

#### Info

Much is already on the regex cheat sheet, so here are just some extra notes.
Useful in big files for "stuff which starts with" or "ends with" or finding the stop codon.

#### Not on the cheat sheet

In [19]:
import re
# "$" is to be placed after the character at the end
end = "ACG$" #not "$ACG"

# groups are referred to as \g<n>. This will result in an error message, but also the correct string
ex1 = "ATCGCC"
ex1_subbed = re.sub("(AT)(C)(G)", "\g<2>", ex1) # this only keeps group 2, resulting in the string CCC

# re.findall(pattern, string) is used to make a list of all strings that match
# re.search(pattern, string) returns the position of the first instance of that string
    # re.search.group -> prints the pattern that was found
    # re.search.start -> prints the index of the first char of the pattern
    # re.search.end -> prints the index of the first char after the pattern
    # re.search.span -> prints a tuple of (start, end)
# re.sub(pattern to replace, what to replace it with, string) finds and replaces parts of a string
# re.split(pattern, string) splits the string into a list, removing the pattern
    # grouping of the pattern retains it as an item on the list (a separate item for each group)

CCC


  ex1_subbed = re.sub("(AT)(C)(G)", "\g<2>", ex1)


#### Stop codon search

In [20]:
# for finding a stop codon at the end of the sequence
# remove $ to find all stop codons, remove bool() to display them

import re
sequence= "ATGCAATCATATGCTTCTGCTAGGTTAA"
stop_codon = "T(AA|AG|GA)$" # the stop codons are TAA, TAG and TGA
print(bool(re.search(stop_codon, sequence))) # prints True or False, based on if the last three letters form a stop codon or not

True


#### Two restriction enzymes

In [None]:
# for using two restriction enzymes in tandem, and printing the resulting fragments and their length
# provide your own dna
import re

bamh1 = "(G)(GATCC)"
acc1 = "(GT)([AC][GT]AC)"
cut1 = re.sub(bamh1, "\g<1>\n\g<2>", dna) #provide your own dna
cut2 = re.sub(acc1, "\g<1>\n\g<2>", cut1)
end_result = re.split("\\n", cut2)
print("Number of fragments:", len(end_result))
for seq in end_result:
    print("Sequence:", seq,"\nLength:", len(seq))

In [None]:
# this is the example Basten Snoek gives for groups in groups, so do not copy this directly!
import re
# put the pattern of the enzyme that is cutting first first, to make sure none of its cut sites are first cut by the second enzyme
dna_digest = re.sub("((G)(GATCC))|((GT)([AC][GT]AC))","\g<2>\g<5>s\g<3>\g<6>",dna)
split_output = re.split("s",dna_digest) # Basten chooses to place an s, whereas I used a \n in the example above. Anything works, since you remove it when splitting anyway
print (split_output)
print (len(split_output))
items = {}
for item in split_output:
    #print (len(item))
    items[item] = len(item)

print (items)


# Assignment Challenge: Loops within dictionaries (Day 2)

"For this assignment, we're going to write a program that translates DNA sequences into protein sequences.

Loop over the DNA sequence 'ACTGACTGACTGAATTCGACTG' in steps of 3 such that you get the codons.
Translate each of these codons to their amino acids as defined in the dictionary codon_table defined below. Print the entire translated protein sequence.
Did you take the reading frames into account? If not, get the codons for the other reading frames as well. Translate each reading frame into a protein sequence.
Now translate all of the valid DNA sequences in the list seqs. Invalid sequences should not be translated.""


In [4]:
### make sure you understands what happens here and run this cell
bases = ['T', 'C', 'A', 'G']
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
codon_table = dict(zip(codons, amino_acids))
### The zip creates a list of tuples between two types of containers that have the same amounts of entries


In [None]:
# Translate the following sequence while reading frames into account
### If you want to make the code even better, you can combine the search codon logic with that of the next cell
sequence='ACTGACTGACTGAATTCGACTG'
frames = {}
translated_sequences = {}

for i in range(3):
    cods = []
    for j in range(i, len(sequence), 3):
        cods.append(sequence[j:j+3])
    print(cods)
    frames[f'Reading frame {i}'] = cods

print(frames)

for k, frame in frames.items():
    translation = ""
    for codon in frame:
        if codon in codon_table:
            translation += codon_table[codon]
    translated_sequences[k] = translation


print(translated_sequences)

In [None]:
# Loop over the seqs and translate the valid DNA sequences
seqs = ['actgactgactgaattcgactg','caucgaucgcgauacacgaucagcuacg','augcagacgacguacgu','atcgatcgatcgatcacgt','atcgtagctactagctagc','acgatcgtagctacgta','cgaucagucgaucgauccagcga','cguacguagcacaugcagucaguauacguacggacgacgac','catgactgactgatcgatgctgactgactg','atcggatctgaactgactg','actgactgactgactg','caucgaucgcgauacacgaucagcuacg','augcagacgacguacgu','atcgatcgaattcgatcgatcacgt','atcgtagctactagctagc','acgatcgaattcgtagctacgta','cgaucagucgaucgauccagcga','cguacguagcacaugcagucaguauacguacggacgacgac','catgactgactgatcgatgaattcgctgactgactg','aucggauccgaaccgacag']
translated_seqs = []

for seq in seqs:
    seq = seq.upper()
    seq = seq.replace("U", "T")
    translation = ""
    for i in range(0, len(seq), 3):
        cod = seq[i:i+3]
        if cod in codon_table:
            translation += codon_table[cod]
    translated_seqs.append(translation)
    print(translation)


# File Handling Good practices:

You should close the file after doing everything