### Helper Utils

In [27]:
from collections import defaultdict
def parse_input(file):
    graph =  defaultdict(list)

    for line in file.readlines():
        x, y = line.strip().split(" -> ")
        u, v = int(x), list( map(int,y.split(",")))
        graph[u].extend(v)

    return graph

def reverse_complement(text):
    text = text[::-1]
    switcher = {
        "A" : "T",
        "T" : "A",
        "G" : "C",
        "C" : "G"}
    reversed = ""
    for i in range(len(text)):
        reversed += switcher.get(text[i])
    return reversed

RNA2PEPTIDE = {
    'UUU': 'F',     'CUU': 'L',     'AUU': 'I',     'GUU': 'V',
    'UUC': 'F',     'CUC': 'L',     'AUC': 'I',     'GUC': 'V',
    'UUA': 'L',     'CUA': 'L',     'AUA': 'I',     'GUA': 'V',
    'UUG': 'L',     'CUG': 'L',     'AUG': 'M',     'GUG': 'V',
    'UCU': 'S',     'CCU': 'P',     'ACU': 'T',     'GCU': 'A',
    'UCC': 'S',     'CCC': 'P',     'ACC': 'T',     'GCC': 'A',
    'UCA': 'S',     'CCA': 'P',     'ACA': 'T',     'GCA': 'A',
    'UCG': 'S',     'CCG': 'P',     'ACG': 'T',     'GCG': 'A',
    'UAU': 'Y',     'CAU': 'H',     'AAU': 'N',     'GAU': 'D',
    'UAC': 'Y',     'CAC': 'H',     'AAC': 'N',     'GAC': 'D',
    'UAA': 'Stop',  'CAA': 'Q',     'AAA': 'K',     'GAA': 'E',
    'UAG': 'Stop',  'CAG': 'Q',     'AAG': 'K',     'GAG': 'E',
    'UGU': 'C',     'CGU': 'R',     'AGU': 'S',     'GGU': 'G',
    'UGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'UGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'UGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}

DNA2PEPTIDE = {
    'TTT': 'F',      'CTT': 'L',      'ATT': 'I',      'GTT': 'V',
    'TTC': 'F',      'CTC': 'L',      'ATC': 'I',      'GTC': 'V',
    'TTA': 'L',      'CTA': 'L',      'ATA': 'I',      'GTA': 'V',
    'TTG': 'L',      'CTG': 'L',      'ATG': 'M',      'GTG': 'V',
    'TCT': 'S',      'CCT': 'P',      'ACT': 'T',      'GCT': 'A',
    'TCC': 'S',      'CCC': 'P',      'ACC': 'T',      'GCC': 'A',
    'TCA': 'S',      'CCA': 'P',      'ACA': 'T',      'GCA': 'A',
    'TCG': 'S',      'CCG': 'P',      'ACG': 'T',      'GCG': 'A',
    'TAT': 'Y',      'CAT': 'H',      'AAT': 'N',      'GAT': 'D',
    'TAC': 'Y',      'CAC': 'H',      'AAC': 'N',      'GAC': 'D',
    'TAA': 'Stop',   'CAA': 'Q',      'AAA': 'K',      'GAA': 'E',
    'TAG': 'Stop',   'CAG': 'Q',      'AAG': 'K',      'GAG': 'E',
    'TGT': 'C',      'CGT': 'R',      'AGT': 'S',      'GGT': 'G',
    'TGC': 'C',      'CGC': 'R',      'AGC': 'S',      'GGC': 'G',
    'TGA': 'Stop',   'CGA': 'R',      'AGA': 'R',      'GGA': 'G',
    'TGG': 'W',      'CGG': 'R',      'AGG': 'R',      'GGG': 'G' 
}

mass_peptide = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114, 
                'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M':131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}

### Find an Eulerian Cycle in a Graph

In [2]:
def find_cycle(graph, start):
    cycle = []
    u = graph[start].pop()
    while u != start :
        cycle.append(u)
        u = graph[u].pop()
    cycle.append(u)

    isolated = [k for k, v in graph.items() if not v]
    for k in isolated:
        del graph[k]

    return cycle

def find_eulerian_cycle(graph, start):
    cycle = [start] + find_cycle(graph, start)
    visited = True
    while visited:
        visited = False
        for i, node in enumerate(cycle):
            if node in graph:
                visited = True
                cycle = cycle[:i+1] + find_cycle(graph, node) + cycle[i+1:]
                break
    return cycle

In [11]:
with open('test.txt') as inFile:
        graph = parse_input(inFile)
print('->'.join(map(str, find_eulerian_cycle(graph,0))))

0->3->2->6->8->7->9->6->5->4->2->1->0


### Reconstruct a String from its k-mer Composition

In [4]:
def get_prefix (pattern):
    return pattern[:len(pattern) - 1]
def get_suffix (pattern):
    return pattern[1:]

def constructDeBruijnFromKmers (kmers):
    adj_list = {}
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix] = []
        adj_list[suffix] = []
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix].append(suffix)
    return adj_list

def get_start(adj_list):
    deg = {}
    for v in adj_list:       
        if v not in deg.keys():
            deg[v] = -len(adj_list[v])
        else:
            deg[v] -= len(adj_list[v])   
        for u in adj_list[v]:
            if u not in deg.keys():
                deg[u] = 1
            else:
                deg[u] += 1   
    for v in deg:
        if deg[v] == -1:
            source = v    
    return source

def eulerian_path(graph, v,path, nodes):

    if v not in graph.keys():
        path.append(v)
        if len(nodes) == 0:
            return
        else:
            eulerian_path(graph, nodes.pop(),path,nodes ) 
    elif len(graph[v]) == 0:
        path.append(v)
        if len(nodes) == 0:
            return
        else:
            eulerian_path(graph, nodes.pop(),path, nodes )
    else:
        nodes.append(v)
        eulerian_path( graph, graph[v].pop(0),path, nodes)
    return path, nodes


def constructSequence(kmers):
    dB = constructDeBruijnFromKmers(kmers)
    start = get_start(dB)
    path_ = []
    nodes_ = []
    path, nodes = eulerian_path(dB, start, path_,nodes_)
    path.reverse()
    seq = path[0]
    for kmer in path[1:] :
        seq += path[-1]
    print(seq)

In [6]:
f = open('test.txt', 'r') 
k = int(f.readline().strip())
kmers = list(str(l.strip('\n')) for l in f.readlines())
print(kmers)
constructSequence(kmers)

['ATG', 'ATG', 'TGT', 'TGG', 'CAT', 'GGA', 'GAT', 'AGA']
AGGTGTGTGTGTGTGT


### Generate Contigs from a Collection of Reads

In [2]:
def count_degree(graph):
    nodes = set(list(v for v in graph.keys()))
    for v in graph:
        nodes.update(graph[v])
    nodes = list(nodes)

    deg = {}
    for v in nodes:
            
        if v not in deg.keys():
            if v in graph.keys():
                deg[v] = (0, len(graph[v]))
            else:
                deg[v] = (0,0)
        else:
            if v in graph.keys():
                deg[v] = (deg[v][0], deg[v][1]+len(graph[v]))
        
        if v in graph.keys():
            for w in graph[v]:
                if w not in deg.keys():
                    deg[w] = (1,0)
                else:
                    deg[w] = (deg[w][0]+1,deg[w][1])
        
    return deg


def max_non_branching_paths (graph, degree):
    paths = []
    for v in degree:
        if degree[v] != (1,1):
            if degree[v][1] > 0:
                for w in graph[v] : 
                    path = str (v + w[-1])
                    while degree[w] == (1,1):
                        w = graph[w][0]
                        path += w[-1]
                    paths.append(path)
    return paths

def construct_contigs(reads) :
    graph = constructDeBruijnFromKmers(reads)
    degree = count_degree(graph)
    print(degree)
    paths = max_non_branching_paths(graph, degree)
    print(paths)

In [7]:
f = open('test.txt', 'r') 
reads = list(str(l.strip('\n')) for l in f.readlines())
construct_contigs(reads)

{'': (1, 1), 'GG': (1, 1), 'GA': (2, 1), 'TG': (2, 2), 'GT': (1, 0), 'AT': (2, 2), 'AG': (0, 1), 'CA': (0, 1)}
['GAT', 'TGT', 'TGGA', 'ATG', 'ATG', 'AGA', 'CAT']


### Translate an RNA String into an Amino Acid String

In [12]:
def translateRNAtoPeptide(rna):
    peptide = ""
    for i in range(0,len(rna),3):
        peptide += RNA2PEPTIDE[rna[i:i+3]]
    return peptide

translateRNAtoPeptide("AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA")

'MAMAPRTEINSTRING'

### Find Substrings of a Genome Encoding a Given Amino Acid String

In [34]:
def translateDNAtoPeptide(dna):
    # print("substring -> ", dna)
    peptide = ""
    for i in range(0,len(dna),3):
        cur = DNA2PEPTIDE[dna[i:i+3]]
        if cur != "Stop" :
            peptide += cur
    return peptide

def aminoAcid_to_genome(dna, peptides):
    for i in range(len(dna) - 3 * len(peptides)):
        substr = dna[i: i+ (3*len(peptides))]
        # print(substr)
        rev_substr = reverse_complement(substr)
        # print("---", rev_substr)
        acids = translateDNAtoPeptide(substr)
        rev_acids = translateDNAtoPeptide(rev_substr)
        if ((acids == peptides) or (rev_acids == peptides)):
            print(substr)
       

In [36]:
f = open('test.txt', 'r') 
lines = f.readlines()
dna_ = lines[0].strip()
peptide_ = lines[1].strip()
print("dna: ", dna_)
print("peptide : ", peptide_)
aminoAcid_to_genome(dna_, peptide_)

dna:  ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA
peptide :  MA
ATGGCC
GGCCAT
ATGGCC


### Generate the Theoretical Spectrum of a Cyclic Peptide

In [29]:
test_mass = {"a" : 1, "b" :2 , "c" :3,  "d":5}
def generateFragments(peptide):
    length = len(peptide)
    cycle = [ peptide + peptide[:i] for i in range(length) ]
    fragments = [cycle[i][k:k+i+1] for i in range(length-1) for k in range(length)]
    fragments.append(peptide)

    return fragments
def generateSpectrum(peptide):
    fragments = generateFragments(peptide)
    print(fragments)
    spectrum = []
    spectrum.append(0)
    for f in fragments :
        mass = 0
        for i,p in enumerate(f):
            mass += mass_peptide [p]
        spectrum.append(mass)
    
    print(spectrum)

generateSpectrum("LEQN")
        

['L', 'E', 'Q', 'N', 'LE', 'EQ', 'QN', 'NL', 'LEQ', 'EQN', 'QNL', 'NLE', 'LEQN']
[0, 113, 129, 128, 114, 242, 257, 242, 227, 370, 371, 355, 356, 484]
