### Helper Utils

In [4]:
from collections import defaultdict
def parse_input(file):
    graph =  defaultdict(list)

    for line in file.readlines():
        x, y = line.strip().split(" -> ")
        u, v = int(x), list( map(int,y.split(",")))
        graph[u].extend(v)

    return graph

### Find an Eulerian Cycle in a Graph

In [2]:
def find_cycle(graph, start):
    cycle = []
    u = graph[start].pop()
    while u != start :
        cycle.append(u)
        u = graph[u].pop()
    cycle.append(u)

    isolated = [k for k, v in graph.items() if not v]
    for k in isolated:
        del graph[k]

    return cycle

def find_eulerian_cycle(graph, start):
    cycle = [start] + find_cycle(graph, start)
    visited = True
    while visited:
        visited = False
        for i, node in enumerate(cycle):
            if node in graph:
                visited = True
                cycle = cycle[:i+1] + find_cycle(graph, node) + cycle[i+1:]
                break
    return cycle

In [11]:
with open('test.txt') as inFile:
        graph = parse_input(inFile)
print('->'.join(map(str, find_eulerian_cycle(graph,0))))

0->3->2->6->8->7->9->6->5->4->2->1->0


### Reconstruct a String from its k-mer Composition

In [25]:
def get_prefix (pattern):
    return pattern[:len(pattern) - 1]
def get_suffix (pattern):
    return pattern[1:]

def constructDeBruijnFromKmers (kmers):
    adj_list = {}
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix] = []
        adj_list[suffix] = []
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix].append(suffix)
    return adj_list

def get_start(adj_list):
    deg = {}
    for v in adj_list:       
        if v not in deg.keys():
            deg[v] = -len(adj_list[v])
        else:
            deg[v] -= len(adj_list[v])   
        for u in adj_list[v]:
            if u not in deg.keys():
                deg[u] = 1
            else:
                deg[u] += 1   
    for v in deg:
        if deg[v] == -1:
            source = v    
    return source

def eulerian_path(graph, v,path, nodes):

    if v not in graph.keys():
        path.append(v)
        if len(nodes) == 0:
            return
        else:
            eulerian_path(graph, nodes.pop(),path,nodes ) 
    elif len(graph[v]) == 0:
        path.append(v)
        if len(nodes) == 0:
            return
        else:
            eulerian_path(graph, nodes.pop(),path, nodes )
    else:
        nodes.append(v)
        eulerian_path( graph, graph[v].pop(0),path, nodes)
    return path, nodes


def constructSequence(kmers):
    seq = kmers[0]
    for kmer in kmers[1:] :
        seq += kmer[-1]

    return seq

In [28]:
f = open('test.txt', 'r') 
k = int(f.readline().strip())
kmers = list(str(l.strip('\n')) for l in f.readlines())
print(kmers)
dB = constructDeBruijnFromKmers(kmers)
print(dB)
start = get_start(dB)
print(start)
path_ = []
nodes_ = []
path, nodes = eulerian_path(dB, start, path_,nodes_)
path.reverse()
print(path)
seq = constructSequence(path)
print(seq)

['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']
{'CTT': ['TTA'], 'TTA': ['TAC'], 'ACC': ['CCA'], 'CCA': [], 'TAC': ['ACC'], 'GGC': ['GCT'], 'GCT': ['CTT']}
GGC
['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']
GGCTTACCA


### Generate Contigs from a Collection of Reads

In [None]:
def count_degree(graph):
    nodes = set(list(v for v in graph.keys()))
    for v in graph:
        nodes.update(graph[v])
    nodes = list(nodes)

    for v in nodes:
            
        if v not in deg.keys():
            if v in graph.keys():
                deg[v] = (0, len(graph[v]))
            else:
                deg[v] = (0,0)
        else:
            if v in graph.keys():
                deg[v] = (deg[v][0], deg[v][1]+len(graph[v]))
        
        if v in graph.keys():
            for w in graph[v]:
                if w not in deg.keys():
                    deg[w] = (1,0)
                else:
                    deg[w] = (deg[w][0]+1,deg[w][1])
        
    return deg


def max_non_branching_paths (graph, degree):
    paths = []
    for v in degree:
        if degree[v]