### (3A) Generate the k-mer Composition of a String

In [1]:
def KmerComposition(k,Text):
    kmers = []
    for i in range( len(Text) - k + 1):
        kmer = Text[i : i +k]
        kmers.append(kmer)

    return kmers

print(KmerComposition(5, "ABCDEFGHIJK"))

['ABCDE', 'BCDEF', 'CDEFG', 'DEFGH', 'EFGHI', 'FGHIJ', 'GHIJK']


### (3B) Reconstruct a String from its Genome Path

In [2]:
def geneReconstruct(kmers):
    text = kmers[0]
    for i in range(1,len(kmers)):
        print(kmers[i][-1:])
        text += kmers[i][-1:]
    return text

geneReconstruct(["ACCGA",
"CCGAA",
"CGAAG",
"GAAGC",
"AAGCT"])

A
G
C
T


'ACCGAAGCT'

### (3C) Construct the Overlap Graph of a Collection of k-mers

In [3]:
def get_prefix (pattern):
    return pattern[:len(pattern) - 1]
def get_suffix (pattern):
    return pattern[1:]

def overlapGraph(patterns):
    adj_matrix = [[0 for i in range(len(patterns))] for j in range(len(patterns))]

    for i in range ( len (patterns)):
        for j in range (len (patterns)):
            if i != j and get_suffix(patterns[i]) == get_prefix(patterns[j]) :
                adj_matrix[i][j] = 1

    for m in range (len(adj_matrix)):
        for n in range (len(adj_matrix)) :
            if adj_matrix[m][n] == 1:
                print (patterns[m], " -> ", patterns[n])



overlapGraph(["ATGCG", "GCATG", "CATGC", "AGGCA", "GGCAT"])

GCATG  ->  CATGC
CATGC  ->  ATGCG
AGGCA  ->  GGCAT
GGCAT  ->  GCATG


### (3D) Construct the de Bruijn Graph of a String

In [5]:
import collections
def constructDeBruijn(k, Text):
    adj_list_pair = {}
    for i in range (len(Text) - k + 1):
        kmer = Text[i : i +k]
        prefix = get_prefix(kmer)
        suffix = get_suffix (kmer)
        if prefix not in adj_list_pair:
            adj_list_pair[prefix] = (suffix,)
        else :
            adj_list_pair[prefix] += (suffix, )
    
    od = collections.OrderedDict(sorted(adj_list_pair.items()))
    for key in od.keys():
        print(str(key) + " -> ")
        if len(adj_list_pair[key]) == 1:
            print(adj_list_pair[key][0] + "\n")
        else:
            adj_list_pair[key] = sorted(adj_list_pair[key])
            print(adj_list_pair[key][0])
            del adj_list_pair[key][0]
            for item in adj_list_pair[key]:
                print("," + str(item))

constructDeBruijn(4,"AAGATTCTCTAC" )

AAG -> 
AGA

AGA -> 
GAT

ATT -> 
TTC

CTA -> 
TAC

CTC -> 
TCT

GAT -> 
ATT

TCT -> 
CTA
,CTC
TTC -> 
TCT



### (3E) Construct the de Bruijn Graph of a Collection of k-mers

In [6]:
def get_prefix (pattern):
    return pattern[:len(pattern) - 1]
def get_suffix (pattern):
    return pattern[1:]

def constructDeBruijnFromKmers (kmers):
    adj_list = {}
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix] = []
        adj_list[suffix] = []
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix].append(suffix)
    return adj_list

constructDeBruijnFromKmers(["GAGG",
"CAGG",
"GGGG",
"GGGA",
"CAGG",
"AGGG",
"GGAG"])

{'GAG': ['AGG'],
 'AGG': ['GGG'],
 'CAG': ['AGG', 'AGG'],
 'GGG': ['GGG', 'GGA'],
 'GGA': ['GAG']}

### (3F) Find an Eulerian Cycle in a Graph

In [7]:
def find_cycle(graph, start):
    cycle = []
    u = graph[start].pop()
    while u != start :
        cycle.append(u)
        u = graph[u].pop()
    cycle.append(u)

    isolated = [k for k, v in graph.items() if not v]
    for k in isolated:
        del graph[k]

    return cycle

def find_eulerian_cycle(graph, start):
    cycle = [start] + find_cycle(graph, start)
    visited = True
    while visited:
        visited = False
        for i, node in enumerate(cycle):
            if node in graph:
                visited = True
                cycle = cycle[:i+1] + find_cycle(graph, node) + cycle[i+1:]
                break
    return cycle
    
with open('test.txt') as inFile:
        graph = parse_input(inFile)
print('->'.join(map(str, find_eulerian_cycle(graph,0))))

### (3G) Find an Eulerian Path in a Graph

In [8]:
def eulerian_path(graph, v,path, nodes):

    if v not in graph.keys():
        path.append(v)
        if len(nodes) == 0:
            return
        else:
            eulerian_path(graph, nodes.pop(),path,nodes ) 
    elif len(graph[v]) == 0:
        path.append(v)
        if len(nodes) == 0:
            return
        else:
            eulerian_path(graph, nodes.pop(),path, nodes )
    else:
        nodes.append(v)
        eulerian_path( graph, graph[v].pop(0),path, nodes)
    return path, nodes

### (3H) Reconstruct a String from its k-mer Composition

In [11]:
def get_start(adj_list):
    deg = {}
    for v in adj_list:       
        if v not in deg.keys():
            deg[v] = -len(adj_list[v])
        else:
            deg[v] -= len(adj_list[v])   
        for u in adj_list[v]:
            if u not in deg.keys():
                deg[u] = 1
            else:
                deg[u] += 1   
    for v in deg:
        if deg[v] == -1:
            source = v    
    return source



def constructSequence(kmers):
    dB = constructDeBruijnFromKmers(kmers)
    start = get_start(dB)
    path_ = []
    nodes_ = []
    path, nodes = eulerian_path(dB, start, path_,nodes_)
    path.reverse()
    seq = path[0]
    for kmer in path[1:] :
        seq += path[-1]
    print(seq)

# f = open('test.txt', 'r') 
# k = int(f.readline().strip())
# kmers = list(str(l.strip('\n')) for l in f.readlines())
# print(kmers)
# constructSequence(kmers)

### (3J) Reconstruct a String from its Paired Composition

### (3K) Generate the Contigs from a Collection of Reads

In [12]:
def count_degree(graph):
    nodes = set(list(v for v in graph.keys()))
    for v in graph:
        nodes.update(graph[v])
    nodes = list(nodes)

    deg = {}
    for v in nodes:
            
        if v not in deg.keys():
            if v in graph.keys():
                deg[v] = (0, len(graph[v]))
            else:
                deg[v] = (0,0)
        else:
            if v in graph.keys():
                deg[v] = (deg[v][0], deg[v][1]+len(graph[v]))
        
        if v in graph.keys():
            for w in graph[v]:
                if w not in deg.keys():
                    deg[w] = (1,0)
                else:
                    deg[w] = (deg[w][0]+1,deg[w][1])
        
    return deg


def max_non_branching_paths (graph, degree):
    paths = []
    for v in degree:
        if degree[v] != (1,1):
            if degree[v][1] > 0:
                for w in graph[v] : 
                    path = str (v + w[-1])
                    while degree[w] == (1,1):
                        w = graph[w][0]
                        path += w[-1]
                    paths.append(path)
    return paths

def construct_contigs(reads) :
    graph = constructDeBruijnFromKmers(reads)
    degree = count_degree(graph)
    print(degree)
    paths = max_non_branching_paths(graph, degree)
    print(paths)

### (3L) Construct a String Spelled by a Gapped Genome Path

### (3M) Generate All Maximal Non-Branching Paths in a Graph

In [None]:
def max_non_branching_paths (graph, degree):
    paths = []
    for v in degree:
        if degree[v] != (1,1):
            if degree[v][1] > 0:
                for w in graph[v] : 
                    path = str (v + w[-1])
                    while degree[w] == (1,1):
                        w = graph[w][0]
                        path += w[-1]
                    paths.append(path)
    return paths