In [1]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

In [85]:
overlap("ATTGC","TGCCCC")

3

In [2]:
import itertools

def scs(ss):
    """ Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            #sup += ssperm[i+1][-(len(ssperm[i+1])-olen):]
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [86]:
scs(['ACGGATGAGC', 'GAGCGGA', 'GAGCGAG'])

'ACGGATGAGCGAGCGGA'

In [3]:
def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen

In [5]:
def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)

In [15]:
s=greedy_scs(['AAAAAAAAAAAAAAATATTTTTATTAAATAATTAAAATAAGAAAAAATAAAAATATAATTATTAATATTTATATTTATTTTTTTTATAAAAATAATATTT',
 'AAAAAAAAAAAAAAATTAAAAATTTTAATTCTTGTAAATTTAATATATATAATCAAAAAAAAACTTTTAAATTTTTAGATTTACATAATTTTTATAAATT',
 'AAAAAAAAAAAAAAATTATTTTTTAAATTATAAAAATATTTTTTTAAAATATTTTTACTTTAAAAAATTATATATAAATATTATATATAAATTTAAAGGA',
 'AAAAAAAAAAAAACAATCTTTTAATAGAAGAATAAAATAAGAGTTAACATCTAAAAAATCAATGAATCTTTTATGTGTTGTTATTTGAAAATGATCTCTA',
 'AAAAAAAAAAAAATAATTATATTCTTTATTTATTTCAATAAAGATATATTTATTATTGTAAATATTTTCTATATTTATAAAAAATAAATCTTTTTTTTTA',
 'AAAAAAAAAAAAATTAATAAAAAAAAATATTTTTTTTAAAAACTTTATATTTAACAATTTATATAAAAAAAATTTTAAATTTATAAATTTTAATTTTTTT',
 'AAAAAAAAAAAACAATCTTTTAATAGAAGAATAAAATAAGAGTTAACATCTAAAAAATCAATGAATCTTTTATGTGTTGTTATTTGAAAATGATCTCTAG',
 'AAAAAAAAAAAATATAAAATATAAAAAATGTTAAAAAAATATTTATCAATAAATTTTTTGTTAATTATAAATTTCGAATATAATTTTAAATCTTTATTAT',
 'AAAAAAAAAAAATATTTTTATTCAATTTAAAAATATTTTTTTTTATAATAATTTCTTTTATAATAATATTTATATTTTTAATAAATTAAAAAAAGCAAAT'],15)

In [83]:
import time
start = time.time()
greedy_scs(['ACGGATGAGC', 'GAGCGGA', 'GAGCGAG'],2)
end = time.time()
print(end - start)


9.560585021972656e-05


In [67]:
import numpy as np
l = []
for i in range(0,9):
    print(np.binary_repr(i,width=4))
    s = s + np.binary_repr(i,width=4)
    l.append(np.binary_repr(i,width=4))
print(l)


0000
0001
0010
0011
0100
0101
0110
0111
1000
['0000', '0001', '0010', '0011', '0100', '0101', '0110', '0111', '1000']


In [65]:
k=scs(l)
len(k)
print(k)
#000010011010111
#0000110010111101
#001000011010111

001000011010111


In [66]:
def de_bruijn_ize(st, k):
    """ Return a list holding, for each k-mer, its left
        k-1-mer and its right k-1-mer in a pair """
    edges = []
    nodes = set()
    for i in range(len(st) - k + 1):
        edges.append((st[i:i+k-1], st[i+1:i+k]))
        nodes.add(st[i:i+k-1])
        nodes.add(st[i+1:i+k])
    return nodes, edges

In [88]:
nodes, edges = de_bruijn_ize("ACGCGTCG", 3)
print(nodes,edges)

{'AC', 'CG', 'GT', 'GC', 'TC'} [('AC', 'CG'), ('CG', 'GC'), ('GC', 'CG'), ('CG', 'GT'), ('GT', 'TC'), ('TC', 'CG')]
