In [15]:
def overlap(first, second, min_len = 3):
    start = 0
    while (start < len(first) - min_len + 1):
        start = first.find(second[:min_len], start)
        if (start == -1):
            return 0
        if second.startswith(first[start:]):
            return len(first) - start
        start += 1

import itertools

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], 1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest


In [19]:
def scs_all(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    sets_shortest_sup = {}
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], 1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            sets_shortest_sup = {sup}
            shortest_sup = sup  # found shorter superstring
        elif len(sup) == len(shortest_sup):
              sets_shortest_sup.add(sup)
    return sets_shortest_sup  # return shortest

In [56]:
# Question 1
strs = ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']
print(scs(strs))
print(len(scs(strs)))

CCTTGGATTGC
11


In [57]:
# Question 2
print(scs_all(strs))

{'CCTTGGATTGC', 'GATTGCCTTGG', 'TGGATTGCCTT', 'TGCCTTGGATT'}


In [52]:
def buildSetsByKmer(reads, k = 30):
    kmersLocation = {}
    for i in range(len(reads)):
        seq = reads[i]
        for j in range(len(seq) - k + 1):
            kmer = seq[j:j + k]
            if kmer in kmersLocation:
                idxList = kmersLocation.get(kmer)
                idxList.add(i)
                kmersLocation[kmer] = idxList
            else:
                kmersLocation[kmer] = set([i])
    return kmersLocation

def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
    kmerLoc = buildSetsByKmer(reads, k)
    reada, readb = None, None
    best_olen = 0
    for i in range(len(reads)):
        for j in kmerLoc[reads[i][-k:]]:
            if i == j:
                continue
            olen = overlap(reads[i], reads[j], min_len=k)
            if olen > best_olen:
                reada, readb = reads[i], reads[j]
                best_olen = olen
    return reada, readb, best_olen

def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)

In [53]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()
            seq = fh.readline().rstrip()
            fh.readline()
            qua = fh.readline().rstrip()
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qua)
    return sequences, qualities

In [54]:
# Question 3, 4
reads, _ = readFastq('Wk4-Data/ads1_week4_reads.fq')
assem_genome = greedy_scs(reads, 30)
print(len(assem_genome))

15894


In [55]:
# Question 3, 4
import collections
count = collections.Counter()
count.update(assem_genome)
print(count)

Counter({'A': 4633, 'C': 3789, 'G': 3749, 'T': 3723})
