In [1]:
def editDistance(x, y):
    # Create distance matrix
    D = []
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))
    # Initialize first row and column of matrix
    for i in range(len(x)+1):
        D[i][0] = i
    for i in range(len(y)+1):
        D[0][i] = 0
    # Fill in the rest of the matrix
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + 1
            D[i][j] = min(distHor, distVer, distDiag)
    # Edit distance is the value in the bottom right corner of the matrix
    return min(D[-1])

In [2]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

In [3]:
genome = readGenome('chr1.GRCh38.excerpt.fasta')

In [4]:
editDistance('GCTGATCGATCGTACG', genome)

3

In [5]:
editDistance('GATTTACCAGATTGAG', genome)

2

In [7]:
editDistance('GCGTATGC', 'TATTGGCTATACGGTT')

2

In [8]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline() # skip name line
            seq = fh.readline().rstrip() # read base sequence
            fh.readline() # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences

In [9]:
reads = readFastq('ERR266411_1.for_asm.fastq')
# reads = ['CGTACG', 'TACGTA', 'GTACGT', 'ACGTAC', 'GTACGA', 'TACGAT']

In [10]:
def overlap(s, p, min_len=3):
    start = 0
    
    while True:
        start = s.find(p[:min_len], start)
        if start == -1:
            return 0
        
        if p.startswith(s[start:]):
            return len(s) - start
    
        start += 1

In [24]:
from collections import defaultdict
def map_kmers(reads, k):
    kmer = defaultdict(set)    
    for read in reads:        
        for i in range(len(read) - k + 1):
            kmer[read[i:i+k]].add(read)            
    return kmer

In [45]:
from itertools import permutations
def overlaps(reads, k=3):
    kmer_map = map_kmers(reads, k)
    olaps = dict()
    used_reads = set()
    for read in reads:
        for b in kmer_map[read[-k:]]:
            if b != read:
                olap = overlap(read, b)
                if olap >= k:
                    used_reads.add(read)
                    used_reads.add(b)
                    olaps[(read, b)] = olap            
    return olaps, used_reads  

In [46]:
%%time
laps, used_reads = overlaps(reads, 30)

CPU times: user 2.37 s, sys: 44.1 ms, total: 2.41 s
Wall time: 2.41 s


In [47]:
count = 0
for pair, v in laps.items():
    count += 1
print(count)
print(len(laps))
print(len(used_reads))


904746
904746
9750


In [41]:
laps

{('TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTC',
  'AAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTCT'): 99,
 ('TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTC',
  'AACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTCTG'): 98,
 ('AGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATC',
  'CGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAA'): 48,
 ('AGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATC',
  'AAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGCTAACACTTTCGGATATTTCTGATG'): 57,
 ('AGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATC',
  'TGCGCGCTTCGATAAAAATGATT