In [80]:
from readGenome import *
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
def editDistance(x, y):
    # Create distance matrix
    D = []
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))
    # Initialize first row and column of matrix
    for i in range(len(x)+1):
        D[i][0] = i
    for i in range(len(y)+1):
        D[0][i] = 0
    # Fill in the rest of the matrix
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + 1
            D[i][j] = min(distHor, distVer, distDiag)
    # Edit distance is the value in the bottom right corner of the matrix
    return min(D[-1])

In [81]:
# check 
P = 'GCGTATGC'
T = 'TATTGGCTATACGGTT'
editDistance(P,T)

2

In [82]:
# Question 1: 
P = 'GCTGATCGATCGTACG'
T = readGenome('chr1.GRCh38.excerpt.fasta')
editDistance(P,T)

3

In [83]:
P = 'GATTTACCAGATTGAG'
T = readGenome('chr1.GRCh38.excerpt.fasta')
editDistance(P,T)

2

In [95]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match
        
def makeKmer(reads, k):
    allKmer = {}
    for read in reads:
        for i in range(len(read)):
            kmer = read[i:k + i]
            if len(kmer) != k: 
                break 
            i = i + 1
            if kmer in allKmer: 
                allKmer[kmer].add(read) 
            else:
                allKmer[kmer] = set([read])
    return allKmer 

            
def naive_overlap_map(reads, k):
    olaps = {}
    for a, b in permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > 0:
            olaps[(a, b)] = olen
    return olaps

def findOverlaps(reads,k):
    overlaps = []
    count = 0 
    # make dict 
    allKmer = makeKmer(reads, k)
    for read in reads:
        # get read suffix 
        suffix = read[-k:]
        # check keys 
        if suffix in allKmer:
            matches = list(allKmer[suffix])
            for match in matches:
                if match == read:
                    pass 
                else:
                    if overlap(read, match, min_length=k) > 0:
                        count += 1
                        overlaps.append((read, match))
                    
    return overlaps, count 

In [34]:
a = 'TAA'
lookup = {'TAA': set(),
 'AAA': set(),
 'AAC': set(),
 'ACA': set(),
 'CAA': set(),
 'AAG': set()}
lookup[a].add('sdkfls')
lookup[a].add('sdfds')


set

In [91]:
# test case 1 
reads = ['ABCDEFG', 'EFGHIJ', 'HIJABC']
findOverlaps(reads,3)
findOverlaps(reads, 4)

[('ABCDEFG', 'EFGHIJ'), ('EFGHIJ', 'HIJABC'), ('HIJABC', 'ABCDEFG')]

[]

In [92]:
# test case 2
reads = ['CGTACG', 'TACGTA', 'GTACGT', 'ACGTAC', 'GTACGA', 'TACGAT']
findOverlaps(reads, 4)
findOverlaps(reads, 5)

[('CGTACG', 'TACGTA'),
 ('CGTACG', 'GTACGT'),
 ('CGTACG', 'GTACGA'),
 ('CGTACG', 'TACGAT'),
 ('TACGTA', 'CGTACG'),
 ('TACGTA', 'ACGTAC'),
 ('GTACGT', 'TACGTA'),
 ('GTACGT', 'ACGTAC'),
 ('ACGTAC', 'GTACGT'),
 ('ACGTAC', 'CGTACG'),
 ('ACGTAC', 'GTACGA'),
 ('GTACGA', 'TACGAT')]

[('CGTACG', 'GTACGT'),
 ('CGTACG', 'GTACGA'),
 ('TACGTA', 'ACGTAC'),
 ('GTACGT', 'TACGTA'),
 ('ACGTAC', 'CGTACG'),
 ('GTACGA', 'TACGAT')]

In [96]:
reads,qualities = readFastq('ERR266411_1.for_asm.fastq')
overlaps, count = findOverlaps(reads, 30)

In [102]:
# edges 
len(overlaps)
# nodes
len(set([x[0] for x in overlaps]))

904746

7161