# Proyecto ABC

In [2]:
import numpy as np
import screed 
from collections import defaultdict

In [3]:
# Invertible Hash function
def InvertibleHash(x, p):
    m = (2 ** p) - 1
    x = ((~x) + (x << 21)) & m
    x = x ^ (x >> 24)
    x = (x + (x << 3) + (x << 8)) & m
    x = x ^ (x >> 14)
    x = (x + (x << 2) + (x << 4)) & m
    x = x ^ (x >> 28)
    x = (x + (x << 31)) & m
    return x

# Natural hash
def NaturalHash(kmer, k):
    values =  {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    x = 0
    for i in range(k):
        x += values[kmer[k - 1 - i]]  * (4 ** i)
    return x

# Composition of string and integer hash to avoid errors with Poly-A's
def Phi(kmer, k):
    return InvertibleHash(NaturalHash(kmer, k), 2 * k)

In [4]:
# Compute minimizers

# def MinimizerSketch(s, w, k):
#     M = set()
#     for i in range(len(s) - w - k + 1):
#         m = np.Inf
#         for j in range(w):
#             kmer = s[i + j: i + j + k]
#             rckmer = screed.rc(kmer)
#             u = Phi(kmer, k)
#             v = Phi(rckmer, k)
#             if u != v: 
#                 m = min(m, min(u, v))

#         for j in range(w):
#             kmer = s[i + j: i + j + k]
#             rckmer = screed.rc(kmer)
#             u = Phi(kmer, k)
#             v = Phi(rckmer, k)
#             if u < v and u == m:
#                 M.add((m, i + j, 0))
#             elif v < u and v == m:
#                 M.add((m, i + j, 1))
#     return M

def LocalMinimizers(queue):
    i = 1
    ranked = sorted(queue)
    t = ranked[0][0]
    if t < np.Inf:
        for M in ranked[1:]:
            if M[0] == t: 
                i += 1
            else:
                break 
        return ranked[:i]
    return []


def MinimizerSketch(s, w, k): 
    queue = [] 
    M = []
    for i in range(w):
        kmer = s[i: i + k]
        rckmer = screed.rc(kmer)
        u = Phi(kmer, k)
        v = Phi(rckmer, k)
        if u < v:
            queue.append((u, i, 0))
        if u == v: 
            queue.append((np.Inf, -1, -1))
        if u > v:
            queue.append((v, i, 1))
    M.extend(LocalMinimizers(queue))


    for i in range(w, len(s) - k + 1):
        kmer = s[i: i + k]
        rckmer = screed.rc(kmer)
        u = Phi(kmer, k)
        v = Phi(rckmer, k)
        if u < v:
            queue.append((u, i, 0))
        if u == v: 
            queue.append((np.Inf, -1, -1))
        if u > v:
            queue.append((v, i, 1))
        
        queue.pop(0)

        lastm = M[-1][0]
        lasti = M[-1][1]
        m = queue[-1][0]
        i = queue[-1][1]
        if m <= lastm:
            M.append(queue[-1])
        elif i - lasti >= w:
            M.extend(LocalMinimizers(queue))
    
    return M

In [5]:
# Index target sequences
def Index(T, w, k):
    A = []
    for t in range(len(T)):
        M = MinimizerSketch(T[t], w, k)
        for minimizer in M:
            h, i, r = minimizer
            seqminimizer = (h, t, i, r)
            A.append(seqminimizer)
    A.sort()
    H = defaultdict(list)
    for a in A:
        H[a[0]] = []
    for a in A:
        H[a[0]].append((a[1], a[2], a[3])) 
    return H

In [6]:
# Map a query sequence

def Map(H, q, w, k, epsilon):
    A = []
    M = MinimizerSketch(q, w, k)
    for minimizer in M:
        h, i, r = minimizer
        if h in H.keys():
            for hminimizer in H[h]:
                t, i_h, r_h = hminimizer
                if r == r_h:
                    A.append((t, 0, i - i_h, i_h)) 
                else:
                    A.append((t, 1, i + i_h, i_h))
    
    A.sort()
    b = 0
    for e in range(len(A)):
        if e == len(A) - 1 or A[e + 1][0] != A[e][0] or (
            A[e + 1][1] != A[e][1] or A[e + 1][2] - A[e][2] >= epsilon):
            chain = A[b: (e + 1)]
            if len(chain) >= 4:
                print(chain)
            b = e + 1


In [7]:
w = 4
k = 8
import time
t0 = time.time()
MinimizerSketch('AAATCCTGCTACCACATCGCCAGACACCA', w, k)
time.time() - t0

0.00022602081298828125

In [8]:
T = ['AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG', 
    'AAATCCTGCTACCACATCGCCAGTCACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG',
    screed.rc('AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG')]
w = 5
k = 15

H = Index(T, w, k)
q = 'CATCGCCAGACACCACAACCGACAACGACGAGATTGAT'
epsilon = 500
Map(H, q, w, k, epsilon)


[(0, 0, -14, 17), (0, 0, -14, 20), (0, 0, -14, 25), (0, 0, -14, 28), (0, 0, -14, 32), (0, 0, -14, 36), (1, 0, -14, 28), (1, 0, -14, 32), (1, 0, -14, 36), (2, 1, 41, 19), (2, 1, 41, 23), (2, 1, 41, 27), (2, 1, 41, 30), (2, 1, 41, 35), (2, 1, 41, 38)]
[(0, 0, -14, 17), (0, 0, -14, 20), (0, 0, -14, 25), (0, 0, -14, 28), (0, 0, -14, 32), (0, 0, -14, 36)]
[(2, 1, 41, 19), (2, 1, 41, 23), (2, 1, 41, 27), (2, 1, 41, 30), (2, 1, 41, 35), (2, 1, 41, 38)]


In [9]:
T = ['AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG', 
    'AAATCCTGCTACCACATCGCCAGTCACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG',
    screed.rc('AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG')]
w = 3
k = 10

H = Index(T, w, k)
q = 'CATCGCCAGACACCACAACCGACAACGACGAGATTGAT'
epsilon = 500
Map(H, q, w, k, epsilon)


[(0, 0, -14, 14), (0, 0, -14, 15), (0, 0, -14, 16), (0, 0, -14, 18), (0, 0, -14, 20), (0, 0, -14, 21), (0, 0, -14, 22), (0, 0, -14, 23), (0, 0, -14, 24), (0, 0, -14, 27), (0, 0, -14, 28), (0, 0, -14, 30), (0, 0, -14, 32), (0, 0, -14, 33), (0, 0, -14, 36), (0, 0, -14, 38), (0, 0, -14, 40), (1, 0, -14, 24), (1, 0, -14, 27), (1, 0, -14, 28), (1, 0, -14, 30), (1, 0, -14, 32), (1, 0, -14, 33), (1, 0, -14, 36), (1, 0, -14, 38), (1, 0, -14, 40), (2, 1, 46, 20), (2, 1, 46, 22), (2, 1, 46, 24), (2, 1, 46, 27), (2, 1, 46, 28), (2, 1, 46, 30), (2, 1, 46, 32), (2, 1, 46, 33), (2, 1, 46, 36), (2, 1, 46, 37), (2, 1, 46, 38), (2, 1, 46, 39), (2, 1, 46, 40), (2, 1, 46, 42), (2, 1, 46, 44), (2, 1, 46, 45), (2, 1, 46, 46)]
[(0, 0, -14, 14), (0, 0, -14, 15), (0, 0, -14, 16), (0, 0, -14, 18), (0, 0, -14, 20), (0, 0, -14, 21), (0, 0, -14, 22), (0, 0, -14, 23), (0, 0, -14, 24), (0, 0, -14, 27), (0, 0, -14, 28), (0, 0, -14, 30), (0, 0, -14, 32), (0, 0, -14, 33), (0, 0, -14, 36), (0, 0, -14, 38), (0, 0, -14, 

In [10]:
def GetSequencesFromFile(fileName):
    sequences = []
    for record in screed.open(fileName):
        sequences.append(record.sequence)
    return sequences

In [11]:
mtuberculosis = GetSequencesFromFile('Mtuberculosis.fasta')
test = [mtuberculosis[0]]
w = 5
k = 15


H = Index(test, w, k)


In [12]:
q = mtuberculosis[0][110000: 112000]
i = 0
for record in screed.open('SRR8186772_subsample_16k.fasta'):
    i += 1
    print('Tiempo: {}'.format(i))
    q = record.sequence
    Map(H, q, w, k, epsilon)

Tiempo: 1
[]
Tiempo: 2
[(0, 0, -4267871, 4270404), (0, 0, -4182524, 4182758), (0, 0, -4182524, 4182761), (0, 0, -4137626, 4138535), (0, 0, -4099850, 4102152), (0, 0, -4024013, 4026434), (0, 0, -4002200, 4004723), (0, 0, -3978009, 3980180), (0, 0, -3960219, 3961701), (0, 0, -3893256, 3894369), (0, 0, -3841549, 3843851), (0, 0, -3779227, 3780769), (0, 0, -3767884, 3770190), (0, 0, -3753967, 3756273), (0, 0, -3743596, 3745902), (0, 0, -3656372, 3657909), (0, 0, -3434771, 3435008), (0, 0, -3290538, 3292240), (0, 0, -2699133, 2701309), (0, 0, -2617542, 2619304), (0, 0, -2587774, 2589429), (0, 0, -2422713, 2423748), (0, 0, -2318714, 2321020), (0, 0, -2272124, 2273949), (0, 0, -2047955, 2050466), (0, 0, -1814089, 1815385), (0, 0, -1763613, 1763860), (0, 0, -1748421, 1750110), (0, 0, -1611199, 1613777), (0, 0, -1329281, 1331230), (0, 0, -1246153, 1248528), (0, 0, -1217062, 1217519), (0, 0, -1120796, 1123316), (0, 0, -309705, 309907), (0, 0, -309605, 310821), (0, 0, -55909, 57850), (0, 1, 36799

KeyboardInterrupt: 