# Proyecto ABC

In [2]:
import numpy as np
import screed 
from collections import defaultdict

In [3]:
# Invertible Hash function
def InvertibleHash(x, p):
    m = (2 ** p) - 1
    x = ((~x) + (x << 21)) & m
    x = x ^ (x >> 24)
    x = (x + (x << 3) + (x << 8)) & m
    x = x ^ (x >> 14)
    x = (x + (x << 2) + (x << 4)) & m
    x = x ^ (x >> 28)
    x = (x + (x << 31)) & m
    return x

# Natural hash
def NaturalHash(kmer, k):
    values =  {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    x = 0
    for i in range(k):
        x += values[kmer[k - 1 - i]]  * (4 ** i)
    return x

# Composition of string and integer hash to avoid errors with Poly-A's
def Phi(kmer, k):
    return InvertibleHash(NaturalHash(kmer, k), 2 * k)

In [4]:
def individual_hash(cadena):
    values =  {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    array = np.zeros(len(cadena) - 1)
    for i in range(len(cadena) - 1):
        array[i] = values[cadena[i]] * 4 + values[cadena[i + 1]]
    return array



def NaturalHash(kmer, k):
    values =  {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    x = 0
    for i in range(k):
        x += values[kmer[k - 1 - i]]  * (4 ** i)
    return x

def hamming_distance(a, b):
    dist = 0
    for i in range(len(a)):
        if a[i] != b[i]:
            dist += 1
    return dist

def squared_distance(a, b):
    return sum((b - a)**2)

def local_representative_sketch(s, w, k, d): 
    t = len(s)
    representatives = []
    for i in range(0, t - w, w // d):
        windowlist= []
        for j in range(w - k + 1):
            kmer = s[i + j: i + j + k]
            windowlist.append(individual_hash(kmer))
            windowlist.append(individual_hash(screed.rc(kmer)))

        mean = np.mean(windowlist, axis=0)

        local_representative = windowlist[0]
        max_distance = squared_distance(local_representative, mean)
        max_kmer_pos = 0
        for j in range(1, len(windowlist)):
            testkmerarray = windowlist[j]
            test = squared_distance(testkmerarray, mean)

            if test > max_distance:
                local_representative = testkmerarray
                max_distance = test
                max_kmer_pos = j
        
        if max_kmer_pos < w - k + 1:
            max_kmer = s[i + max_kmer_pos: i + max_kmer_pos + k]
            representatives.append((Phi(max_kmer, k), i + max_kmer_pos, 0))
        else:
            max_kmer = screed.rc(s[i + max_kmer_pos - w + k - 1: i + max_kmer_pos - w + k - 1 + k])
            representatives.append((Phi(max_kmer, k), i + max_kmer_pos - w + k - 1, 1))
    return representatives
        

In [5]:
# Compute minimizers

# def MinimizerSketch(s, w, k):
#     M = set()
#     for i in range(len(s) - w - k + 1):
#         m = np.Inf
#         for j in range(w):
#             kmer = s[i + j: i + j + k]
#             rckmer = screed.rc(kmer)
#             u = Phi(kmer, k)
#             v = Phi(rckmer, k)
#             if u != v: 
#                 m = min(m, min(u, v))

#         for j in range(w):
#             kmer = s[i + j: i + j + k]
#             rckmer = screed.rc(kmer)
#             u = Phi(kmer, k)
#             v = Phi(rckmer, k)
#             if u < v and u == m:
#                 M.add((m, i + j, 0))
#             elif v < u and v == m:
#                 M.add((m, i + j, 1))
#     return M

def LocalMinimizers(queue):
    i = 1
    ranked = sorted(queue)
    t = ranked[0][0]
    if t < np.Inf:
        for M in ranked[1:]:
            if M[0] == t: 
                i += 1
            else:
                break 
        return ranked[:i]
    return []


def MinimizerSketch(s, w, k): 
    queue = [] 
    M = []
    for i in range(w):
        kmer = s[i: i + k]
        rckmer = screed.rc(kmer)
        u = Phi(kmer, k)
        v = Phi(rckmer, k)
        if u < v:
            queue.append((u, i, 0))
        if u == v: 
            queue.append((np.Inf, -1, -1))
        if u > v:
            queue.append((v, i, 1))
    M.extend(LocalMinimizers(queue))


    for i in range(w, len(s) - k + 1):
        kmer = s[i: i + k]
        rckmer = screed.rc(kmer)
        u = Phi(kmer, k)
        v = Phi(rckmer, k)
        if u < v:
            queue.append((u, i, 0))
        if u == v: 
            queue.append((np.Inf, -1, -1))
        if u > v:
            queue.append((v, i, 1))
        
        queue.pop(0)

        lastm = M[-1][0]
        lasti = M[-1][1]
        m = queue[-1][0]
        i = queue[-1][1]
        if m <= lastm:
            M.append(queue[-1])
        elif i - lasti >= w:
            M.extend(LocalMinimizers(queue))
    
    return M

In [6]:
# Index target sequences
def Index(T, w, k, d):
    A = []
    for t in range(len(T)):
        M = local_representative_sketch(T[t], w, k, d)
        for minimizer in M:
            h, i, r = minimizer
            seqminimizer = (h, t, i, r)
            A.append(seqminimizer)
    A.sort()
    H = defaultdict(list)
    for a in A:
        H[a[0]] = []
    for a in A:
        H[a[0]].append((a[1], a[2], a[3])) 
    return H

In [7]:
# Map a query sequence

def Map(H, q, w, k, d, epsilon):
    A = []
    M = local_representative_sketch(q, w, k, d)
    for minimizer in M:
        h, i, r = minimizer
        if h in H.keys():
            for hminimizer in H[h]:
                t, i_h, r_h = hminimizer
                if r == r_h:
                    A.append((t, 0, i - i_h, i_h)) 
                else:
                    A.append((t, 1, i + i_h, i_h))
    
    A.sort()
    b = 0
    maxchain = []
    for e in range(len(A)):
        if e == len(A) - 1 or A[e + 1][0] != A[e][0] or (
            A[e + 1][1] != A[e][1] or A[e + 1][2] - A[e][2] >= epsilon):
            chain = A[b: (e + 1)]
            if len(chain) >= 4 and len(chain) > len(maxchain):
                    maxchain = chain
            b = e + 1
    return maxchain


In [8]:
w = 12
k = 8
d = 3
import time
t0 = time.time()
local_representative_sketch('AAATCCTGCTACCACATCGCCAGACACCA', w, k, 3)
time.time() - t0

0.0007548332214355469

In [9]:
T = ['AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG', 
    'AAATCCTGCTACCACATCGCCAGTCACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG',
    screed.rc('AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG')]
w = 30
k = 15
d = 5

H = Index(T, w, k, d)
q = 'AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGA'
epsilon = 500
Map(H, q, w, k, d, epsilon)


[]

In [10]:
local_representative_sketch(q, w, k, d)

[(178319391, 0, 0), (773567123, 18, 1), (437538454, 22, 1)]

In [11]:
local_representative_sketch('AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG', w, k, d)

[(178319391, 0, 0),
 (773567123, 18, 1),
 (437538454, 22, 1),
 (582068975, 31, 0),
 (189083587, 25, 0),
 (940920792, 37, 1),
 (266032859, 36, 1)]

In [12]:
T = ['AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG', 
    'AAATCCTGCTACCACATCGCCAGTCACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG',
    screed.rc('AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG')]
w = 3
k = 10

H = Index(T, w, k)
q = 'CATCGCCAGACACCACAACCGACAACGACGAGATTGAT'
epsilon = 500
Map(H, q, w, k, epsilon)


TypeError: Index() missing 1 required positional argument: 'd'

In [13]:
def GetSequencesFromFile(fileName):
    sequences = []
    for record in screed.open(fileName):
        sequences.append(record.sequence)
    return sequences

In [21]:
mtuberculosis = GetSequencesFromFile('MtuberculosisTest.fasta')
test = [mtuberculosis[0]]
w = 30
k = 15
d = 10


H = Index(test, w, k, d)


In [22]:
# Para formatear el resultado en un archivo SAM dummy
i = 0
with open('test.sam', 'w') as f:
    for record in screed.open('SRR8186772_subsample_16k.fasta'):
        i += 1
        print('tiempo: ', i)
        Seq = record.sequence
        maxchain = Map(H, Seq, w, k, d, epsilon)
        Seq_ID = record.name
        if not maxchain:
            flag = 4
        elif maxchain[0][1] == 0:
            print(maxchain)
            flag = 0
        else:
            print(maxchain)
            flag = 16
        if maxchain:
            pos = maxchain[0][3] + 1
        else: 
            pos = 0
        # Seq_ID = "El identificador (encabezado) de la secuencia, los que en el archivo con las secuencias empeiza con >. También podemos poner cualquier cosa por ahora"
        # flag = "0 si mapea, 4 si no mapea, 16 si mapea pero en el reverso complemento (este es opcional, puede ser 0)."
        # Seq = "La secuencia como tal. También podríamos poner cualqueir secuencia tipo AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA."
        f.write(f'{Seq_ID}\t{flag}\t*\t{pos}\t0\t*\t*\t0\t0\t{Seq}\t*\n')
    

tiempo:  1
tiempo:  2
tiempo:  3
tiempo:  4
tiempo:  5
tiempo:  6
tiempo:  7
tiempo:  8
tiempo:  9
tiempo:  10
tiempo:  11
tiempo:  12
tiempo:  13
[(0, 1, 62830, 62382), (0, 1, 62830, 62523), (0, 1, 62830, 62790), (0, 1, 62833, 60966), (0, 1, 62833, 60966), (0, 1, 62833, 60972), (0, 1, 62833, 62719), (0, 1, 62833, 62719), (0, 1, 62834, 61355), (0, 1, 62835, 61305), (0, 1, 62835, 61308), (0, 1, 62835, 61308), (0, 1, 62835, 61309), (0, 1, 62835, 61402), (0, 1, 62835, 61404), (0, 1, 62835, 61409), (0, 1, 62835, 61410), (0, 1, 62835, 61412), (0, 1, 62835, 61416), (0, 1, 62836, 61450), (0, 1, 62836, 61450), (0, 1, 62837, 60662), (0, 1, 62837, 60830), (0, 1, 62837, 60830), (0, 1, 62838, 60864), (0, 1, 62838, 60867), (0, 1, 62838, 60870), (0, 1, 62838, 60871), (0, 1, 62838, 61683), (0, 1, 62844, 60551), (0, 1, 62844, 60551), (0, 1, 62844, 60552), (0, 1, 62849, 62016), (0, 1, 62850, 60175), (0, 1, 62850, 60176), (0, 1, 62850, 60179), (0, 1, 62850, 60179), (0, 1, 62850, 60185), (0, 1, 62850, 60

In [18]:
local_representative_sketch('AAATCCTGCTACCACATAGCCAGACACCACAACCGACAACGACGATATTGATGACAGCGCTGCGGCACGG', 20, 15, 3)

[(178319391, 0, 0),
 (869724757, 8, 1),
 (1025245313, 16, 1),
 (108159648, 20, 0),
 (189083587, 25, 0),
 (442223201, 31, 0),
 (1004952505, 41, 1),
 (223034630, 46, 1),
 (1060069703, 52, 1)]

In [None]:
screed.rc('CCGTGCCGCAGC')

'GCTGCGGCACGG'

In [None]:
b = individual_hash('GACTACGACGAGATG')

In [None]:
hamming_distance(a, b)

2

In [None]:
len('AAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGG')

70