# Proyecto ABC

In [112]:
import numpy as np
import screed 

In [113]:
# Invertible Hash function
def InvertibleHash(x, p):
    m = (2 ** p) - 1
    x = ((~x) + (x << 21)) & m
    x = x ^ (x >> 24)
    x = (x + (x << 3) + (x << 8)) & m
    x = x ^ (x >> 14)
    x = (x + (x << 2) + (x << 4)) & m
    x = x ^ (x >> 28)
    x = (x + (x << 31)) & m
    return x

# Natural hash
def NaturalHash(kmer, k):
    values =  {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    x = 0
    for i in range(k):
        x += values[kmer[k - 1 - i]]  * (4 ** i)
    return x

# Composition of string and integer hash to avoid errors with Poly-A's
def Phi(kmer, k):
    return InvertibleHash(NaturalHash(kmer, k), 2 * k)

In [195]:
# Compute minimizers
def MinimizerSketch(s, w, k):
    M = list()
    for i in range(len(s) - w - k + 1):
        m = np.Inf
        for j in range(w):
            kmer = s[i + j: i + j + k]
            rckmer = screed.rc(kmer)
            u = Phi(kmer, k)
            v = Phi(rckmer, k)
            if u != v: 
                m = min(m, min(u, v))

        for j in range(w):
            kmer = s[i + j: i + j + k]
            rckmer = screed.rc(kmer)
            u = Phi(kmer, k)
            v = Phi(rckmer, k)
            if u < v and u == m:
                M.append((m, i + j, 0, i))
            elif v < u and v == m:
                M.append((m, i + j, 1, i))
    return M



In [196]:
A = []
for i in range(2 ** 10):
    hash = InvertibleHash(i, 10)
    if hash not in A:
        A.append(hash)
    else:
        print('error', i, hash)

In [198]:
MinimizerSketch('CCAGACACCA', 3, 4)

[(27, 1, 0, 0), (27, 1, 0, 1), (40, 2, 1, 2), (52, 4, 1, 3)]

In [168]:
s = 'CATAACGATCGCCT'
len(s)

14

In [191]:
k = 10
s = 'CCAGACACCACAACCGACAACGACGAGATTGATGACA'
for i in range(len(s) - k + 1):
    kmer = s[i: i + k]
    rckmer = screed.rc(kmer)
    u = Phi(kmer, k)
    v = Phi(rckmer, k)
    print(min(u, v), i)

949498 0
498542 1
386475 2
695625 3
696478 4
701234 5
710703 6
716013 7
440092 8
108301 9
552030 10
93804 11
379815 12
139832 13
296388 14
335191 15
866938 16
217732 17
314939 18
79391 19
789978 20
19392 21
94263 22
379803 23
477188 24
515127 25
374755 26
467118 27


In [170]:
Phi('ATCG', 4)

101

In [171]:
screed.rc('ATCG')

'CGAT'

In [172]:
Phi('CGAT', 4)

44

In [173]:
NaturalHash('A', 1)

0

In [174]:
NaturalHash('C', 1)

1

In [175]:
NaturalHash('T', 1)

3