# Benchmarking hamming distance calculation

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
import scipy.stats
import seaborn as sns
import sklearn.neighbors
import matplotlib.pyplot as plt
%matplotlib inline

import Levenshtein

import sys
sys.path.append('..')

from lib import *

In [2]:
k = 9
counter9 = count_kmers_proteome(human, k, clean=True)
human9 = set(counter9)

In [3]:
humansample = random.sample(human9, 100000)
points = np.asarray([map_aatonumber(h) for h in humansample])


In [4]:
def mindist(x, sample):
    return min(Levenshtein.hamming(s, x) for s in sample)

In [5]:
mindist('AAACCCAAA', humansample)

3

In [6]:
%timeit -t mindist('AAACCCAAA', humansample)

39.8 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
bt = sklearn.neighbors.BallTree(points, metric='hamming')

In [8]:
def mindist_sklearn(x, tree):
    d, i = tree.query(map_aatonumber(x).reshape(1, -1))
    return int(d*len(x))

In [9]:
mindist_sklearn('AAACCCAAA', bt)

3

In [10]:
%timeit -t mindist_sklearn('AAACCCAAA', bt)

6.2 ms ± 224 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## on all 9mers 

In [11]:
mindist('AAACCCAAA', human9)

2

In [12]:
%timeit -t mindist('AAACCCAAA', human9)

4.42 s ± 182 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
nchunks = 100

In [14]:
human9_number = np.asarray([map_aatonumber(h) for h in human9])

In [15]:
pointss = np.array_split(human9_number, nchunks)

In [16]:
bts = [sklearn.neighbors.BallTree(points, metric='hamming') for points in pointss]

In [17]:
def mindist_sklearn_chunked(x, trees):
    d = min(bt.query(map_aatonumber(x).reshape(1, -1))[0] for bt in trees)
    return int(d*len(x))

In [18]:
mindist_sklearn_chunked('AAACCCAAA', bts)

2

In [19]:
%timeit -t mindist_sklearn_chunked('AAACCCAAA', bts)

711 ms ± 44.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
btdist = BallTreeDist(human9, nchunks=nchunks)

In [6]:
btdist.mindist('AAACCCAAA')

2

In [7]:
%timeit -t btdist.mindist('AAACCCAAA')

695 ms ± 45.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
