# Array classes: memory and performance profiling

In [1]:
import numpy as np
np.random.seed(42)
import sys
import cProfile
import humanize
sys.path.insert(0, '../..')
%reload_ext memory_profiler
%reload_ext autoreload
%autoreload 1
%aimport allel.model
%aimport allel.bcolz

In [2]:
def binarysize(n):
    return humanize.naturalsize(n, binary=True)

In [3]:
# setup an array of genotype calls
shape = n_variants, n_samples, ploidy = 50000, 1000, 2
data = np.zeros(shape, dtype='i1')
# simulate some mutations
n_alleles = n_variants * n_samples * ploidy
idx = np.random.randint(0, (n_alleles//2)-1, size=n_alleles//20)
data[:, :, 1].reshape((-1))[idx] = 1
data[:, :, 0].reshape((-1))[idx[:n_alleles//200]] = 1

## Total memory size

### Contiguous array (numpy)

In [4]:
g = allel.model.GenotypeArray(data, copy=False)
print(binarysize(g.nbytes))

95.4 MiB


### Compressed array (bcolz)

In [5]:
gc = allel.bcolz.GenotypeCArray(data)
print(binarysize(gc.cbytes), gc.chunklen)

16.0 MiB 262


### Compressed persistent array (bcolz)

In [6]:
import tempfile
rootdir = tempfile.mkdtemp()
rootdir

'/tmp/tmpwkit6ul7'

In [7]:
gcp = allel.bcolz.GenotypeCArray(data, rootdir=rootdir, mode='w')
gcp.carr.flush()
print(binarysize(gc.cbytes), gc.chunklen)

16.0 MiB 262


### Sparse array (scipy)

In [8]:
m = g.to_sparse(format='csr')
m

<50000x2000 sparse matrix of type '<class 'numpy.int8'>'
	with 5255839 stored elements in Compressed Sparse Row format>

In [9]:
print(m.data.dtype, binarysize(m.data.nbytes), binarysize(m.indices.nbytes), binarysize(m.indptr.nbytes))

int8 5.0 MiB 20.0 MiB 195.3 KiB


## Genotype counting

In [10]:
g.count_called()

50000000

In [11]:
g.count_hom_ref()

45241646

In [12]:
g.count_het()

4260869

In [13]:
g.count_hom_alt()

497485

In [14]:
%timeit n = g.count_called()
%timeit n = gc.count_called()
%timeit n = gcp.count_called()
%memit n = g.count_called()
%memit n = gc.count_called()
%memit n = gcp.count_called()

10 loops, best of 3: 148 ms per loop
1 loops, best of 3: 236 ms per loop
1 loops, best of 3: 264 ms per loop
peak memory: 296.37 MiB, increment: 47.35 MiB
peak memory: 249.15 MiB, increment: 0.32 MiB
peak memory: 249.68 MiB, increment: 0.53 MiB


In [15]:
%timeit n = g.count_het()
%timeit n = gc.count_het()
%timeit n = gcp.count_het()
%memit n = g.count_het()
%memit n = gc.count_het()
%memit n = gcp.count_het()

10 loops, best of 3: 164 ms per loop
1 loops, best of 3: 370 ms per loop
1 loops, best of 3: 449 ms per loop
peak memory: 293.17 MiB, increment: 47.57 MiB
peak memory: 245.82 MiB, increment: 0.31 MiB
peak memory: 245.82 MiB, increment: 0.00 MiB


In [16]:
%timeit n = g.count_call((0, 1))
%timeit n = gc.count_call((0, 1))
%timeit n = gcp.count_call((0, 1))
%memit n = g.count_call((0, 1))
%memit n = gc.count_call((0, 1))
%memit n = gcp.count_call((0, 1))

10 loops, best of 3: 146 ms per loop
1 loops, best of 3: 378 ms per loop
1 loops, best of 3: 409 ms per loop
peak memory: 293.10 MiB, increment: 47.79 MiB
peak memory: 245.93 MiB, increment: 0.52 MiB
peak memory: 245.93 MiB, increment: 0.00 MiB


## Genotype transformations

In [17]:
%timeit gn = g.to_n_alt()
%timeit gn = gc.to_n_alt()
%timeit gn = gcp.to_n_alt()
%memit gn = g.to_n_alt()
%memit gn = gc.to_n_alt()
%memit gn = gcp.to_n_alt()

1 loops, best of 3: 1.06 s per loop
1 loops, best of 3: 1.29 s per loop
1 loops, best of 3: 1.35 s per loop
peak memory: 401.68 MiB, increment: 142.90 MiB
peak memory: 307.04 MiB, increment: 0.57 MiB
peak memory: 272.26 MiB, increment: 12.90 MiB


In [18]:
gc.to_n_alt()

carray((50000, 1000), int8)
  nbytes: 47.68 MB; cbytes: 13.04 MB; ratio: 3.66
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[[0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

In [19]:
%timeit x = g.to_packed()
%timeit x = gc.to_packed()
%timeit x = gcp.to_packed()
%memit x = g.to_packed()
%memit x = gc.to_packed()
%memit x = gcp.to_packed()

1 loops, best of 3: 499 ms per loop
1 loops, best of 3: 1.11 s per loop
1 loops, best of 3: 1.17 s per loop
peak memory: 333.56 MiB, increment: 47.69 MiB
peak memory: 334.00 MiB, increment: 0.44 MiB
peak memory: 299.02 MiB, increment: 12.71 MiB


In [20]:
gc.to_packed()

carray((50000, 1000), uint8)
  nbytes: 47.68 MB; cbytes: 13.21 MB; ratio: 3.61
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[[0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

In [21]:
%timeit x = g.to_allele_counts()
%timeit x = gc.to_allele_counts()
%timeit x = gcp.to_allele_counts()
%memit x = g.to_allele_counts()
%memit x = gc.to_allele_counts()
%memit x = gcp.to_allele_counts()

1 loops, best of 3: 2.29 s per loop
1 loops, best of 3: 2.59 s per loop
1 loops, best of 3: 2.67 s per loop
peak memory: 506.71 MiB, increment: 190.73 MiB
peak memory: 411.50 MiB, increment: 0.15 MiB
peak memory: 319.04 MiB, increment: 2.91 MiB


In [15]:
mapping = np.array([[1, 0]] * g.shape[0])
mapping

array([[1, 0],
       [1, 0],
       [1, 0],
       ..., 
       [1, 0],
       [1, 0],
       [1, 0]])

In [16]:
g.map_alleles(mapping)

GenotypeArray((50000, 1000, 2), dtype=int8)
[[[1 1]
  [1 1]
  [1 1]
  ..., 
  [1 1]
  [1 1]
  [1 1]]

 [[1 1]
  [1 0]
  [1 1]
  ..., 
  [1 1]
  [1 1]
  [1 1]]

 [[1 1]
  [1 1]
  [1 1]
  ..., 
  [1 1]
  [1 1]
  [1 1]]

 ..., 
 [[1 1]
  [1 0]
  [1 1]
  ..., 
  [1 1]
  [1 1]
  [1 1]]

 [[1 1]
  [1 1]
  [1 0]
  ..., 
  [1 1]
  [1 1]
  [1 1]]

 [[1 1]
  [1 1]
  [1 1]
  ..., 
  [1 1]
  [1 1]
  [1 1]]]

In [17]:
g2 = g.copy()
%timeit x = g2.map_alleles(mapping, copy=False)
%timeit x = gc.map_alleles(mapping)
%timeit x = gcp.map_alleles(mapping)
%memit x = g2.map_alleles(mapping, copy=False)
%memit x = gc.map_alleles(mapping)
%memit x = gcp.map_alleles(mapping)

10 loops, best of 3: 158 ms per loop
1 loops, best of 3: 400 ms per loop
1 loops, best of 3: 389 ms per loop
peak memory: 457.71 MiB, increment: 0.00 MiB
peak memory: 458.05 MiB, increment: 0.34 MiB
peak memory: 473.80 MiB, increment: 15.73 MiB


## Allele counting

In [10]:
h = g.to_haplotypes()
h

HaplotypeArray((50000, 2000), dtype=int8)
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

In [11]:
hc = gc.to_haplotypes()
hc

HaplotypeCArray((50000, 2000), int8)
  nbytes: 95.37 MB; cbytes: 15.97 MB; ratio: 5.97
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

In [12]:
%timeit x = g.max()
%timeit x = gc.max()
%memit x = g.max()
%memit x = gc.max()

10 loops, best of 3: 96.4 ms per loop
1 loops, best of 3: 201 ms per loop
peak memory: 264.77 MiB, increment: 0.17 MiB
peak memory: 265.39 MiB, increment: 0.61 MiB


In [13]:
%timeit x = h.max()
%timeit x = hc.max()
%memit x = h.max()
%memit x = hc.max()

10 loops, best of 3: 96.7 ms per loop
1 loops, best of 3: 198 ms per loop
peak memory: 265.52 MiB, increment: 0.00 MiB
peak memory: 265.52 MiB, increment: 0.00 MiB


In [14]:
%timeit x = g.count_alleles(max_allele=2)
%timeit x = gc.count_alleles(max_allele=2)
%memit x = g.count_alleles(max_allele=2)
%memit x = gc.count_alleles(max_allele=2)

10 loops, best of 3: 184 ms per loop
1 loops, best of 3: 272 ms per loop
peak memory: 266.48 MiB, increment: 0.00 MiB
peak memory: 267.04 MiB, increment: 0.55 MiB


In [15]:
%timeit x = h.count_alleles(max_allele=2)
%timeit x = hc.count_alleles(max_allele=2)
%memit x = h.count_alleles(max_allele=2)
%memit x = hc.count_alleles(max_allele=2)

10 loops, best of 3: 186 ms per loop
1 loops, best of 3: 285 ms per loop
peak memory: 267.04 MiB, increment: 0.00 MiB
peak memory: 267.38 MiB, increment: 0.34 MiB


In [16]:
gsub = np.random.choice(list(range(g.shape[1])), size=(g.shape[1]//2), replace=False)
len(gsub)

500

In [17]:
np.array_equal(g.count_alleles(max_allele=2, subpop=gsub), 
               g.take(gsub, axis=1).count_alleles(max_allele=2))

True

In [18]:
%timeit x = g.count_alleles(max_allele=2, subpop=gsub)
%timeit x = g.take(gsub, axis=1).count_alleles(max_allele=2)
%timeit x = gc.count_alleles(max_allele=2, subpop=gsub)
%timeit x = gc.take(gsub, axis=1).count_alleles(max_allele=2)
%memit x = g.count_alleles(max_allele=2, subpop=gsub)
%memit x = g.take(gsub, axis=1).count_alleles(max_allele=2)
%memit x = gc.count_alleles(max_allele=2, subpop=gsub)
%memit x = gc.take(gsub, axis=1).count_alleles(max_allele=2)

10 loops, best of 3: 105 ms per loop
10 loops, best of 3: 164 ms per loop
1 loops, best of 3: 210 ms per loop
1 loops, best of 3: 334 ms per loop
peak memory: 275.29 MiB, increment: 0.00 MiB
peak memory: 322.98 MiB, increment: 47.69 MiB
peak memory: 275.79 MiB, increment: 0.49 MiB
peak memory: 275.79 MiB, increment: 0.00 MiB


In [19]:
hsub = [ploidy*i + n for i in gsub for n in range(ploidy)]
len(hsub)

1000

In [20]:
np.array_equal(h.count_alleles(max_allele=2, subpop=hsub), 
               h.take(hsub, axis=1).count_alleles(max_allele=2))

True

In [21]:
%timeit x = h.count_alleles(max_allele=2, subpop=hsub)
%timeit x = h.take(hsub, axis=1).count_alleles(max_allele=2)
%timeit x = hc.count_alleles(max_allele=2, subpop=hsub)
%timeit x = hc.take(hsub, axis=1).count_alleles(max_allele=2)
%memit x = h.count_alleles(max_allele=2, subpop=hsub)
%memit x = h.take(hsub, axis=1).count_alleles(max_allele=2)
%memit x = hc.count_alleles(max_allele=2, subpop=hsub)
%memit x = hc.take(hsub, axis=1).count_alleles(max_allele=2)

10 loops, best of 3: 104 ms per loop
10 loops, best of 3: 178 ms per loop
1 loops, best of 3: 217 ms per loop
1 loops, best of 3: 362 ms per loop
peak memory: 275.81 MiB, increment: 0.00 MiB
peak memory: 323.50 MiB, increment: 47.69 MiB
peak memory: 275.81 MiB, increment: 0.00 MiB
peak memory: 275.81 MiB, increment: 0.00 MiB


In [22]:
gsub1 = np.random.choice(list(range(g.shape[1])), size=(g.shape[1]//4), replace=False)
gsub2 = np.random.choice(list(range(g.shape[1])), size=(g.shape[1]//4), replace=False)
subpops = {'sub1': gsub1, 'sub2': gsub2}

In [23]:
%timeit gc.count_alleles_subpops(max_allele=2, subpops=subpops)
%timeit {name: gc.count_alleles(max_allele=2, subpop=subpop) for name, subpop in subpops.items()}
%memit gc.count_alleles_subpops(max_allele=2, subpops=subpops)
%memit {name: gc.count_alleles(max_allele=2, subpop=subpop) for name, subpop in subpops.items()}

1 loops, best of 3: 233 ms per loop
1 loops, best of 3: 331 ms per loop
peak memory: 275.93 MiB, increment: 0.00 MiB
peak memory: 275.93 MiB, increment: 0.00 MiB


In [24]:
x1 = gc.count_alleles_subpops(max_allele=2, subpops=subpops)
x1

ctable((50000,), [('sub1', '<u4', (3,)), ('sub2', '<u4', (3,))])
  nbytes: 1.14 MB; cbytes: 681.47 KB; ratio: 1.72
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[([458, 42, 0], [465, 35, 0]) ([472, 28, 0], [471, 29, 0])
 ([470, 30, 0], [476, 24, 0]) ..., ([473, 27, 0], [466, 34, 0])
 ([467, 33, 0], [464, 36, 0]) ([467, 33, 0], [465, 35, 0])]

In [25]:
x1['sub1']

carray((50000, 3), uint32)
  nbytes: 585.94 KB; cbytes: 340.73 KB; ratio: 1.72
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[[458  42   0]
 [472  28   0]
 [470  30   0]
 ..., 
 [473  27   0]
 [467  33   0]
 [467  33   0]]

In [26]:
x1['sub2']

carray((50000, 3), uint32)
  nbytes: 585.94 KB; cbytes: 340.74 KB; ratio: 1.72
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[[465  35   0]
 [471  29   0]
 [476  24   0]
 ..., 
 [466  34   0]
 [464  36   0]
 [465  35   0]]

In [27]:
x2 = {name: gc.count_alleles(max_allele=2, subpop=subpop) for name, subpop in subpops.items()}
x2

{'sub1': AlleleCountsCArray((50000, 3), uint32)
   nbytes: 585.94 KB; cbytes: 340.73 KB; ratio: 1.72
   cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
 [[458  42   0]
  [472  28   0]
  [470  30   0]
  ..., 
  [473  27   0]
  [467  33   0]
  [467  33   0]], 'sub2': AlleleCountsCArray((50000, 3), uint32)
   nbytes: 585.94 KB; cbytes: 340.74 KB; ratio: 1.72
   cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
 [[465  35   0]
  [471  29   0]
  [476  24   0]
  ..., 
  [466  34   0]
  [464  36   0]
  [465  35   0]]}

In [29]:
np.array_equal(x1['sub2'], x2['sub2'])

True

## Subset operations

In [37]:
variants = np.random.randint(0, 2, size=n_variants).astype(bool)
np.count_nonzero(variants)

25140

In [38]:
samples = np.random.randint(0, 2, size=n_samples).astype(bool)
np.count_nonzero(samples)

498

In [41]:
%timeit x = g.compress(variants, axis=0)
%timeit x = gc.compress(variants, axis=0)
%memit x = g.compress(variants, axis=0)
%memit x = gc.compress(variants, axis=0)

10 loops, best of 3: 47.6 ms per loop
1 loops, best of 3: 258 ms per loop
peak memory: 424.01 MiB, increment: 87.75 MiB
peak memory: 384.21 MiB, increment: 0.00 MiB


In [39]:
indices = np.nonzero(variants)[0]
%timeit x = g.take(indices, axis=0)
%timeit x = gc.take(indices, axis=0)
%memit x = g.take(indices, axis=0)
%memit x = gc.take(indices, axis=0)

10 loops, best of 3: 47.4 ms per loop
1 loops, best of 3: 259 ms per loop
peak memory: 426.03 MiB, increment: 89.78 MiB
peak memory: 384.21 MiB, increment: 0.00 MiB


In [42]:
%timeit x = g.compress(samples, axis=1)
%timeit x = gc.compress(samples, axis=1)
%memit x = g.compress(samples, axis=1)
%memit x = gc.compress(samples, axis=1)

10 loops, best of 3: 106 ms per loop
1 loops, best of 3: 348 ms per loop
peak memory: 431.26 MiB, increment: 95.00 MiB
peak memory: 383.76 MiB, increment: 0.00 MiB


In [43]:
indices = np.nonzero(samples)[0]
%timeit x = g.take(indices, axis=1)
%timeit x = gc.take(indices, axis=1)
%memit x = g.take(indices, axis=1)
%memit x = gc.take(indices, axis=1)

10 loops, best of 3: 106 ms per loop
1 loops, best of 3: 360 ms per loop
peak memory: 431.15 MiB, increment: 94.88 MiB
peak memory: 383.77 MiB, increment: 0.00 MiB


In [44]:
%timeit x = g.subset(variants, samples)
%timeit x = gc.subset(variants, samples)
%memit x = g.subset(variants, samples)
%memit x = gc.subset(variants, samples)

1 loops, best of 3: 272 ms per loop
1 loops, best of 3: 480 ms per loop
peak memory: 336.28 MiB, increment: 0.00 MiB
peak memory: 336.28 MiB, increment: 0.00 MiB


In [45]:
gc.subset(variants, samples)

GenotypeCArray((25140, 498, 2), int8)
  nbytes: 23.88 MB; cbytes: 4.33 MB; ratio: 5.52
  cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[[[0 0]
  [0 0]
  [0 0]
  ..., 
  [0 0]
  [0 1]
  [0 0]]

 [[0 0]
  [0 0]
  [0 0]
  ..., 
  [0 0]
  [0 0]
  [0 0]]

 [[0 0]
  [0 1]
  [0 1]
  ..., 
  [0 0]
  [0 0]
  [0 0]]

 ..., 
 [[0 1]
  [0 0]
  [0 0]
  ..., 
  [0 0]
  [0 0]
  [0 0]]

 [[0 0]
  [0 0]
  [0 0]
  ..., 
  [0 0]
  [0 1]
  [0 0]]

 [[0 1]
  [0 0]
  [0 0]
  ..., 
  [0 0]
  [0 0]
  [0 0]]]

## Sandbox