In [1]:
import h5py
import numpy as np
import bcolz
import allel
allel.__version__

'1.0.0.dev16'

In [2]:
# data from http://www.malariagen.net/data/ag1000g-phase1-ar3
callset = h5py.File('/data/coluzzi/ag1000g/data/phase1/release/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.h5',
                    mode='r')

## Chunked arrays

In [3]:
genotype = allel.GenotypeChunkedArray(callset['3L/calldata/genotype'])
genotype

Unnamed: 0,0,1,2,3,4,...,760,761,762,763,764
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
3,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
4,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0


In [4]:
%time genotype.max()

CPU times: user 48.6 s, sys: 388 ms, total: 49 s
Wall time: 56.9 s


3

In [5]:
genotype_bcolzmem = genotype.copy(stop=1000000, storage='bcolzmem')
%time genotype_bcolzmem.max()

CPU times: user 3.04 s, sys: 16 ms, total: 3.06 s
Wall time: 2.36 s


3

In [6]:
genotype_bcolztmp = genotype.copy(stop=1000000, storage='bcolztmp', dir='/magnetic')
%time genotype_bcolztmp.max()

CPU times: user 2.96 s, sys: 116 ms, total: 3.08 s
Wall time: 2.4 s


3

In [7]:
genotype_bcolzmem_zlib1 = genotype.copy(stop=1000000, storage='bcolzmem_zlib1')
%time genotype_bcolzmem_zlib1.max()

CPU times: user 5.21 s, sys: 44 ms, total: 5.26 s
Wall time: 2.9 s


3

In [8]:
genotype_bcolztmp_zlib1 = genotype.copy(stop=1000000, storage='bcolztmp_zlib1', dir='/magnetic')
%time genotype_bcolztmp_zlib1.max()

CPU times: user 5.28 s, sys: 80 ms, total: 5.36 s
Wall time: 3.05 s


3

In [9]:
genotype_hdf5mem_zlib1 = genotype.copy(stop=1000000, storage='hdf5mem_zlib1')
%time genotype_hdf5mem_zlib1.max()

CPU times: user 4.48 s, sys: 0 ns, total: 4.48 s
Wall time: 4.47 s


3

In [10]:
genotype_hdf5tmp_zlib1 = genotype.copy(stop=1000000, storage='hdf5tmp_zlib1', dir='/magnetic')
%time genotype_hdf5tmp_zlib1.max()

CPU times: user 4.61 s, sys: 0 ns, total: 4.61 s
Wall time: 4.6 s


3

### Tune default HDF5 chunk size

In [11]:
genotype.copy(storage='bcolzmem').chunklen

2741

In [12]:
genotype_bcolzmem.chunklen

1370

In [13]:
genotype_hdf5mem_zlib1.chunks

(685, 765, 2)

In [14]:
for chunksize in 2**15, 2**16, 2**17, 2**18, 2**19, 2**20, 2**21, 2**22, 2**23:
    chunklen = chunksize // (genotype.shape[1] * genotype.shape[2])
    chunks = (chunklen,) + genotype.shape[1:]
    print(chunksize, chunklen)
    genotype_hdf5mem_zlib1_bigchunks = genotype.copy(stop=1000000, storage='hdf5mem_zlib1', chunks=chunks)
    %time genotype_hdf5mem_zlib1_bigchunks.max()

32768 21
CPU times: user 12.4 s, sys: 0 ns, total: 12.4 s
Wall time: 12.3 s
65536 42
CPU times: user 8.15 s, sys: 0 ns, total: 8.15 s
Wall time: 8.14 s
131072 85
CPU times: user 7.32 s, sys: 0 ns, total: 7.32 s
Wall time: 7.32 s
262144 171
CPU times: user 6.08 s, sys: 8 ms, total: 6.09 s
Wall time: 6.09 s
524288 342
CPU times: user 4.68 s, sys: 0 ns, total: 4.68 s
Wall time: 4.68 s
1048576 685
CPU times: user 4.91 s, sys: 0 ns, total: 4.91 s
Wall time: 4.9 s
2097152 1370
CPU times: user 4.26 s, sys: 0 ns, total: 4.26 s
Wall time: 4.26 s
4194304 2741
CPU times: user 4.18 s, sys: 4 ms, total: 4.19 s
Wall time: 4.19 s
8388608 5482
CPU times: user 3.97 s, sys: 0 ns, total: 3.97 s
Wall time: 3.96 s


### Excercise methods

In [15]:
%time genotype_bcolzmem.is_called()

CPU times: user 6.86 s, sys: 80 ms, total: 6.94 s
Wall time: 2.93 s


ChunkedArray((1000000, 765), bool, bcolz.carray_ext.carray)

In [16]:
%time genotype_hdf5mem_zlib1.is_called(storage='hdf5mem_zlib1')

CPU times: user 10.8 s, sys: 108 ms, total: 10.9 s
Wall time: 7.58 s


ChunkedArray((1000000, 765), bool, h5py._hl.dataset.Dataset)

In [17]:
%time genotype_bcolzmem.count_het()

CPU times: user 9.14 s, sys: 76 ms, total: 9.22 s
Wall time: 3.59 s


28029171

In [18]:
%time genotype_hdf5mem_zlib1.count_het()

CPU times: user 11.1 s, sys: 84 ms, total: 11.2 s
Wall time: 6.21 s


28029171

In [19]:
%time genotype_bcolzmem.count_alleles()

CPU times: user 7.48 s, sys: 100 ms, total: 7.58 s
Wall time: 6.2 s


Unnamed: 0,0,1,2,3
0,1527,3,0,0
1,1529,1,0,0
2,1528,2,0,0
3,1528,2,0,0
4,1526,4,0,0


In [20]:
%time genotype_bcolzmem_zlib1.count_alleles()

CPU times: user 11.9 s, sys: 96 ms, total: 12 s
Wall time: 7.4 s


Unnamed: 0,0,1,2,3
0,1527,3,0,0
1,1529,1,0,0
2,1528,2,0,0
3,1528,2,0,0
4,1526,4,0,0


In [21]:
%time genotype_hdf5mem_zlib1.count_alleles(storage='hdf5mem_zlib1')

CPU times: user 12.6 s, sys: 4 ms, total: 12.6 s
Wall time: 12.6 s


Unnamed: 0,0,1,2,3
0,1527,3,0,0
1,1529,1,0,0
2,1528,2,0,0
3,1528,2,0,0
4,1526,4,0,0


In [22]:
acs = genotype_bcolzmem.count_alleles_subpops(subpops={'pop1': list(range(100)),
                                                       'pop2': list(range(100, 200))},
                                              max_allele=3)
acs

pop2,pop1
[200 0 0 0],[200 0 0 0]
[200 0 0 0],[199 1 0 0]
[200 0 0 0],[200 0 0 0]
[200 0 0 0],[198 2 0 0]
[200 0 0 0],[196 4 0 0]


In [23]:
acs['pop1']

Unnamed: 0,0,1,2,3
0,200,0,0,0
1,199,1,0,0
2,200,0,0,0
3,198,2,0,0
4,196,4,0,0


In [24]:
acs = genotype_hdf5mem_zlib1.count_alleles_subpops(subpops={'pop1': list(range(100)),
                                                            'pop2': list(range(100, 200))},
                                                   max_allele=3,
                                                   storage='hdf5mem_zlib1')
acs

pop2,pop1
[200 0 0 0],[200 0 0 0]
[200 0 0 0],[199 1 0 0]
[200 0 0 0],[200 0 0 0]
[200 0 0 0],[198 2 0 0]
[200 0 0 0],[196 4 0 0]


In [25]:
acs['pop1']

Unnamed: 0,0,1,2,3
0,200,0,0,0
1,199,1,0,0
2,200,0,0,0
3,198,2,0,0
4,196,4,0,0


In [26]:
%time genotype_bcolzmem.to_haplotypes()

CPU times: user 3.12 s, sys: 116 ms, total: 3.23 s
Wall time: 1.53 s


Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529
0,0,0,0,0,0,...,0,0,0,0,0
1,0,0,0,0,0,...,0,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,0,0
4,0,0,0,0,0,...,0,0,0,0,0


In [27]:
%time genotype_bcolzmem_zlib1.to_haplotypes(storage='bcolzmem_zlib1')

CPU times: user 13.6 s, sys: 572 ms, total: 14.2 s
Wall time: 4.66 s


Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529
0,0,0,0,0,0,...,0,0,0,0,0
1,0,0,0,0,0,...,0,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,0,0
4,0,0,0,0,0,...,0,0,0,0,0


In [28]:
%time genotype_hdf5mem_zlib1.to_haplotypes(storage='hdf5mem_zlib1')

CPU times: user 9.58 s, sys: 20 ms, total: 9.6 s
Wall time: 9.6 s


Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529
0,0,0,0,0,0,...,0,0,0,0,0
1,0,0,0,0,0,...,0,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,0,0
4,0,0,0,0,0,...,0,0,0,0,0


In [29]:
ac = genotype_bcolzmem.count_alleles()
ac

Unnamed: 0,0,1,2,3
0,1527,3,0,0
1,1529,1,0,0
2,1528,2,0,0
3,1528,2,0,0
4,1526,4,0,0


In [30]:
ac.is_segregating()

ChunkedArray((1000000,), bool, bcolz.carray_ext.carray)

In [31]:
ac.max_allele()

ChunkedArray((1000000,), int8, bcolz.carray_ext.carray)

### Copy into persistent HDF5 file

In [32]:
h5f = h5py.File('/magnetic/test.h5', mode='a')

In [33]:
h5g = h5f.require_group('calldata')
h5g

<HDF5 group "/calldata" (2 members)>

In [34]:
if 'genotype' in h5g:
    del h5g['genotype']

In [35]:
gc = genotype.copy(stop=100000, storage='hdf5', group=h5g, name='genotype', compression='gzip', compression_opts=1)
gc

Unnamed: 0,0,1,2,3,4,...,760,761,762,763,764
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
3,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0
4,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0


In [36]:
h5g['genotype']

<HDF5 dataset "genotype": shape (100000, 765, 2), type "|i1">

In [37]:
ac = gc.count_alleles(storage='hdf5', group=h5g, name='allele_counts', compression='gzip', compression_opts=1)
ac

RuntimeError: Unable to create link (Name already exists)

In [None]:
h5g['allele_counts']

## Chunked tables

In [None]:
vt = allel.VariantChunkedTable(callset['3L/variants'])
vt

In [None]:
vt['CHROM']

In [None]:
vt[['CHROM', 'POS']]

In [None]:
query1 = '(AN == 1530) & (MQ > 40)'

In [None]:
%timeit vt.eval(query1, vm='numexpr')

In [None]:
%timeit vt.eval(query1, vm='python')

In [None]:
cond = vt.eval(query1, vm='numexpr')
cond.shape, cond.count_nonzero()

In [None]:
# can only be evaluated with python vm
query2 = '(AN == 1530) & (MQ > 40) & (AC[:, 1] > 5)'

In [None]:
%time vt.eval(query2, vm='python')

In [None]:
vt_bcolzmem = vt.copy(storage='bcolzmem', stop=1000000)
%timeit vt_bcolzmem.eval(query1, vm='numexpr')

In [None]:
%time vt_bcolzmem.query(query1)

In [None]:
vt_hdf5mem_zlib1 = vt.copy(storage='hdf5mem_zlib1', stop=1000000)
%timeit vt_hdf5mem_zlib1.eval(query1, vm='numexpr')

In [None]:
%time vt_hdf5mem_zlib1.query(query1)

### Copy into existing HDF5 file

In [None]:
h5g = h5f.require_group('variants')
h5g

In [None]:
for k in h5g:
    del h5g[k]

In [None]:
vtc = vt[['CHROM', 'POS', 'AN', 'AC']].copy(stop=100000, storage='hdf5_zlib1', group=h5g)
vtc

In [None]:
h5g

## Sysadmin

In [None]:
!yes | pip3 uninstall scikit-allel

In [None]:
!cd /src/github/cggh/scikit-allel/ && python3 setup.py install