In [1]:
import numpy as np
import dask.async
import dask.array as da
import os
os.chdir('../..')
import allel

In [2]:
g = allel.GenotypeArray([[[0, 0], [0, 1], [0, 1]],
                         [[0, 1], [1, 1], [1, 1]],
                         [[1, 1], [1, 1], [0, 0]],
                         [[1, 1], [1, 1], [0, 0]],
                         [[1, 1], [0, 1], [-1, -1]]], dtype='i1')
g

Unnamed: 0,0,1,2
0,0/0,0/1,0/1
1,0/1,1/1,1/1
2,1/1,1/1,0/0
3,1/1,1/1,0/0
4,1/1,0/1,./.


In [3]:
# the mapping array is effectively a look-up table, telling how to transform
# integer values in each row
mapping = np.array([[0, 1],  # no transformation
                    [1, 0],  # 0->1, 1->0
                    [0, 0],  # 0->0, 1->0
                    [1, 0],  # 0->1, 1->0
                    [0, 1]], # no transformation
                   dtype='i1')

In [4]:
# the only shape constraint is that size of first dimension must match
assert g.shape[0] == mapping.shape[0]

In [5]:
# this is the pure numpy implementation
expect = g.map_alleles(mapping)
expect

Unnamed: 0,0,1,2
0,0/0,0/1,0/1
1,1/0,0/0,0/0
2,0/0,0/0,0/0
3,0/0,0/0,1/1
4,1/1,0/1,./.


## Dask with 1D chunking

In [6]:
chunks_dim0 = 2

In [7]:
gd1 = da.from_array(g, chunks=(chunks_dim0, None, None))  # N.B., chunk across entire rows
gd1

dask.array<from-ar..., shape=(5, 3, 2), dtype=int8, chunksize=(2, 3, 2)>

In [8]:
md = da.from_array(mapping, chunks=(chunks_dim0, None))  # N.B., first dimension chunk size matches gd1
md

dask.array<from-ar..., shape=(5, 2), dtype=int8, chunksize=(2, 2)>

In [9]:
def f(block, bmapping):
    return allel.GenotypeArray(block).map_alleles(bmapping[:, 0, :])

In [10]:
gmapped1 = da.map_blocks(f, gd1, md[:, None, :])
gmapped1

dask.array<atop-3a..., shape=(5, 3, 2), dtype=None, chunksize=(2, 3, 2)>

In [11]:
allel.GenotypeArray(gmapped1.compute())

Unnamed: 0,0,1,2
0,0/0,0/1,0/1
1,1/0,0/0,0/0
2,0/0,0/0,0/0
3,0/0,0/0,1/1
4,1/1,0/1,./.


In [12]:
np.array_equal(expect, gmapped1.compute())

True

## Dask with 2D chunking

In [13]:
chunks_dim1 = 2

In [14]:
gd2 = da.from_array(g, chunks=(chunks_dim0, chunks_dim1, None))
gd2

dask.array<from-ar..., shape=(5, 3, 2), dtype=int8, chunksize=(2, 2, 2)>

In [15]:
gmapped2 = da.map_blocks(f, gd2, md[:, None, :])
gmapped2

dask.array<atop-a5..., shape=(5, 3, 2), dtype=None, chunksize=(2, 2, 2)>

In [16]:
np.array_equal(expect, gmapped2.compute())

True

It works!

## Different data

In [17]:
g = allel.GenotypeArray([
    [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1]],
    [[0, 2], [1, 1], [0, 2], [1, 1], [-1, -1]],
    [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1]],
    [[0, 2], [1, 1], [0, 2], [1, 1], [-1, -1]],
    [[1, 0], [2, 1], [1, 0], [2, 1], [-1, -1]],
    [[2, 2], [-1, -1], [2, 2], [-1, -1], [-1, -1]],
    [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]
], dtype='i1')
g

Unnamed: 0,0,1,2,3,4
0,0/0,0/1,0/0,0/1,./.
1,0/2,1/1,0/2,1/1,./.
2,0/0,0/1,0/0,0/1,./.
3,0/2,1/1,0/2,1/1,./.
4,1/0,2/1,1/0,2/1,./.


In [18]:
mapping = np.array([[0, 1, 2],
                    [2, 0, 1],
                    [1, 2, 0],
                    [2, 1, 0],
                    [1, 2, 0],
                    [2, 1, 0],
                    [2, 0, 1]], dtype=np.int8)

In [19]:
g.map_alleles(mapping)

Unnamed: 0,0,1,2,3,4
0,0/0,0/1,0/0,0/1,./.
1,2/1,0/0,2/1,0/0,./.
2,1/1,1/2,1/1,1/2,./.
3,2/0,1/1,2/0,1/1,./.
4,2/1,0/2,2/1,0/2,./.


In [20]:
gd = da.from_array(g, chunks=(2, 2, None))
gd

dask.array<from-ar..., shape=(7, 5, 2), dtype=int8, chunksize=(2, 2, 2)>

In [21]:
md = da.from_array(mapping, chunks=(2, None))
md

dask.array<from-ar..., shape=(7, 3), dtype=int8, chunksize=(2, 3)>

In [22]:
gm = da.map_blocks(f, gd, md[:, None, :])

ValueError: Shapes do not align {0: {(2,), (3,)}, 1: {(2, 2, 1), (1,)}, 2: {(2, 2, 2, 1)}}

Try something different...

In [23]:
def ff(block, bmapping):
    return allel.GenotypeArray(block[:, :, :, 0]).map_alleles(bmapping[:, 0, 0, :])

In [24]:
res = da.map_blocks(ff, gd[:, :, :, None], md[:, None, None, :], drop_dims=3)
res

dask.array<atop-29..., shape=(7, 5, 2, 3), dtype=None, chunksize=(2, 2, 2, 3)>

In [25]:
res.compute()

KeyError: ('atop-29ab5e04a84132ebc0c067349e2a91b4', 2, 1, 0, 0)