-
Notifications
You must be signed in to change notification settings - Fork 2
/
h5.py
338 lines (264 loc) · 10.6 KB
/
h5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# -*- coding: utf-8 -*-
"""
Utility functions for working with data stored in HDF5 files.
**HDF5 file convention for variant call sets**
Note that this module assumes that data for a variant call set has been
organised in an HDF5 file following a particular convention. Briefly,
the convention is as follows.
An HDF5 file may contain one or more call sets. Each call set is stored
within a separate group. A call set may be stored within the root group.
Within each call set, data are grouped by chromosome.
Within each chromosome group, there are two subgroups,
named `variants` and `calldata`.
The `variants` group contains one or more datasets holding data on the
variants in the call set. The first dimension of all `variants` datasets
must have the same length, being the number of variants on the chromosome.
The `variants` group must contain a `POS` dataset holding the genome
positions of the variants.
The `calldata` group contains one or more datasets holding data relating to
genotype calls. The first dimension of all `calldata` datasets must have
the same length, being the number of variants on the chromosome. The second
dimension of all `calldata` datasets must have the same length,
being the number of samples in the cohort.
In addition to the above, a `samples` dataset may be stored within the
callset, providing a list of labels or identifiers for the samples in the
cohort. This `samples` dataset may be stored as a child of the callset group
and/or as a child of the chromosome groups.
So, for example, an HDF5 file containing a SNP call set for a cohort of
*Anopheles gambiae* samples with chromosomes (2R, 2L, 3R, 3L, X)
might be organised as follows::
/ [callset group]
/samples [dataset, shape (n_samples,), dtype string]
/2L [chromosome group]
/2L/variants [variants group]
/2L/variants/POS [dataset, shape (n_variants,), dtype int32]
/2L/variants/REF [dataset, shape (n_variants,), dtype S1]
/2L/variants/ALT [dataset, shape (n_variants, 3), dtype S1]
/2L/variants/MQ [dataset, shape (n_variants,), dtype f4]
/2L/variants/...
/2L/calldata [calldata group]
/2L/calldata/genotype [dataset, shape (n_variants, n_samples, ploidy), dtype int8]
/2L/calldata/DP [dataset, shape (n_variants, n_samples), dtype=int32]
/2L/calldata/...
/3L/variants/...
/3L/calldata/...
/...
""" # noqa
from __future__ import division, print_function, absolute_import
# standard library dependencies
import itertools
from anhima.compat import string_types, range, force_bytes
# third party dependencies
import numpy as np
import h5py
import numexpr
# internal dependencies
import anhima.loc
def load_region(callset, chrom, start_position=0, stop_position=None,
variants_fields=None,
calldata_fields=None,
variants_query=None,
samples=None):
"""Load data into memory from `callset` for the given region.
Parameters
----------
callset : HDF5 file or group
A file or group containing a variant call set.
chrom : string
The chromosome to extract data for.
start_position : int, optional
The start position for the region to extract data for.
stop_position : int, optional
The stop position for the region to extract data for.
variants_fields : sequence of strings, optional
Names of the variants datasets to extract.
calldata_fields : sequence of strings, optional
Names of the calldata datasets to extract.
variants_query : string, optional
A query to filter variants. Note that this query is applied
after data for the region has been loaded, so any fields
referenced in this query need to be included in `variants_fields`.
samples : sequence of strings, optional
Selected samples to extract.
Returns
-------
variants : dict
A dictionary mapping dataset identifiers to ndarrays.
calldata : dict
A dictionary mapping dataset identifiers to ndarrays.
"""
# obtain chromosome group
grp_chrom = callset[chrom]
# setup output variables
variants = dict()
calldata = dict()
# obtain variant positions
pos = grp_chrom['variants']['POS']
# select samples needs list of all samples, check one is stored in the
# callset and fail early if not
all_samples = None
if samples is not None:
# find all samples
if 'samples' in callset.keys():
all_samples = list(callset['samples'])
elif 'samples' in grp_chrom.keys():
all_samples = list(grp_chrom['samples'])
else:
raise Exception('list of all samples not found in callset')
# locate region
loc = anhima.loc.locate_interval(pos, start_position, stop_position)
# extract variants data
if variants_fields:
if isinstance(variants_fields, string_types):
variants_fields = [variants_fields]
for f in variants_fields:
variants[f] = grp_chrom['variants'][f][loc, ...]
# extract calldata
if calldata_fields:
if isinstance(calldata_fields, string_types):
calldata_fields = [calldata_fields]
for f in calldata_fields:
calldata[f] = grp_chrom['calldata'][f][loc, ...]
# select variants
if variants_query is not None:
condition = numexpr.evaluate(variants_query, local_dict=variants)
for f in variants:
variants[f] = np.compress(condition, variants[f], axis=0)
for f in calldata:
calldata[f] = np.compress(condition, calldata[f], axis=0)
# select samples
if samples is not None:
# TODO check dtype of all_samples
samples = force_bytes(samples)
sample_indices = [all_samples.index(s) for s in samples]
for f in calldata:
calldata[f] = np.take(calldata[f], sample_indices, axis=1)
return variants, calldata
def take2d_pointsel(dataset, row_indices=None, col_indices=None,
block_size=1000):
"""
Load selected rows and optionally columns from an HDF5 dataset with 2 or
more dimensions, using HDF5 point selections.
Parameters
----------
dataset : HDF5 dataset
The dataset to load data from.
row_indices : sequence of ints, optional
The indices of the selected rows. If not provided, all rows will be
returned.
col_indices : sequence of ints, optional
The indices of the selected columns. If not provided, all columns
will be returned.
block_size : int, optional
The size (in number of points) of the block of data to load and
process at a time.
Returns
-------
out : ndarray
An array containing the selected rows and columns.
See Also
--------
anhima.util.take2d
Notes
-----
This function is similar to :func:`anhima.util.take2d` but uses an HDF5
point selection under the hood. Performance characteristics will be
different and may be much better or much worse, depending on the size,
shape and configuration of the dataset, and depending on the number of
points to be selected.
"""
n_rows_in = dataset.shape[0]
if row_indices:
row_indices = sorted(row_indices)
n_rows_out = len(row_indices)
else:
# select all rows
row_indices = range(n_rows_in)
n_rows_out = n_rows_in
n_cols_in = dataset.shape[1]
if col_indices:
# select all columns
col_indices = sorted(col_indices)
n_cols_out = len(col_indices)
else:
col_indices = range(n_cols_in)
n_cols_out = n_cols_in
n_items_out = n_rows_out * n_cols_out
# initialise output array
out = np.empty((n_items_out,), dtype=dataset.dtype)
# convert indices into coordinates
coords = itertools.product(row_indices, col_indices)
# set up selection
sel = h5py._hl.selections.PointSelection(dataset.shape)
typ = h5py.h5t.py_create(dataset.dtype)
# process blocks at a time
for block_start in range(0, n_items_out, block_size):
# materialise a block of coordinates
selection = np.asarray(list(itertools.islice(coords, block_size)))
# set selection
sel.set(selection)
# read data
block_stop = block_start + len(selection)
space = h5py.h5s.create_simple(sel.mshape)
dataset.id.read(space,
sel._id,
out[block_start:block_stop],
typ)
# reshape output array
out = out.reshape(n_rows_out, n_cols_out)
return out
def save_tped(path, callset, chrom,
start_position=0,
stop_position=None,
samples=None):
"""Save genotype data from an HDF5 callset to a Plink transposed format
file (TPED).
Parameters
----------
path : string or file-like
Path of file to write, or file-like object to write to.
callset : HDF5 file or group
A file or group containing a variant call set.
chrom : string
The chromosome to extract data for.
start_position : int, optional
The start position for the region to extract data for.
stop_position : int, optional
The stop position for the region to extract data for.
samples : sequence of strings, optional
Selection of samples to extract genotypes for, defaults to all samples.
Notes
-----
Note that the current implementation loads all data from the requested
region into memory before writing out to TPED, so may not be applicable
to very large datasets.
"""
variants, calldata = load_region(callset,
chrom,
start_position,
stop_position,
variants_fields=['POS', 'REF', 'ALT'],
calldata_fields=['genotype'])
# determine samples that we will use
if samples is None:
genotypes = calldata['genotype']
else:
samples = force_bytes(samples)
h5_samples = callset[chrom]['samples'][:].tolist()
genotypes = np.take(
calldata['genotype'],
[h5_samples.index(s) for s in samples],
axis=1)
ref = variants['REF']
alt = variants['ALT']
if alt.ndim > 1:
alt = alt[:, 0]
pos = variants['POS']
anhima.io.save_tped(path,
genotypes=genotypes,
ref=ref,
alt=alt,
pos=pos,
chromosome=chrom,
identifier=None,
genetic_distance=None)