Skip to content

Commit

Permalink
Merge pull request #221 from alimanfoo/no-tabix-ultimate-redux-aliman…
Browse files Browse the repository at this point in the history
…foo-20181113

Support unsorted VCFs; better handling for no variants
  • Loading branch information
alimanfoo committed Nov 14, 2018
2 parents c0c0df4 + d1e4635 commit 3ad2120
Show file tree
Hide file tree
Showing 8 changed files with 8,687 additions and 8,498 deletions.
98 changes: 65 additions & 33 deletions allel/io/vcf_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ def read_vcf(input,
log=None):
"""Read data from a VCF file into NumPy arrays.
.. versionchanged:: 1.12.0
Now returns None if no variants are found in the VCF file or matching the
requested region.
Parameters
----------
input : string or file-like
Expand Down Expand Up @@ -274,7 +278,7 @@ def read_vcf(input,
Returns
-------
data : dict[str, ndarray]
A dictionary holding arrays.
A dictionary holding arrays, or None if no variants were found.
"""

Expand Down Expand Up @@ -303,13 +307,13 @@ def read_vcf(input,
# read all chunks into a list
chunks = [d[0] for d in it]

# setup output
output = dict()
if chunks:

if len(samples) > 0 and store_samples:
output['samples'] = samples
# setup output
output = dict()

if chunks:
if len(samples) > 0 and store_samples:
output['samples'] = samples

# find array keys
keys = sorted(chunks[0].keys())
Expand All @@ -318,6 +322,10 @@ def read_vcf(input,
for k in keys:
output[k] = np.concatenate([chunk[k] for chunk in chunks], axis=0)

else:

output = None

return output


Expand Down Expand Up @@ -367,6 +375,10 @@ def vcf_to_npz(input, output,
log=None):
"""Read data from a VCF file into NumPy arrays and save as a .npz file.
.. versionchanged:: 1.12.0
Now will not create any output file if no variants are found in the VCF file or
matching the requested region.
Parameters
----------
input : string
Expand Down Expand Up @@ -421,6 +433,10 @@ def vcf_to_npz(input, output,
transformers=transformers
)

if data is None:
# no data, bail out
return

# setup save function
if compressed:
savez = np.savez_compressed
Expand Down Expand Up @@ -585,6 +601,10 @@ def vcf_to_hdf5(input, output,
log=None):
"""Read data from a VCF file and load into an HDF5 file.
.. versionchanged:: 1.12.0
Now will not create any output file if no variants are found in the VCF file or
matching the requested region.
Parameters
----------
input : string
Expand Down Expand Up @@ -651,28 +671,35 @@ def vcf_to_hdf5(input, output,
# noinspection PyTypeChecker
store_samples, fields = _prep_fields_param(fields)

with h5py.File(output, mode='a') as h5f:
# setup chunk iterator
fields, samples, headers, it = iter_vcf_chunks(
input, fields=fields, exclude_fields=exclude_fields, types=types,
numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
samples=samples, transformers=transformers
)

# obtain root group that data will be stored into
root = h5f.require_group(group)
# handle field renaming
if rename_fields:
rename_fields, it = _do_rename(it, fields=fields,
rename_fields=rename_fields,
headers=headers)

# setup chunk iterator
fields, samples, headers, it = iter_vcf_chunks(
input, fields=fields, exclude_fields=exclude_fields, types=types,
numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
samples=samples, transformers=transformers
)
# setup progress logging
if log is not None:
it = _chunk_iter_progress(it, log, prefix='[vcf_to_hdf5]')

# handle field renaming
if rename_fields:
rename_fields, it = _do_rename(it, fields=fields,
rename_fields=rename_fields,
headers=headers)
# read first chunk
try:
chunk, _, _, _ = next(it)
except StopIteration:
# no data, bail out
return

# setup progress logging
if log is not None:
it = _chunk_iter_progress(it, log, prefix='[vcf_to_hdf5]')
with h5py.File(output, mode='a') as h5f:

# obtain root group that data will be stored into
root = h5f.require_group(group)

if len(samples) > 0 and store_samples:
# store samples
Expand All @@ -693,9 +720,6 @@ def vcf_to_hdf5(input, output,
t = samples.dtype
root.create_dataset(name, data=samples, chunks=None, dtype=t)

# read first chunk
chunk, _, _, _ = next(it)

# setup datasets
# noinspection PyTypeChecker
keys = _hdf5_setup_datasets(
Expand Down Expand Up @@ -820,6 +844,10 @@ def vcf_to_zarr(input, output,
log=None):
"""Read data from a VCF file and load into a Zarr on-disk store.
.. versionchanged:: 1.12.0
Now will not create any output files if no variants are found in the VCF file or
matching the requested region.
Parameters
----------
input : string
Expand Down Expand Up @@ -871,9 +899,6 @@ def vcf_to_zarr(input, output,
# noinspection PyTypeChecker
store_samples, fields = _prep_fields_param(fields)

# open root group
root = zarr.open_group(output, mode='a', path=group)

# setup chunk iterator
fields, samples, headers, it = iter_vcf_chunks(
input, fields=fields, exclude_fields=exclude_fields, types=types,
Expand Down Expand Up @@ -910,6 +935,16 @@ def vcf_to_zarr(input, output,
if log is not None:
it = _chunk_iter_progress(it, log, prefix='[vcf_to_zarr]')

# read first chunk
try:
chunk, _, _, _ = next(it)
except StopIteration:
# no data, bail out
return

# open root group
root = zarr.open_group(output, mode='a', path=group)

if len(samples) > 0 and store_samples:
# store samples
if samples.dtype.kind == 'O':
Expand All @@ -922,9 +957,6 @@ def vcf_to_zarr(input, output,
root.create_dataset('samples', data=samples, compressor=None, overwrite=overwrite,
dtype=dtype)

# read first chunk
chunk, _, _, _ = next(it)

# setup datasets
# noinspection PyTypeChecker
keys = _zarr_setup_datasets(
Expand Down

0 comments on commit 3ad2120

Please sign in to comment.