Merge pull request #221 from alimanfoo/no-tabix-ultimate-redux-aliman…

…foo-20181113 Support unsorted VCFs; better handling for no variants
cggh · Nov 14, 2018 · 3ad2120 · 3ad2120
2 parents c0c0df4 + d1e4635
commit 3ad2120
Show file tree

Hide file tree

Showing 8 changed files with 8,687 additions and 8,498 deletions.
diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py
@@ -238,6 +238,10 @@ def read_vcf(input,
              log=None):
     """Read data from a VCF file into NumPy arrays.
 
+    .. versionchanged:: 1.12.0
+        Now returns None if no variants are found in the VCF file or matching the
+        requested region.
+
     Parameters
     ----------
     input : string or file-like
@@ -274,7 +278,7 @@ def read_vcf(input,
     Returns
     -------
     data : dict[str, ndarray]
-        A dictionary holding arrays.
+        A dictionary holding arrays, or None if no variants were found.
 
     """
 
@@ -303,13 +307,13 @@ def read_vcf(input,
     # read all chunks into a list
     chunks = [d[0] for d in it]
 
-    # setup output
-    output = dict()
+    if chunks:
 
-    if len(samples) > 0 and store_samples:
-        output['samples'] = samples
+        # setup output
+        output = dict()
 
-    if chunks:
+        if len(samples) > 0 and store_samples:
+            output['samples'] = samples
 
         # find array keys
         keys = sorted(chunks[0].keys())
@@ -318,6 +322,10 @@ def read_vcf(input,
         for k in keys:
             output[k] = np.concatenate([chunk[k] for chunk in chunks], axis=0)
 
+    else:
+
+        output = None
+
     return output
 
 
@@ -367,6 +375,10 @@ def vcf_to_npz(input, output,
                log=None):
     """Read data from a VCF file into NumPy arrays and save as a .npz file.
 
+    .. versionchanged:: 1.12.0
+        Now will not create any output file if no variants are found in the VCF file or
+        matching the requested region.
+
     Parameters
     ----------
     input : string
@@ -421,6 +433,10 @@ def vcf_to_npz(input, output,
         transformers=transformers
     )
 
+    if data is None:
+        # no data, bail out
+        return
+
     # setup save function
     if compressed:
         savez = np.savez_compressed
@@ -585,6 +601,10 @@ def vcf_to_hdf5(input, output,
                 log=None):
     """Read data from a VCF file and load into an HDF5 file.
 
+    .. versionchanged:: 1.12.0
+        Now will not create any output file if no variants are found in the VCF file or
+        matching the requested region.
+
     Parameters
     ----------
     input : string
@@ -651,28 +671,35 @@ def vcf_to_hdf5(input, output,
     # noinspection PyTypeChecker
     store_samples, fields = _prep_fields_param(fields)
 
-    with h5py.File(output, mode='a') as h5f:
+    # setup chunk iterator
+    fields, samples, headers, it = iter_vcf_chunks(
+        input, fields=fields, exclude_fields=exclude_fields, types=types,
+        numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
+        chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
+        samples=samples, transformers=transformers
+    )
 
-        # obtain root group that data will be stored into
-        root = h5f.require_group(group)
+    # handle field renaming
+    if rename_fields:
+        rename_fields, it = _do_rename(it, fields=fields,
+                                       rename_fields=rename_fields,
+                                       headers=headers)
 
-        # setup chunk iterator
-        fields, samples, headers, it = iter_vcf_chunks(
-            input, fields=fields, exclude_fields=exclude_fields, types=types,
-            numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
-            chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
-            samples=samples, transformers=transformers
-        )
+    # setup progress logging
+    if log is not None:
+        it = _chunk_iter_progress(it, log, prefix='[vcf_to_hdf5]')
 
-        # handle field renaming
-        if rename_fields:
-            rename_fields, it = _do_rename(it, fields=fields,
-                                           rename_fields=rename_fields,
-                                           headers=headers)
+    # read first chunk
+    try:
+        chunk, _, _, _ = next(it)
+    except StopIteration:
+        # no data, bail out
+        return
 
-        # setup progress logging
-        if log is not None:
-            it = _chunk_iter_progress(it, log, prefix='[vcf_to_hdf5]')
+    with h5py.File(output, mode='a') as h5f:
+
+        # obtain root group that data will be stored into
+        root = h5f.require_group(group)
 
         if len(samples) > 0 and store_samples:
             # store samples
@@ -693,9 +720,6 @@ def vcf_to_hdf5(input, output,
                 t = samples.dtype
             root.create_dataset(name, data=samples, chunks=None, dtype=t)
 
-        # read first chunk
-        chunk, _, _, _ = next(it)
-
         # setup datasets
         # noinspection PyTypeChecker
         keys = _hdf5_setup_datasets(
@@ -820,6 +844,10 @@ def vcf_to_zarr(input, output,
                 log=None):
     """Read data from a VCF file and load into a Zarr on-disk store.
 
+    .. versionchanged:: 1.12.0
+        Now will not create any output files if no variants are found in the VCF file or
+        matching the requested region.
+
     Parameters
     ----------
     input : string
@@ -871,9 +899,6 @@ def vcf_to_zarr(input, output,
     # noinspection PyTypeChecker
     store_samples, fields = _prep_fields_param(fields)
 
-    # open root group
-    root = zarr.open_group(output, mode='a', path=group)
-
     # setup chunk iterator
     fields, samples, headers, it = iter_vcf_chunks(
         input, fields=fields, exclude_fields=exclude_fields, types=types,
@@ -910,6 +935,16 @@ def vcf_to_zarr(input, output,
     if log is not None:
         it = _chunk_iter_progress(it, log, prefix='[vcf_to_zarr]')
 
+    # read first chunk
+    try:
+        chunk, _, _, _ = next(it)
+    except StopIteration:
+        # no data, bail out
+        return
+
+    # open root group
+    root = zarr.open_group(output, mode='a', path=group)
+
     if len(samples) > 0 and store_samples:
         # store samples
         if samples.dtype.kind == 'O':
@@ -922,9 +957,6 @@ def vcf_to_zarr(input, output,
         root.create_dataset('samples', data=samples, compressor=None, overwrite=overwrite,
                             dtype=dtype)
 
-    # read first chunk
-    chunk, _, _, _ = next(it)
-
     # setup datasets
     # noinspection PyTypeChecker
     keys = _zarr_setup_datasets(