added support for conversion of very-long reads from third-generation…

… sequencer BAM data to BED format by way of dynamic memory allocation support in bam/sam2bed routines; enforcement of minimal BED sort order in starch input
bedops · Nov 4, 2016 · 502726e · 502726e
1 parent 587cefb
commit 502726e
Show file tree

Hide file tree

Showing 6 changed files with 735 additions and 192 deletions.
diff --git a/applications/bed/conversion/src/convert2bed.c b/applications/bed/conversion/src/convert2bed.c
diff --git a/applications/bed/conversion/src/convert2bed.h b/applications/bed/conversion/src/convert2bed.h
@@ -73,6 +73,9 @@ const boolean kFalse = 0;
 #define C2B_MAX_VCF_FIELD_COUNT_VALUE 24576
 #define C2B_SAM_CIGAR_OPS_VALUE_INITIAL 1
 #define C2B_SAM_CIGAR_OPS_VALUE_INCREMENT 1000
+#define C2B_SAM_ELEMENT_FIELD_LENGTH_VALUE_INITIAL 32
+#define C2B_SAM_ELEMENT_FIELD_LENGTH_VALUE_EXTENSION 32
+#define C2B_TEST_BUFFER_SIZE 5000000
 
 extern const char *c2b_samtools;
 extern const char *c2b_sort_bed;
@@ -255,19 +258,33 @@ const char default_cigar_op_operation = '-';
 
 typedef struct sam {
     char *qname;
+    ssize_t qname_capacity;
+    char *modified_qname;
+    ssize_t modified_qname_capacity;
     int flag;
     char *strand;
+    ssize_t strand_capacity;
     char *rname;
+    ssize_t rname_capacity;
     uint64_t start;
     uint64_t stop;
     char *mapq;
+    ssize_t mapq_capacity;
     char *cigar;
+    ssize_t cigar_capacity;
     char *rnext;
+    ssize_t rnext_capacity;
     char *pnext;
+    ssize_t pnext_capacity;
     char *tlen;
+    ssize_t tlen_capacity;
     char *seq;
+    ssize_t seq_capacity;
     char *qual;
+    ssize_t qual_capacity;
     char *opt;
+    ssize_t opt_length;
+    ssize_t opt_capacity;
 } c2b_sam_t;
 
 /* 
@@ -553,6 +570,7 @@ typedef struct pipeline_stage {
     int status;
     char *description;
     pid_t pid;
+    ssize_t buffer_size;
 } c2b_pipeline_stage_t;
 
 #define PIPE4_FLAG_NONE       (0U)
@@ -1231,6 +1249,7 @@ typedef struct rmsk_state {
 typedef struct sam_state {
     char *samtools_path;
     c2b_cigar_t *cigar;
+    c2b_sam_t *element;
 } c2b_sam_state_t;
 
 typedef struct vcf_state {
@@ -1366,11 +1385,13 @@ extern "C" {
     static void              c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t *dest_size, char *src, ssize_t src_size);
     static void              c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *dest_size, char *src, ssize_t src_size); 
     static inline void       c2b_sam_cigar_str_to_ops(char *s);
+    static void              c2b_sam_init_element(c2b_sam_t **e);
+    static void              c2b_sam_delete_element(c2b_sam_t *e);
     static void              c2b_sam_init_cigar_ops(c2b_cigar_t **c, const ssize_t size);
     static void              c2b_sam_resize_cigar_ops(c2b_cigar_t **new_c, c2b_cigar_t *old_c);
     static void              c2b_sam_debug_cigar_ops(c2b_cigar_t *c);
     static void              c2b_sam_delete_cigar_ops(c2b_cigar_t *c);
-    static inline void       c2b_line_convert_sam_to_bed(c2b_sam_t s, char *dest_line, ssize_t *dest_size);
+    static inline void       c2b_line_convert_sam_ptr_to_bed(c2b_sam_t *s, char *dest_line, ssize_t *dest_size, boolean print_modified_qname);
     static void              c2b_line_convert_vcf_to_bed_unsorted(char *dest, ssize_t *dest_size, char *src, ssize_t src_size);
     static inline boolean    c2b_vcf_allele_is_id(char *s);
     static inline boolean    c2b_vcf_record_is_snv(char *ref, char *alt);

diff --git a/docs/content/revision-history.rst b/docs/content/revision-history.rst
@@ -19,24 +19,24 @@ Released: **TBD**
 
 * :ref:`bedmap <bedmap>`
 
-  * Measurement values in `bedmap` did not allow `+` in the exponent (both `-` worked and no `+` for a positive value.  Similarly, out in front of the number, `+` was previously not allowed. Shane Neph posted the report and fix.
+  * Measurement values in `bedmap` did not allow `+` in the exponent (both `-` worked and no `+` for a positive value.  Similarly, out in front of the number, `+` was previously not allowed. Shane Neph posted the report and fix.<br />
 
 
 * :ref:`bedops <bedops>`
 
-  * Fixed issue with `-chop` where complement operation could potentially be included. Shane Neph posted the fix.
+  * Fixed issue with `-chop` where complement operation could potentially be included. Shane Neph posted the fix.<br />
 
 
 * :ref:`sort-bed <sort-bed>`
 
   * Sorting of BED input now leads to unambiguous result when two or more elements have the same genomic interval (chromosome name and start and stop position), but different content in remaining columns (ID, score, etc.). 
 
-    Formerly, elements with the same genomic interval that have different content in fourth and subsequent columns could be printed in a non-consistent ordering on repeated sorts. A deterministic sort order facilitates the use of data integrity functions on sorted BED and Starch data.
+    Formerly, elements with the same genomic interval that have different content in fourth and subsequent columns could be printed in a non-consistent ordering on repeated sorts. A deterministic sort order facilitates the use of data integrity functions on sorted BED and Starch data.<br />
 
 
 * :ref:`starchcluster <starchcluster>`
 
-  * SLURM-ready version of `starchcluster` script added, to help SLURM job scheduler users with parallelizing the creation of Starch archives.
+  * SLURM-ready version of `starchcluster` script added, to help SLURM job scheduler users with parallelizing the creation of Starch archives.<br />
 
 
 * :ref:`unstarch <unstarch>`
@@ -53,7 +53,7 @@ Released: **TBD**
 
   * The output from the `--list-json` option includes a `signature` key in each chromosome record in the archive metadata, reporting the same information.
 
-  * The `--is-starch` option now quits with a non-zero exit code, if the specified input file is not a Starch archive.
+  * The `--is-starch` option now quits with a non-zero exit code, if the specified input file is not a Starch archive.<br />
 
 
 * :ref:`starch <starch>`
@@ -64,17 +64,35 @@ Released: **TBD**
 
   * Fixed `--header` transform bug reported in `Issue 161 <https://github.com/bedops/bedops/issues/161>`_. Thanks to Shane Neph for the bug report!
 
+  * Added chromosome name order test to `STARCH2_transformHeaderlessBEDInput` and `STARCH2_transformHeaderedBEDInput` functions. 
+
+    Compression with `starch` ends with a fatal error, should any of the following comparison tests fail:
+
+    1. The chromosome names are not lexicographically ordered (*e.g.*, `chr1` records coming after `chr2` records indicates the data are not correctly sorted).
+    2. The start position of an input element is less than the start position of a previous input element on the same chromosome (*e.g.*, `chr1:1000-1234` coming after `chr1:2000-2345` is not correctly sorted).
+    3. The stop positions of two or more input elements are not in ascending order when their start positions are equal (*e.g.*, `chr1:1000:1234` coming after `chr1:1000-2345` is not correctly sorted). 
+
+    If the sort order of the input data is unknown or uncertain, simply use `sort-bed <sort-bed>` to generate the correct ordering and pipe the output from that to `starch`, *e.g.* `$ cat elements.bed | sort-bed - | starch - > elements.starch`.<br />
+
 
 * :ref:`starchcat <starchcat>`
 
   * Added `--report-progress=N` option to (optionally) report compression of the Nth element of the current chromosome to standard error stream.
 
-  * As in `starch`, at the conclusion of compressing a chromosome made from one or more input Starch archives, the input Starch-transform bytes are continually run through a SHA-1 hash function. The resulting data integrity signature is stored as a Base64-encoded string in the chromosome's entry in the new archive's metadata.
+  * As in `starch`, at the conclusion of compressing a chromosome made from one or more input Starch archives, the input Starch-transform bytes are continually run through a SHA-1 hash function. The resulting data integrity signature is stored as a Base64-encoded string in the chromosome's entry in the new archive's metadata.<br />
 
 
 * :ref:`convert2bed <convert2bed>`
 
-  * Switched to handle dynamic number of CIGAR operations as reported in `Issue 157 <https://github.com/bedops/bedops/issues/157>`_.
+  * Improvements in support for BAM/SAM inputs with larger-sized reads, as would come from alignments made from data collected from third-generation sequencers. Simulated read datasets were generated using `SimLoRD <https://bitbucket.org/genomeinformatics/simlord/>`_. Tests have been performed on simulated hg19 data up to 100kb read lengths.
+
+    Improvements allow:
+
+    * conversion of dynamic number of CIGAR operations (up to system memory)
+
+    * conversion of dynamically-sized read fields (up to system memory and/or interthread buffer size limit)
+
+    These patches follow up on bug reports in `Issue 157 <https://github.com/bedops/bedops/issues/157>`_.<br />
 
 
 =================

diff --git a/interfaces/general-headers/data/starch/starchMetadataHelpers.h b/interfaces/general-headers/data/starch/starchMetadataHelpers.h
@@ -288,6 +288,9 @@ ArchiveVersion * STARCH_copyArchiveVersion(const ArchiveVersion *oav);
 int              STARCH_chromosomeInMetadataRecords(const Metadata *md, 
                                                         const char *chr);
 
+int              STARCH_chromosomePositionedBeforeExistingMetadataRecord(const Metadata *md, 
+                                                                             const char *chr);
+
 #ifdef __cplusplus
 } // namespace starch
 #endif

diff --git a/interfaces/src/data/starch/starchHelpers.c b/interfaces/src/data/starch/starchHelpers.c
@@ -1843,11 +1843,21 @@ STARCH2_transformHeaderedBEDInput(const FILE *inFp, Metadata **md, const Compres
                     if (prevChromosome) 
                     {
 #ifdef __cplusplus
-		        if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS) {
+                        if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
 #else
-		        if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS) {
+                        if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
 #endif
-	    	            fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
+                        {
+                            fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
+                            return STARCH_FATAL_ERROR;
+                        }
+#ifdef __cplusplus
+                        if (STARCH_chromosomePositionedBeforeExistingMetadataRecord(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
+#else
+                        if (STARCH_chromosomePositionedBeforeExistingMetadataRecord((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
+#endif
+                        {
+                            fprintf(stderr, "ERROR: Chromosome name not ordered lexicographically. Possible sorting issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
                             return STARCH_FATAL_ERROR;
                         }
                         sprintf(compressedFn, "%s.%s", prevChromosome, tag);
@@ -2908,13 +2918,21 @@ STARCH2_transformHeaderlessBEDInput(const FILE *inFp, Metadata **md, const Compr
                     if (prevChromosome) 
                     {
 #ifdef __cplusplus
-                        if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS) 
-                        {
+                        if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
 #else
-                        if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS) 
+                        if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
+#endif
                         {
+                            fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
+                            return STARCH_FATAL_ERROR;
+                        }
+#ifdef __cplusplus
+                        if (STARCH_chromosomePositionedBeforeExistingMetadataRecord(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
+#else
+                        if (STARCH_chromosomePositionedBeforeExistingMetadataRecord((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
 #endif
-    	    	            fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
+                        {
+                            fprintf(stderr, "ERROR: Chromosome name not ordered lexicographically. Possible sorting issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
                             return STARCH_FATAL_ERROR;
                         }
                         sprintf(compressedFn, "%s.%s", prevChromosome, tag);

diff --git a/interfaces/src/data/starch/starchMetadataHelpers.c b/interfaces/src/data/starch/starchMetadataHelpers.c
@@ -1953,6 +1953,26 @@ STARCH_chromosomeInMetadataRecords(const Metadata *md,
     return STARCH_EXIT_FAILURE;
 }
 
+int
+STARCH_chromosomePositionedBeforeExistingMetadataRecord(const Metadata *md, 
+                                                        const char *chr) {
+#ifdef DEBUG
+    fprintf(stderr, "\n--- STARCH_chromosomePositionedBeforeExistingMetadataRecord() ---\n");
+#endif
+    const Metadata *iter;
+
+    if (!md) {
+        fprintf(stderr, "ERROR: Could not list chromosomes (metadata structure is empty)\n");
+        return STARCH_EXIT_FAILURE;
+    }
+
+    for (iter = md; iter != NULL; iter = iter->next)
+        if (strcmp(chr, iter->chromosome) < 0)
+        return STARCH_EXIT_SUCCESS;
+
+    return STARCH_EXIT_FAILURE;
+}
+
 #ifdef __cplusplus
 } // namespace starch
 #endif