Skip to content

Commit

Permalink
added support for conversion of very-long reads from third-generation…
Browse files Browse the repository at this point in the history
… sequencer BAM data to BED format by way of dynamic memory allocation support in bam/sam2bed routines; enforcement of minimal BED sort order in starch input
  • Loading branch information
alexpreynolds committed Nov 4, 2016
1 parent 587cefb commit 502726e
Show file tree
Hide file tree
Showing 6 changed files with 735 additions and 192 deletions.
817 changes: 640 additions & 177 deletions applications/bed/conversion/src/convert2bed.c

Large diffs are not rendered by default.

23 changes: 22 additions & 1 deletion applications/bed/conversion/src/convert2bed.h
Expand Up @@ -73,6 +73,9 @@ const boolean kFalse = 0;
#define C2B_MAX_VCF_FIELD_COUNT_VALUE 24576
#define C2B_SAM_CIGAR_OPS_VALUE_INITIAL 1
#define C2B_SAM_CIGAR_OPS_VALUE_INCREMENT 1000
#define C2B_SAM_ELEMENT_FIELD_LENGTH_VALUE_INITIAL 32
#define C2B_SAM_ELEMENT_FIELD_LENGTH_VALUE_EXTENSION 32
#define C2B_TEST_BUFFER_SIZE 5000000

extern const char *c2b_samtools;
extern const char *c2b_sort_bed;
Expand Down Expand Up @@ -255,19 +258,33 @@ const char default_cigar_op_operation = '-';

typedef struct sam {
char *qname;
ssize_t qname_capacity;
char *modified_qname;
ssize_t modified_qname_capacity;
int flag;
char *strand;
ssize_t strand_capacity;
char *rname;
ssize_t rname_capacity;
uint64_t start;
uint64_t stop;
char *mapq;
ssize_t mapq_capacity;
char *cigar;
ssize_t cigar_capacity;
char *rnext;
ssize_t rnext_capacity;
char *pnext;
ssize_t pnext_capacity;
char *tlen;
ssize_t tlen_capacity;
char *seq;
ssize_t seq_capacity;
char *qual;
ssize_t qual_capacity;
char *opt;
ssize_t opt_length;
ssize_t opt_capacity;
} c2b_sam_t;

/*
Expand Down Expand Up @@ -553,6 +570,7 @@ typedef struct pipeline_stage {
int status;
char *description;
pid_t pid;
ssize_t buffer_size;
} c2b_pipeline_stage_t;

#define PIPE4_FLAG_NONE (0U)
Expand Down Expand Up @@ -1231,6 +1249,7 @@ typedef struct rmsk_state {
typedef struct sam_state {
char *samtools_path;
c2b_cigar_t *cigar;
c2b_sam_t *element;
} c2b_sam_state_t;

typedef struct vcf_state {
Expand Down Expand Up @@ -1366,11 +1385,13 @@ extern "C" {
static void c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t *dest_size, char *src, ssize_t src_size);
static void c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *dest_size, char *src, ssize_t src_size);
static inline void c2b_sam_cigar_str_to_ops(char *s);
static void c2b_sam_init_element(c2b_sam_t **e);
static void c2b_sam_delete_element(c2b_sam_t *e);
static void c2b_sam_init_cigar_ops(c2b_cigar_t **c, const ssize_t size);
static void c2b_sam_resize_cigar_ops(c2b_cigar_t **new_c, c2b_cigar_t *old_c);
static void c2b_sam_debug_cigar_ops(c2b_cigar_t *c);
static void c2b_sam_delete_cigar_ops(c2b_cigar_t *c);
static inline void c2b_line_convert_sam_to_bed(c2b_sam_t s, char *dest_line, ssize_t *dest_size);
static inline void c2b_line_convert_sam_ptr_to_bed(c2b_sam_t *s, char *dest_line, ssize_t *dest_size, boolean print_modified_qname);
static void c2b_line_convert_vcf_to_bed_unsorted(char *dest, ssize_t *dest_size, char *src, ssize_t src_size);
static inline boolean c2b_vcf_allele_is_id(char *s);
static inline boolean c2b_vcf_record_is_snv(char *ref, char *alt);
Expand Down
32 changes: 25 additions & 7 deletions docs/content/revision-history.rst
Expand Up @@ -19,24 +19,24 @@ Released: **TBD**

* :ref:`bedmap <bedmap>`

* Measurement values in `bedmap` did not allow `+` in the exponent (both `-` worked and no `+` for a positive value. Similarly, out in front of the number, `+` was previously not allowed. Shane Neph posted the report and fix.
* Measurement values in `bedmap` did not allow `+` in the exponent (both `-` worked and no `+` for a positive value. Similarly, out in front of the number, `+` was previously not allowed. Shane Neph posted the report and fix.<br />


* :ref:`bedops <bedops>`

* Fixed issue with `-chop` where complement operation could potentially be included. Shane Neph posted the fix.
* Fixed issue with `-chop` where complement operation could potentially be included. Shane Neph posted the fix.<br />


* :ref:`sort-bed <sort-bed>`

* Sorting of BED input now leads to unambiguous result when two or more elements have the same genomic interval (chromosome name and start and stop position), but different content in remaining columns (ID, score, etc.).

Formerly, elements with the same genomic interval that have different content in fourth and subsequent columns could be printed in a non-consistent ordering on repeated sorts. A deterministic sort order facilitates the use of data integrity functions on sorted BED and Starch data.
Formerly, elements with the same genomic interval that have different content in fourth and subsequent columns could be printed in a non-consistent ordering on repeated sorts. A deterministic sort order facilitates the use of data integrity functions on sorted BED and Starch data.<br />


* :ref:`starchcluster <starchcluster>`

* SLURM-ready version of `starchcluster` script added, to help SLURM job scheduler users with parallelizing the creation of Starch archives.
* SLURM-ready version of `starchcluster` script added, to help SLURM job scheduler users with parallelizing the creation of Starch archives.<br />


* :ref:`unstarch <unstarch>`
Expand All @@ -53,7 +53,7 @@ Released: **TBD**

* The output from the `--list-json` option includes a `signature` key in each chromosome record in the archive metadata, reporting the same information.

* The `--is-starch` option now quits with a non-zero exit code, if the specified input file is not a Starch archive.
* The `--is-starch` option now quits with a non-zero exit code, if the specified input file is not a Starch archive.<br />


* :ref:`starch <starch>`
Expand All @@ -64,17 +64,35 @@ Released: **TBD**

* Fixed `--header` transform bug reported in `Issue 161 <https://github.com/bedops/bedops/issues/161>`_. Thanks to Shane Neph for the bug report!

* Added chromosome name order test to `STARCH2_transformHeaderlessBEDInput` and `STARCH2_transformHeaderedBEDInput` functions.

Compression with `starch` ends with a fatal error, should any of the following comparison tests fail:

1. The chromosome names are not lexicographically ordered (*e.g.*, `chr1` records coming after `chr2` records indicates the data are not correctly sorted).
2. The start position of an input element is less than the start position of a previous input element on the same chromosome (*e.g.*, `chr1:1000-1234` coming after `chr1:2000-2345` is not correctly sorted).
3. The stop positions of two or more input elements are not in ascending order when their start positions are equal (*e.g.*, `chr1:1000:1234` coming after `chr1:1000-2345` is not correctly sorted).

If the sort order of the input data is unknown or uncertain, simply use `sort-bed <sort-bed>` to generate the correct ordering and pipe the output from that to `starch`, *e.g.* `$ cat elements.bed | sort-bed - | starch - > elements.starch`.<br />


* :ref:`starchcat <starchcat>`

* Added `--report-progress=N` option to (optionally) report compression of the Nth element of the current chromosome to standard error stream.

* As in `starch`, at the conclusion of compressing a chromosome made from one or more input Starch archives, the input Starch-transform bytes are continually run through a SHA-1 hash function. The resulting data integrity signature is stored as a Base64-encoded string in the chromosome's entry in the new archive's metadata.
* As in `starch`, at the conclusion of compressing a chromosome made from one or more input Starch archives, the input Starch-transform bytes are continually run through a SHA-1 hash function. The resulting data integrity signature is stored as a Base64-encoded string in the chromosome's entry in the new archive's metadata.<br />


* :ref:`convert2bed <convert2bed>`

* Switched to handle dynamic number of CIGAR operations as reported in `Issue 157 <https://github.com/bedops/bedops/issues/157>`_.
* Improvements in support for BAM/SAM inputs with larger-sized reads, as would come from alignments made from data collected from third-generation sequencers. Simulated read datasets were generated using `SimLoRD <https://bitbucket.org/genomeinformatics/simlord/>`_. Tests have been performed on simulated hg19 data up to 100kb read lengths.

Improvements allow:

* conversion of dynamic number of CIGAR operations (up to system memory)

* conversion of dynamically-sized read fields (up to system memory and/or interthread buffer size limit)

These patches follow up on bug reports in `Issue 157 <https://github.com/bedops/bedops/issues/157>`_.<br />


=================
Expand Down
Expand Up @@ -288,6 +288,9 @@ ArchiveVersion * STARCH_copyArchiveVersion(const ArchiveVersion *oav);
int STARCH_chromosomeInMetadataRecords(const Metadata *md,
const char *chr);

int STARCH_chromosomePositionedBeforeExistingMetadataRecord(const Metadata *md,
const char *chr);

#ifdef __cplusplus
} // namespace starch
#endif
Expand Down
32 changes: 25 additions & 7 deletions interfaces/src/data/starch/starchHelpers.c
Expand Up @@ -1843,11 +1843,21 @@ STARCH2_transformHeaderedBEDInput(const FILE *inFp, Metadata **md, const Compres
if (prevChromosome)
{
#ifdef __cplusplus
if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS) {
if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
#else
if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS) {
if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
#endif
fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
{
fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
return STARCH_FATAL_ERROR;
}
#ifdef __cplusplus
if (STARCH_chromosomePositionedBeforeExistingMetadataRecord(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
#else
if (STARCH_chromosomePositionedBeforeExistingMetadataRecord((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
#endif
{
fprintf(stderr, "ERROR: Chromosome name not ordered lexicographically. Possible sorting issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
return STARCH_FATAL_ERROR;
}
sprintf(compressedFn, "%s.%s", prevChromosome, tag);
Expand Down Expand Up @@ -2908,13 +2918,21 @@ STARCH2_transformHeaderlessBEDInput(const FILE *inFp, Metadata **md, const Compr
if (prevChromosome)
{
#ifdef __cplusplus
if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
{
if (STARCH_chromosomeInMetadataRecords(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
#else
if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
if (STARCH_chromosomeInMetadataRecords((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
#endif
{
fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
return STARCH_FATAL_ERROR;
}
#ifdef __cplusplus
if (STARCH_chromosomePositionedBeforeExistingMetadataRecord(reinterpret_cast<const Metadata *>( firstRecord ), chromosome) == STARCH_EXIT_SUCCESS)
#else
if (STARCH_chromosomePositionedBeforeExistingMetadataRecord((const Metadata *)firstRecord, chromosome) == STARCH_EXIT_SUCCESS)
#endif
fprintf(stderr, "ERROR: Found same chromosome in earlier portion of file. Possible interleaving issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
{
fprintf(stderr, "ERROR: Chromosome name not ordered lexicographically. Possible sorting issue? Be sure to first sort input with sort-bed or remove --do-not-sort option from conversion script.\n");
return STARCH_FATAL_ERROR;
}
sprintf(compressedFn, "%s.%s", prevChromosome, tag);
Expand Down
20 changes: 20 additions & 0 deletions interfaces/src/data/starch/starchMetadataHelpers.c
Expand Up @@ -1953,6 +1953,26 @@ STARCH_chromosomeInMetadataRecords(const Metadata *md,
return STARCH_EXIT_FAILURE;
}

int
STARCH_chromosomePositionedBeforeExistingMetadataRecord(const Metadata *md,
const char *chr) {
#ifdef DEBUG
fprintf(stderr, "\n--- STARCH_chromosomePositionedBeforeExistingMetadataRecord() ---\n");
#endif
const Metadata *iter;

if (!md) {
fprintf(stderr, "ERROR: Could not list chromosomes (metadata structure is empty)\n");
return STARCH_EXIT_FAILURE;
}

for (iter = md; iter != NULL; iter = iter->next)
if (strcmp(chr, iter->chromosome) < 0)
return STARCH_EXIT_SUCCESS;

return STARCH_EXIT_FAILURE;
}

#ifdef __cplusplus
} // namespace starch
#endif

0 comments on commit 502726e

Please sign in to comment.