Skip to content

Commit

Permalink
switched to handle dynamic number of CIGAR operations as reported in …
Browse files Browse the repository at this point in the history
  • Loading branch information
alexpreynolds committed Oct 30, 2016
1 parent 268e4c1 commit f890352
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 37 deletions.
110 changes: 74 additions & 36 deletions applications/bed/conversion/src/convert2bed.c
Expand Up @@ -2347,8 +2347,8 @@ c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t
}
else {
/* copy header line to destination stream buffer */
char src_header_line_str[C2B_MAX_LINE_LENGTH_VALUE];
char dest_header_line_str[C2B_MAX_LINE_LENGTH_VALUE];
char src_header_line_str[C2B_MAX_LINE_LENGTH_VALUE] = {0};
char dest_header_line_str[C2B_MAX_LINE_LENGTH_VALUE] = {0};
memcpy(src_header_line_str, src, src_size);
src_header_line_str[src_size] = '\0';
sprintf(dest_header_line_str, "%s\t%u\t%u\t%s\n", c2b_header_chr_name, c2b_globals.header_line_idx, (c2b_globals.header_line_idx + 1), src_header_line_str);
Expand Down Expand Up @@ -2387,7 +2387,7 @@ c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t
*/

ssize_t cigar_size = sam_field_offsets[5] - sam_field_offsets[4];
char cigar_str[C2B_MAX_FIELD_LENGTH_VALUE];
char cigar_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(cigar_str, src + sam_field_offsets[4] + 1, cigar_size - 1);
cigar_str[cigar_size - 1] = '\0';
c2b_sam_cigar_str_to_ops(cigar_str);
Expand All @@ -2405,7 +2405,7 @@ c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t
*/

ssize_t flag_size = sam_field_offsets[1] - sam_field_offsets[0];
char flag_src_str[C2B_MAX_FIELD_LENGTH_VALUE];
char flag_src_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(flag_src_str, src + sam_field_offsets[0] + 1, flag_size);
flag_src_str[flag_size] = '\0';
int flag_val = (int) strtol(flag_src_str, NULL, 10);
Expand All @@ -2418,14 +2418,14 @@ c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t
*/

/* RNAME */
char rname_str[C2B_MAX_FIELD_LENGTH_VALUE];
char rname_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
if (is_mapped) {
ssize_t rname_size = sam_field_offsets[2] - sam_field_offsets[1] - 1;
memcpy(rname_str, src + sam_field_offsets[1] + 1, rname_size);
rname_str[rname_size] = '\0';
}
else {
char unmapped_read_chr_str[C2B_MAX_FIELD_LENGTH_VALUE];
char unmapped_read_chr_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(unmapped_read_chr_str, c2b_unmapped_read_chr_name, strlen(c2b_unmapped_read_chr_name));
unmapped_read_chr_str[strlen(c2b_unmapped_read_chr_name)] = '\t';
unmapped_read_chr_str[strlen(c2b_unmapped_read_chr_name) + 1] = '\0';
Expand All @@ -2435,14 +2435,14 @@ c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t

/* POS */
ssize_t pos_size = sam_field_offsets[3] - sam_field_offsets[2];
char pos_src_str[C2B_MAX_FIELD_LENGTH_VALUE];
char pos_src_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(pos_src_str, src + sam_field_offsets[2] + 1, pos_size - 1);
pos_src_str[pos_size - 1] = '\0';
uint64_t pos_val = strtoull(pos_src_str, NULL, 10);
uint64_t start_val = pos_val - 1; /* remember, start = POS - 1 */

/* QNAME */
char qname_str[C2B_MAX_FIELD_LENGTH_VALUE];
char qname_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t qname_size = sam_field_offsets[0];
memcpy(qname_str, src, qname_size);
qname_str[qname_size] = '\0';
Expand All @@ -2453,43 +2453,43 @@ c2b_line_convert_sam_to_bed_unsorted_without_split_operation(char *dest, ssize_t
sprintf(strand_str, "%c", (strand_val == 0x10) ? '-' : '+');

/* MAPQ */
char mapq_str[C2B_MAX_FIELD_LENGTH_VALUE];
char mapq_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t mapq_size = sam_field_offsets[4] - sam_field_offsets[3] - 1;
memcpy(mapq_str, src + sam_field_offsets[3] + 1, mapq_size);
mapq_str[mapq_size] = '\0';

/* RNEXT */
char rnext_str[C2B_MAX_FIELD_LENGTH_VALUE];
char rnext_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t rnext_size = sam_field_offsets[6] - sam_field_offsets[5] - 1;
memcpy(rnext_str, src + sam_field_offsets[5] + 1, rnext_size);
rnext_str[rnext_size] = '\0';

/* PNEXT */
char pnext_str[C2B_MAX_FIELD_LENGTH_VALUE];
char pnext_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t pnext_size = sam_field_offsets[7] - sam_field_offsets[6] - 1;
memcpy(pnext_str, src + sam_field_offsets[6] + 1, pnext_size);
pnext_str[pnext_size] = '\0';

/* TLEN */
char tlen_str[C2B_MAX_FIELD_LENGTH_VALUE];
char tlen_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t tlen_size = sam_field_offsets[8] - sam_field_offsets[7] - 1;
memcpy(tlen_str, src + sam_field_offsets[7] + 1, tlen_size);
tlen_str[tlen_size] = '\0';

/* SEQ */
char seq_str[C2B_MAX_FIELD_LENGTH_VALUE];
char seq_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t seq_size = sam_field_offsets[9] - sam_field_offsets[8] - 1;
memcpy(seq_str, src + sam_field_offsets[8] + 1, seq_size);
seq_str[seq_size] = '\0';

/* QUAL */
char qual_str[C2B_MAX_FIELD_LENGTH_VALUE];
char qual_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t qual_size = sam_field_offsets[10] - sam_field_offsets[9] - 1;
memcpy(qual_str, src + sam_field_offsets[9] + 1, qual_size);
qual_str[qual_size] = '\0';

/* Optional fields */
char opt_str[C2B_MAX_FIELD_LENGTH_VALUE];
char opt_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
if (sam_field_offsets[11] != -1) {
opt_str[0] = '\0';
for (int field_idx = 11; field_idx <= sam_field_idx; field_idx++) {
Expand Down Expand Up @@ -2566,10 +2566,10 @@ c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *d
}
else {
/* copy header line to destination stream buffer */
char src_header_line_str[C2B_MAX_LINE_LENGTH_VALUE];
char dest_header_line_str[C2B_MAX_LINE_LENGTH_VALUE];
char src_header_line_str[C2B_MAX_LINE_LENGTH_VALUE] = {0};
char dest_header_line_str[C2B_MAX_LINE_LENGTH_VALUE] = {0};
memcpy(src_header_line_str, src, src_size);
src_header_line_str[src_size] = '\0';
//src_header_line_str[src_size] = '\0';
sprintf(dest_header_line_str, "%s\t%u\t%u\t%s\n", c2b_header_chr_name, c2b_globals.header_line_idx, (c2b_globals.header_line_idx + 1), src_header_line_str);
memcpy(dest + *dest_size, dest_header_line_str, strlen(dest_header_line_str));
*dest_size += strlen(dest_header_line_str);
Expand Down Expand Up @@ -2606,7 +2606,7 @@ c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *d
*/

ssize_t cigar_size = sam_field_offsets[5] - sam_field_offsets[4];
char cigar_str[C2B_MAX_FIELD_LENGTH_VALUE];
char cigar_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(cigar_str, src + sam_field_offsets[4] + 1, cigar_size - 1);
cigar_str[cigar_size - 1] = '\0';
c2b_sam_cigar_str_to_ops(cigar_str);
Expand All @@ -2624,7 +2624,7 @@ c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *d
*/

ssize_t flag_size = sam_field_offsets[1] - sam_field_offsets[0];
char flag_src_str[C2B_MAX_FIELD_LENGTH_VALUE];
char flag_src_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(flag_src_str, src + sam_field_offsets[0] + 1, flag_size);
flag_src_str[flag_size] = '\0';
int flag_val = (int) strtol(flag_src_str, NULL, 10);
Expand All @@ -2637,14 +2637,14 @@ c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *d
*/

/* RNAME */
char rname_str[C2B_MAX_FIELD_LENGTH_VALUE];
char rname_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
if (is_mapped) {
ssize_t rname_size = sam_field_offsets[2] - sam_field_offsets[1] - 1;
memcpy(rname_str, src + sam_field_offsets[1] + 1, rname_size);
rname_str[rname_size] = '\0';
}
else {
char unmapped_read_chr_str[C2B_MAX_FIELD_LENGTH_VALUE];
char unmapped_read_chr_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(unmapped_read_chr_str, c2b_unmapped_read_chr_name, strlen(c2b_unmapped_read_chr_name));
unmapped_read_chr_str[strlen(c2b_unmapped_read_chr_name)] = '\t';
unmapped_read_chr_str[strlen(c2b_unmapped_read_chr_name) + 1] = '\0';
Expand All @@ -2654,7 +2654,7 @@ c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *d

/* POS */
ssize_t pos_size = sam_field_offsets[3] - sam_field_offsets[2];
char pos_src_str[C2B_MAX_FIELD_LENGTH_VALUE];
char pos_src_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
memcpy(pos_src_str, src + sam_field_offsets[2] + 1, pos_size - 1);
pos_src_str[pos_size - 1] = '\0';
uint64_t pos_val = strtoull(pos_src_str, NULL, 10);
Expand All @@ -2673,48 +2673,48 @@ c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *d
sprintf(strand_str, "%c", (strand_val == 0x10) ? '-' : '+');

/* MAPQ */
char mapq_str[C2B_MAX_FIELD_LENGTH_VALUE];
char mapq_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t mapq_size = sam_field_offsets[4] - sam_field_offsets[3] - 1;
memcpy(mapq_str, src + sam_field_offsets[3] + 1, mapq_size);
mapq_str[mapq_size] = '\0';

/* RNEXT */
char rnext_str[C2B_MAX_FIELD_LENGTH_VALUE];
char rnext_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t rnext_size = sam_field_offsets[6] - sam_field_offsets[5] - 1;
memcpy(rnext_str, src + sam_field_offsets[5] + 1, rnext_size);
rnext_str[rnext_size] = '\0';

/* PNEXT */
char pnext_str[C2B_MAX_FIELD_LENGTH_VALUE];
char pnext_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t pnext_size = sam_field_offsets[7] - sam_field_offsets[6] - 1;
memcpy(pnext_str, src + sam_field_offsets[6] + 1, pnext_size);
pnext_str[pnext_size] = '\0';

/* TLEN */
char tlen_str[C2B_MAX_FIELD_LENGTH_VALUE];
char tlen_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t tlen_size = sam_field_offsets[8] - sam_field_offsets[7] - 1;
memcpy(tlen_str, src + sam_field_offsets[7] + 1, tlen_size);
tlen_str[tlen_size] = '\0';

/* SEQ */
char seq_str[C2B_MAX_FIELD_LENGTH_VALUE];
char seq_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t seq_size = sam_field_offsets[9] - sam_field_offsets[8] - 1;
memcpy(seq_str, src + sam_field_offsets[8] + 1, seq_size);
seq_str[seq_size] = '\0';

/* QUAL */
char qual_str[C2B_MAX_FIELD_LENGTH_VALUE];
char qual_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
ssize_t qual_size = sam_field_offsets[10] - sam_field_offsets[9] - 1;
memcpy(qual_str, src + sam_field_offsets[9] + 1, qual_size);
qual_str[qual_size] = '\0';

/* Optional fields */
char opt_str[C2B_MAX_FIELD_LENGTH_VALUE];
char opt_str[C2B_MAX_FIELD_LENGTH_VALUE] = {0};
if (sam_field_offsets[11] != -1) {
for (int field_idx = 11; field_idx <= sam_field_idx; field_idx++) {
ssize_t opt_size = sam_field_offsets[field_idx] - sam_field_offsets[field_idx - 1] - (field_idx == sam_field_idx ? 1 : 0);
memcpy(opt_str + strlen(opt_str), src + sam_field_offsets[field_idx - 1] + 1, opt_size);
opt_str[strlen(opt_str) + opt_size] = '\0';
opt_str[strlen(opt_str) + opt_size] = '\0';
}
}

Expand Down Expand Up @@ -2803,6 +2803,13 @@ c2b_sam_cigar_str_to_ops(char *s)
op_idx++;
operation_flag = kFalse;
bases_flag = kTrue;
/* if op_idx >= size property of CIGAR entity, we need to resize this entity */
if (op_idx >= c2b_globals.sam->cigar->size) {
c2b_cigar_t *resized_cigar = NULL;
c2b_sam_resize_cigar_ops(&resized_cigar, c2b_globals.sam->cigar);
c2b_sam_delete_cigar_ops(c2b_globals.sam->cigar);
c2b_globals.sam->cigar = resized_cigar;
}
}
curr_bases_field[bases_idx++] = curr_char;
curr_bases_field[bases_idx] = '\0';
Expand Down Expand Up @@ -2834,20 +2841,50 @@ c2b_sam_init_cigar_ops(c2b_cigar_t **c, const ssize_t size)
c2b_print_usage(stderr);
exit(ENOMEM); /* Not enough space (POSIX.1) */
}
(*c)->ops = malloc(size * sizeof(c2b_cigar_op_t));
(*c)->size = size;
(*c)->ops = malloc((*c)->size * sizeof(c2b_cigar_op_t));
if (!(*c)->ops) {
fprintf(stderr, "Error: Could not allocate space for CIGAR struct operation pointer\n");
fprintf(stderr, "Error: Could not allocate space for CIGAR struct malloc operation pointer\n");
c2b_print_usage(stderr);
exit(ENOMEM); /* Not enough space (POSIX.1) */
}
(*c)->size = size;
(*c)->length = 0;
for (ssize_t idx = 0; idx < size; idx++) {
(*c)->ops[idx].bases = default_cigar_op_bases;
(*c)->ops[idx].operation = default_cigar_op_operation;
}
}

static void
c2b_sam_resize_cigar_ops(c2b_cigar_t **new_c, c2b_cigar_t *old_c)
{
*new_c = malloc(sizeof(c2b_cigar_t));
if (!*new_c) {
fprintf(stderr, "Error: Could not allocate space for larger CIGAR struct pointer\n");
c2b_print_usage(stderr);
exit(ENOMEM); /* Not enough space (POSIX.1) */
}
/* we increment the size of the input CIGAR entity by C2B_SAM_CIGAR_OPS_VALUE_INCREMENT */
(*new_c)->size = old_c->size + C2B_SAM_CIGAR_OPS_VALUE_INCREMENT;
(*new_c)->ops = malloc((*new_c)->size * sizeof(c2b_cigar_op_t));
if (!(*new_c)->ops) {
fprintf(stderr, "Error: Could not allocate space for larger CIGAR struct malloc operation pointer\n");
c2b_print_usage(stderr);
exit(ENOMEM); /* Not enough space (POSIX.1) */
}
(*new_c)->length = old_c->length;
/* copy operations from input CIGAR entity to new entity */
for (ssize_t idx = 0; idx < old_c->size; idx++) {
(*new_c)->ops[idx].bases = old_c->ops[idx].bases;
(*new_c)->ops[idx].operation = old_c->ops[idx].operation;
}
/* set default base and operation values for new entity */
for (ssize_t idx = old_c->size; idx < (*new_c)->size; idx++) {
(*new_c)->ops[idx].bases = default_cigar_op_bases;
(*new_c)->ops[idx].operation = default_cigar_op_operation;
}
}

/*
specifying special attribute for c2b_sam_debug_cigar_ops() to avoid: "warning: unused
function 'c2b_sam_debug_cigar_ops' [-Wunused-function]" message during non-debug compilation
Expand Down Expand Up @@ -2875,7 +2912,8 @@ c2b_sam_delete_cigar_ops(c2b_cigar_t *c)
if (c->ops) {
free(c->ops), c->ops = NULL;
}
c->length = 0, c->size = 0;
c->length = 0;
c->size = 0;
free(c), c = NULL;
}
}
Expand Down Expand Up @@ -4751,7 +4789,7 @@ c2b_init_global_sam_state()

c2b_globals.sam->samtools_path = NULL;

c2b_globals.sam->cigar = NULL, c2b_sam_init_cigar_ops(&(c2b_globals.sam->cigar), C2B_MAX_OPERATIONS_VALUE);
c2b_globals.sam->cigar = NULL, c2b_sam_init_cigar_ops(&(c2b_globals.sam->cigar), C2B_SAM_CIGAR_OPS_VALUE_INITIAL);

#ifdef DEBUG
fprintf(stderr, "--- c2b_init_global_sam_state() - exit ---\n");
Expand Down
4 changes: 3 additions & 1 deletion applications/bed/conversion/src/convert2bed.h
Expand Up @@ -66,12 +66,13 @@ const boolean kFalse = 0;
#define C2B_MAX_LINE_LENGTH_VALUE 131072
#define C2B_MAX_LONGER_LINE_LENGTH_VALUE 1048576
#define C2B_MAX_LINES_VALUE 32
#define C2B_MAX_OPERATIONS_VALUE 1024
#define C2B_MAX_CHROMOSOME_LENGTH 32
#define C2B_MAX_PSL_BLOCKS 1024
#define C2B_MAX_PSL_BLOCK_SIZES_STRING_LENGTH 20
#define C2B_MAX_PSL_T_STARTS_STRING_LENGTH 20
#define C2B_MAX_VCF_FIELD_COUNT_VALUE 24576
#define C2B_SAM_CIGAR_OPS_VALUE_INITIAL 1
#define C2B_SAM_CIGAR_OPS_VALUE_INCREMENT 1000

extern const char *c2b_samtools;
extern const char *c2b_sort_bed;
Expand Down Expand Up @@ -1366,6 +1367,7 @@ extern "C" {
static void c2b_line_convert_sam_to_bed_unsorted_with_split_operation(char *dest, ssize_t *dest_size, char *src, ssize_t src_size);
static inline void c2b_sam_cigar_str_to_ops(char *s);
static void c2b_sam_init_cigar_ops(c2b_cigar_t **c, const ssize_t size);
static void c2b_sam_resize_cigar_ops(c2b_cigar_t **new_c, c2b_cigar_t *old_c);
static void c2b_sam_debug_cigar_ops(c2b_cigar_t *c);
static void c2b_sam_delete_cigar_ops(c2b_cigar_t *c);
static inline void c2b_line_convert_sam_to_bed(c2b_sam_t s, char *dest_line, ssize_t *dest_size);
Expand Down
10 changes: 10 additions & 0 deletions docs/content/revision-history.rst
Expand Up @@ -22,6 +22,11 @@ Released: **TBD**
* Measurement values in `bedmap` did not allow `+` in the exponent (both `-` worked and no `+` for a positive value. Similarly, out in front of the number, `+` was previously not allowed. Shane Neph posted the report and fix.


* :ref:`bedops <bedops>`

* Fixed issue with `-chop` where complement operation could potentially be included. Shane Neph posted the fix.


* :ref:`sort-bed <sort-bed>`

* Sorting of BED input now leads to unambiguous result when two or more elements have the same genomic interval (chromosome name and start and stop position), but different content in remaining columns (ID, score, etc.).
Expand Down Expand Up @@ -67,6 +72,11 @@ Released: **TBD**
* As in `starch`, at the conclusion of compressing a chromosome made from one or more input Starch archives, the input Starch-transform bytes are continually run through a SHA-1 hash function. The resulting data integrity signature is stored as a Base64-encoded string in the chromosome's entry in the new archive's metadata.


* :ref:`convert2bed <convert2bed>`

* Switched to handle dynamic number of CIGAR operations as reported in `Issue 157 <https://github.com/bedops/bedops/issues/157>`_.


=================
Previous versions
=================
Expand Down

0 comments on commit f890352

Please sign in to comment.