From fd0a990ad8a22b2c7a9b64702d67cdf8eeb7ab44 Mon Sep 17 00:00:00 2001 From: Adrian Tan Date: Fri, 6 Jan 2017 15:06:30 -0500 Subject: [PATCH] updated htslib, updated inverse quantile normalization in svm_train --- lib/htslib/Makefile | 2 +- lib/htslib/NEWS | 1 + lib/htslib/bgzip.c | 4 +- lib/htslib/cram/cram_samtools.c | 13 ++-- lib/htslib/cram/cram_samtools.h | 2 +- lib/htslib/htsfile.c | 4 +- lib/htslib/htslib/sam.h | 10 ++- lib/htslib/sam.c | 24 +++--- lib/htslib/tabix.c | 4 +- lib/htslib/test/sam.c | 64 +++++++++++++++- svm_train.cpp | 126 +++++++++++++++++++++++++++----- svm_train.h | 1 + 12 files changed, 208 insertions(+), 47 deletions(-) diff --git a/lib/htslib/Makefile b/lib/htslib/Makefile index 0cfd68b4..0fbe3f66 100644 --- a/lib/htslib/Makefile +++ b/lib/htslib/Makefile @@ -338,7 +338,7 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl check test: bgzip htsfile $(BUILT_TEST_PROGRAMS) test/fieldarith test/fieldarith.sam test/hfile - test/sam test/ce.fa test/faidx.fa + REF_PATH=: test/sam test/ce.fa test/faidx.fa test/test-regidx cd test && REF_PATH=: ./test.pl diff --git a/lib/htslib/NEWS b/lib/htslib/NEWS index 611422ea..eb970a3c 100644 --- a/lib/htslib/NEWS +++ b/lib/htslib/NEWS @@ -4,6 +4,7 @@ Noteworthy changes in release 1.4 in this release, and the shared library soversion has been bumped to 2. - bam_pileup1_t has an additional field (which holds user data) - bam1_core_t has been modified to allow for >64K CIGAR operations + and (along with bam1_t) so that CIGAR entries are aligned in memory - hopen() has vararg arguments for setting URL scheme-dependent options - the various tbx_conf_* presets are now const diff --git a/lib/htslib/bgzip.c b/lib/htslib/bgzip.c index 861f7026..a8b2953b 100644 --- a/lib/htslib/bgzip.c +++ b/lib/htslib/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2016 Genome Research Ltd. + Copyright (C) 2010, 2013-2017 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -125,7 +125,7 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2016 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': case '?': return bgzip_main_usage(); diff --git a/lib/htslib/cram/cram_samtools.c b/lib/htslib/cram/cram_samtools.c index 60a7b5fe..0effb020 100644 --- a/lib/htslib/cram/cram_samtools.c +++ b/lib/htslib/cram/cram_samtools.c @@ -74,11 +74,12 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, }; bam1_t *b = (bam1_t *)*bp; uint8_t *cp; - int i, bam_len; + int i, qname_nuls, bam_len; //b->l_aux = extra_len; // we fill this out later - bam_len = qname_len + 1 + ncigar*4 + (len+1)/2 + len + extra_len; + qname_nuls = 4 - qname_len%4; + bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len; if (b->m_data < bam_len) { b->m_data = bam_len; kroundup32(b->m_data); @@ -92,7 +93,8 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, b->core.pos = pos-1; b->core.bin = bam_reg2bin(pos-1, end); b->core.qual = mapq; - b->core.l_qname = qname_len+1; + b->core.l_qname = qname_len+qname_nuls; + b->core.l_extranul = qname_nuls-1; b->core.flag = flag; b->core.n_cigar = ncigar; b->core.l_qseq = len; @@ -103,8 +105,9 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, cp = b->data; strncpy((char *)cp, qname, qname_len); - cp[qname_len] = 0; - cp += qname_len+1; + for (i = 0; i < qname_nuls; i++) + cp[qname_len+i] = '\0'; + cp += qname_len+qname_nuls; memcpy(cp, cigar, ncigar*4); cp += ncigar*4; diff --git a/lib/htslib/cram/cram_samtools.h b/lib/htslib/cram/cram_samtools.h index 635e2e02..606c6015 100644 --- a/lib/htslib/cram/cram_samtools.h +++ b/lib/htslib/cram/cram_samtools.h @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define bam_flag(b) (b)->core.flag #define bam_bin(b) (b)->core.bin #define bam_map_qual(b) (b)->core.qual -#define bam_name_len(b) (b)->core.l_qname +#define bam_name_len(b) ((b)->core.l_qname - (b)->core.l_extranul) #define bam_name(b) bam_get_qname((b)) #define bam_qual(b) bam_get_qual((b)) #define bam_seq(b) bam_get_seq((b)) diff --git a/lib/htslib/htsfile.c b/lib/htslib/htsfile.c index 6224fdca..71297f78 100644 --- a/lib/htslib/htsfile.c +++ b/lib/htslib/htsfile.c @@ -1,6 +1,6 @@ /* htsfile.c -- file identifier and minimal viewer. - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2017 Genome Research Ltd. Author: John Marshall @@ -187,7 +187,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2016 Genome Research Ltd.\n", +"Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff --git a/lib/htslib/htslib/sam.h b/lib/htslib/htslib/sam.h index d3ed7a0e..e8e580f3 100644 --- a/lib/htslib/htslib/sam.h +++ b/lib/htslib/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2016 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -141,6 +141,7 @@ typedef struct { @field qual mapping quality @field l_qname length of the query name @field flag bitwise flag + @field l_extranul length of extra NULs between qname & cigar (for alignment) @field n_cigar number of CIGAR operations @field l_qseq length of the query sequence (read) @field mtid chromosome ID of next read in template, defined by bam_hdr_t @@ -153,7 +154,8 @@ typedef struct { uint8_t qual; uint8_t l_qname; uint16_t flag; - uint16_t unused1; + uint8_t unused1; + uint8_t l_extranul; uint32_t n_cigar; int32_t l_qseq; int32_t mtid; @@ -170,7 +172,9 @@ typedef struct { @discussion Notes: - 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + 1. qname is terminated by one to four NULs, so that the following + cigar data is 32-bit aligned; core.l_qname includes these trailing NULs, + while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). 2. l_qseq is calculated from the total length of an alignment block on reading or from CIGAR. 3. cigar data is encoded 4 bytes per CIGAR operation. diff --git a/lib/htslib/sam.c b/lib/htslib/sam.c index fb3c7b25..bcc573fd 100644 --- a/lib/htslib/sam.c +++ b/lib/htslib/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2016 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2017 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -423,10 +423,11 @@ int bam_read1(BGZF *fp, bam1_t *b) } c->tid = x[0]; c->pos = x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; - b->l_data = block_len - 32; + b->l_data = block_len - 32 + c->l_extranul; if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4; if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data) return -4; @@ -437,8 +438,10 @@ int bam_read1(BGZF *fp, bam1_t *b) if (!b->data) return -4; } - if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4; - //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4; + for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; + c->l_qname += c->l_extranul; + if (bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return 4 + block_len; } @@ -446,7 +449,7 @@ int bam_read1(BGZF *fp, bam1_t *b) int bam_write1(BGZF *fp, const bam1_t *b) { const bam1_core_t *c = &b->core; - uint32_t x[8], block_len = b->l_data + 32, y; + uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; if (c->n_cigar >= 65536) { if (hts_verbose >= 1) @@ -456,7 +459,7 @@ int bam_write1(BGZF *fp, const bam1_t *b) } x[0] = c->tid; x[1] = c->pos; - x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; + x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); x[3] = (uint32_t)c->flag<<16 | c->n_cigar; x[4] = c->l_qseq; x[5] = c->mtid; @@ -472,7 +475,8 @@ int bam_write1(BGZF *fp, const bam1_t *b) if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0); } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); - if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0); + if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); + if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return ok? 4 + block_len : -1; } @@ -893,7 +897,9 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) _parse_warn(p - q <= 1, "empty query name"); _parse_err(p - q > 255, "query name too long"); kputsn_(q, p - q, &str); - c->l_qname = p - q; + for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++) + kputc_('\0', &str); + c->l_qname = p - q + c->l_extranul; // flag c->flag = strtol(p, &p, 0); if (*p++ != '\t') goto err_ret; // malformated flag @@ -1112,7 +1118,7 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) const bam1_core_t *c = &b->core; str->l = 0; - kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name + kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name kputw(c->flag, str); kputc('\t', str); // flag if (c->tid >= 0) { // chr kputs(h->target_name[c->tid] , str); diff --git a/lib/htslib/tabix.c b/lib/htslib/tabix.c index 55a499d6..5050adf6 100644 --- a/lib/htslib/tabix.c +++ b/lib/htslib/tabix.c @@ -1,7 +1,7 @@ /* tabix.c -- Generic indexer for TAB-delimited genome position files. Copyright (C) 2009-2011 Broad Institute. - Copyright (C) 2010-2012, 2014-2016 Genome Research Ltd. + Copyright (C) 2010-2012, 2014-2017 Genome Research Ltd. Author: Heng Li @@ -446,7 +446,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2016 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; default: return usage(); } diff --git a/lib/htslib/test/sam.c b/lib/htslib/test/sam.c index cbffd399..9ffbf0b2 100644 --- a/lib/htslib/test/sam.c +++ b/lib/htslib/test/sam.c @@ -1,6 +1,6 @@ /* test/sam.c -- SAM/BAM/CRAM API test cases. - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2017 Genome Research Ltd. Author: John Marshall @@ -177,10 +177,68 @@ static void iterators1(void) hts_itr_destroy(sam_itr_queryi(NULL, HTS_IDX_NONE, 0, 0)); } -static void samrecord_sizeof(void) +static void copy_check_alignment(const char *infname, const char *informat, + const char *outfname, const char *outmode, const char *outref) { + samFile *in = sam_open(infname, "r"); + samFile *out = sam_open(outfname, outmode); + bam1_t *aln = bam_init1(); + bam_hdr_t *header; + + if (outref) { + if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0) + fail("setting reference %s for %s", outref, outfname); + } + + header = sam_hdr_read(in); + if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname); + + while (sam_read1(in, header, aln) >= 0) { + int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4; + if (mod4 != 0) + fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"", + informat, mod4, bam_get_qname(aln)); + + if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname); + } + + bam_destroy1(aln); + bam_hdr_destroy(header); + sam_close(in); + sam_close(out); +} + +static void samrecord_layout(void) +{ + static const char qnames[] = "data:," +"@SQ\tSN:CHROMOSOME_II\tLN:5000\n" + "a\t0\tCHROMOSOME_II\t100\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + "bc\t0\tCHROMOSOME_II\t200\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + "def\t0\tCHROMOSOME_II\t300\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" + "ghij\t0\tCHROMOSOME_II\t400\t10\t4M\t*\t0\t0\tATGC\tqqqq\n" +"klmno\t0\tCHROMOSOME_II\t500\t10\t4M\t*\t0\t0\tATGC\tqqqq\n"; + + size_t bam1_t_size, bam1_t_size2; + + bam1_t_size = 36 + sizeof (int) + 4 + sizeof (char *); +#ifndef BAM_NO_ID + bam1_t_size += 8; +#endif + bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms + if (sizeof (bam1_core_t) != 36) fail("sizeof bam1_core_t is %zu, expected 36", sizeof (bam1_core_t)); + + if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) + fail("sizeof bam1_t is %zu, expected either %zu or %zu", + sizeof(bam1_t), bam1_t_size, bam1_t_size2); + + copy_check_alignment(qnames, "SAM", + "test/sam_alignment.tmp.bam", "wb", NULL); + copy_check_alignment("test/sam_alignment.tmp.bam", "BAM", + "test/sam_alignment.tmp.cram", "wc", "test/ce.fa"); + copy_check_alignment("test/sam_alignment.tmp.cram", "CRAM", + "test/sam_alignment.tmp.sam_", "w", NULL); } static void faidx1(const char *filename) @@ -225,7 +283,7 @@ int main(int argc, char **argv) aux_fields1(); iterators1(); - samrecord_sizeof(); + samrecord_layout(); for (i = 1; i < argc; i++) faidx1(argv[i]); return status; diff --git a/svm_train.cpp b/svm_train.cpp index 2aa5b837..e8491144 100644 --- a/svm_train.cpp +++ b/svm_train.cpp @@ -78,8 +78,8 @@ class Igor : Program VTOutput my; cmd.setOutput(&my); TCLAP::ValueArg arg_intervals("i", "i", "intervals []", false, "", "str", cmd); TCLAP::ValueArg arg_interval_list("I", "I", "file containing list of intervals []", false, "", "file", cmd); - TCLAP::ValueArg arg_positive_training_set_vcf_file("p", "p", "file containing list of reference datasets []", false, "", "file", cmd); - TCLAP::ValueArg arg_negative_training_set_vcf_file("p", "p", "file containing list of reference datasets []", false, "", "file", cmd); + TCLAP::ValueArg arg_positive_training_set_vcf_file("p", "p", "positive training data set []", false, "", "file", cmd); + TCLAP::ValueArg arg_negative_training_set_vcf_file("n", "n", "negative training data set []", false, "", "file", cmd); TCLAP::ValueArg arg_fexp("f", "f", "filter expression []", false, "VTYPE==SNP", "str", cmd); TCLAP::UnlabeledValueArg arg_input_vcf_file("", "input VCF file", true, "","file", cmd); @@ -126,39 +126,127 @@ class Igor : Program //stats initialization// //////////////////////// no_snps = 0; + no_variants = 0; + } + + /** + * Inverse normalizes a set of values. + */ + void inverse_normalize(std::vector& raw, std::vector& normalized) + { + //sort vector + std::vector raw_ptr(raw.size()); + for (int32_t i=0; i current_recs; - std::vector overlaps; - Variant variant; +// std::vector current_recs; +// std::vector overlaps; +// Variant variant; +// +// while(sr->read_next_position(current_recs)) +// { +// //check first variant +// bcf1_t *v = current_recs[0]->v; +// bcf_hdr_t *h = current_recs[0]->h; +// int32_t vtype = vm->classify_variant(h, v, variant); +// +// } + + //inverse normalize covariates in memory + std::vector raw; + std::vector normalized; - while(sr->read_next_position(current_recs)) + for (int32_t i=0; i<10; ++i) { - //check first variant - bcf1_t *v = current_recs[0]->v; - bcf_hdr_t *h = current_recs[0]->h; - int32_t vtype = vm->classify_variant(h, v, variant); + raw.push_back(runif(-100, 100)); + } + inverse_normalize(raw, normalized); + + for (int32_t i=0; i<10; ++i) + { + printf("%.4f\t%.4f\n", raw[i], normalized[i]); } - //inverse normalize covariates in memory - - - + //train - - + + //output model trained - - + + //output inverse normalized features to allow for inspection - + }; void print_options() diff --git a/svm_train.h b/svm_train.h index 518da4f9..069620a6 100644 --- a/svm_train.h +++ b/svm_train.h @@ -26,6 +26,7 @@ #include "program.h" #include "libsvm/svm.h" +#include "Rmath/Rmath.h" void svm_train(int argc, char ** argv);