Skip to content

Commit

Permalink
updated htslib, updated inverse quantile normalization in svm_train
Browse files Browse the repository at this point in the history
  • Loading branch information
atks committed Jan 6, 2017
1 parent 4a0aeb0 commit fd0a990
Show file tree
Hide file tree
Showing 12 changed files with 208 additions and 47 deletions.
2 changes: 1 addition & 1 deletion lib/htslib/Makefile
Expand Up @@ -338,7 +338,7 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl
check test: bgzip htsfile $(BUILT_TEST_PROGRAMS)
test/fieldarith test/fieldarith.sam
test/hfile
test/sam test/ce.fa test/faidx.fa
REF_PATH=: test/sam test/ce.fa test/faidx.fa
test/test-regidx
cd test && REF_PATH=: ./test.pl

Expand Down
1 change: 1 addition & 0 deletions lib/htslib/NEWS
Expand Up @@ -4,6 +4,7 @@ Noteworthy changes in release 1.4
in this release, and the shared library soversion has been bumped to 2.
- bam_pileup1_t has an additional field (which holds user data)
- bam1_core_t has been modified to allow for >64K CIGAR operations
and (along with bam1_t) so that CIGAR entries are aligned in memory
- hopen() has vararg arguments for setting URL scheme-dependent options
- the various tbx_conf_* presets are now const

Expand Down
4 changes: 2 additions & 2 deletions lib/htslib/bgzip.c
@@ -1,7 +1,7 @@
/* bgzip.c -- Block compression/decompression utility.
Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
Copyright (C) 2010, 2013-2016 Genome Research Ltd.
Copyright (C) 2010, 2013-2017 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -125,7 +125,7 @@ int main(int argc, char **argv)
case 1:
printf(
"bgzip (htslib) %s\n"
"Copyright (C) 2016 Genome Research Ltd.\n", hts_version());
"Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
return EXIT_SUCCESS;
case 'h':
case '?': return bgzip_main_usage();
Expand Down
13 changes: 8 additions & 5 deletions lib/htslib/cram/cram_samtools.c
Expand Up @@ -74,11 +74,12 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
};
bam1_t *b = (bam1_t *)*bp;
uint8_t *cp;
int i, bam_len;
int i, qname_nuls, bam_len;

//b->l_aux = extra_len; // we fill this out later

bam_len = qname_len + 1 + ncigar*4 + (len+1)/2 + len + extra_len;
qname_nuls = 4 - qname_len%4;
bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len;
if (b->m_data < bam_len) {
b->m_data = bam_len;
kroundup32(b->m_data);
Expand All @@ -92,7 +93,8 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
b->core.pos = pos-1;
b->core.bin = bam_reg2bin(pos-1, end);
b->core.qual = mapq;
b->core.l_qname = qname_len+1;
b->core.l_qname = qname_len+qname_nuls;
b->core.l_extranul = qname_nuls-1;
b->core.flag = flag;
b->core.n_cigar = ncigar;
b->core.l_qseq = len;
Expand All @@ -103,8 +105,9 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
cp = b->data;

strncpy((char *)cp, qname, qname_len);
cp[qname_len] = 0;
cp += qname_len+1;
for (i = 0; i < qname_nuls; i++)
cp[qname_len+i] = '\0';
cp += qname_len+qname_nuls;
memcpy(cp, cigar, ncigar*4);
cp += ncigar*4;

Expand Down
2 changes: 1 addition & 1 deletion lib/htslib/cram/cram_samtools.h
Expand Up @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define bam_flag(b) (b)->core.flag
#define bam_bin(b) (b)->core.bin
#define bam_map_qual(b) (b)->core.qual
#define bam_name_len(b) (b)->core.l_qname
#define bam_name_len(b) ((b)->core.l_qname - (b)->core.l_extranul)
#define bam_name(b) bam_get_qname((b))
#define bam_qual(b) bam_get_qual((b))
#define bam_seq(b) bam_get_seq((b))
Expand Down
4 changes: 2 additions & 2 deletions lib/htslib/htsfile.c
@@ -1,6 +1,6 @@
/* htsfile.c -- file identifier and minimal viewer.
Copyright (C) 2014-2016 Genome Research Ltd.
Copyright (C) 2014-2017 Genome Research Ltd.
Author: John Marshall <jm18@sanger.ac.uk>
Expand Down Expand Up @@ -187,7 +187,7 @@ int main(int argc, char **argv)
case 1:
printf(
"htsfile (htslib) %s\n"
"Copyright (C) 2016 Genome Research Ltd.\n",
"Copyright (C) 2017 Genome Research Ltd.\n",
hts_version());
exit(EXIT_SUCCESS);
break;
Expand Down
10 changes: 7 additions & 3 deletions lib/htslib/htslib/sam.h
@@ -1,7 +1,7 @@
/// @file htslib/sam.h
/// High-level SAM/BAM/CRAM sequence file operations.
/*
Copyright (C) 2008, 2009, 2013-2016 Genome Research Ltd.
Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd.
Copyright (C) 2010, 2012, 2013 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
Expand Down Expand Up @@ -141,6 +141,7 @@ typedef struct {
@field qual mapping quality
@field l_qname length of the query name
@field flag bitwise flag
@field l_extranul length of extra NULs between qname & cigar (for alignment)
@field n_cigar number of CIGAR operations
@field l_qseq length of the query sequence (read)
@field mtid chromosome ID of next read in template, defined by bam_hdr_t
Expand All @@ -153,7 +154,8 @@ typedef struct {
uint8_t qual;
uint8_t l_qname;
uint16_t flag;
uint16_t unused1;
uint8_t unused1;
uint8_t l_extranul;
uint32_t n_cigar;
int32_t l_qseq;
int32_t mtid;
Expand All @@ -170,7 +172,9 @@ typedef struct {
@discussion Notes:
1. qname is zero tailing and core.l_qname includes the tailing '\0'.
1. qname is terminated by one to four NULs, so that the following
cigar data is 32-bit aligned; core.l_qname includes these trailing NULs,
while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3).
2. l_qseq is calculated from the total length of an alignment block
on reading or from CIGAR.
3. cigar data is encoded 4 bytes per CIGAR operation.
Expand Down
24 changes: 15 additions & 9 deletions lib/htslib/sam.c
@@ -1,6 +1,6 @@
/* sam.c -- SAM and BAM file I/O and manipulation.
Copyright (C) 2008-2010, 2012-2016 Genome Research Ltd.
Copyright (C) 2008-2010, 2012-2017 Genome Research Ltd.
Copyright (C) 2010, 2012, 2013 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
Expand Down Expand Up @@ -423,10 +423,11 @@ int bam_read1(BGZF *fp, bam1_t *b)
}
c->tid = x[0]; c->pos = x[1];
c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
c->l_qseq = x[4];
c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
b->l_data = block_len - 32;
b->l_data = block_len - 32 + c->l_extranul;
if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data)
return -4;
Expand All @@ -437,16 +438,18 @@ int bam_read1(BGZF *fp, bam1_t *b)
if (!b->data)
return -4;
}
if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4;
//b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
c->l_qname += c->l_extranul;
if (bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4;
if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
return 4 + block_len;
}

int bam_write1(BGZF *fp, const bam1_t *b)
{
const bam1_core_t *c = &b->core;
uint32_t x[8], block_len = b->l_data + 32, y;
uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
int i, ok;
if (c->n_cigar >= 65536) {
if (hts_verbose >= 1)
Expand All @@ -456,7 +459,7 @@ int bam_write1(BGZF *fp, const bam1_t *b)
}
x[0] = c->tid;
x[1] = c->pos;
x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
x[4] = c->l_qseq;
x[5] = c->mtid;
Expand All @@ -472,7 +475,8 @@ int bam_write1(BGZF *fp, const bam1_t *b)
if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
}
if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0);
if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
return ok? 4 + block_len : -1;
}
Expand Down Expand Up @@ -893,7 +897,9 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
_parse_warn(p - q <= 1, "empty query name");
_parse_err(p - q > 255, "query name too long");
kputsn_(q, p - q, &str);
c->l_qname = p - q;
for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
kputc_('\0', &str);
c->l_qname = p - q + c->l_extranul;
// flag
c->flag = strtol(p, &p, 0);
if (*p++ != '\t') goto err_ret; // malformated flag
Expand Down Expand Up @@ -1112,7 +1118,7 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
const bam1_core_t *c = &b->core;

str->l = 0;
kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name
kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
kputw(c->flag, str); kputc('\t', str); // flag
if (c->tid >= 0) { // chr
kputs(h->target_name[c->tid] , str);
Expand Down
4 changes: 2 additions & 2 deletions lib/htslib/tabix.c
@@ -1,7 +1,7 @@
/* tabix.c -- Generic indexer for TAB-delimited genome position files.
Copyright (C) 2009-2011 Broad Institute.
Copyright (C) 2010-2012, 2014-2016 Genome Research Ltd.
Copyright (C) 2010-2012, 2014-2017 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
Expand Down Expand Up @@ -446,7 +446,7 @@ int main(int argc, char *argv[])
case 1:
printf(
"tabix (htslib) %s\n"
"Copyright (C) 2016 Genome Research Ltd.\n", hts_version());
"Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
return EXIT_SUCCESS;
default: return usage();
}
Expand Down
64 changes: 61 additions & 3 deletions lib/htslib/test/sam.c
@@ -1,6 +1,6 @@
/* test/sam.c -- SAM/BAM/CRAM API test cases.
Copyright (C) 2014-2016 Genome Research Ltd.
Copyright (C) 2014-2017 Genome Research Ltd.
Author: John Marshall <jm18@sanger.ac.uk>
Expand Down Expand Up @@ -177,10 +177,68 @@ static void iterators1(void)
hts_itr_destroy(sam_itr_queryi(NULL, HTS_IDX_NONE, 0, 0));
}

static void samrecord_sizeof(void)
static void copy_check_alignment(const char *infname, const char *informat,
const char *outfname, const char *outmode, const char *outref)
{
samFile *in = sam_open(infname, "r");
samFile *out = sam_open(outfname, outmode);
bam1_t *aln = bam_init1();
bam_hdr_t *header;

if (outref) {
if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0)
fail("setting reference %s for %s", outref, outfname);
}

header = sam_hdr_read(in);
if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname);

while (sam_read1(in, header, aln) >= 0) {
int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4;
if (mod4 != 0)
fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"",
informat, mod4, bam_get_qname(aln));

if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname);
}

bam_destroy1(aln);
bam_hdr_destroy(header);
sam_close(in);
sam_close(out);
}

static void samrecord_layout(void)
{
static const char qnames[] = "data:,"
"@SQ\tSN:CHROMOSOME_II\tLN:5000\n"
"a\t0\tCHROMOSOME_II\t100\t10\t4M\t*\t0\t0\tATGC\tqqqq\n"
"bc\t0\tCHROMOSOME_II\t200\t10\t4M\t*\t0\t0\tATGC\tqqqq\n"
"def\t0\tCHROMOSOME_II\t300\t10\t4M\t*\t0\t0\tATGC\tqqqq\n"
"ghij\t0\tCHROMOSOME_II\t400\t10\t4M\t*\t0\t0\tATGC\tqqqq\n"
"klmno\t0\tCHROMOSOME_II\t500\t10\t4M\t*\t0\t0\tATGC\tqqqq\n";

size_t bam1_t_size, bam1_t_size2;

bam1_t_size = 36 + sizeof (int) + 4 + sizeof (char *);
#ifndef BAM_NO_ID
bam1_t_size += 8;
#endif
bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms

if (sizeof (bam1_core_t) != 36)
fail("sizeof bam1_core_t is %zu, expected 36", sizeof (bam1_core_t));

if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2)
fail("sizeof bam1_t is %zu, expected either %zu or %zu",
sizeof(bam1_t), bam1_t_size, bam1_t_size2);

copy_check_alignment(qnames, "SAM",
"test/sam_alignment.tmp.bam", "wb", NULL);
copy_check_alignment("test/sam_alignment.tmp.bam", "BAM",
"test/sam_alignment.tmp.cram", "wc", "test/ce.fa");
copy_check_alignment("test/sam_alignment.tmp.cram", "CRAM",
"test/sam_alignment.tmp.sam_", "w", NULL);
}

static void faidx1(const char *filename)
Expand Down Expand Up @@ -225,7 +283,7 @@ int main(int argc, char **argv)

aux_fields1();
iterators1();
samrecord_sizeof();
samrecord_layout();
for (i = 1; i < argc; i++) faidx1(argv[i]);

return status;
Expand Down

0 comments on commit fd0a990

Please sign in to comment.