diff --git a/gamgee/sam.cpp b/gamgee/sam.cpp index 853e91aeb..5069cb23f 100644 --- a/gamgee/sam.cpp +++ b/gamgee/sam.cpp @@ -1,13 +1,17 @@ -#include "htslib/sam.h" #include "sam.h" +#include "cigar.h" #include "utils/hts_memory.h" +#include "htslib/sam.h" #include +#include using namespace std; namespace gamgee { +constexpr auto MATE_CIGAR_TAG = "MC"; + /** * @brief creates a sam record that points to htslib memory already allocated * @@ -71,6 +75,17 @@ Sam& Sam::operator=(Sam&& other) noexcept { return *this; } +uint32_t Sam::mate_alignment_stop() const { + auto result = mate_alignment_start(); + const auto mate_cigar = string_tag(MATE_CIGAR_TAG); + stringstream cigar_stream {mate_cigar.value()}; + while (cigar_stream.peek() != std::char_traits::eof()) { + const auto element = Cigar::parse_next_cigar_element(cigar_stream); + result += bam_cigar_type(element.second)&2 ? element.first : 0; + } + return result-1; // we want the last base to be 1-based and **inclusive** +} + /** * @brief calculates the theoretical alignment start of a read that has soft/hard-clips preceding the alignment * diff --git a/gamgee/sam.h b/gamgee/sam.h index af9598036..f4bb7d6ba 100644 --- a/gamgee/sam.h +++ b/gamgee/sam.h @@ -31,14 +31,14 @@ class Sam { // getters for non-variable length fields (things outside of the data member) uint32_t chromosome() const { return uint32_t(m_body->core.tid); } ///< @brief returns the integer representation of the chromosome. Notice that chromosomes are listed in index order with regards to the header (so a 0-based number). Similar to Picards getReferenceIndex() uint32_t alignment_start() const { return uint32_t(m_body->core.pos+1); } ///< @brief returns a (1-based and inclusive) alignment start position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files. - uint32_t alignment_stop() const { return uint32_t(bam_endpos(m_body.get())+1); } ///< @brief returns a (1-based and inclusive) alignment stop position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files. + uint32_t alignment_stop() const { return uint32_t(bam_endpos(m_body.get())); } ///< @brief returns a (1-based and inclusive) alignment stop position. @note the internal encoding is 0-based to mimic that of the BAM files. uint32_t unclipped_start() const ; uint32_t unclipped_stop() const ; uint32_t mate_chromosome() const { return uint32_t(m_body->core.mtid); } ///< @brief returns the integer representation of the mate's chromosome. Notice that chromosomes are listed in index order with regards to the header (so a 0-based number). Similar to Picards getMateReferenceIndex() uint32_t mate_alignment_start() const { return uint32_t(m_body->core.mpos+1); } ///< @brief returns a (1-based and inclusive) mate's alignment start position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files. - uint32_t mate_alignment_stop() const ; ///< @brief returns a (1-based and inclusive) mate'salignment stop position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files. - uint32_t mate_unclipped_start() const { return alignment_start(); } ///< - uint32_t mate_unclipped_stop() const ; ///< + uint32_t mate_alignment_stop() const ; ///< @brief returns a (1-based and inclusive) mate'salignment stop position. @note the internal encoding is 0-based to mimic that of the BAM files. + uint32_t mate_unclipped_start() const { return alignment_start(); } ///< @warning not implemented + uint32_t mate_unclipped_stop() const ; ///< @warning not implemented // modify non-variable length fields (things outside of the data member) // TODO: provide setter for TLEN (core.isize)? diff --git a/test/sam_test.cpp b/test/sam_test.cpp index b6e983347..85b872d50 100644 --- a/test/sam_test.cpp +++ b/test/sam_test.cpp @@ -4,7 +4,6 @@ #include "sam_reader.h" #include "sam_builder.h" #include "missing.h" - #include using namespace std; @@ -391,10 +390,10 @@ BOOST_AUTO_TEST_CASE( sam_unclipped_start_and_stop ) { const auto sam2 = builder.set_chromosome(0).set_alignment_start(100).set_cigar("5S15M").build(); const auto sam3 = builder.set_chromosome(0).set_alignment_start(100).set_cigar("15M5S").build(); const auto sam4 = builder.set_chromosome(0).set_alignment_start(100).set_cigar("5S10M5S").build(); - check_read_alignment_starts_and_stops(sam1, 100, 120, 100, 120); - check_read_alignment_starts_and_stops(sam2, 100, 115, 95, 115); - check_read_alignment_starts_and_stops(sam3, 100, 115, 100, 120); - check_read_alignment_starts_and_stops(sam4, 100, 110, 95, 115); + check_read_alignment_starts_and_stops(sam1, 100, 119, 100, 119); + check_read_alignment_starts_and_stops(sam2, 100, 114, 95, 114); + check_read_alignment_starts_and_stops(sam3, 100, 114, 100, 119); + check_read_alignment_starts_and_stops(sam4, 100, 109, 95, 114); } BOOST_AUTO_TEST_CASE( sam_input_vector ) { @@ -434,5 +433,16 @@ BOOST_AUTO_TEST_CASE( sam_input_vector ) { } BOOST_AUTO_TEST_CASE( sam_input_vector_too_large ) { - BOOST_CHECK_THROW((SingleSamReader {vector{"testdata/test_simple.bam","testdata/test_simple.bam"}}), std::runtime_error);} + BOOST_CHECK_THROW((SingleSamReader {vector{"testdata/test_simple.bam","testdata/test_simple.bam"}}), std::runtime_error); +} +BOOST_AUTO_TEST_CASE( sam_mate_alignment_stop ) { + const auto truth_mate_alignment_stop = vector{264,207,264,10486,10448}; + auto i = 0u; + for (const auto& record : SingleSamReader{"testdata/test_simple.sam"}) { + if (!missing(record.string_tag("MC"))) { + printf("read: %d, mate cigar: %s, astart: %d, astop: %d, mate_start: %d, mate_stop: %d\n", i, record.string_tag("MC").value().c_str(), record.alignment_start(), record.alignment_stop(), record.mate_alignment_start(), record.mate_alignment_stop()); + BOOST_CHECK_EQUAL(record.mate_alignment_stop(), truth_mate_alignment_stop[i++]); + } + } +} diff --git a/testdata/test_simple.sam b/testdata/test_simple.sam index 5256668a2..50b0a6f36 100644 --- a/testdata/test_simple.sam +++ b/testdata/test_simple.sam @@ -2,11 +2,11 @@ @SQ SN:chr1 LN:100000 AS:HG18 UR:/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta M5:9ebc6df9496613f373e73396d5b3b6b6 SP:Homo sapiens @RG ID:exampleBAM.bam PL:illumina PU:exampleBAM.bam LB:exampleBAM.bam SM:exampleBAM.bam @PG ID:0 VN:0.7.1-9 CL:/seq/software/picard/current/3rd_party/maq/maq map -D -s 0 -a 1500 -e 2100 /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.out.aln.map /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bfa /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.1.bfq /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.2.bfq 2> /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.out.map.log -30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam ZA:f:2.3 ZB:i:23 ZC:A:t -30PPJAAXX090125:1:42:512:1817#0 147 chr1 255 2 76M = 200 -130 CAACCCCAGCCCAAACCCGAACCCAAACCCGAACCCGAACCCTAACCCAACCCCCAACCCCAACCCTAACCCTAAC 2+&:A=4?'8@'%2'=>B3A3&>;A?A,@&>A?1A:AAB5=&9>7&:,@A<*70@B=9:0>@@2;=3 PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam -30PPJAAXX090125:1:60:1109:517#0 73 chr1 257 0 76M = 257 0 ACCCTAACCCTAAACATAAACCTAACCCTAACCTGAACCTTATCCAGAACCCCAACTCGAACAAAAGTCGTAACCT ===<4:67(=>@>B?>BB=;::87=?9@B?=((*>1@6B:)8'85%%=,+(-7923%7<,0)4/'-1(.('02-9' PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam -30PPJAAXX090125:1:67:744:1312#0 163 chr1 10399 0 76M = 10479 -155 GGAAATTTACAAGGAACAAATGTGAAGCACAACATTTAGGTTTTAAAAATCAAGCGAATAAATACAGAAGGTGGAG ):CCBAABBBCCBC?=-=CC=@A?ACCB==CCC:ABBBBC=B@BA=ABCCBCCACC-=CCBCCCBCB2>><22BAB PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam -30PPJAAXX090125:1:67:744:1312#0 83 chr1 10479 0 76M = 10399 -155 CTTGCTTTAGACACTGTTCATGTGAAGAAAGACCTGGAAACTTCTGTTAACTATAAGCTCAGTAGGGGCTAAAAGC @BC.>,BBAA<1>CCB=C>((C<@=@?CBB:3=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam ZA:f:2.3 ZB:i:23 ZC:A:t MC:Z:10M +30PPJAAXX090125:1:42:512:1817#0 147 chr1 255 2 76M = 200 -130 CAACCCCAGCCCAAACCCGAACCCAAACCCGAACCCGAACCCTAACCCAACCCCCAACCCCAACCCTAACCCTAAC 2+&:A=4?'8@'%2'=>B3A3&>;A?A,@&>A?1A:AAB5=&9>7&:,@A<*70@B=9:0>@@2;=3 PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2S8M +30PPJAAXX090125:1:60:1109:517#0 73 chr1 257 0 76M = 257 0 ACCCTAACCCTAAACATAAACCTAACCCTAACCTGAACCTTATCCAGAACCCCAACTCGAACAAAAGTCGTAACCT ===<4:67(=>@>B?>BB=;::87=?9@B?=((*>1@6B:)8'85%%=,+(-7923%7<,0)4/'-1(.('02-9' PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2S4M2D2M +30PPJAAXX090125:1:67:744:1312#0 163 chr1 10399 0 76M = 10479 -155 GGAAATTTACAAGGAACAAATGTGAAGCACAACATTTAGGTTTTAAAAATCAAGCGAATAAATACAGAAGGTGGAG ):CCBAABBBCCBC?=-=CC=@A?ACCB==CCC:ABBBBC=B@BA=ABCCBCCACC-=CCBCCCBCB2>><22BAB PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2H2M4I2M2D2M +30PPJAAXX090125:1:67:744:1312#0 83 chr1 10479 0 76M = 10399 -155 CTTGCTTTAGACACTGTTCATGTGAAGAAAGACCTGGAAACTTCTGTTAACTATAAGCTCAGTAGGGGCTAAAAGC @BC.>,BBAA<1>CCB=C>((C<@=@?CBB:3<>BB@<=9BA@B@AB@0@CB@BAA8':A=><73?=<=,;A>?=6=>27@=: PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam 30PPJAAXX090125:1:18:14:1047#0 147 chr1 22924 0 76M = 22910 -89 ATGAGTGAATACACAAATGGTGATATGGACATACAGTGGAGTATTAGTCATAAAAAGGAAGGCAGAGCTGATCCAC A64>235B;:562<>?;?5588=6'.:09?B@@A@6>7.<:A@@/>9>>@@AB9BA==AB:3>0388<>:?BCC?B PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam 30PPJAAXX090125:1:54:1146:1815#0 163 chr1 27878 0 76M = 27982 -179 GGGACGGTGGTAGTGTTAGTCCAGTAACACAGCCTAGACCCCGGCTTCCACGCGGGTTTGACCGGCGCCTACTAAA B5*>49)&)(7A=2A2760(=0(>9>*662+2;7',0(934(.(3/90,56A))A?/,*'*('*8%)5/4'=3,0= PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam