Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Sam: implementation of mate_alignment_stop()
Browse files Browse the repository at this point in the history
  • Loading branch information
Mauricio Carneiro committed Aug 16, 2014
1 parent 44ac6d0 commit 962b32c
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 16 deletions.
17 changes: 16 additions & 1 deletion gamgee/sam.cpp
@@ -1,13 +1,17 @@
#include "htslib/sam.h"
#include "sam.h"
#include "cigar.h"
#include "utils/hts_memory.h"
#include "htslib/sam.h"

#include <iostream>
#include <string>

using namespace std;

namespace gamgee {

constexpr auto MATE_CIGAR_TAG = "MC";

/**
* @brief creates a sam record that points to htslib memory already allocated
*
Expand Down Expand Up @@ -71,6 +75,17 @@ Sam& Sam::operator=(Sam&& other) noexcept {
return *this;
}

uint32_t Sam::mate_alignment_stop() const {
auto result = mate_alignment_start();
const auto mate_cigar = string_tag(MATE_CIGAR_TAG);
stringstream cigar_stream {mate_cigar.value()};
while (cigar_stream.peek() != std::char_traits<char>::eof()) {
const auto element = Cigar::parse_next_cigar_element(cigar_stream);
result += bam_cigar_type(element.second)&2 ? element.first : 0;
}
return result-1; // we want the last base to be 1-based and **inclusive**
}

/**
* @brief calculates the theoretical alignment start of a read that has soft/hard-clips preceding the alignment
*
Expand Down
8 changes: 4 additions & 4 deletions gamgee/sam.h
Expand Up @@ -31,14 +31,14 @@ class Sam {
// getters for non-variable length fields (things outside of the data member)
uint32_t chromosome() const { return uint32_t(m_body->core.tid); } ///< @brief returns the integer representation of the chromosome. Notice that chromosomes are listed in index order with regards to the header (so a 0-based number). Similar to Picards getReferenceIndex()
uint32_t alignment_start() const { return uint32_t(m_body->core.pos+1); } ///< @brief returns a (1-based and inclusive) alignment start position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files.
uint32_t alignment_stop() const { return uint32_t(bam_endpos(m_body.get())+1); } ///< @brief returns a (1-based and inclusive) alignment stop position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files.
uint32_t alignment_stop() const { return uint32_t(bam_endpos(m_body.get())); } ///< @brief returns a (1-based and inclusive) alignment stop position. @note the internal encoding is 0-based to mimic that of the BAM files.
uint32_t unclipped_start() const ;
uint32_t unclipped_stop() const ;
uint32_t mate_chromosome() const { return uint32_t(m_body->core.mtid); } ///< @brief returns the integer representation of the mate's chromosome. Notice that chromosomes are listed in index order with regards to the header (so a 0-based number). Similar to Picards getMateReferenceIndex()
uint32_t mate_alignment_start() const { return uint32_t(m_body->core.mpos+1); } ///< @brief returns a (1-based and inclusive) mate's alignment start position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files.
uint32_t mate_alignment_stop() const ; ///< @brief returns a (1-based and inclusive) mate'salignment stop position (as you would see in a Sam file). @note the internal encoding is 0-based to mimic that of the BAM files.
uint32_t mate_unclipped_start() const { return alignment_start(); } ///<
uint32_t mate_unclipped_stop() const ; ///<
uint32_t mate_alignment_stop() const ; ///< @brief returns a (1-based and inclusive) mate'salignment stop position. @note the internal encoding is 0-based to mimic that of the BAM files.
uint32_t mate_unclipped_start() const { return alignment_start(); } ///< @warning not implemented
uint32_t mate_unclipped_stop() const ; ///< @warning not implemented

// modify non-variable length fields (things outside of the data member)
// TODO: provide setter for TLEN (core.isize)?
Expand Down
22 changes: 16 additions & 6 deletions test/sam_test.cpp
Expand Up @@ -4,7 +4,6 @@
#include "sam_reader.h"
#include "sam_builder.h"
#include "missing.h"

#include <vector>

using namespace std;
Expand Down Expand Up @@ -391,10 +390,10 @@ BOOST_AUTO_TEST_CASE( sam_unclipped_start_and_stop ) {
const auto sam2 = builder.set_chromosome(0).set_alignment_start(100).set_cigar("5S15M").build();
const auto sam3 = builder.set_chromosome(0).set_alignment_start(100).set_cigar("15M5S").build();
const auto sam4 = builder.set_chromosome(0).set_alignment_start(100).set_cigar("5S10M5S").build();
check_read_alignment_starts_and_stops(sam1, 100, 120, 100, 120);
check_read_alignment_starts_and_stops(sam2, 100, 115, 95, 115);
check_read_alignment_starts_and_stops(sam3, 100, 115, 100, 120);
check_read_alignment_starts_and_stops(sam4, 100, 110, 95, 115);
check_read_alignment_starts_and_stops(sam1, 100, 119, 100, 119);
check_read_alignment_starts_and_stops(sam2, 100, 114, 95, 114);
check_read_alignment_starts_and_stops(sam3, 100, 114, 100, 119);
check_read_alignment_starts_and_stops(sam4, 100, 109, 95, 114);
}

BOOST_AUTO_TEST_CASE( sam_input_vector ) {
Expand Down Expand Up @@ -434,5 +433,16 @@ BOOST_AUTO_TEST_CASE( sam_input_vector ) {
}

BOOST_AUTO_TEST_CASE( sam_input_vector_too_large ) {
BOOST_CHECK_THROW((SingleSamReader {vector<string>{"testdata/test_simple.bam","testdata/test_simple.bam"}}), std::runtime_error);}
BOOST_CHECK_THROW((SingleSamReader {vector<string>{"testdata/test_simple.bam","testdata/test_simple.bam"}}), std::runtime_error);
}

BOOST_AUTO_TEST_CASE( sam_mate_alignment_stop ) {
const auto truth_mate_alignment_stop = vector<uint32_t>{264,207,264,10486,10448};
auto i = 0u;
for (const auto& record : SingleSamReader{"testdata/test_simple.sam"}) {
if (!missing(record.string_tag("MC"))) {
printf("read: %d, mate cigar: %s, astart: %d, astop: %d, mate_start: %d, mate_stop: %d\n", i, record.string_tag("MC").value().c_str(), record.alignment_start(), record.alignment_stop(), record.mate_alignment_start(), record.mate_alignment_stop());
BOOST_CHECK_EQUAL(record.mate_alignment_stop(), truth_mate_alignment_stop[i++]);
}
}
}
10 changes: 5 additions & 5 deletions testdata/test_simple.sam
Expand Up @@ -2,11 +2,11 @@
@SQ SN:chr1 LN:100000 AS:HG18 UR:/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta M5:9ebc6df9496613f373e73396d5b3b6b6 SP:Homo sapiens
@RG ID:exampleBAM.bam PL:illumina PU:exampleBAM.bam LB:exampleBAM.bam SM:exampleBAM.bam
@PG ID:0 VN:0.7.1-9 CL:/seq/software/picard/current/3rd_party/maq/maq map -D -s 0 -a 1500 -e 2100 /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.out.aln.map /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bfa /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.1.bfq /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.2.bfq 2> /seq/picard/30PPJAAXX/C1-152_2009-01-25_2009-04-07/1/Solexa-9821/30PPJAAXX.1.0.out.map.log
30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam ZA:f:2.3 ZB:i:23 ZC:A:t
30PPJAAXX090125:1:42:512:1817#0 147 chr1 255 2 76M = 200 -130 CAACCCCAGCCCAAACCCGAACCCAAACCCGAACCCGAACCCTAACCCAACCCCCAACCCCAACCCTAACCCTAAC 2+&:A=4?'8@'%2'=>B3A3<A='A=A9>&>;A?A,@&>A?1A:AAB5=&9>7&:,@A<*70@B=9:0>@@2;=3 PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
30PPJAAXX090125:1:60:1109:517#0 73 chr1 257 0 76M = 257 0 ACCCTAACCCTAAACATAAACCTAACCCTAACCTGAACCTTATCCAGAACCCCAACTCGAACAAAAGTCGTAACCT ===<4:67(=>@>B?>BB=;::87=?9@B?=((*>1@6B:)8'85%%=,+(-7923%7<,0)4/'-1(.('02-9' PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
30PPJAAXX090125:1:67:744:1312#0 163 chr1 10399 0 76M = 10479 -155 GGAAATTTACAAGGAACAAATGTGAAGCACAACATTTAGGTTTTAAAAATCAAGCGAATAAATACAGAAGGTGGAG ):CCBAABBBCCBC?=-=CC=@A?ACCB==CCC:ABBBBC=B@BA=ABCCBCCACC-=CCBCCCBCB2>><22BAB PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
30PPJAAXX090125:1:67:744:1312#0 83 chr1 10479 0 76M = 10399 -155 CTTGCTTTAGACACTGTTCATGTGAAGAAAGACCTGGAAACTTCTGTTAACTATAAGCTCAGTAGGGGCTAAAAGC @BC<B@4=CC7>.>,BBAA<1>CCB=C>((C<@=@?CBB:3<C@CA@C=,?B@@A=A?A5B:;CCCC=7CB7<7?< PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam ZA:f:2.3 ZB:i:23 ZC:A:t MC:Z:10M
30PPJAAXX090125:1:42:512:1817#0 147 chr1 255 2 76M = 200 -130 CAACCCCAGCCCAAACCCGAACCCAAACCCGAACCCGAACCCTAACCCAACCCCCAACCCCAACCCTAACCCTAAC 2+&:A=4?'8@'%2'=>B3A3<A='A=A9>&>;A?A,@&>A?1A:AAB5=&9>7&:,@A<*70@B=9:0>@@2;=3 PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2S8M
30PPJAAXX090125:1:60:1109:517#0 73 chr1 257 0 76M = 257 0 ACCCTAACCCTAAACATAAACCTAACCCTAACCTGAACCTTATCCAGAACCCCAACTCGAACAAAAGTCGTAACCT ===<4:67(=>@>B?>BB=;::87=?9@B?=((*>1@6B:)8'85%%=,+(-7923%7<,0)4/'-1(.('02-9' PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2S4M2D2M
30PPJAAXX090125:1:67:744:1312#0 163 chr1 10399 0 76M = 10479 -155 GGAAATTTACAAGGAACAAATGTGAAGCACAACATTTAGGTTTTAAAAATCAAGCGAATAAATACAGAAGGTGGAG ):CCBAABBBCCBC?=-=CC=@A?ACCB==CCC:ABBBBC=B@BA=ABCCBCCACC-=CCBCCCBCB2>><22BAB PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2H2M4I2M2D2M
30PPJAAXX090125:1:67:744:1312#0 83 chr1 10479 0 76M = 10399 -155 CTTGCTTTAGACACTGTTCATGTGAAGAAAGACCTGGAAACTTCTGTTAACTATAAGCTCAGTAGGGGCTAAAAGC @BC<B@4=CC7>.>,BBAA<1>CCB=C>((C<@=@?CBB:3<C@CA@C=,?B@@A=A?A5B:;CCCC=7CB7<7?< PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam MC:Z:2S10M10N10M10D10I10M10H
30PPJAAXX090125:1:18:14:1047#0 99 chr1 22910 0 76M = 22924 -89 GTGTCCATAGATTAATGAGTGAATACACAAATGGTGATATGGACATACAGTGGAGTATTAGTCATAAAAAGGAAGG ?3><><B<ABBBCBA@BBB>BB@<=9BA@B@AB@0@CB@BAA8':A=><73?=<=,;A><?238A>?=6=>27@=: PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
30PPJAAXX090125:1:18:14:1047#0 147 chr1 22924 0 76M = 22910 -89 ATGAGTGAATACACAAATGGTGATATGGACATACAGTGGAGTATTAGTCATAAAAAGGAAGGCAGAGCTGATCCAC A64>235B;:562<>?;?5588=6'.:09?B@@A@6>7.<:A@@/>9>>@@AB9BA==AB:3>0388<>:?BCC?B PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
30PPJAAXX090125:1:54:1146:1815#0 163 chr1 27878 0 76M = 27982 -179 GGGACGGTGGTAGTGTTAGTCCAGTAACACAGCCTAGACCCCGGCTTCCACGCGGGTTTGACCGGCGCCTACTAAA B5*>49)&)(7A=2A2760(=0(>9>*662+2;7',0(934(.(3/90,56A))A?/,*'*('*8%)5/4'=3,0= PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam
Expand Down

0 comments on commit 962b32c

Please sign in to comment.