Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Sam: implementation of mate operations
Browse files Browse the repository at this point in the history
* add mate_alignment_stop, mate_unclipped_start, mate_unclipped_stop
* add mate operations overloads for performance and tests
* fix off-by-one bug on Sam::alignment_stop (it was reporting the base after the last inclusive base, contradicting the docs)
* refactor cigar operator parsing routine from SamBuilder to Cigar class for mutual use
* add consumes_read_bases and consumes_reference_bases as static methods to the Cigar class
* add tests to the new API and all new functions

fixes #6
  • Loading branch information
Mauricio Carneiro committed Aug 21, 2014
1 parent d3d259e commit 4d76f30
Show file tree
Hide file tree
Showing 11 changed files with 368 additions and 70 deletions.
16 changes: 16 additions & 0 deletions gamgee/cigar.cpp
Expand Up @@ -14,6 +14,7 @@ namespace gamgee {
* @warning the order of these operators are matching the defines in htslib, do not change!
*/
const char Cigar::cigar_ops_as_chars[] = { 'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B' };
const std::vector<int8_t> Cigar::cigar_op_parse_table = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,7,-1,-1,-1,-1,9,-1,2,-1,-1,-1,5,1,-1,-1,-1,0,3,-1,6,-1,-1,4,-1,-1,-1,-1,8,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};

/**
* @brief creates a Cigar object that points to htslib memory already allocated
Expand Down Expand Up @@ -102,5 +103,20 @@ string Cigar::to_string() const {
return stream.str();
}

CigarElement Cigar::parse_next_cigar_element (stringstream& cigar_stream) {
auto element_length = uint32_t{0};
auto element_op = uint8_t{0};
cigar_stream >> element_length;
if ( cigar_stream.fail() )
throw invalid_argument(string("Error parsing cigar string: ") + cigar_stream.str());
cigar_stream >> element_op;
if ( cigar_stream.fail() || int(element_op) >= 128 )
throw invalid_argument(string("Error parsing cigar string: ") + cigar_stream.str());
const auto encoded_op = cigar_op_parse_table[element_op];
if ( encoded_op < 0 )
throw invalid_argument(string("Unrecognized operator ") + char(element_op) + " in cigar string: " + cigar_stream.str());
return make_cigar_element(element_length, static_cast<CigarOperator>(encoded_op));
}


}
31 changes: 31 additions & 0 deletions gamgee/cigar.h
Expand Up @@ -5,6 +5,8 @@

#include <memory>
#include <string>
#include <vector>
#include <sstream>

namespace gamgee {

Expand Down Expand Up @@ -56,6 +58,35 @@ class Cigar {
inline static CigarElement make_cigar_element(const uint32_t oplen, const CigarOperator op) {
return (oplen << BAM_CIGAR_SHIFT) | static_cast<uint32_t>(op);
}

/**
* @brief Table used to parse chars representing cigar operations into their htslib encodings
*
* @note: This table should eventually be moved to htslib. Currently htslib dynamically allocates
* and fills the equivalent of this table on-demand, which makes little sense.
*/
static const std::vector<int8_t> cigar_op_parse_table;

/**
* @brief utility function to parse cigar strings (in a stringstream) one element at a time
*
* This function is useful if you are parsing a cigar string into elements and operators. A simple way to iterate over all the elements is :
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* auto cigar_stream = stringstream{cigar_string}; // requires gcc 4.9's stdlibc++ if you can't use that, don't use auto on this line
* while (cigar_stream.peek() != std::char_traits<char>::eof()) {
* const auto element = parse_next_cigar_element(cigar_stream);
* do_something_with(Cigar::cigar_op(element), Cigar::cigar_oplen(element));
* }
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* @return a CigarElement with length and operator that can be extracted using the Cigar API (Cigar::cigar_op and Cigar::cigar_oplen) to actual elements.
*/
static CigarElement parse_next_cigar_element (std::stringstream& cigar_stream);

inline static bool consumes_read_bases(const CigarOperator op) {return bam_cigar_type(static_cast<int8_t>(op))&1;} ///< returns true if operator is one of the following: Match (M), Insertion (I), Soft-Clip (S), Equal (=) or Different (X)
inline static bool consumes_reference_bases(const CigarOperator op) {return bam_cigar_type(static_cast<int8_t>(op))&2;} ///< returns true if operator is one of the following: Match (M), Deletion (D), Reference-Skip (N), Equal (=) or Different (X)



private:
std::shared_ptr<bam1_t> m_sam_record; ///< sam record containing our cigar, potentially co-owned by multiple other objects
Expand Down
97 changes: 76 additions & 21 deletions gamgee/sam.cpp
@@ -1,13 +1,18 @@
#include "htslib/sam.h"
#include "sam.h"
#include "cigar.h"
#include "utils/hts_memory.h"
#include "htslib/sam.h"
#include "missing.h"

#include <iostream>
#include <string>

using namespace std;

namespace gamgee {

constexpr auto MATE_CIGAR_TAG = "MC";

/**
* @brief creates a sam record that points to htslib memory already allocated
*
Expand Down Expand Up @@ -71,16 +76,27 @@ Sam& Sam::operator=(Sam&& other) noexcept {
return *this;
}

/**
* @brief calculates the theoretical alignment start of a read that has soft/hard-clips preceding the alignment
*
* @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read
* has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped)
* then this method will return 96.
*
* Invalid to call on an unmapped read.
* Invalid to call with cigar = null
*/
uint32_t Sam::mate_alignment_stop(const SamTag<string>& mate_cigar_tag) const {
auto result = mate_alignment_start();
stringstream cigar_stream {mate_cigar_tag.value()};
auto has_reference_bases = false;
while (cigar_stream.peek() != std::char_traits<char>::eof()) {
const auto element = Cigar::parse_next_cigar_element(cigar_stream);
if (Cigar::consumes_reference_bases(Cigar::cigar_op(element))) {
result += Cigar::cigar_oplen(element);
has_reference_bases = true;
}
}
return has_reference_bases ? result-1 : result; // we want the last base to be 1-based and **inclusive** but we only need to deduct 1 if we had any reference consuming operators in the read.
}

uint32_t Sam::mate_alignment_stop() const {
const auto mate_cigar = string_tag(MATE_CIGAR_TAG);
if (missing(mate_cigar))
throw std::invalid_argument{string{"Cannot find the mate alignment stop on a record without the tag: "} + MATE_CIGAR_TAG};
return mate_alignment_stop(mate_cigar);
}

uint32_t Sam::unclipped_start() const {
auto pos = alignment_start();
const auto* cigar = bam_get_cigar(m_body.get());
Expand All @@ -94,16 +110,6 @@ uint32_t Sam::unclipped_start() const {
return pos;
}

/**
* @brief calculates the theoretical alignment stop of a read that has soft/hard-clips preceding the alignment
*
* @return the alignment stop (1-based, inclusive) adjusted for clipped bases. For example if the read
* has an alignment stop of 100 but the last 4 bases were clipped (hard or soft clipped)
* then this method will return 104.
*
* Invalid to call on an unmapped read.
* Invalid to call with cigar = null
*/
uint32_t Sam::unclipped_stop() const {
auto pos = alignment_stop();
const auto* cigar = bam_get_cigar(m_body.get());
Expand All @@ -117,6 +123,55 @@ uint32_t Sam::unclipped_stop() const {
return pos;
}

uint32_t Sam::mate_unclipped_start() const {
const auto mate_cigar_tag = string_tag(MATE_CIGAR_TAG);
if (missing(mate_cigar_tag))
throw std::invalid_argument{string{"Cannot find the mate unclipped start on a record without the tag: "} + MATE_CIGAR_TAG};
return mate_unclipped_start(mate_cigar_tag);
}

uint32_t Sam::mate_unclipped_start(const SamTag<string>& mate_cigar_tag) const {
auto result = mate_alignment_start();
stringstream cigar_stream {mate_cigar_tag.value()};
while (cigar_stream.peek() != std::char_traits<char>::eof()) {
const auto element = Cigar::parse_next_cigar_element(cigar_stream);
const auto op = Cigar::cigar_op(element);
if (op != CigarOperator::S && op != CigarOperator::H)
break;
result -= Cigar::cigar_oplen(element);
}
return result;
}

uint32_t Sam::mate_unclipped_stop() const {
const auto mate_cigar_tag = string_tag(MATE_CIGAR_TAG);
if (missing(mate_cigar_tag))
throw std::invalid_argument{string{"Cannot find the mate unclipped stop on a record without the tag: "} + MATE_CIGAR_TAG};
return mate_unclipped_stop(mate_cigar_tag);
}

uint32_t Sam::mate_unclipped_stop(const SamTag<string>& mate_cigar_tag) const {
auto result = mate_alignment_start();
stringstream cigar_stream {mate_cigar_tag.value()};
auto end_tail = false;
auto has_reference_bases = false;
while (cigar_stream.peek() != std::char_traits<char>::eof()) {
const auto element = Cigar::parse_next_cigar_element(cigar_stream);
const auto op = Cigar::cigar_op(element);
if (!end_tail && (op == CigarOperator::S || op == CigarOperator::H))
continue;
if (!end_tail)
end_tail = true;
if (Cigar::consumes_reference_bases(op) || op == CigarOperator::S || op == CigarOperator::H) {
result += Cigar::cigar_oplen(element);
has_reference_bases = true;
}
}
return has_reference_bases ? result - 1 : result; // we only want to subtract 1 in case we actually had bases in the read that modified the alignment start. Because we are adding the length of the operators, we will end up pointing to one past the unclipped stop of the read. We want to be **inclusive**.
}



/**
* @brief retrieve an integer-valued tag by name
*
Expand Down

0 comments on commit 4d76f30

Please sign in to comment.