From 49327d75aa909fdf59a7ace03e5e759cded84853 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 14 Aug 2014 22:58:27 +0800 Subject: [PATCH] Indexed iteration of bam/cram files. #7 --- gamgee/indexed_sam_iterator.cpp | 87 +++++++++++++++++++ gamgee/indexed_sam_iterator.h | 107 +++++++++++++++++++++++ gamgee/indexed_sam_reader.h | 142 +++++++++++++++++++++++++++++++ test/indexed_sam_reader_test.cpp | 88 +++++++++++++++++++ testdata/test_simple.bam.bai | Bin 0 -> 232 bytes 5 files changed, 424 insertions(+) create mode 100644 gamgee/indexed_sam_iterator.cpp create mode 100644 gamgee/indexed_sam_iterator.h create mode 100644 gamgee/indexed_sam_reader.h create mode 100644 test/indexed_sam_reader_test.cpp create mode 100644 testdata/test_simple.bam.bai diff --git a/gamgee/indexed_sam_iterator.cpp b/gamgee/indexed_sam_iterator.cpp new file mode 100644 index 000000000..1c6853e2a --- /dev/null +++ b/gamgee/indexed_sam_iterator.cpp @@ -0,0 +1,87 @@ +#include "indexed_sam_iterator.h" +#include "sam.h" +#include "utils/hts_memory.h" + +using namespace std; + +namespace gamgee { + +IndexedSamIterator::IndexedSamIterator() : + m_sam_file_ptr {nullptr}, + m_sam_index_ptr {nullptr}, + m_sam_header_ptr {nullptr}, + m_sam_itr_ptr {nullptr}, + m_sam_record_ptr {nullptr} { +} + +IndexedSamIterator::IndexedSamIterator(samFile* sam_file_ptr, hts_idx_t* sam_index_ptr, + const std::shared_ptr& sam_header_ptr, const std::vector& interval_list) : + m_sam_file_ptr {sam_file_ptr}, + m_sam_index_ptr {sam_index_ptr}, + m_sam_header_ptr {sam_header_ptr}, + m_interval_list {interval_list}, + m_interval_iterator {m_interval_list.begin()}, + m_sam_itr_ptr {sam_itr_querys(m_sam_index_ptr, m_sam_header_ptr.get(), (*m_interval_iterator).c_str())}, + m_sam_record_ptr {utils::make_shared_sam(bam_init1())}, + m_sam_record {m_sam_header_ptr, m_sam_record_ptr} { + fetch_next_record(); +} + +IndexedSamIterator::IndexedSamIterator(IndexedSamIterator&& other) : + m_sam_file_ptr {std::move(other.m_sam_file_ptr)}, + m_sam_index_ptr {std::move(other.m_sam_index_ptr)}, + m_sam_header_ptr {std::move(other.m_sam_header_ptr)}, + m_interval_list {std::move(other.m_interval_list)}, + m_interval_iterator {std::move(other.m_interval_iterator)}, + m_sam_itr_ptr {std::move(other.m_sam_itr_ptr)}, + m_sam_record_ptr {std::move(other.m_sam_record_ptr)}, + m_sam_record {std::move(other.m_sam_record)} { + other.m_sam_file_ptr = nullptr; + other.m_sam_itr_ptr = nullptr; +} + +IndexedSamIterator& IndexedSamIterator::operator=(IndexedSamIterator&& other) { + m_sam_file_ptr = std::move(other.m_sam_file_ptr); + m_sam_index_ptr = std::move(other.m_sam_index_ptr); + m_sam_header_ptr = std::move(other.m_sam_header_ptr); + m_interval_list = std::move(other.m_interval_list); + m_interval_iterator = std::move(other.m_interval_iterator); + m_sam_itr_ptr = std::move(other.m_sam_itr_ptr); + m_sam_record_ptr = std::move(other.m_sam_record_ptr); + m_sam_record = std::move(other.m_sam_record); + other.m_sam_file_ptr = nullptr; + other.m_sam_itr_ptr = nullptr; + return *this; +} + + +IndexedSamIterator::~IndexedSamIterator() { + if (m_sam_itr_ptr) + sam_itr_destroy(m_sam_itr_ptr); +} + +Sam& IndexedSamIterator::operator*() { + return m_sam_record; +} + +Sam& IndexedSamIterator::operator++() { + fetch_next_record(); + return m_sam_record; +} + +bool IndexedSamIterator::operator!=(const IndexedSamIterator& rhs) { + return m_sam_file_ptr != rhs.m_sam_file_ptr; +} + +void IndexedSamIterator::fetch_next_record() { + while (sam_itr_next(m_sam_file_ptr, m_sam_itr_ptr, m_sam_record_ptr.get()) < 0) { + ++m_interval_iterator; + if (m_interval_list.end() == m_interval_iterator) { + m_sam_file_ptr = nullptr; + return; + } + m_sam_itr_ptr = sam_itr_querys(m_sam_index_ptr, m_sam_header_ptr.get(), (*m_interval_iterator).c_str()); + } +} + +} diff --git a/gamgee/indexed_sam_iterator.h b/gamgee/indexed_sam_iterator.h new file mode 100644 index 000000000..1ce82f4a3 --- /dev/null +++ b/gamgee/indexed_sam_iterator.h @@ -0,0 +1,107 @@ +#ifndef gamgee__indexed_sam_iterator__guard +#define gamgee__indexed_sam_iterator__guard + +#include "sam.h" + +#include "htslib/sam.h" + +#include +#include +#include + +namespace gamgee { + +/** + * @brief Utility class to enable for-each style iteration in the IndexedSamReader class + */ +class IndexedSamIterator { + public: + /** + * @brief creates an empty iterator (used for the end() method) + */ + IndexedSamIterator(); + + /** + * @brief initializes a new iterator based on an input file stream + * + * @param sam_file_ptr pointer to a bam/cram file opened via the sam_open() macro from htslib + * @param sam_index_ptr pointer to a bam/cram file opened via the sam_index_load() macro from htslib + * @param sam_header_ptr pointer to a bam/cram file header created with the sam_hdr_read() macro from htslib + * @param interval_list vector of intervals compatible with sam_itr_querys, hts_parse_reg, etc. + */ + IndexedSamIterator(samFile* sam_file_ptr, hts_idx_t* sam_index_ptr, + const std::shared_ptr& sam_header_ptr, const std::vector& interval_list); + + /** + * @brief simple move assignment/construction + * + * we need a custom move constructor here because we need to set the iterator pointer to nullptr + * otherwise the destructor will try to close the dangling pointers. + * + * @param other the other IndexedSamIterator to move from + */ + IndexedSamIterator(IndexedSamIterator&& other); + + /** + * @copydoc IndexedSamIterator(IndexedSamIterator&&) + */ + IndexedSamIterator& operator=(IndexedSamIterator&& other); + + /** + * @brief no copy construction/assignment allowed for readers and iterators + * + * @param other the other IndexedSamIterator to copy from + */ + IndexedSamIterator(IndexedSamIterator& other) = delete; + + /** + * @copydoc IndexedSamIterator(IndexedSamIterator&) + */ + IndexedSamIterator& operator=(IndexedSamIterator& other) = delete; + + /** + * @brief closes the iterators if there are some + */ + ~IndexedSamIterator(); + + /** + * @brief inequality operator (needed by for-each loop) + * + * @param rhs the other SamIterator to compare to + * + * @return whether or not the two iterators are the same (e.g. have the same input stream on the same + * status) + */ + bool operator!=(const IndexedSamIterator& rhs); + + /** + * @brief dereference operator (needed by for-each loop) + * + * @return a Sam object by reference, valid until the next record is fetched (the iterator re-uses memory at each iteration) + */ + Sam& operator*(); + + /** + * @brief pre-fetches the next record and tests for end of file + * + * @return a reference to the object (it can be const& because this return value should only be used + * by the for-each loop to check for the eof) + */ + Sam& operator++(); + + private: + samFile * m_sam_file_ptr; ///< pointer to the bam file + hts_idx_t * m_sam_index_ptr; ///< pointer to the bam index + std::shared_ptr m_sam_header_ptr; ///< pointer to the bam header + std::vector m_interval_list; ///< intervals to iterate + std::vector::iterator m_interval_iterator; ///< temporary interval to hold between sam_itr_querys and serve fetch_next_record + hts_itr_t * m_sam_itr_ptr; ///< temporary iterator to hold between sam_itr_querys and serve fetch_next_record + std::shared_ptr m_sam_record_ptr; ///< pointer to the internal structure of the sam record. Useful to only allocate it once. + Sam m_sam_record; ///< temporary record to hold between fetch (operator++) and serve (operator*) + + void fetch_next_record(); ///< fetches next Sam record into existing htslib memory without making a copy +}; + +} + +#endif diff --git a/gamgee/indexed_sam_reader.h b/gamgee/indexed_sam_reader.h new file mode 100644 index 000000000..ae64d2935 --- /dev/null +++ b/gamgee/indexed_sam_reader.h @@ -0,0 +1,142 @@ +#ifndef gamgee__indexed_sam_reader__guard +#define gamgee__indexed_sam_reader__guard + +#include "indexed_sam_iterator.h" +#include "utils/hts_memory.h" + +#include "htslib/sam.h" + +#include +#include +#include +#include +#include + +namespace gamgee { + +/** + * @brief Utility class to read a BAM/CRAM file with an appropriate Sam iterator from an indexed file + * in a for-each loop. Intervals are passed in using a vector of string coordinates compatible with + * Samtools. When iteration begins, the iterations (re-)starts at the beginning of the first interval. + * + * This class is designed to parse the file in for-each loops with the following signature: + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * for (const auto& record : IndexedSamReader{filename, interval_list}) + * do_something_with_sam(record); + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Most iterators have aliases defined by this module so you can use it like so: + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * for (const auto& record : IndexedSingleSamReader{filename, interval_list}) + * do_something_with_sam(record); + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +template +class IndexedSamReader { + public: + + /** + * @brief reads through all records in a file parsing them into Sam objects + * + * @param filename the name of the bam/cram file + * @param interval_list Samtools style intervals to look for records + */ + IndexedSamReader(const std::string& filename, std::vector interval_list) : + m_sam_file_ptr {sam_open(filename.c_str(), "r")}, // TODO: Not checking for errors. + m_sam_index_ptr {sam_index_load(m_sam_file_ptr, filename.c_str())}, // TODO: Not checking for errors. + m_sam_header_ptr { utils::make_shared_sam_header(sam_hdr_read(m_sam_file_ptr)) }, // TODO: Not checking for errors. + m_interval_list {std::move(interval_list)} { + } + + /** + * @brief simple move assignment/construction + * + * we need a custom move constructor here because we need to set the file pointers to nullptr + * otherwise the destructor will try to close the dangling pointers. + * + * @param other the other IndexedSamIterator to copy from + */ + IndexedSamReader(IndexedSamReader&& other) : + m_sam_file_ptr {std::move(other.m_sam_file_ptr)}, + m_sam_index_ptr {std::move(other.m_sam_index_ptr)}, + m_sam_header_ptr {std::move(other.m_sam_header_ptr)}, + m_interval_list {std::move(other.m_interval_list)} { + other.m_sam_index_ptr = nullptr; + other.m_sam_file_ptr = nullptr; + } + + /** + * @copydoc IndexedSamReader(IndexedSamReader&&) + */ + IndexedSamReader& operator=(IndexedSamReader&& other) { + m_sam_file_ptr = std::move(other.m_sam_file_ptr); + m_sam_index_ptr = std::move(other.m_sam_index_ptr); + m_sam_header_ptr = std::move(other.m_sam_header_ptr); + m_interval_list = std::move(other.m_interval_list); + other.m_sam_index_ptr = nullptr; + other.m_sam_file_ptr = nullptr; + return *this; + } + + /** + * @brief no copy construction/assignment allowed for iterators and readers + */ + IndexedSamReader(IndexedSamReader& other) = delete; + + /** + * @copydoc IndexedSamReader(IndexedSamReader&) + */ + IndexedSamReader& operator=(IndexedSamReader& other) = delete; + + /** + * @brief closes the file streams if there are some + */ + ~IndexedSamReader() { + if (m_sam_index_ptr) + hts_idx_destroy(m_sam_index_ptr); // TODO: Copied from samtools. Is there a {bam,sam}_idx_destroy alias? Didn't find under "BAM/CRAM indexing" in sam.h + if (m_sam_file_ptr) + sam_close(m_sam_file_ptr); + } + + /** + * @brief creates a ITERATOR pointing at the start of the input stream (needed by for-each + * loop) + * + * @return a ITERATOR ready to start parsing the file + */ + ITERATOR begin() { + if (m_interval_list.empty()) + return ITERATOR{}; + else + return ITERATOR{m_sam_file_ptr, m_sam_index_ptr, m_sam_header_ptr, m_interval_list}; + } + + /** + * @brief creates a ITERATOR with a nullified input stream (needed by for-each loop) + * + * @return a ITERATOR that will match the end status of the iterator at the end of the stream + */ + ITERATOR end() { + return ITERATOR{}; + } + + /** + * @brief returns the header + * + * @return the header + */ + inline SamHeader header() { return SamHeader{m_sam_header_ptr}; } + + private: + samFile * m_sam_file_ptr; ///< pointer to the bam file + hts_idx_t * m_sam_index_ptr; ///< pointer to the bam index + std::shared_ptr m_sam_header_ptr; ///< pointer to the bam header + std::vector m_interval_list; ///< intervals to iterate +}; + +using IndexedSingleSamReader = IndexedSamReader; + +} + +#endif diff --git a/test/indexed_sam_reader_test.cpp b/test/indexed_sam_reader_test.cpp new file mode 100644 index 000000000..72b17a280 --- /dev/null +++ b/test/indexed_sam_reader_test.cpp @@ -0,0 +1,88 @@ +#include "indexed_sam_reader.h" + +#include + +using namespace std; +using namespace gamgee; +using namespace gamgee::utils; + +BOOST_AUTO_TEST_CASE( indexed_single_readers_intervals ) +{ + for (const auto& filename : {"testdata/test_simple.bam"}) { + auto read_counter = 0u; + const auto interval_list = vector{"chr1:201-257", "chr1:30001-40000", "chr1:59601-70000", "chr1:94001"}; + + for (const auto& sam : IndexedSingleSamReader{filename, interval_list}) { + BOOST_CHECK_EQUAL(sam.name().substr(0, 15), "30PPJAAXX090125"); + BOOST_CHECK_EQUAL(sam.chromosome(), 0); + ++read_counter; + } + BOOST_CHECK_EQUAL(read_counter, 15u); + + const auto interval_read_counts = vector{3, 4, 4, 4}; + auto interval_number = 0u; + for (const auto& interval : interval_list) { + auto interval_read_counter = 0u; + for (const auto& sam : IndexedSingleSamReader{filename, vector{interval}}) { + BOOST_CHECK_EQUAL(sam.name().substr(0, 15), "30PPJAAXX090125"); + BOOST_CHECK_EQUAL(sam.chromosome(), 0); + ++interval_read_counter; + } + BOOST_CHECK_EQUAL(interval_read_counter, interval_read_counts[interval_number]); + ++interval_number; + } + } +} + +BOOST_AUTO_TEST_CASE( indexed_single_readers_entire_file ) +{ + const auto entire_file = vector{"."}; + for (const auto& filename : {"testdata/test_simple.bam"}) { + auto read_counter = 0u; + for (const auto& sam : IndexedSingleSamReader{filename, entire_file}) { + BOOST_CHECK_EQUAL(sam.name().substr(0, 15), "30PPJAAXX090125"); + BOOST_CHECK_EQUAL(sam.chromosome(), 0); + ++read_counter; + } + BOOST_CHECK_EQUAL(read_counter, 33u); + } +} + +BOOST_AUTO_TEST_CASE( indexed_single_readers_unmapped ) +{ + const auto unmapped_reads = vector{"*"}; + for (const auto& filename : {"testdata/test_simple.bam"}) { + auto read_counter = 0u; + for (const auto& sam : IndexedSingleSamReader{filename, unmapped_reads}) { + BOOST_CHECK_EQUAL(sam.name().substr(0, 15), "30PPJAAXX090125"); + BOOST_CHECK_EQUAL(sam.chromosome(), 0); + ++read_counter; + } + BOOST_CHECK_EQUAL(read_counter, 0u); + } +} + +BOOST_AUTO_TEST_CASE( indexed_single_readers_empty ) +{ + const auto empty_list = vector{}; + for (const auto& filename : {"testdata/test_simple.bam"}) { + auto read_counter = 0u; + for (const auto& sam : IndexedSingleSamReader{filename, empty_list}) { + BOOST_CHECK_EQUAL(sam.name().substr(0, 15), "30PPJAAXX090125"); + BOOST_CHECK_EQUAL(sam.chromosome(), 0); + ++read_counter; + } + BOOST_CHECK_EQUAL(read_counter, 0u); + } +} + +BOOST_AUTO_TEST_CASE( indexed_single_readers_move_constructor_and_assignment ) { + auto reader1 = IndexedSingleSamReader{"testdata/test_simple.bam", vector{"."}}; + auto it1 = reader1.begin(); + auto reader2 = std::move(reader1); // check move constructor + reader1 = IndexedSingleSamReader{"testdata/test_simple.bam", vector{"."}}; // check move assignment + auto it2 = reader2.begin(); + auto it3 = reader1.begin(); + BOOST_CHECK_EQUAL((*it1).alignment_start(), (*it2).alignment_start()); // unlike the SingleSamReader, these should be the same because they are pointing at the exact same restarted iterator + BOOST_CHECK_EQUAL((*it1).alignment_start(), (*it3).alignment_start()); // these should be the same! both pointing at the first record +} diff --git a/testdata/test_simple.bam.bai b/testdata/test_simple.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..854ec21bad645c4fafd030c487875a73dc3c669d GIT binary patch literal 232 zcmZ>A^kigYU|?VaVoxCk1`wNpp&v}uu|Q}qs5p#&juRs84Hbv+mG~jzK2UKO-$D#( o&Ljo~CM5G282I@>Tt%=Lg7Aathv|c<1Gy31ZRmER+X*uj0Q)HtqyPW_ literal 0 HcmV?d00001