This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #196 from broadinstitute/ks_interval_sam_iteration…
…_using_indexed_files_issue_7 Indexed iteration of bam/cram files. #7
- Loading branch information
Showing
5 changed files
with
424 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#include "indexed_sam_iterator.h" | ||
#include "sam.h" | ||
#include "utils/hts_memory.h" | ||
|
||
using namespace std; | ||
|
||
namespace gamgee { | ||
|
||
IndexedSamIterator::IndexedSamIterator() : | ||
m_sam_file_ptr {nullptr}, | ||
m_sam_index_ptr {nullptr}, | ||
m_sam_header_ptr {nullptr}, | ||
m_sam_itr_ptr {nullptr}, | ||
m_sam_record_ptr {nullptr} { | ||
} | ||
|
||
IndexedSamIterator::IndexedSamIterator(samFile* sam_file_ptr, hts_idx_t* sam_index_ptr, | ||
const std::shared_ptr<bam_hdr_t>& sam_header_ptr, const std::vector<std::string>& interval_list) : | ||
m_sam_file_ptr {sam_file_ptr}, | ||
m_sam_index_ptr {sam_index_ptr}, | ||
m_sam_header_ptr {sam_header_ptr}, | ||
m_interval_list {interval_list}, | ||
m_interval_iterator {m_interval_list.begin()}, | ||
m_sam_itr_ptr {sam_itr_querys(m_sam_index_ptr, m_sam_header_ptr.get(), (*m_interval_iterator).c_str())}, | ||
m_sam_record_ptr {utils::make_shared_sam(bam_init1())}, | ||
m_sam_record {m_sam_header_ptr, m_sam_record_ptr} { | ||
fetch_next_record(); | ||
} | ||
|
||
IndexedSamIterator::IndexedSamIterator(IndexedSamIterator&& other) : | ||
m_sam_file_ptr {std::move(other.m_sam_file_ptr)}, | ||
m_sam_index_ptr {std::move(other.m_sam_index_ptr)}, | ||
m_sam_header_ptr {std::move(other.m_sam_header_ptr)}, | ||
m_interval_list {std::move(other.m_interval_list)}, | ||
m_interval_iterator {std::move(other.m_interval_iterator)}, | ||
m_sam_itr_ptr {std::move(other.m_sam_itr_ptr)}, | ||
m_sam_record_ptr {std::move(other.m_sam_record_ptr)}, | ||
m_sam_record {std::move(other.m_sam_record)} { | ||
other.m_sam_file_ptr = nullptr; | ||
other.m_sam_itr_ptr = nullptr; | ||
} | ||
|
||
IndexedSamIterator& IndexedSamIterator::operator=(IndexedSamIterator&& other) { | ||
m_sam_file_ptr = std::move(other.m_sam_file_ptr); | ||
m_sam_index_ptr = std::move(other.m_sam_index_ptr); | ||
m_sam_header_ptr = std::move(other.m_sam_header_ptr); | ||
m_interval_list = std::move(other.m_interval_list); | ||
m_interval_iterator = std::move(other.m_interval_iterator); | ||
m_sam_itr_ptr = std::move(other.m_sam_itr_ptr); | ||
m_sam_record_ptr = std::move(other.m_sam_record_ptr); | ||
m_sam_record = std::move(other.m_sam_record); | ||
other.m_sam_file_ptr = nullptr; | ||
other.m_sam_itr_ptr = nullptr; | ||
return *this; | ||
} | ||
|
||
|
||
IndexedSamIterator::~IndexedSamIterator() { | ||
if (m_sam_itr_ptr) | ||
sam_itr_destroy(m_sam_itr_ptr); | ||
} | ||
|
||
Sam& IndexedSamIterator::operator*() { | ||
return m_sam_record; | ||
} | ||
|
||
Sam& IndexedSamIterator::operator++() { | ||
fetch_next_record(); | ||
return m_sam_record; | ||
} | ||
|
||
bool IndexedSamIterator::operator!=(const IndexedSamIterator& rhs) { | ||
return m_sam_file_ptr != rhs.m_sam_file_ptr; | ||
} | ||
|
||
void IndexedSamIterator::fetch_next_record() { | ||
while (sam_itr_next(m_sam_file_ptr, m_sam_itr_ptr, m_sam_record_ptr.get()) < 0) { | ||
++m_interval_iterator; | ||
if (m_interval_list.end() == m_interval_iterator) { | ||
m_sam_file_ptr = nullptr; | ||
return; | ||
} | ||
m_sam_itr_ptr = sam_itr_querys(m_sam_index_ptr, m_sam_header_ptr.get(), (*m_interval_iterator).c_str()); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#ifndef gamgee__indexed_sam_iterator__guard | ||
#define gamgee__indexed_sam_iterator__guard | ||
|
||
#include "sam.h" | ||
|
||
#include "htslib/sam.h" | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace gamgee { | ||
|
||
/** | ||
* @brief Utility class to enable for-each style iteration in the IndexedSamReader class | ||
*/ | ||
class IndexedSamIterator { | ||
public: | ||
/** | ||
* @brief creates an empty iterator (used for the end() method) | ||
*/ | ||
IndexedSamIterator(); | ||
|
||
/** | ||
* @brief initializes a new iterator based on an input file stream | ||
* | ||
* @param sam_file_ptr pointer to a bam/cram file opened via the sam_open() macro from htslib | ||
* @param sam_index_ptr pointer to a bam/cram file opened via the sam_index_load() macro from htslib | ||
* @param sam_header_ptr pointer to a bam/cram file header created with the sam_hdr_read() macro from htslib | ||
* @param interval_list vector of intervals compatible with sam_itr_querys, hts_parse_reg, etc. | ||
*/ | ||
IndexedSamIterator(samFile* sam_file_ptr, hts_idx_t* sam_index_ptr, | ||
const std::shared_ptr<bam_hdr_t>& sam_header_ptr, const std::vector<std::string>& interval_list); | ||
|
||
/** | ||
* @brief simple move assignment/construction | ||
* | ||
* we need a custom move constructor here because we need to set the iterator pointer to nullptr | ||
* otherwise the destructor will try to close the dangling pointers. | ||
* | ||
* @param other the other IndexedSamIterator to move from | ||
*/ | ||
IndexedSamIterator(IndexedSamIterator&& other); | ||
|
||
/** | ||
* @copydoc IndexedSamIterator(IndexedSamIterator&&) | ||
*/ | ||
IndexedSamIterator& operator=(IndexedSamIterator&& other); | ||
|
||
/** | ||
* @brief no copy construction/assignment allowed for readers and iterators | ||
* | ||
* @param other the other IndexedSamIterator to copy from | ||
*/ | ||
IndexedSamIterator(IndexedSamIterator& other) = delete; | ||
|
||
/** | ||
* @copydoc IndexedSamIterator(IndexedSamIterator&) | ||
*/ | ||
IndexedSamIterator& operator=(IndexedSamIterator& other) = delete; | ||
|
||
/** | ||
* @brief closes the iterators if there are some | ||
*/ | ||
~IndexedSamIterator(); | ||
|
||
/** | ||
* @brief inequality operator (needed by for-each loop) | ||
* | ||
* @param rhs the other SamIterator to compare to | ||
* | ||
* @return whether or not the two iterators are the same (e.g. have the same input stream on the same | ||
* status) | ||
*/ | ||
bool operator!=(const IndexedSamIterator& rhs); | ||
|
||
/** | ||
* @brief dereference operator (needed by for-each loop) | ||
* | ||
* @return a Sam object by reference, valid until the next record is fetched (the iterator re-uses memory at each iteration) | ||
*/ | ||
Sam& operator*(); | ||
|
||
/** | ||
* @brief pre-fetches the next record and tests for end of file | ||
* | ||
* @return a reference to the object (it can be const& because this return value should only be used | ||
* by the for-each loop to check for the eof) | ||
*/ | ||
Sam& operator++(); | ||
|
||
private: | ||
samFile * m_sam_file_ptr; ///< pointer to the bam file | ||
hts_idx_t * m_sam_index_ptr; ///< pointer to the bam index | ||
std::shared_ptr<bam_hdr_t> m_sam_header_ptr; ///< pointer to the bam header | ||
std::vector<std::string> m_interval_list; ///< intervals to iterate | ||
std::vector<std::string>::iterator m_interval_iterator; ///< temporary interval to hold between sam_itr_querys and serve fetch_next_record | ||
hts_itr_t * m_sam_itr_ptr; ///< temporary iterator to hold between sam_itr_querys and serve fetch_next_record | ||
std::shared_ptr<bam1_t> m_sam_record_ptr; ///< pointer to the internal structure of the sam record. Useful to only allocate it once. | ||
Sam m_sam_record; ///< temporary record to hold between fetch (operator++) and serve (operator*) | ||
|
||
void fetch_next_record(); ///< fetches next Sam record into existing htslib memory without making a copy | ||
}; | ||
|
||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
#ifndef gamgee__indexed_sam_reader__guard | ||
#define gamgee__indexed_sam_reader__guard | ||
|
||
#include "indexed_sam_iterator.h" | ||
#include "utils/hts_memory.h" | ||
|
||
#include "htslib/sam.h" | ||
|
||
#include <string> | ||
#include <iostream> | ||
#include <fstream> | ||
#include <memory> | ||
#include <vector> | ||
|
||
namespace gamgee { | ||
|
||
/** | ||
* @brief Utility class to read a BAM/CRAM file with an appropriate Sam iterator from an indexed file | ||
* in a for-each loop. Intervals are passed in using a vector of string coordinates compatible with | ||
* Samtools. When iteration begins, the iterations (re-)starts at the beginning of the first interval. | ||
* | ||
* This class is designed to parse the file in for-each loops with the following signature: | ||
* | ||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
* for (const auto& record : IndexedSamReader<IndexedSamIterator>{filename, interval_list}) | ||
* do_something_with_sam(record); | ||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
* | ||
* Most iterators have aliases defined by this module so you can use it like so: | ||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
* for (const auto& record : IndexedSingleSamReader{filename, interval_list}) | ||
* do_something_with_sam(record); | ||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
*/ | ||
template<class ITERATOR> | ||
class IndexedSamReader { | ||
public: | ||
|
||
/** | ||
* @brief reads through all records in a file parsing them into Sam objects | ||
* | ||
* @param filename the name of the bam/cram file | ||
* @param interval_list Samtools style intervals to look for records | ||
*/ | ||
IndexedSamReader(const std::string& filename, std::vector<std::string> interval_list) : | ||
m_sam_file_ptr {sam_open(filename.c_str(), "r")}, // TODO: Not checking for errors. | ||
m_sam_index_ptr {sam_index_load(m_sam_file_ptr, filename.c_str())}, // TODO: Not checking for errors. | ||
m_sam_header_ptr { utils::make_shared_sam_header(sam_hdr_read(m_sam_file_ptr)) }, // TODO: Not checking for errors. | ||
m_interval_list {std::move(interval_list)} { | ||
} | ||
|
||
/** | ||
* @brief simple move assignment/construction | ||
* | ||
* we need a custom move constructor here because we need to set the file pointers to nullptr | ||
* otherwise the destructor will try to close the dangling pointers. | ||
* | ||
* @param other the other IndexedSamIterator to copy from | ||
*/ | ||
IndexedSamReader(IndexedSamReader&& other) : | ||
m_sam_file_ptr {std::move(other.m_sam_file_ptr)}, | ||
m_sam_index_ptr {std::move(other.m_sam_index_ptr)}, | ||
m_sam_header_ptr {std::move(other.m_sam_header_ptr)}, | ||
m_interval_list {std::move(other.m_interval_list)} { | ||
other.m_sam_index_ptr = nullptr; | ||
other.m_sam_file_ptr = nullptr; | ||
} | ||
|
||
/** | ||
* @copydoc IndexedSamReader(IndexedSamReader&&) | ||
*/ | ||
IndexedSamReader& operator=(IndexedSamReader&& other) { | ||
m_sam_file_ptr = std::move(other.m_sam_file_ptr); | ||
m_sam_index_ptr = std::move(other.m_sam_index_ptr); | ||
m_sam_header_ptr = std::move(other.m_sam_header_ptr); | ||
m_interval_list = std::move(other.m_interval_list); | ||
other.m_sam_index_ptr = nullptr; | ||
other.m_sam_file_ptr = nullptr; | ||
return *this; | ||
} | ||
|
||
/** | ||
* @brief no copy construction/assignment allowed for iterators and readers | ||
*/ | ||
IndexedSamReader(IndexedSamReader& other) = delete; | ||
|
||
/** | ||
* @copydoc IndexedSamReader(IndexedSamReader&) | ||
*/ | ||
IndexedSamReader& operator=(IndexedSamReader& other) = delete; | ||
|
||
/** | ||
* @brief closes the file streams if there are some | ||
*/ | ||
~IndexedSamReader() { | ||
if (m_sam_index_ptr) | ||
hts_idx_destroy(m_sam_index_ptr); // TODO: Copied from samtools. Is there a {bam,sam}_idx_destroy alias? Didn't find under "BAM/CRAM indexing" in sam.h | ||
if (m_sam_file_ptr) | ||
sam_close(m_sam_file_ptr); | ||
} | ||
|
||
/** | ||
* @brief creates a ITERATOR pointing at the start of the input stream (needed by for-each | ||
* loop) | ||
* | ||
* @return a ITERATOR ready to start parsing the file | ||
*/ | ||
ITERATOR begin() { | ||
if (m_interval_list.empty()) | ||
return ITERATOR{}; | ||
else | ||
return ITERATOR{m_sam_file_ptr, m_sam_index_ptr, m_sam_header_ptr, m_interval_list}; | ||
} | ||
|
||
/** | ||
* @brief creates a ITERATOR with a nullified input stream (needed by for-each loop) | ||
* | ||
* @return a ITERATOR that will match the end status of the iterator at the end of the stream | ||
*/ | ||
ITERATOR end() { | ||
return ITERATOR{}; | ||
} | ||
|
||
/** | ||
* @brief returns the header | ||
* | ||
* @return the header | ||
*/ | ||
inline SamHeader header() { return SamHeader{m_sam_header_ptr}; } | ||
|
||
private: | ||
samFile * m_sam_file_ptr; ///< pointer to the bam file | ||
hts_idx_t * m_sam_index_ptr; ///< pointer to the bam index | ||
std::shared_ptr<bam_hdr_t> m_sam_header_ptr; ///< pointer to the bam header | ||
std::vector<std::string> m_interval_list; ///< intervals to iterate | ||
}; | ||
|
||
using IndexedSingleSamReader = IndexedSamReader<IndexedSamIterator>; | ||
|
||
} | ||
|
||
#endif |
Oops, something went wrong.