Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Merge pull request #196 from broadinstitute/ks_interval_sam_iteration…
Browse files Browse the repository at this point in the history
…_using_indexed_files_issue_7

Indexed iteration of bam/cram files. #7
  • Loading branch information
MauricioCarneiro committed Aug 20, 2014
2 parents 2d58aec + 49327d7 commit 0bc63ee
Show file tree
Hide file tree
Showing 5 changed files with 424 additions and 0 deletions.
87 changes: 87 additions & 0 deletions gamgee/indexed_sam_iterator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#include "indexed_sam_iterator.h"
#include "sam.h"
#include "utils/hts_memory.h"

using namespace std;

namespace gamgee {

IndexedSamIterator::IndexedSamIterator() :
m_sam_file_ptr {nullptr},
m_sam_index_ptr {nullptr},
m_sam_header_ptr {nullptr},
m_sam_itr_ptr {nullptr},
m_sam_record_ptr {nullptr} {
}

IndexedSamIterator::IndexedSamIterator(samFile* sam_file_ptr, hts_idx_t* sam_index_ptr,
const std::shared_ptr<bam_hdr_t>& sam_header_ptr, const std::vector<std::string>& interval_list) :
m_sam_file_ptr {sam_file_ptr},
m_sam_index_ptr {sam_index_ptr},
m_sam_header_ptr {sam_header_ptr},
m_interval_list {interval_list},
m_interval_iterator {m_interval_list.begin()},
m_sam_itr_ptr {sam_itr_querys(m_sam_index_ptr, m_sam_header_ptr.get(), (*m_interval_iterator).c_str())},
m_sam_record_ptr {utils::make_shared_sam(bam_init1())},
m_sam_record {m_sam_header_ptr, m_sam_record_ptr} {
fetch_next_record();
}

IndexedSamIterator::IndexedSamIterator(IndexedSamIterator&& other) :
m_sam_file_ptr {std::move(other.m_sam_file_ptr)},
m_sam_index_ptr {std::move(other.m_sam_index_ptr)},
m_sam_header_ptr {std::move(other.m_sam_header_ptr)},
m_interval_list {std::move(other.m_interval_list)},
m_interval_iterator {std::move(other.m_interval_iterator)},
m_sam_itr_ptr {std::move(other.m_sam_itr_ptr)},
m_sam_record_ptr {std::move(other.m_sam_record_ptr)},
m_sam_record {std::move(other.m_sam_record)} {
other.m_sam_file_ptr = nullptr;
other.m_sam_itr_ptr = nullptr;
}

IndexedSamIterator& IndexedSamIterator::operator=(IndexedSamIterator&& other) {
m_sam_file_ptr = std::move(other.m_sam_file_ptr);
m_sam_index_ptr = std::move(other.m_sam_index_ptr);
m_sam_header_ptr = std::move(other.m_sam_header_ptr);
m_interval_list = std::move(other.m_interval_list);
m_interval_iterator = std::move(other.m_interval_iterator);
m_sam_itr_ptr = std::move(other.m_sam_itr_ptr);
m_sam_record_ptr = std::move(other.m_sam_record_ptr);
m_sam_record = std::move(other.m_sam_record);
other.m_sam_file_ptr = nullptr;
other.m_sam_itr_ptr = nullptr;
return *this;
}


IndexedSamIterator::~IndexedSamIterator() {
if (m_sam_itr_ptr)
sam_itr_destroy(m_sam_itr_ptr);
}

Sam& IndexedSamIterator::operator*() {
return m_sam_record;
}

Sam& IndexedSamIterator::operator++() {
fetch_next_record();
return m_sam_record;
}

bool IndexedSamIterator::operator!=(const IndexedSamIterator& rhs) {
return m_sam_file_ptr != rhs.m_sam_file_ptr;
}

void IndexedSamIterator::fetch_next_record() {
while (sam_itr_next(m_sam_file_ptr, m_sam_itr_ptr, m_sam_record_ptr.get()) < 0) {
++m_interval_iterator;
if (m_interval_list.end() == m_interval_iterator) {
m_sam_file_ptr = nullptr;
return;
}
m_sam_itr_ptr = sam_itr_querys(m_sam_index_ptr, m_sam_header_ptr.get(), (*m_interval_iterator).c_str());
}
}

}
107 changes: 107 additions & 0 deletions gamgee/indexed_sam_iterator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#ifndef gamgee__indexed_sam_iterator__guard
#define gamgee__indexed_sam_iterator__guard

#include "sam.h"

#include "htslib/sam.h"

#include <memory>
#include <string>
#include <vector>

namespace gamgee {

/**
* @brief Utility class to enable for-each style iteration in the IndexedSamReader class
*/
class IndexedSamIterator {
public:
/**
* @brief creates an empty iterator (used for the end() method)
*/
IndexedSamIterator();

/**
* @brief initializes a new iterator based on an input file stream
*
* @param sam_file_ptr pointer to a bam/cram file opened via the sam_open() macro from htslib
* @param sam_index_ptr pointer to a bam/cram file opened via the sam_index_load() macro from htslib
* @param sam_header_ptr pointer to a bam/cram file header created with the sam_hdr_read() macro from htslib
* @param interval_list vector of intervals compatible with sam_itr_querys, hts_parse_reg, etc.
*/
IndexedSamIterator(samFile* sam_file_ptr, hts_idx_t* sam_index_ptr,
const std::shared_ptr<bam_hdr_t>& sam_header_ptr, const std::vector<std::string>& interval_list);

/**
* @brief simple move assignment/construction
*
* we need a custom move constructor here because we need to set the iterator pointer to nullptr
* otherwise the destructor will try to close the dangling pointers.
*
* @param other the other IndexedSamIterator to move from
*/
IndexedSamIterator(IndexedSamIterator&& other);

/**
* @copydoc IndexedSamIterator(IndexedSamIterator&&)
*/
IndexedSamIterator& operator=(IndexedSamIterator&& other);

/**
* @brief no copy construction/assignment allowed for readers and iterators
*
* @param other the other IndexedSamIterator to copy from
*/
IndexedSamIterator(IndexedSamIterator& other) = delete;

/**
* @copydoc IndexedSamIterator(IndexedSamIterator&)
*/
IndexedSamIterator& operator=(IndexedSamIterator& other) = delete;

/**
* @brief closes the iterators if there are some
*/
~IndexedSamIterator();

/**
* @brief inequality operator (needed by for-each loop)
*
* @param rhs the other SamIterator to compare to
*
* @return whether or not the two iterators are the same (e.g. have the same input stream on the same
* status)
*/
bool operator!=(const IndexedSamIterator& rhs);

/**
* @brief dereference operator (needed by for-each loop)
*
* @return a Sam object by reference, valid until the next record is fetched (the iterator re-uses memory at each iteration)
*/
Sam& operator*();

/**
* @brief pre-fetches the next record and tests for end of file
*
* @return a reference to the object (it can be const& because this return value should only be used
* by the for-each loop to check for the eof)
*/
Sam& operator++();

private:
samFile * m_sam_file_ptr; ///< pointer to the bam file
hts_idx_t * m_sam_index_ptr; ///< pointer to the bam index
std::shared_ptr<bam_hdr_t> m_sam_header_ptr; ///< pointer to the bam header
std::vector<std::string> m_interval_list; ///< intervals to iterate
std::vector<std::string>::iterator m_interval_iterator; ///< temporary interval to hold between sam_itr_querys and serve fetch_next_record
hts_itr_t * m_sam_itr_ptr; ///< temporary iterator to hold between sam_itr_querys and serve fetch_next_record
std::shared_ptr<bam1_t> m_sam_record_ptr; ///< pointer to the internal structure of the sam record. Useful to only allocate it once.
Sam m_sam_record; ///< temporary record to hold between fetch (operator++) and serve (operator*)

void fetch_next_record(); ///< fetches next Sam record into existing htslib memory without making a copy
};

}

#endif
142 changes: 142 additions & 0 deletions gamgee/indexed_sam_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#ifndef gamgee__indexed_sam_reader__guard
#define gamgee__indexed_sam_reader__guard

#include "indexed_sam_iterator.h"
#include "utils/hts_memory.h"

#include "htslib/sam.h"

#include <string>
#include <iostream>
#include <fstream>
#include <memory>
#include <vector>

namespace gamgee {

/**
* @brief Utility class to read a BAM/CRAM file with an appropriate Sam iterator from an indexed file
* in a for-each loop. Intervals are passed in using a vector of string coordinates compatible with
* Samtools. When iteration begins, the iterations (re-)starts at the beginning of the first interval.
*
* This class is designed to parse the file in for-each loops with the following signature:
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* for (const auto& record : IndexedSamReader<IndexedSamIterator>{filename, interval_list})
* do_something_with_sam(record);
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Most iterators have aliases defined by this module so you can use it like so:
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* for (const auto& record : IndexedSingleSamReader{filename, interval_list})
* do_something_with_sam(record);
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
template<class ITERATOR>
class IndexedSamReader {
public:

/**
* @brief reads through all records in a file parsing them into Sam objects
*
* @param filename the name of the bam/cram file
* @param interval_list Samtools style intervals to look for records
*/
IndexedSamReader(const std::string& filename, std::vector<std::string> interval_list) :
m_sam_file_ptr {sam_open(filename.c_str(), "r")}, // TODO: Not checking for errors.
m_sam_index_ptr {sam_index_load(m_sam_file_ptr, filename.c_str())}, // TODO: Not checking for errors.
m_sam_header_ptr { utils::make_shared_sam_header(sam_hdr_read(m_sam_file_ptr)) }, // TODO: Not checking for errors.
m_interval_list {std::move(interval_list)} {
}

/**
* @brief simple move assignment/construction
*
* we need a custom move constructor here because we need to set the file pointers to nullptr
* otherwise the destructor will try to close the dangling pointers.
*
* @param other the other IndexedSamIterator to copy from
*/
IndexedSamReader(IndexedSamReader&& other) :
m_sam_file_ptr {std::move(other.m_sam_file_ptr)},
m_sam_index_ptr {std::move(other.m_sam_index_ptr)},
m_sam_header_ptr {std::move(other.m_sam_header_ptr)},
m_interval_list {std::move(other.m_interval_list)} {
other.m_sam_index_ptr = nullptr;
other.m_sam_file_ptr = nullptr;
}

/**
* @copydoc IndexedSamReader(IndexedSamReader&&)
*/
IndexedSamReader& operator=(IndexedSamReader&& other) {
m_sam_file_ptr = std::move(other.m_sam_file_ptr);
m_sam_index_ptr = std::move(other.m_sam_index_ptr);
m_sam_header_ptr = std::move(other.m_sam_header_ptr);
m_interval_list = std::move(other.m_interval_list);
other.m_sam_index_ptr = nullptr;
other.m_sam_file_ptr = nullptr;
return *this;
}

/**
* @brief no copy construction/assignment allowed for iterators and readers
*/
IndexedSamReader(IndexedSamReader& other) = delete;

/**
* @copydoc IndexedSamReader(IndexedSamReader&)
*/
IndexedSamReader& operator=(IndexedSamReader& other) = delete;

/**
* @brief closes the file streams if there are some
*/
~IndexedSamReader() {
if (m_sam_index_ptr)
hts_idx_destroy(m_sam_index_ptr); // TODO: Copied from samtools. Is there a {bam,sam}_idx_destroy alias? Didn't find under "BAM/CRAM indexing" in sam.h
if (m_sam_file_ptr)
sam_close(m_sam_file_ptr);
}

/**
* @brief creates a ITERATOR pointing at the start of the input stream (needed by for-each
* loop)
*
* @return a ITERATOR ready to start parsing the file
*/
ITERATOR begin() {
if (m_interval_list.empty())
return ITERATOR{};
else
return ITERATOR{m_sam_file_ptr, m_sam_index_ptr, m_sam_header_ptr, m_interval_list};
}

/**
* @brief creates a ITERATOR with a nullified input stream (needed by for-each loop)
*
* @return a ITERATOR that will match the end status of the iterator at the end of the stream
*/
ITERATOR end() {
return ITERATOR{};
}

/**
* @brief returns the header
*
* @return the header
*/
inline SamHeader header() { return SamHeader{m_sam_header_ptr}; }

private:
samFile * m_sam_file_ptr; ///< pointer to the bam file
hts_idx_t * m_sam_index_ptr; ///< pointer to the bam index
std::shared_ptr<bam_hdr_t> m_sam_header_ptr; ///< pointer to the bam header
std::vector<std::string> m_interval_list; ///< intervals to iterate
};

using IndexedSingleSamReader = IndexedSamReader<IndexedSamIterator>;

}

#endif

0 comments on commit 0bc63ee

Please sign in to comment.