This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- heavily based on Khalid's IndexedSamReader/Iterator
- Loading branch information
1 parent
0bc63ee
commit 332352f
Showing
12 changed files
with
358 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#include "indexed_variant_iterator.h" | ||
#include "variant_iterator.h" | ||
|
||
#include "htslib/vcf.h" | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <vector> | ||
|
||
using namespace std; | ||
|
||
namespace gamgee { | ||
const std::vector<std::string> IndexedVariantIterator::all_intervals = {"."}; | ||
|
||
IndexedVariantIterator::IndexedVariantIterator() : | ||
VariantIterator {}, | ||
m_variant_index_ptr { nullptr }, | ||
m_interval_list {}, | ||
m_interval_iter {}, | ||
m_index_iter_ptr { nullptr } | ||
{} | ||
|
||
IndexedVariantIterator::IndexedVariantIterator(vcfFile* file_ptr, hts_idx_t* index_ptr, | ||
const std::shared_ptr<bcf_hdr_t>& header_ptr, const std::vector<std::string> interval_list) : | ||
VariantIterator { file_ptr, header_ptr }, | ||
m_variant_index_ptr { index_ptr }, | ||
m_interval_list { interval_list.empty() ? all_intervals : move(interval_list) }, | ||
m_interval_iter { m_interval_list.begin() }, | ||
m_index_iter_ptr { bcf_itr_querys(m_variant_index_ptr, m_variant_header_ptr.get(), m_interval_iter->c_str()) } | ||
{ | ||
fetch_next_record(); | ||
} | ||
|
||
IndexedVariantIterator::~IndexedVariantIterator() { | ||
bcf_itr_destroy(m_index_iter_ptr); | ||
} | ||
|
||
bool IndexedVariantIterator::operator!=(const IndexedVariantIterator& rhs) { | ||
return m_variant_file_ptr != rhs.m_variant_file_ptr && | ||
m_index_iter_ptr != rhs.m_index_iter_ptr; | ||
} | ||
|
||
/** | ||
* @brief pre-fetches the next variant record | ||
* @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed | ||
*/ | ||
void IndexedVariantIterator::fetch_next_record() { | ||
while (bcf_itr_next(m_variant_file_ptr, m_index_iter_ptr, m_variant_record_ptr.get()) < 0) { | ||
++m_interval_iter; | ||
if (m_interval_list.end() == m_interval_iter) { | ||
m_variant_file_ptr = nullptr; | ||
m_variant_record = Variant{}; | ||
return; | ||
} | ||
m_index_iter_ptr = bcf_itr_querys(m_variant_index_ptr, m_variant_header_ptr.get(), m_interval_iter->c_str()); | ||
} | ||
} | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#ifndef gamgee__indexed_variant_iterator__guard | ||
#define gamgee__indexed_variant_iterator__guard | ||
|
||
#include "variant_iterator.h" | ||
|
||
#include "htslib/vcf.h" | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace gamgee { | ||
|
||
class IndexedVariantIterator : public VariantIterator { | ||
public: | ||
|
||
static const std::vector<std::string> all_intervals; | ||
|
||
/** | ||
* @brief creates an empty iterator (used for the end() method) | ||
*/ | ||
IndexedVariantIterator(); | ||
|
||
/** | ||
* @brief initializes a new iterator based on a file, an index, a header, and a vector of intervals | ||
* | ||
* @param file_ptr pointer to a BCF file opened via the bcf_open() macro from htslib | ||
* @param index_ptr pointer to a BCF file index (CSI) created with the bcf_index_load() macro from htslib | ||
* @param header_ptr shared pointer to a BCF file header created with the bcf_hdr_read() macro from htslib | ||
* @param interval_list vector of intervals represented by strings | ||
*/ | ||
IndexedVariantIterator(vcfFile* file_ptr, hts_idx_t* index_ptr, | ||
const std::shared_ptr<bcf_hdr_t>& header_ptr, | ||
const std::vector<std::string> interval_list = all_intervals); | ||
|
||
virtual ~IndexedVariantIterator(); | ||
|
||
/** | ||
* @brief an IndexedVariantIterator cannot be copied safely, as it is iterating over a stream. | ||
*/ | ||
|
||
IndexedVariantIterator(IndexedVariantIterator& other) = delete; | ||
IndexedVariantIterator& operator=(IndexedVariantIterator& other) = delete; | ||
|
||
/** | ||
* @brief an IndexedVariantIterator can be moved | ||
*/ | ||
|
||
IndexedVariantIterator(IndexedVariantIterator&& other) : | ||
m_variant_index_ptr { std::move(other.m_variant_index_ptr) }, | ||
m_interval_list { std::move(other.m_interval_list) }, | ||
m_interval_iter { std::move(other.m_interval_iter) }, | ||
m_index_iter_ptr { std::move(other.m_index_iter_ptr) } | ||
{ | ||
other.m_index_iter_ptr = nullptr; | ||
} | ||
|
||
IndexedVariantIterator& operator=(IndexedVariantIterator&& other) { | ||
m_variant_index_ptr = std::move(other.m_variant_index_ptr); | ||
m_interval_list = std::move(other.m_interval_list); | ||
m_interval_iter = std::move(other.m_interval_iter); | ||
m_index_iter_ptr = std::move(other.m_index_iter_ptr); | ||
|
||
other.m_index_iter_ptr = nullptr; | ||
|
||
return *this; | ||
} | ||
|
||
/** | ||
* @brief inequality operator (needed by for-each loop) | ||
* | ||
* @param rhs the other IndexedVariantIterator to compare to | ||
* | ||
* @return whether or not the two iterators are the same (e.g. have the same input file on the same | ||
* status and the same intervals) | ||
*/ | ||
bool operator!=(const IndexedVariantIterator& rhs); | ||
|
||
protected: | ||
void fetch_next_record() override; ///< fetches next Variant record into existing htslib memory without making a copy | ||
|
||
private: | ||
hts_idx_t* m_variant_index_ptr; ///< pointer to the internal structure of the index file. NOTE: owned by IndexedVariantReader! | ||
std::vector<std::string> m_interval_list; ///< vector of intervals represented by strings | ||
std::vector<std::string>::const_iterator m_interval_iter; ///< iterator for the interval list | ||
hts_itr_t* m_index_iter_ptr; ///< htslib BCF index iterator | ||
}; | ||
|
||
} | ||
|
||
#endif /* defined(gamgee__indexed_variant_iterator__guard) */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#ifndef gamgee__indexed_variant_reader__guard | ||
#define gamgee__indexed_variant_reader__guard | ||
|
||
#include "indexed_variant_iterator.h" | ||
#include "utils/hts_memory.h" | ||
|
||
#include "htslib/vcf.h" | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace gamgee { | ||
|
||
/** | ||
* @brief Utility class to read an indexed BCF file by intervals using an appropriate Variant iterator | ||
* in a for-each loop. | ||
* | ||
* NOTE: this will only parse BCF files with CSI indices | ||
* | ||
* This class is designed to parse the file in for-each loops with the following signature: | ||
* | ||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
* for (auto& record : IndexedVariantReader<IndexedVariantIterator>{filename, intervals}) | ||
* do_something_with_record(record); | ||
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
*/ | ||
template<class ITERATOR> | ||
class IndexedVariantReader { | ||
public: | ||
|
||
/** | ||
* @brief reads through all records in a file matching one of the given intervals, | ||
* parsing them into Variant objects | ||
* | ||
* @param filename the name of the variant file | ||
* @param interval_list a vector of intervals represented by strings | ||
* | ||
*/ | ||
IndexedVariantReader(const std::string& filename, const std::vector<std::string> interval_list) : | ||
m_variant_file_ptr { bcf_open(filename.c_str(), "r") }, | ||
m_variant_index_ptr { bcf_index_load(filename.c_str()) }, | ||
m_variant_header_ptr { utils::make_shared_variant_header(bcf_hdr_read(m_variant_file_ptr)) }, | ||
m_interval_list { std::move(interval_list) } | ||
{} | ||
|
||
/** | ||
* @brief closes the file stream and index | ||
*/ | ||
~IndexedVariantReader() { | ||
if (m_variant_file_ptr != nullptr) | ||
bcf_close(m_variant_file_ptr); | ||
hts_idx_destroy(m_variant_index_ptr); | ||
} | ||
|
||
/** | ||
* @brief an IndexedVariantReader cannot be copied safely, as it is iterating over a stream. | ||
*/ | ||
|
||
IndexedVariantReader(IndexedVariantReader& other) = delete; | ||
IndexedVariantReader& operator=(IndexedVariantReader& other) = delete; | ||
|
||
/** | ||
* @brief an IndexedVariantReader can be moved | ||
*/ | ||
|
||
IndexedVariantReader(IndexedVariantReader&& other) : | ||
m_variant_file_ptr { std::move(other.m_variant_file_ptr) }, | ||
m_variant_index_ptr { std::move(other.m_variant_index_ptr) }, | ||
m_variant_header_ptr { std::move(other.m_variant_header_ptr) }, | ||
m_interval_list { std::move(other.m_interval_list) } | ||
{ | ||
other.m_variant_file_ptr = nullptr; | ||
other.m_variant_index_ptr = nullptr; | ||
} | ||
|
||
IndexedVariantReader& operator=(IndexedVariantReader&& other) { | ||
m_variant_file_ptr = std::move(other.m_variant_file_ptr); | ||
m_variant_index_ptr = std::move(other.m_variant_index_ptr); | ||
m_variant_header_ptr = std::move(other.m_variant_header_ptr); | ||
m_interval_list = std::move(other.m_interval_list); | ||
|
||
other.m_variant_file_ptr = nullptr; | ||
other.m_variant_index_ptr = nullptr; | ||
|
||
return *this; | ||
} | ||
|
||
ITERATOR begin() const { | ||
return ITERATOR{ m_variant_file_ptr, m_variant_index_ptr, m_variant_header_ptr, m_interval_list }; | ||
} | ||
|
||
ITERATOR end() const { | ||
return ITERATOR{}; | ||
} | ||
|
||
/** | ||
* @brief returns the variant header of the file being read | ||
*/ | ||
inline VariantHeader header() const { return VariantHeader{m_variant_header_ptr}; } | ||
|
||
private: | ||
vcfFile* m_variant_file_ptr; ///< pointer to the internal structure of the variant file | ||
hts_idx_t* m_variant_index_ptr; ///< pointer to the internal structure of the index file | ||
std::shared_ptr<bcf_hdr_t> m_variant_header_ptr; ///< pointer to the internal structure of the header file | ||
std::vector<std::string> m_interval_list; ///< vector of intervals represented by strings | ||
}; | ||
|
||
} | ||
|
||
#endif /* defined(gamgee__indexed_variant_reader__guard) */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.