Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
IndexedVariantReader/Iterator
Browse files Browse the repository at this point in the history
- heavily based on Khalid's IndexedSamReader/Iterator
  • Loading branch information
jmthibault79 committed Aug 21, 2014
1 parent 0bc63ee commit 332352f
Show file tree
Hide file tree
Showing 12 changed files with 358 additions and 3 deletions.
60 changes: 60 additions & 0 deletions gamgee/indexed_variant_iterator.cpp
@@ -0,0 +1,60 @@
#include "indexed_variant_iterator.h"
#include "variant_iterator.h"

#include "htslib/vcf.h"

#include <memory>
#include <string>
#include <vector>

using namespace std;

namespace gamgee {
const std::vector<std::string> IndexedVariantIterator::all_intervals = {"."};

IndexedVariantIterator::IndexedVariantIterator() :
VariantIterator {},
m_variant_index_ptr { nullptr },
m_interval_list {},
m_interval_iter {},
m_index_iter_ptr { nullptr }
{}

IndexedVariantIterator::IndexedVariantIterator(vcfFile* file_ptr, hts_idx_t* index_ptr,
const std::shared_ptr<bcf_hdr_t>& header_ptr, const std::vector<std::string> interval_list) :
VariantIterator { file_ptr, header_ptr },
m_variant_index_ptr { index_ptr },
m_interval_list { interval_list.empty() ? all_intervals : move(interval_list) },
m_interval_iter { m_interval_list.begin() },
m_index_iter_ptr { bcf_itr_querys(m_variant_index_ptr, m_variant_header_ptr.get(), m_interval_iter->c_str()) }
{
fetch_next_record();
}

IndexedVariantIterator::~IndexedVariantIterator() {
bcf_itr_destroy(m_index_iter_ptr);
}

bool IndexedVariantIterator::operator!=(const IndexedVariantIterator& rhs) {
return m_variant_file_ptr != rhs.m_variant_file_ptr &&
m_index_iter_ptr != rhs.m_index_iter_ptr;
}

/**
* @brief pre-fetches the next variant record
* @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed
*/
void IndexedVariantIterator::fetch_next_record() {
while (bcf_itr_next(m_variant_file_ptr, m_index_iter_ptr, m_variant_record_ptr.get()) < 0) {
++m_interval_iter;
if (m_interval_list.end() == m_interval_iter) {
m_variant_file_ptr = nullptr;
m_variant_record = Variant{};
return;
}
m_index_iter_ptr = bcf_itr_querys(m_variant_index_ptr, m_variant_header_ptr.get(), m_interval_iter->c_str());
}
}

}

91 changes: 91 additions & 0 deletions gamgee/indexed_variant_iterator.h
@@ -0,0 +1,91 @@
#ifndef gamgee__indexed_variant_iterator__guard
#define gamgee__indexed_variant_iterator__guard

#include "variant_iterator.h"

#include "htslib/vcf.h"

#include <memory>
#include <string>
#include <vector>

namespace gamgee {

class IndexedVariantIterator : public VariantIterator {
public:

static const std::vector<std::string> all_intervals;

/**
* @brief creates an empty iterator (used for the end() method)
*/
IndexedVariantIterator();

/**
* @brief initializes a new iterator based on a file, an index, a header, and a vector of intervals
*
* @param file_ptr pointer to a BCF file opened via the bcf_open() macro from htslib
* @param index_ptr pointer to a BCF file index (CSI) created with the bcf_index_load() macro from htslib
* @param header_ptr shared pointer to a BCF file header created with the bcf_hdr_read() macro from htslib
* @param interval_list vector of intervals represented by strings
*/
IndexedVariantIterator(vcfFile* file_ptr, hts_idx_t* index_ptr,
const std::shared_ptr<bcf_hdr_t>& header_ptr,
const std::vector<std::string> interval_list = all_intervals);

virtual ~IndexedVariantIterator();

/**
* @brief an IndexedVariantIterator cannot be copied safely, as it is iterating over a stream.
*/

IndexedVariantIterator(IndexedVariantIterator& other) = delete;
IndexedVariantIterator& operator=(IndexedVariantIterator& other) = delete;

/**
* @brief an IndexedVariantIterator can be moved
*/

IndexedVariantIterator(IndexedVariantIterator&& other) :
m_variant_index_ptr { std::move(other.m_variant_index_ptr) },
m_interval_list { std::move(other.m_interval_list) },
m_interval_iter { std::move(other.m_interval_iter) },
m_index_iter_ptr { std::move(other.m_index_iter_ptr) }
{
other.m_index_iter_ptr = nullptr;
}

IndexedVariantIterator& operator=(IndexedVariantIterator&& other) {
m_variant_index_ptr = std::move(other.m_variant_index_ptr);
m_interval_list = std::move(other.m_interval_list);
m_interval_iter = std::move(other.m_interval_iter);
m_index_iter_ptr = std::move(other.m_index_iter_ptr);

other.m_index_iter_ptr = nullptr;

return *this;
}

/**
* @brief inequality operator (needed by for-each loop)
*
* @param rhs the other IndexedVariantIterator to compare to
*
* @return whether or not the two iterators are the same (e.g. have the same input file on the same
* status and the same intervals)
*/
bool operator!=(const IndexedVariantIterator& rhs);

protected:
void fetch_next_record() override; ///< fetches next Variant record into existing htslib memory without making a copy

private:
hts_idx_t* m_variant_index_ptr; ///< pointer to the internal structure of the index file. NOTE: owned by IndexedVariantReader!
std::vector<std::string> m_interval_list; ///< vector of intervals represented by strings
std::vector<std::string>::const_iterator m_interval_iter; ///< iterator for the interval list
hts_itr_t* m_index_iter_ptr; ///< htslib BCF index iterator
};

}

#endif /* defined(gamgee__indexed_variant_iterator__guard) */
111 changes: 111 additions & 0 deletions gamgee/indexed_variant_reader.h
@@ -0,0 +1,111 @@
#ifndef gamgee__indexed_variant_reader__guard
#define gamgee__indexed_variant_reader__guard

#include "indexed_variant_iterator.h"
#include "utils/hts_memory.h"

#include "htslib/vcf.h"

#include <memory>
#include <string>
#include <vector>

namespace gamgee {

/**
* @brief Utility class to read an indexed BCF file by intervals using an appropriate Variant iterator
* in a for-each loop.
*
* NOTE: this will only parse BCF files with CSI indices
*
* This class is designed to parse the file in for-each loops with the following signature:
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* for (auto& record : IndexedVariantReader<IndexedVariantIterator>{filename, intervals})
* do_something_with_record(record);
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
template<class ITERATOR>
class IndexedVariantReader {
public:

/**
* @brief reads through all records in a file matching one of the given intervals,
* parsing them into Variant objects
*
* @param filename the name of the variant file
* @param interval_list a vector of intervals represented by strings
*
*/
IndexedVariantReader(const std::string& filename, const std::vector<std::string> interval_list) :
m_variant_file_ptr { bcf_open(filename.c_str(), "r") },
m_variant_index_ptr { bcf_index_load(filename.c_str()) },
m_variant_header_ptr { utils::make_shared_variant_header(bcf_hdr_read(m_variant_file_ptr)) },
m_interval_list { std::move(interval_list) }
{}

/**
* @brief closes the file stream and index
*/
~IndexedVariantReader() {
if (m_variant_file_ptr != nullptr)
bcf_close(m_variant_file_ptr);
hts_idx_destroy(m_variant_index_ptr);
}

/**
* @brief an IndexedVariantReader cannot be copied safely, as it is iterating over a stream.
*/

IndexedVariantReader(IndexedVariantReader& other) = delete;
IndexedVariantReader& operator=(IndexedVariantReader& other) = delete;

/**
* @brief an IndexedVariantReader can be moved
*/

IndexedVariantReader(IndexedVariantReader&& other) :
m_variant_file_ptr { std::move(other.m_variant_file_ptr) },
m_variant_index_ptr { std::move(other.m_variant_index_ptr) },
m_variant_header_ptr { std::move(other.m_variant_header_ptr) },
m_interval_list { std::move(other.m_interval_list) }
{
other.m_variant_file_ptr = nullptr;
other.m_variant_index_ptr = nullptr;
}

IndexedVariantReader& operator=(IndexedVariantReader&& other) {
m_variant_file_ptr = std::move(other.m_variant_file_ptr);
m_variant_index_ptr = std::move(other.m_variant_index_ptr);
m_variant_header_ptr = std::move(other.m_variant_header_ptr);
m_interval_list = std::move(other.m_interval_list);

other.m_variant_file_ptr = nullptr;
other.m_variant_index_ptr = nullptr;

return *this;
}

ITERATOR begin() const {
return ITERATOR{ m_variant_file_ptr, m_variant_index_ptr, m_variant_header_ptr, m_interval_list };
}

ITERATOR end() const {
return ITERATOR{};
}

/**
* @brief returns the variant header of the file being read
*/
inline VariantHeader header() const { return VariantHeader{m_variant_header_ptr}; }

private:
vcfFile* m_variant_file_ptr; ///< pointer to the internal structure of the variant file
hts_idx_t* m_variant_index_ptr; ///< pointer to the internal structure of the index file
std::shared_ptr<bcf_hdr_t> m_variant_header_ptr; ///< pointer to the internal structure of the header file
std::vector<std::string> m_interval_list; ///< vector of intervals represented by strings
};

}

#endif /* defined(gamgee__indexed_variant_reader__guard) */
2 changes: 1 addition & 1 deletion gamgee/variant_iterator.cpp
Expand Up @@ -18,7 +18,7 @@ VariantIterator::VariantIterator(vcfFile* variant_file_ptr, const std::shared_pt
m_variant_record_ptr {utils::make_shared_variant(bcf_init1())}, ///< important to initialize the record buffer in the constructor so we can reuse it across the iterator
m_variant_record {m_variant_header_ptr, m_variant_record_ptr}
{
fetch_next_record();
fetch_next_record();
}

Variant& VariantIterator::operator*() {
Expand Down
4 changes: 2 additions & 2 deletions gamgee/variant_iterator.h
Expand Up @@ -70,13 +70,13 @@ class VariantIterator {
*/
bool empty();

private:
protected:
vcfFile * m_variant_file_ptr; ///< pointer to the vcf/bcf file
std::shared_ptr<bcf_hdr_t> m_variant_header_ptr; ///< pointer to the variant header
std::shared_ptr<bcf1_t> m_variant_record_ptr; ///< pointer to the internal structure of the variant record. Useful to only allocate it once.
Variant m_variant_record; ///< temporary record to hold between fetch (operator++) and serve (operator*)

void fetch_next_record(); ///< fetches next Variant record into existing htslib memory without making a copy
virtual void fetch_next_record(); ///< fetches next Variant record into existing htslib memory without making a copy
};

} // end namespace gamgee
Expand Down
93 changes: 93 additions & 0 deletions test/variant_reader_test.cpp
Expand Up @@ -2,6 +2,8 @@
#include "variant_reader.h"
#include "multiple_variant_reader.h"
#include "multiple_variant_iterator.h"
#include "indexed_variant_reader.h"
#include "indexed_variant_iterator.h"
#include "missing.h"

#include <boost/test/unit_test.hpp>
Expand Down Expand Up @@ -538,3 +540,94 @@ BOOST_AUTO_TEST_CASE( gvcf_test ) {
++truth_index;
}
}

// TODO? update to work with VCF GZ
// const auto input_files = vector<string>{"testdata/var_idx/test_variants.bcf", "testdata/var_idx/test_variants_csi.vcf.gz", "testdata/var_idx/test_variants_tabix.vcf.gz"};
const auto indexed_variant_input_files = vector<string>{"testdata/var_idx/test_variants.bcf"};

const auto indexed_variant_chrom_full = vector<string> {"1", "20", "22"};
const auto indexed_variant_bp_full = vector<string> {"1:10000000-10000000", "20:10001000-10001000", "20:10002000-10002000", "20:10003000-10003000", "22:10004000-10004000"};
const auto indexed_variant_chrom_partial = vector<string> {"1"};
const auto indexed_variant_bp_partial = vector<string> {"20:10001000-10001000"};

// empty interval lists and full interval lists
BOOST_AUTO_TEST_CASE( indexed_variant_reader_full_test ) {
for (const auto filename : indexed_variant_input_files) {
for (const auto intervals : {vector<string>{}, IndexedVariantIterator::all_intervals, indexed_variant_chrom_full, indexed_variant_bp_full}) {
auto truth_index = 0u;
const auto reader = IndexedVariantReader<IndexedVariantIterator>{filename, intervals};
for (const auto& record : reader) {
check_variant_basic_api(record, truth_index);
check_quals_api(record, truth_index);
check_alt_api(record, truth_index);
check_filters_api(record, truth_index);
check_genotype_quals_api(record,truth_index);
check_phred_likelihoods_api(record, truth_index);
check_individual_field_api(record, truth_index);
check_shared_field_api(record, truth_index);
check_genotype_api(record, truth_index);
++truth_index;
}
BOOST_CHECK_EQUAL(truth_index, 5u);
}
}
}

// one (different) record each
BOOST_AUTO_TEST_CASE( indexed_variant_reader_partial_test ) {
for (const auto filename : indexed_variant_input_files) {

const auto intervals1 = indexed_variant_chrom_partial;
auto truth_index = 0u;
const auto reader1 = IndexedVariantReader<IndexedVariantIterator>{filename, intervals1};
for (const auto& record : reader1) {
BOOST_CHECK_EQUAL(record.ref(), "T");
BOOST_CHECK_EQUAL(record.chromosome(), 0);
BOOST_CHECK_EQUAL(record.alignment_start(), 10000000);
BOOST_CHECK_EQUAL(record.alignment_stop(), 10000000);
BOOST_CHECK_EQUAL(record.n_alleles(), 2);
BOOST_CHECK_EQUAL(record.n_samples(), 3);
BOOST_CHECK_EQUAL(record.id(), "db2342");
++truth_index;
}
BOOST_CHECK_EQUAL(truth_index, 1u);

const auto intervals2 = indexed_variant_bp_partial;
truth_index = 0u;
const auto reader2 = IndexedVariantReader<IndexedVariantIterator>{filename, intervals2};
for (const auto& record : reader2) {
BOOST_CHECK_EQUAL(record.ref(), "GG");
BOOST_CHECK_EQUAL(record.chromosome(), 1);
BOOST_CHECK_EQUAL(record.alignment_start(), 10001000);
BOOST_CHECK_EQUAL(record.alignment_stop(), 10001001);
BOOST_CHECK_EQUAL(record.n_alleles(), 2);
BOOST_CHECK_EQUAL(record.n_samples(), 3);
BOOST_CHECK_EQUAL(record.id(), "rs837472");
++truth_index;
}
BOOST_CHECK_EQUAL(truth_index, 1u);
}
}

BOOST_AUTO_TEST_CASE( indexed_variant_reader_move_test ) {
for (const auto filename : indexed_variant_input_files) {
auto reader1 = IndexedVariantReader<IndexedVariantIterator>{filename, indexed_variant_chrom_full};
auto iter1 = reader1.begin();

// move construct
auto reader2 = std::move(reader1);
auto iter2 = reader2.begin();

// move assign
reader1 = std::move(reader2);
auto iter3 = reader1.begin();

auto rec1 = *iter1;
auto rec2 = *iter2;
auto rec3 = *iter3;

BOOST_CHECK_EQUAL(rec1.alignment_start(), rec2.alignment_start());
BOOST_CHECK_EQUAL(rec1.alignment_start(), rec3.alignment_start());
}
}

Binary file added testdata/var_idx/test_variants.bcf
Binary file not shown.
Binary file added testdata/var_idx/test_variants.bcf.csi
Binary file not shown.
Binary file added testdata/var_idx/test_variants_csi.vcf.gz
Binary file not shown.
Binary file added testdata/var_idx/test_variants_csi.vcf.gz.csi
Binary file not shown.
Binary file added testdata/var_idx/test_variants_tabix.vcf.gz
Binary file not shown.
Binary file added testdata/var_idx/test_variants_tabix.vcf.gz.tbi
Binary file not shown.

0 comments on commit 332352f

Please sign in to comment.