Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Merge pull request #354 from broadinstitute/jt_kg_rbs
Browse files Browse the repository at this point in the history
ReferenceBlockSplittingVariantIterator
  • Loading branch information
jmthibault79 committed Nov 4, 2014
2 parents a9e8989 + 473a8ad commit 3631525
Show file tree
Hide file tree
Showing 18 changed files with 601 additions and 2 deletions.
2 changes: 2 additions & 0 deletions gamgee/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ set(SOURCE_FILES
read_bases.h
read_group.cpp
read_group.h
reference_block_splitting_variant_iterator.cpp
reference_block_splitting_variant_iterator.h
reference_iterator.cpp
reference_iterator.h
reference_map.cpp
Expand Down
1 change: 1 addition & 0 deletions gamgee/gamgee.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "multiple_variant_iterator.h"
#include "multiple_variant_reader.h"
#include "read_bases.h"
#include "reference_block_splitting_variant_iterator.h"
#include "reference_iterator.h"
#include "reference_map.h"
#include "sam.h"
Expand Down
124 changes: 124 additions & 0 deletions gamgee/reference_block_splitting_variant_iterator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#include "reference_block_splitting_variant_iterator.h"

namespace gamgee {

ReferenceBlockSplittingVariantIterator::ReferenceBlockSplittingVariantIterator(const std::vector<std::shared_ptr<htsFile>> variant_files, const std::vector<std::shared_ptr<bcf_hdr_t>> variant_headers) :
MultipleVariantIterator {variant_files, variant_headers},
m_pending_variants {},
m_split_variants {}
{
m_split_variants.reserve(variant_files.size());
fetch_next_split_vector();
}

std::vector<Variant>& ReferenceBlockSplittingVariantIterator::operator*() {
return m_split_variants;
}

std::vector<Variant>& ReferenceBlockSplittingVariantIterator::operator++() {
fetch_next_split_vector();
return m_split_variants;
}

// NOTE: this method does the minimal work necessary to determine that we have reached the end of iteration
// it is NOT a valid general-purpose inequality method
bool ReferenceBlockSplittingVariantIterator::operator!=(const ReferenceBlockSplittingVariantIterator& rhs) {
return !(m_pending_variants.empty() && rhs.m_pending_variants.empty()
&& m_split_variants.empty() && rhs.m_split_variants.empty());
}

void ReferenceBlockSplittingVariantIterator::populate_pending () {
for (const auto& variant : MultipleVariantIterator::operator*()) {
m_pending_min_end = std::min(m_pending_min_end, variant.alignment_stop());
m_pending_variants.push_front(std::move(variant));
}

MultipleVariantIterator::operator++();
}

void ReferenceBlockSplittingVariantIterator::populate_split_variants () {
// m_pending_chrom can only change when pending is empty
auto new_pending_start = -1;
auto new_pending_end = UINT_MAX;
//default value of reference if a REF interval is split
auto new_reference_allele = 'N';
//Try to see if the next record begins immediately after end of current interval
//If yes, then new_reference_base can be obtained from the next record
//FIXME: should remove these checks to avoid overhead every time
auto& next_pos_variant_vector = MultipleVariantIterator::operator*();
if (!next_pos_variant_vector.empty()
&& next_pos_variant_vector[0].chromosome() == m_pending_chrom
&& next_pos_variant_vector[0].alignment_start() == m_pending_min_end+1
&& !gamgee::missing(next_pos_variant_vector[0].ref()))
new_reference_allele = next_pos_variant_vector[0].ref()[0]; //only the first character is needed

// advance iter by erase() or ++ depending on if
for (auto iter = m_pending_variants.begin(); iter != m_pending_variants.end(); ) {
auto& variant = *iter;
auto var_end = variant.alignment_stop();
// don't split reference blocks which end at the correct point
// or variants with actual alt alleles
if (var_end == m_pending_min_end || variant.alt().size() > 1) {
m_split_variants.push_back(std::move(variant));
iter = m_pending_variants.erase(iter);
}
else {
// this is a reference block that extends past the desired end, so split it
auto split_variant = variant;
split_variant.set_alignment_stop(m_pending_min_end);
m_split_variants.push_back(std::move(split_variant));

new_pending_start = m_pending_min_end + 1;
new_pending_end = std::min(new_pending_end, var_end);
variant.set_alignment_start(new_pending_start);
variant.set_alignment_stop(var_end); // stop is internally an offset to start, so we need to reset it after updating stop
variant.set_reference_allele(new_reference_allele);
iter++;
}
}

if (new_pending_start != -1) {
m_pending_start = new_pending_start;
m_pending_min_end = new_pending_end;
}

}

void ReferenceBlockSplittingVariantIterator::fetch_next_split_vector() {
m_split_variants.clear();
// run until we have split variants or the incoming pre-split vector is empty, indicating iteration is done
// if the incoming vector is empty, we also end with an empty vector to indicate our iteration is done
while (m_split_variants.empty() && ! MultipleVariantIterator::operator*().empty()) {
auto& incoming = MultipleVariantIterator::operator*();
// when the pending list is empty, fill from the incoming vector and consume it
if (m_pending_variants.empty()) {
m_pending_chrom = incoming[0].chromosome();
m_pending_start = incoming[0].alignment_start();
m_pending_min_end = UINT_MAX;
populate_pending();
}
else {
// if the incoming vector has the same start location as pending, add it to pending and consume it
if (!incoming.empty()
&& incoming[0].chromosome() == m_pending_chrom
&& incoming[0].alignment_start() == m_pending_start)
populate_pending();
}

// refresh incoming because it may have advanced
incoming = MultipleVariantIterator::operator*();

// if the pending Variants' end is after the incoming vector's start, then their end is too high
if (!incoming.empty() && incoming[0].chromosome() == m_pending_chrom)
m_pending_min_end = std::min(m_pending_min_end, incoming[0].alignment_start() - 1);

populate_split_variants();
}

// we may need one final populate for the last ref block
if (m_split_variants.empty() && !m_pending_variants.empty())
populate_split_variants();
}

}

94 changes: 94 additions & 0 deletions gamgee/reference_block_splitting_variant_iterator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#ifndef __gamgee__reference_block_splitting_variant_iterator__
#define __gamgee__reference_block_splitting_variant_iterator__

#include "variant.h"
#include "multiple_variant_iterator.h"

#include <list>

namespace gamgee {

/**
* @brief Utility class to handle reference blocks while iterating over multiple variant files
*
* @warn This class is experimental/WIP
*/
class ReferenceBlockSplittingVariantIterator : public MultipleVariantIterator {
public:

/**
* @brief creates an empty iterator (used for the end() method)
*/
ReferenceBlockSplittingVariantIterator() = default;

/**
* @brief initializes a new iterator based on a vector of input files (vcf or bcf)
*
* @param variant_files vector of vcf/bcf files opened via the bcf_open() macro from htslib
* @param variant_headers vector of variant headers corresponding to these files
*/
ReferenceBlockSplittingVariantIterator(const std::vector<std::shared_ptr<htsFile>> variant_files, const std::vector<std::shared_ptr<bcf_hdr_t>> variant_headers);

/**
* @brief a ReferenceBlockSplittingVariantIterator move constructor guarantees all objects will have the same state.
*/
ReferenceBlockSplittingVariantIterator(ReferenceBlockSplittingVariantIterator&&) = default;
ReferenceBlockSplittingVariantIterator& operator=(ReferenceBlockSplittingVariantIterator&&) = default;

/**
* @brief a ReferenceBlockSplittingVariantIterator cannot be copied.
*/
ReferenceBlockSplittingVariantIterator(const ReferenceBlockSplittingVariantIterator&) = delete;
ReferenceBlockSplittingVariantIterator& operator=(const ReferenceBlockSplittingVariantIterator&) = delete;

/**
* @brief pseudo-inequality operator (needed by for-each loop)
*
* @warning this method does the minimal work necessary to determine that we have reached the end of iteration.
* it is NOT a valid general-purpose inequality method.
*
* @param rhs the other ReferenceBlockSplittingVariantIterator to compare to
*
* @return whether both iterators have entered their end states
*/
bool operator!=(const ReferenceBlockSplittingVariantIterator& rhs);

/**
* @brief dereference operator (needed by for-each loop)
*
* @return a reference to the iterator's Variant vector
*/
std::vector<Variant>& operator*();

/**
* @brief advances the iterator, fetching the next vector
*
* @return a reference to the iterator's Variant vector
*/
std::vector<Variant>& operator++();

private:
// fetches the next reference-block-split Variant vector
// calls populate_pending() and populate_split_variants() as needed
void fetch_next_split_vector();

// populates the list of pending variants from the incoming vector of pre-split reference-block variants
inline void populate_pending();

// populates the vector of split variants from the list of pending variants, modifying the pending list as well
inline void populate_split_variants();

// holds the incoming reference-block variants before and during split operations
std::list<Variant> m_pending_variants;

// caches next reference-block-split Variant vector
std::vector<Variant> m_split_variants;

unsigned int m_pending_chrom = UINT_MAX;
unsigned int m_pending_start = UINT_MAX;
unsigned int m_pending_min_end = UINT_MAX;
};

} // end namespace gamgee

#endif // __gamgee__reference_block_splitting_variant_iterator__
33 changes: 33 additions & 0 deletions gamgee/variant.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,39 @@ class Variant {

friend class VariantWriter;
friend class VariantBuilder; ///< builder needs access to the internals in order to build efficiently

// TODO: remove this friendship and these mutators after Issue #320 is resolved

friend class ReferenceBlockSplittingVariantIterator;

inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; }
inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; }

inline void set_reference_allele(const char* ref, const int32_t ref_length)
{
bcf_unpack(m_body.get(), BCF_UN_STR);
//Try to avoid calling bcf_update_alleles as it causes significant overhead in
//terms of memory re-allocation etc.
//If enough space is available, over-write the current reference allele value
//Seems like an un-necessary hack, but trust me every single such overhead counts
if(m_body->rlen >= ref_length)
{
memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy
m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option
m_body->d.shared_dirty |= BCF1_DIRTY_ALS;
}
else
{
//expensive - causes significant reallocs within htslib
//hacks to interpret const ptrs to non-const. Again, saves some overhead
//No need to free d.allele[0] because it points to shared string within bcf1_1.shared
m_body->d.allele[0] = const_cast<char*>(ref);
//Re-use same d.allele char** as update_alleles writes to different region of memory anyway
bcf_update_alleles(const_cast<const bcf_hdr_t*>(m_header.get()), m_body.get(), const_cast<const char**>(m_body->d.allele), m_body->n_allele);
}
}
inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast<int32_t>(strlen(ref))); }
inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); }
};

} // end of namespace
Expand Down
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(SOURCE_FILES
main.cpp
missing_test.cpp
read_group_test.cpp
reference_block_splitting_variant_reader_test.cpp
reference_test.cpp
sam_builder_test.cpp
sam_header_test.cpp
Expand Down
Loading

0 comments on commit 3631525

Please sign in to comment.