From 0e6351ca037e424c735e0f87b644c123ca817f21 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 21 Jul 2014 15:42:32 -0400 Subject: [PATCH 1/4] ReferenceBlockSplittingVariantIterator - uses htslib hack to modify Variant (TODO: use VariantBuilder) - TODO: test ref bases --- gamgee/CMakeLists.txt | 2 + gamgee/gamgee.h | 1 + ...rence_block_splitting_variant_iterator.cpp | 112 +++++++++++++++++ ...ference_block_splitting_variant_iterator.h | 94 ++++++++++++++ gamgee/variant.h | 4 + test/CMakeLists.txt | 1 + ...ce_block_splitting_variant_reader_test.cpp | 119 ++++++++++++++++++ testdata/ref_block/problem1.vcf | 21 ++++ testdata/ref_block/problem2_file1.vcf | 25 ++++ testdata/ref_block/problem2_file2.vcf | 17 +++ testdata/ref_block/problem3_file1.vcf | 24 ++++ testdata/ref_block/problem3_file2.vcf | 15 +++ testdata/ref_block/test1.vcf | 22 ++++ testdata/ref_block/test2.vcf | 22 ++++ testdata/ref_block/test3.vcf | 24 ++++ testdata/ref_block/test4.vcf | 23 ++++ testdata/ref_block/test5.vcf | 22 ++++ 17 files changed, 548 insertions(+) create mode 100644 gamgee/reference_block_splitting_variant_iterator.cpp create mode 100644 gamgee/reference_block_splitting_variant_iterator.h create mode 100644 test/reference_block_splitting_variant_reader_test.cpp create mode 100644 testdata/ref_block/problem1.vcf create mode 100644 testdata/ref_block/problem2_file1.vcf create mode 100644 testdata/ref_block/problem2_file2.vcf create mode 100644 testdata/ref_block/problem3_file1.vcf create mode 100644 testdata/ref_block/problem3_file2.vcf create mode 100644 testdata/ref_block/test1.vcf create mode 100644 testdata/ref_block/test2.vcf create mode 100644 testdata/ref_block/test3.vcf create mode 100644 testdata/ref_block/test4.vcf create mode 100644 testdata/ref_block/test5.vcf diff --git a/gamgee/CMakeLists.txt b/gamgee/CMakeLists.txt index 7413f08c9..335d75b25 100644 --- a/gamgee/CMakeLists.txt +++ b/gamgee/CMakeLists.txt @@ -33,6 +33,8 @@ set(SOURCE_FILES read_bases.h read_group.cpp read_group.h + reference_block_splitting_variant_iterator.cpp + reference_block_splitting_variant_iterator.h reference_iterator.cpp reference_iterator.h reference_map.cpp diff --git a/gamgee/gamgee.h b/gamgee/gamgee.h index 8ec1774b9..f51436774 100644 --- a/gamgee/gamgee.h +++ b/gamgee/gamgee.h @@ -21,6 +21,7 @@ #include "multiple_variant_iterator.h" #include "multiple_variant_reader.h" #include "read_bases.h" +#include "reference_block_splitting_variant_iterator.h" #include "reference_iterator.h" #include "reference_map.h" #include "sam.h" diff --git a/gamgee/reference_block_splitting_variant_iterator.cpp b/gamgee/reference_block_splitting_variant_iterator.cpp new file mode 100644 index 000000000..4484e5ab1 --- /dev/null +++ b/gamgee/reference_block_splitting_variant_iterator.cpp @@ -0,0 +1,112 @@ +#include "reference_block_splitting_variant_iterator.h" + +namespace gamgee { + +ReferenceBlockSplittingVariantIterator::ReferenceBlockSplittingVariantIterator(const std::vector> variant_files, const std::vector> variant_headers) : + MultipleVariantIterator {variant_files, variant_headers}, + m_pending_variants {}, + m_split_variants {} +{ + m_split_variants.reserve(variant_files.size()); + fetch_next_split_vector(); +} + +std::vector& ReferenceBlockSplittingVariantIterator::operator*() { + return m_split_variants; +} + +std::vector& ReferenceBlockSplittingVariantIterator::operator++() { + fetch_next_split_vector(); + return m_split_variants; +} + +// NOTE: this method does the minimal work necessary to determine that we have reached the end of iteration +// it is NOT a valid general-purpose inequality method +bool ReferenceBlockSplittingVariantIterator::operator!=(const ReferenceBlockSplittingVariantIterator& rhs) { + return !(m_pending_variants.empty() && rhs.m_pending_variants.empty() + && m_split_variants.empty() && rhs.m_split_variants.empty()); +} + +void ReferenceBlockSplittingVariantIterator::populate_pending () { + for (const auto& variant : MultipleVariantIterator::operator*()) { + m_pending_min_end = std::min(m_pending_min_end, variant.alignment_stop()); + m_pending_variants.push_front(std::move(variant)); + } + + MultipleVariantIterator::operator++(); +} + +void ReferenceBlockSplittingVariantIterator::populate_split_variants () { + // m_pending_chrom can only change when pending is empty + auto new_pending_start = -1; + auto new_pending_end = UINT_MAX; + + // advance iter by erase() or ++ depending on if + for (auto iter = m_pending_variants.begin(); iter != m_pending_variants.end(); ) { + auto& variant = *iter; + auto var_end = variant.alignment_stop(); + // don't split reference blocks which end at the correct point + // or variants with actual alt alleles + if (var_end == m_pending_min_end || variant.alt().size() > 1) { + m_split_variants.push_back(std::move(variant)); + iter = m_pending_variants.erase(iter); + } + else { + // this is a reference block that extends past the desired end, so split it + auto split_variant = variant; + split_variant.set_alignment_stop(m_pending_min_end); + m_split_variants.push_back(std::move(split_variant)); + + new_pending_start = m_pending_min_end + 1; + new_pending_end = std::min(new_pending_end, var_end); + variant.set_alignment_start(new_pending_start); + variant.set_alignment_stop(var_end); // stop is internally an offset to start, so we need to reset it after updating stop + iter++; + } + } + + if (new_pending_start != -1) { + m_pending_start = new_pending_start; + m_pending_min_end = new_pending_end; + } + +} + +void ReferenceBlockSplittingVariantIterator::fetch_next_split_vector() { + m_split_variants.clear(); + // run until we have split variants or the incoming pre-split vector is empty, indicating iteration is done + // if the incoming vector is empty, we also end with an empty vector to indicate our iteration is done + while (m_split_variants.empty() && ! MultipleVariantIterator::operator*().empty()) { + auto& incoming = MultipleVariantIterator::operator*(); + // when the pending list is empty, fill from the incoming vector and consume it + if (m_pending_variants.empty()) { + m_pending_chrom = incoming[0].chromosome(); + m_pending_start = incoming[0].alignment_start(); + m_pending_min_end = UINT_MAX; + populate_pending(); + } + else { + // if the incoming vector has the same start location as pending, add it to pending and consume it + if (!incoming.empty() + && incoming[0].chromosome() == m_pending_chrom + && incoming[0].alignment_start() == m_pending_start) + populate_pending(); + } + + // refresh incoming because it may have advanced + incoming = MultipleVariantIterator::operator*(); + + // if the pending Variants' end is after the incoming vector's start, then their end is too high + if (!incoming.empty() && incoming[0].chromosome() == m_pending_chrom) + m_pending_min_end = std::min(m_pending_min_end, incoming[0].alignment_start() - 1); + + populate_split_variants(); + } + + // we may need one final populate for the last ref block + if (m_split_variants.empty() && !m_pending_variants.empty()) + populate_split_variants(); +} + +} + diff --git a/gamgee/reference_block_splitting_variant_iterator.h b/gamgee/reference_block_splitting_variant_iterator.h new file mode 100644 index 000000000..374b40c8b --- /dev/null +++ b/gamgee/reference_block_splitting_variant_iterator.h @@ -0,0 +1,94 @@ +#ifndef __gamgee__reference_block_splitting_variant_iterator__ +#define __gamgee__reference_block_splitting_variant_iterator__ + +#include "variant.h" +#include "multiple_variant_iterator.h" + +#include + +namespace gamgee { + +/** + * @brief Utility class to handle reference blocks while iterating over multiple variant files + * + * @warn This class is experimental/WIP + */ +class ReferenceBlockSplittingVariantIterator : public MultipleVariantIterator { + public: + + /** + * @brief creates an empty iterator (used for the end() method) + */ + ReferenceBlockSplittingVariantIterator() = default; + + /** + * @brief initializes a new iterator based on a vector of input files (vcf or bcf) + * + * @param variant_files vector of vcf/bcf files opened via the bcf_open() macro from htslib + * @param variant_headers vector of variant headers corresponding to these files + */ + ReferenceBlockSplittingVariantIterator(const std::vector> variant_files, const std::vector> variant_headers); + + /** + * @brief a ReferenceBlockSplittingVariantIterator move constructor guarantees all objects will have the same state. + */ + ReferenceBlockSplittingVariantIterator(ReferenceBlockSplittingVariantIterator&&) = default; + ReferenceBlockSplittingVariantIterator& operator=(ReferenceBlockSplittingVariantIterator&&) = default; + + /** + * @brief a ReferenceBlockSplittingVariantIterator cannot be copied. + */ + ReferenceBlockSplittingVariantIterator(const ReferenceBlockSplittingVariantIterator&) = delete; + ReferenceBlockSplittingVariantIterator& operator=(const ReferenceBlockSplittingVariantIterator&) = delete; + + /** + * @brief pseudo-inequality operator (needed by for-each loop) + * + * @warning this method does the minimal work necessary to determine that we have reached the end of iteration. + * it is NOT a valid general-purpose inequality method. + * + * @param rhs the other ReferenceBlockSplittingVariantIterator to compare to + * + * @return whether both iterators have entered their end states + */ + bool operator!=(const ReferenceBlockSplittingVariantIterator& rhs); + + /** + * @brief dereference operator (needed by for-each loop) + * + * @return a reference to the iterator's Variant vector + */ + std::vector& operator*(); + + /** + * @brief advances the iterator, fetching the next vector + * + * @return a reference to the iterator's Variant vector + */ + std::vector& operator++(); + + private: + // fetches the next reference-block-split Variant vector + // calls populate_pending() and populate_split_variants() as needed + void fetch_next_split_vector(); + + // populates the list of pending variants from the incoming vector of pre-split reference-block variants + inline void populate_pending(); + + // populates the vector of split variants from the list of pending variants, modifying the pending list as well + inline void populate_split_variants(); + + // holds the incoming reference-block variants before and during split operations + std::list m_pending_variants; + + // caches next reference-block-split Variant vector + std::vector m_split_variants; + + unsigned int m_pending_chrom = UINT_MAX; + unsigned int m_pending_start = UINT_MAX; + unsigned int m_pending_min_end = UINT_MAX; +}; + +} // end namespace gamgee + +#endif // __gamgee__reference_block_splitting_variant_iterator__ diff --git a/gamgee/variant.h b/gamgee/variant.h index 0cd836464..2a84ac0e8 100644 --- a/gamgee/variant.h +++ b/gamgee/variant.h @@ -84,6 +84,10 @@ class Variant { SharedField shared_field_as_float(const int32_t index) const; ///< same as float_shared_field but will attempt to convert underlying data to float if possible. @warning creates a new object but makes no copies of the underlying values. SharedField shared_field_as_string(const int32_t index) const; ///< same as string_shared_field but will attempt to convert underlying data to string if possible. @warning creates a new object but makes no copies of the underlying values. + // nasty temporary hacks for Joel + inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; } + inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; } + /** * @brief functional-style set logic operations for variant field vectors * diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d9d0d24bd..faf8377a4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,6 +8,7 @@ set(SOURCE_FILES main.cpp missing_test.cpp read_group_test.cpp + reference_block_splitting_variant_reader_test.cpp reference_test.cpp sam_builder_test.cpp sam_header_test.cpp diff --git a/test/reference_block_splitting_variant_reader_test.cpp b/test/reference_block_splitting_variant_reader_test.cpp new file mode 100644 index 000000000..422b35ffa --- /dev/null +++ b/test/reference_block_splitting_variant_reader_test.cpp @@ -0,0 +1,119 @@ +#include "multiple_variant_reader.h" +#include "reference_block_splitting_variant_iterator.h" + +#include "test_utils.h" + +#include + +using namespace std; +using namespace gamgee; + +using GVCFReader = MultipleVariantReader; + +const auto test_files = vector{"testdata/ref_block/test1.vcf", "testdata/ref_block/test2.vcf", "testdata/ref_block/test3.vcf", + "testdata/ref_block/test4.vcf", "testdata/ref_block/test5.vcf"}; +const auto truth_contigs = vector{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; +const auto truth_block_starts = vector{1, 20, 41, 42, 50, 61, 81, 101, 102, 103, 151, 201, 300, 401, 600, 650, 701, 10}; +const auto truth_block_stops = vector{19, 40, 41, 49, 60, 80, 100, 101, 102, 150, 200, 299, 400, 500, 649, 700, 750, 11}; +// note: final block is not a ref block, but the block ends at start + 1 because the reference allele length is 2 +//TODO const auto truth_refs = vector{"T", "T", "T", "T", "T", "61", "81", "T", "T", "T", "151", "201", "T", "401", "T", "T", "701", "GG"}; + +BOOST_AUTO_TEST_CASE( split_reference_blocks ) +{ + auto reader = GVCFReader{test_files, false}; + auto position_counter = 0u; + for (const auto& vec : reader) { + for (const auto& record : vec) { + BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]); + //TODO BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); + } + ++position_counter; + } + BOOST_CHECK_EQUAL(position_counter, 18u); +} + +BOOST_AUTO_TEST_CASE( reference_block_iterator_move_test ) { + auto reader0 = MultipleVariantReader{test_files, false}; + auto iter0 = reader0.begin(); + + auto reader1 = MultipleVariantReader{test_files, false}; + auto iter1 = reader1.begin(); + auto moved = check_move_constructor(iter1); + + auto record0 = *iter0; + auto moved_record = *iter1; + const auto position_counter = 0; + for (const auto vec : {record0, moved_record}) { + for (const auto record : vec) { + BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]); + //TODO BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); + } + } +} + +// test cases based on problematic regions from real 1000 Genomes data +// mainly involving long overlapping reference alleles + +const auto p1_file = "testdata/ref_block/problem1.vcf"; +const auto p1_block_starts = vector{11140505, 11140610, 11140611, 11140620, 11140621, 11140623, 11140629, 11140768}; +const auto p1_block_stops = vector{11140609, 11140610, 11140619, 11140628, 11140628, 11140628, 11140658, 11141021}; + +BOOST_AUTO_TEST_CASE( reference_block_problem_region_1 ) { + auto counter = 0u; + auto reader = GVCFReader{{p1_file}}; + for (const auto& vec : reader) { + BOOST_CHECK(vec.size() == 1u); + for (const auto& record : vec) { + BOOST_CHECK(record.alignment_start() == p1_block_starts[counter]); + BOOST_CHECK(record.alignment_stop() == p1_block_stops[counter]); + counter++; + } + } + BOOST_CHECK(counter == 8u); +} + +const auto p2_files = vector{"testdata/ref_block/problem2_file1.vcf", "testdata/ref_block/problem2_file2.vcf"}; +const auto p2_block_starts = vector{3319304, 3319319, 3319404, 3319407, 3319410, + 3319413, 3319433, 3319447, 3319558, 3319565, 3319577, 3319599, 3319600, 3319601, 3319602}; +const auto p2_block_stops = vector{3319318, 3319403, 3319406, 3319409, 3319412, + 3319432, 3319446, 3319557, 3319564, 3319576, 3319598, 3319599, 3319600, 3319601, 3319614}; + +BOOST_AUTO_TEST_CASE( reference_block_problem_region_2 ) { + auto counter = 0u; + auto reader = GVCFReader{p2_files}; + for (const auto& vec : reader) { + for (const auto& record : vec) { + BOOST_CHECK(record.alignment_start() == p2_block_starts[counter]); + if (record.alt().size() == 1) + BOOST_CHECK(record.alignment_stop() == p2_block_stops[counter]); + } + counter++; + } + + BOOST_CHECK(counter == p2_block_starts.size()); +} + +const auto p3_files = vector{"testdata/ref_block/problem3_file1.vcf", "testdata/ref_block/problem3_file2.vcf"}; +const auto p3_block_starts = vector{3319304, 3319319, 3319404, 3319407, 3319410, + 3319413, 3319433, 3319447, 3319558, 3319565, 3319577, 3319599, 3319600}; +const auto p3_block_stops = vector{3319318, 3319403, 3319406, 3319409, 3319412, + 3319432, 3319446, 3319557, 3319564, 3319576, 3319598, 3319599, 3319600}; + +BOOST_AUTO_TEST_CASE( reference_block_problem_region_3 ) { + auto counter = 0u; + auto reader = GVCFReader{p3_files}; + for (const auto& vec : reader) { + for (const auto& record : vec) { + BOOST_CHECK(record.alignment_start() == p3_block_starts[counter]); + if (record.alt().size() == 1) + BOOST_CHECK(record.alignment_stop() == p3_block_stops[counter]); + } + counter++; + } + + BOOST_CHECK(counter == p3_block_starts.size()); +} diff --git a/testdata/ref_block/problem1.vcf b/testdata/ref_block/problem1.vcf new file mode 100644 index 000000000..97abe862b --- /dev/null +++ b/testdata/ref_block/problem1.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE +1 11140505 . C . . END=11140609 GT:DP:GQ:MIN_DP:PL 0/0:102:99:64:0,117,1755 +1 11140610 . G . . END=11140610 GT:DP:GQ:MIN_DP:PL 0/0:65:0:65:0,0,1851 +1 11140611 . A . . END=11140619 GT:DP:GQ:MIN_DP:PL 0/0:83:69:82:0,60,900 +1 11140620 . AACACACAC A,AAC,AACACACACAC, 0.01 . . GT:AD:DP:GQ:PL:SB 0/3:20,0,0,5,0:25:9:198,444,4343,424,3388,3263,0,1715,1379,2412,9,694,634,502,457:6,14,3,2 +1 11140621 . ACACACAC A,AACACAC,AAC, 142.73 . . GT:AD:DP:GQ:PL:SB 0/2:41,0,10,2,0:53:99:204,420,4129,0,2639,2527,217,3164,2429,3061,180,2736,2400,2563,2525:18,23,2,8 +1 11140623 . ACACAC A,AAC, 0 . . GT:AD:DP:GQ:PL:SB 0/0:46,3,7,0:56:58:0,127,3660,58,2726,2688,145,2832,2670,2756:0,0,0,0 +1 11140629 . A . . END=11140658 GT:DP:GQ:MIN_DP:PL 0/0:76:99:73:0,120,1800 +1 11140768 . A . . END=11141021 GT:DP:GQ:MIN_DP:PL 0/0:127:99:93:0,120,1800 diff --git a/testdata/ref_block/problem2_file1.vcf b/testdata/ref_block/problem2_file1.vcf new file mode 100644 index 000000000..7ad227aa2 --- /dev/null +++ b/testdata/ref_block/problem2_file1.vcf @@ -0,0 +1,25 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE1 +1 3319304 . C . . END=3319318 GT:DP:GQ:MIN_DP:PL 0/0:19:51:17:0,42,618 +1 3319319 . A . . END=3319403 GT:DP:GQ:MIN_DP:PL 0/0:29:75:21:0,63,758 +1 3319404 . G . . END=3319406 GT:DP:GQ:MIN_DP:PL 0/0:24:57:24:0,57,855 +1 3319407 . G . . END=3319409 GT:DP:GQ:MIN_DP:PL 0/0:24:63:24:0,60,900 +1 3319410 . C . . END=3319412 GT:DP:GQ:MIN_DP:PL 0/0:24:54:24:0,54,810 +1 3319413 . G . . END=3319432 GT:DP:GQ:MIN_DP:PL 0/0:24:63:22:0,60,900 +1 3319433 . C . . END=3319446 GT:DP:GQ:MIN_DP:PL 0/0:21:57:20:0,45,810 +1 3319447 . G . . END=3319557 GT:DP:GQ:MIN_DP:PL 0/0:29:72:21:0,60,715 +1 3319558 . T . . END=3319564 GT:DP:GQ:MIN_DP:PL 0/0:24:57:23:0,54,810 +1 3319565 . G . . END=3319576 GT:DP:GQ:MIN_DP:PL 0/0:22:60:21:0,60,900 +1 3319577 . T . . END=3319600 GT:DP:GQ:MIN_DP:PL 0/0:17:42:13:0,23,495 +1 3319601 . ACCCTCCTCTGAGTCTTCCTCCCCTTCCCGTG A, 155.74 . DP=13;MLEAC=2,0;MLEAF=1.00,0.00;MQ=61.16;MQ0=0 GT:AD:DP:GQ:PL:SB 1/1:0,4,0:4:32:198,32,0,198,32,198:0,0,0,0 diff --git a/testdata/ref_block/problem2_file2.vcf b/testdata/ref_block/problem2_file2.vcf new file mode 100644 index 000000000..f5db152c2 --- /dev/null +++ b/testdata/ref_block/problem2_file2.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE2 +1 3319304 . C . . END=3319598 GT:DP:GQ:MIN_DP:PL 0/0:151:99:66:0,63,1800 +1 3319599 . G . . END=3319599 GT:DP:GQ:MIN_DP:PL 0/0:101:52:101:0,52,2707 +1 3319600 . T . . END=3319601 GT:DP:GQ:MIN_DP:PL 0/0:100:0:87:0,0,1855 +1 3319602 . C . . END=3319614 GT:DP:GQ:MIN_DP:PL 0/0:84:99:71:0,120,1800 diff --git a/testdata/ref_block/problem3_file1.vcf b/testdata/ref_block/problem3_file1.vcf new file mode 100644 index 000000000..d6cb82a47 --- /dev/null +++ b/testdata/ref_block/problem3_file1.vcf @@ -0,0 +1,24 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE1 +1 3319304 . C . . END=3319318 GT:DP:GQ:MIN_DP:PL 0/0:19:51:17:0,42,618 +1 3319319 . A . . END=3319403 GT:DP:GQ:MIN_DP:PL 0/0:29:75:21:0,63,758 +1 3319404 . G . . END=3319406 GT:DP:GQ:MIN_DP:PL 0/0:24:57:24:0,57,855 +1 3319407 . G . . END=3319409 GT:DP:GQ:MIN_DP:PL 0/0:24:63:24:0,60,900 +1 3319410 . C . . END=3319412 GT:DP:GQ:MIN_DP:PL 0/0:24:54:24:0,54,810 +1 3319413 . G . . END=3319432 GT:DP:GQ:MIN_DP:PL 0/0:24:63:22:0,60,900 +1 3319433 . C . . END=3319446 GT:DP:GQ:MIN_DP:PL 0/0:21:57:20:0,45,810 +1 3319447 . G . . END=3319557 GT:DP:GQ:MIN_DP:PL 0/0:29:72:21:0,60,715 +1 3319558 . T . . END=3319564 GT:DP:GQ:MIN_DP:PL 0/0:24:57:23:0,54,810 +1 3319565 . G . . END=3319576 GT:DP:GQ:MIN_DP:PL 0/0:22:60:21:0,60,900 +1 3319577 . T . . END=3319600 GT:DP:GQ:MIN_DP:PL 0/0:17:42:13:0,23,495 diff --git a/testdata/ref_block/problem3_file2.vcf b/testdata/ref_block/problem3_file2.vcf new file mode 100644 index 000000000..482e2eede --- /dev/null +++ b/testdata/ref_block/problem3_file2.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE2 +1 3319304 . C . . END=3319598 GT:DP:GQ:MIN_DP:PL 0/0:151:99:66:0,63,1800 +1 3319599 . G . . END=3319599 GT:DP:GQ:MIN_DP:PL 0/0:101:52:101:0,52,2707 diff --git a/testdata/ref_block/test1.vcf b/testdata/ref_block/test1.vcf new file mode 100644 index 000000000..a11d1b025 --- /dev/null +++ b/testdata/ref_block/test1.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 1 . T . . END=100;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 101 . T . . END=200;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test2.vcf b/testdata/ref_block/test2.vcf new file mode 100644 index 000000000..dd2c44b58 --- /dev/null +++ b/testdata/ref_block/test2.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 50 . T . . END=150;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 300 . T . . END=400;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test3.vcf b/testdata/ref_block/test3.vcf new file mode 100644 index 000000000..11fc2cbfb --- /dev/null +++ b/testdata/ref_block/test3.vcf @@ -0,0 +1,24 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 1 . T . . END=40;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 41 . T C, . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 42 . T . . END=60;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 650 . T . . END=750;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test4.vcf b/testdata/ref_block/test4.vcf new file mode 100644 index 000000000..20cfaa50c --- /dev/null +++ b/testdata/ref_block/test4.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 1 . T . . END=101;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 102 . T C, . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 103 . T . . END=500;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test5.vcf b/testdata/ref_block/test5.vcf new file mode 100644 index 000000000..bd98de167 --- /dev/null +++ b/testdata/ref_block/test5.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 20 . T . . END=80;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 600 . T . . END=700;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA From ec51df4be10abaa1f161323a748bb09195e48f6f Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 2 Oct 2014 16:36:32 -0400 Subject: [PATCH 2/4] MultipleVariantReader changes causes the test case for Issue #209 to pass - but the issue still exists, so add a new test case --- test/variant_reader_test.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/variant_reader_test.cpp b/test/variant_reader_test.cpp index 2ed9af686..6792f2af6 100644 --- a/test/variant_reader_test.cpp +++ b/test/variant_reader_test.cpp @@ -878,23 +878,30 @@ void single_variant_reader_sample_test(const string filename, const vector{}, true, 0); // exclude all samples (sites-only) + // Fails: Issue #209 + //single_variant_reader_sample_test("testdata/test_variants.bcf", vector{}, true, 0); // exclude all samples (sites-only) } BOOST_AUTO_TEST_CASE( single_variant_reader_include_all_samples ) { single_variant_reader_sample_test("testdata/test_variants.vcf", vector{}, false, 3); // include all samples by setting include == false and passing an empty list + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{}, false, 3); // include all samples by setting include == false and passing an empty list } BOOST_AUTO_TEST_CASE( single_variant_reader_including ) { single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12878"}, true, 1); // include only NA12878 single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12878", "NA12892"}, true, 2); // include both these samples + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12878"}, true, 1); // include only NA12878 + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12878", "NA12892"}, true, 2); // include both these samples } BOOST_AUTO_TEST_CASE( single_variant_reader_excluding ) { single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12891"}, false, 2); // exclude only NA12891 single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12891", "NA12878"}, false, 1); // exclude both these samples + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12891"}, false, 2); // exclude only NA12891 + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12891", "NA12878"}, false, 1); // exclude both these samples } BOOST_AUTO_TEST_CASE( single_variant_reader_missing_data ) @@ -1033,12 +1040,10 @@ void multiple_variant_reader_sample_test(const vector samples, const boo BOOST_CHECK_EQUAL(reader.combined_header().n_samples(), desired_samples); } -/* TODO: Issue #209 BOOST_AUTO_TEST_CASE( multiple_variant_reader_sites_only ) { multiple_variant_reader_sample_test(vector{}, true, 0); // exclude all samples (sites-only) } -*/ BOOST_AUTO_TEST_CASE( multiple_variant_reader_include_all_samples ) { From 6a0dbf198af0b7ccbe52d31dc84a222443345660 Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Fri, 24 Oct 2014 15:43:46 -0700 Subject: [PATCH 3/4] 1. Modified ReferenceBlockSplittingVariantIterator to insert 'N' as the reference allele for a split GVCF interval. This is done by introducing a function set_reference_allele() in Variant. 2. Uncommented the test cases for ReferenceBlockSplittingVariantIteratorr reference allele 3. The problem*.vcf files failed because their headers did not declare an END field. htslib assumes that fields which are not declared are of type String. Declaring END as an integer field in these files solves the problem. --- ...rence_block_splitting_variant_iterator.cpp | 12 +++++++++ gamgee/variant.h | 26 +++++++++++++++++++ ...ce_block_splitting_variant_reader_test.cpp | 8 +++--- testdata/ref_block/problem1.vcf | 1 + testdata/ref_block/problem2_file1.vcf | 1 + testdata/ref_block/problem2_file2.vcf | 1 + testdata/ref_block/problem3_file1.vcf | 1 + testdata/ref_block/problem3_file2.vcf | 1 + 8 files changed, 47 insertions(+), 4 deletions(-) diff --git a/gamgee/reference_block_splitting_variant_iterator.cpp b/gamgee/reference_block_splitting_variant_iterator.cpp index 4484e5ab1..cc664d9cd 100644 --- a/gamgee/reference_block_splitting_variant_iterator.cpp +++ b/gamgee/reference_block_splitting_variant_iterator.cpp @@ -40,6 +40,17 @@ void ReferenceBlockSplittingVariantIterator::populate_split_variants () { // m_pending_chrom can only change when pending is empty auto new_pending_start = -1; auto new_pending_end = UINT_MAX; + //default value of reference if a REF interval is split + auto new_reference_allele = 'N'; + //Try to see if the next record begins immediately after end of current interval + //If yes, then new_reference_base can be obtained from the next record + //FIXME: should remove these checks to avoid overhead every time + auto& next_pos_variant_vector = MultipleVariantIterator::operator*(); + if (!next_pos_variant_vector.empty() + && next_pos_variant_vector[0].chromosome() == m_pending_chrom + && next_pos_variant_vector[0].alignment_start() == m_pending_min_end+1 + && !gamgee::missing(next_pos_variant_vector[0].ref())) + new_reference_allele = next_pos_variant_vector[0].ref()[0]; //only the first character is needed // advance iter by erase() or ++ depending on if for (auto iter = m_pending_variants.begin(); iter != m_pending_variants.end(); ) { @@ -61,6 +72,7 @@ void ReferenceBlockSplittingVariantIterator::populate_split_variants () { new_pending_end = std::min(new_pending_end, var_end); variant.set_alignment_start(new_pending_start); variant.set_alignment_stop(var_end); // stop is internally an offset to start, so we need to reset it after updating stop + variant.set_reference_allele(new_reference_allele); iter++; } } diff --git a/gamgee/variant.h b/gamgee/variant.h index 2a84ac0e8..c39b1a166 100644 --- a/gamgee/variant.h +++ b/gamgee/variant.h @@ -88,6 +88,32 @@ class Variant { inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; } inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; } + inline void set_reference_allele(const char* ref, const int32_t ref_length) + { + bcf_unpack(m_body.get(), BCF_UN_STR); + //Try to avoid calling bcf_update_alleles as it causes significant overhead in + //terms of memory re-allocation etc. + //If enough space is available, over-write the current reference allele value + //Seems like an un-necessary hack, but trust me every single such overhead counts + if(m_body->rlen >= ref_length) + { + memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy + m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option + m_body->d.shared_dirty |= BCF1_DIRTY_ALS; + } + else + { + //expensive - causes significant reallocs within htslib + //hacks to interpret const ptrs to non-const. Again, saves some overhead + //No need to free d.allele[0] because it points to shared string within bcf1_1.shared + m_body->d.allele[0] = const_cast(ref); + //Re-use same d.allele char** as update_alleles writes to different region of memory anyway + bcf_update_alleles(const_cast(m_header.get()), m_body.get(), const_cast(m_body->d.allele), m_body->n_allele); + } + } + inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast(strlen(ref))); } + inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); } + /** * @brief functional-style set logic operations for variant field vectors * diff --git a/test/reference_block_splitting_variant_reader_test.cpp b/test/reference_block_splitting_variant_reader_test.cpp index 422b35ffa..f2ddd39ad 100644 --- a/test/reference_block_splitting_variant_reader_test.cpp +++ b/test/reference_block_splitting_variant_reader_test.cpp @@ -16,7 +16,7 @@ const auto truth_contigs = vector{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, const auto truth_block_starts = vector{1, 20, 41, 42, 50, 61, 81, 101, 102, 103, 151, 201, 300, 401, 600, 650, 701, 10}; const auto truth_block_stops = vector{19, 40, 41, 49, 60, 80, 100, 101, 102, 150, 200, 299, 400, 500, 649, 700, 750, 11}; // note: final block is not a ref block, but the block ends at start + 1 because the reference allele length is 2 -//TODO const auto truth_refs = vector{"T", "T", "T", "T", "T", "61", "81", "T", "T", "T", "151", "201", "T", "401", "T", "T", "701", "GG"}; +const auto truth_refs = vector{"T", "T", "T", "T", "T", "N", "N", "T", "T", "T", "N", "N", "T", "N", "T", "T", "N", "GG"}; BOOST_AUTO_TEST_CASE( split_reference_blocks ) { @@ -27,11 +27,11 @@ BOOST_AUTO_TEST_CASE( split_reference_blocks ) BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]); BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]); BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]); - //TODO BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); + BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); } ++position_counter; } - BOOST_CHECK_EQUAL(position_counter, 18u); + BOOST_CHECK_EQUAL(position_counter, truth_contigs.size()); } BOOST_AUTO_TEST_CASE( reference_block_iterator_move_test ) { @@ -50,7 +50,7 @@ BOOST_AUTO_TEST_CASE( reference_block_iterator_move_test ) { BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]); BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]); BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]); - //TODO BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); + BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); } } } diff --git a/testdata/ref_block/problem1.vcf b/testdata/ref_block/problem1.vcf index 97abe862b..8b535c1cb 100644 --- a/testdata/ref_block/problem1.vcf +++ b/testdata/ref_block/problem1.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.1 ##ALT= ##FILTER= +##INFO= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/testdata/ref_block/problem2_file1.vcf b/testdata/ref_block/problem2_file1.vcf index 7ad227aa2..dba271e49 100644 --- a/testdata/ref_block/problem2_file1.vcf +++ b/testdata/ref_block/problem2_file1.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.1 ##ALT= ##FILTER= +##INFO= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/testdata/ref_block/problem2_file2.vcf b/testdata/ref_block/problem2_file2.vcf index f5db152c2..091417de7 100644 --- a/testdata/ref_block/problem2_file2.vcf +++ b/testdata/ref_block/problem2_file2.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.1 ##ALT= ##FILTER= +##INFO= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/testdata/ref_block/problem3_file1.vcf b/testdata/ref_block/problem3_file1.vcf index d6cb82a47..ee87769e1 100644 --- a/testdata/ref_block/problem3_file1.vcf +++ b/testdata/ref_block/problem3_file1.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.1 ##ALT= ##FILTER= +##INFO= ##FORMAT= ##FORMAT= ##FORMAT= diff --git a/testdata/ref_block/problem3_file2.vcf b/testdata/ref_block/problem3_file2.vcf index 482e2eede..84dfbcbb9 100644 --- a/testdata/ref_block/problem3_file2.vcf +++ b/testdata/ref_block/problem3_file2.vcf @@ -1,6 +1,7 @@ ##fileformat=VCFv4.1 ##ALT= ##FILTER= +##INFO= ##FORMAT= ##FORMAT= ##FORMAT= From 473a8adaac17788312e2a4d225041981afc91651 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 3 Nov 2014 16:16:22 -0500 Subject: [PATCH 4/4] Make temporary mutators private --- gamgee/variant.h | 63 +++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/gamgee/variant.h b/gamgee/variant.h index c39b1a166..b80eaf05e 100644 --- a/gamgee/variant.h +++ b/gamgee/variant.h @@ -84,36 +84,6 @@ class Variant { SharedField shared_field_as_float(const int32_t index) const; ///< same as float_shared_field but will attempt to convert underlying data to float if possible. @warning creates a new object but makes no copies of the underlying values. SharedField shared_field_as_string(const int32_t index) const; ///< same as string_shared_field but will attempt to convert underlying data to string if possible. @warning creates a new object but makes no copies of the underlying values. - // nasty temporary hacks for Joel - inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; } - inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; } - - inline void set_reference_allele(const char* ref, const int32_t ref_length) - { - bcf_unpack(m_body.get(), BCF_UN_STR); - //Try to avoid calling bcf_update_alleles as it causes significant overhead in - //terms of memory re-allocation etc. - //If enough space is available, over-write the current reference allele value - //Seems like an un-necessary hack, but trust me every single such overhead counts - if(m_body->rlen >= ref_length) - { - memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy - m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option - m_body->d.shared_dirty |= BCF1_DIRTY_ALS; - } - else - { - //expensive - causes significant reallocs within htslib - //hacks to interpret const ptrs to non-const. Again, saves some overhead - //No need to free d.allele[0] because it points to shared string within bcf1_1.shared - m_body->d.allele[0] = const_cast(ref); - //Re-use same d.allele char** as update_alleles writes to different region of memory anyway - bcf_update_alleles(const_cast(m_header.get()), m_body.get(), const_cast(m_body->d.allele), m_body->n_allele); - } - } - inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast(strlen(ref))); } - inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); } - /** * @brief functional-style set logic operations for variant field vectors * @@ -210,6 +180,39 @@ class Variant { friend class VariantWriter; friend class VariantBuilder; ///< builder needs access to the internals in order to build efficiently + + // TODO: remove this friendship and these mutators after Issue #320 is resolved + + friend class ReferenceBlockSplittingVariantIterator; + + inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; } + inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; } + + inline void set_reference_allele(const char* ref, const int32_t ref_length) + { + bcf_unpack(m_body.get(), BCF_UN_STR); + //Try to avoid calling bcf_update_alleles as it causes significant overhead in + //terms of memory re-allocation etc. + //If enough space is available, over-write the current reference allele value + //Seems like an un-necessary hack, but trust me every single such overhead counts + if(m_body->rlen >= ref_length) + { + memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy + m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option + m_body->d.shared_dirty |= BCF1_DIRTY_ALS; + } + else + { + //expensive - causes significant reallocs within htslib + //hacks to interpret const ptrs to non-const. Again, saves some overhead + //No need to free d.allele[0] because it points to shared string within bcf1_1.shared + m_body->d.allele[0] = const_cast(ref); + //Re-use same d.allele char** as update_alleles writes to different region of memory anyway + bcf_update_alleles(const_cast(m_header.get()), m_body.get(), const_cast(m_body->d.allele), m_body->n_allele); + } + } + inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast(strlen(ref))); } + inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); } }; } // end of namespace