diff --git a/gamgee/CMakeLists.txt b/gamgee/CMakeLists.txt index 7413f08c9..335d75b25 100644 --- a/gamgee/CMakeLists.txt +++ b/gamgee/CMakeLists.txt @@ -33,6 +33,8 @@ set(SOURCE_FILES read_bases.h read_group.cpp read_group.h + reference_block_splitting_variant_iterator.cpp + reference_block_splitting_variant_iterator.h reference_iterator.cpp reference_iterator.h reference_map.cpp diff --git a/gamgee/gamgee.h b/gamgee/gamgee.h index 8ec1774b9..f51436774 100644 --- a/gamgee/gamgee.h +++ b/gamgee/gamgee.h @@ -21,6 +21,7 @@ #include "multiple_variant_iterator.h" #include "multiple_variant_reader.h" #include "read_bases.h" +#include "reference_block_splitting_variant_iterator.h" #include "reference_iterator.h" #include "reference_map.h" #include "sam.h" diff --git a/gamgee/reference_block_splitting_variant_iterator.cpp b/gamgee/reference_block_splitting_variant_iterator.cpp new file mode 100644 index 000000000..cc664d9cd --- /dev/null +++ b/gamgee/reference_block_splitting_variant_iterator.cpp @@ -0,0 +1,124 @@ +#include "reference_block_splitting_variant_iterator.h" + +namespace gamgee { + +ReferenceBlockSplittingVariantIterator::ReferenceBlockSplittingVariantIterator(const std::vector> variant_files, const std::vector> variant_headers) : + MultipleVariantIterator {variant_files, variant_headers}, + m_pending_variants {}, + m_split_variants {} +{ + m_split_variants.reserve(variant_files.size()); + fetch_next_split_vector(); +} + +std::vector& ReferenceBlockSplittingVariantIterator::operator*() { + return m_split_variants; +} + +std::vector& ReferenceBlockSplittingVariantIterator::operator++() { + fetch_next_split_vector(); + return m_split_variants; +} + +// NOTE: this method does the minimal work necessary to determine that we have reached the end of iteration +// it is NOT a valid general-purpose inequality method +bool ReferenceBlockSplittingVariantIterator::operator!=(const ReferenceBlockSplittingVariantIterator& rhs) { + return !(m_pending_variants.empty() && rhs.m_pending_variants.empty() + && m_split_variants.empty() && rhs.m_split_variants.empty()); +} + +void ReferenceBlockSplittingVariantIterator::populate_pending () { + for (const auto& variant : MultipleVariantIterator::operator*()) { + m_pending_min_end = std::min(m_pending_min_end, variant.alignment_stop()); + m_pending_variants.push_front(std::move(variant)); + } + + MultipleVariantIterator::operator++(); +} + +void ReferenceBlockSplittingVariantIterator::populate_split_variants () { + // m_pending_chrom can only change when pending is empty + auto new_pending_start = -1; + auto new_pending_end = UINT_MAX; + //default value of reference if a REF interval is split + auto new_reference_allele = 'N'; + //Try to see if the next record begins immediately after end of current interval + //If yes, then new_reference_base can be obtained from the next record + //FIXME: should remove these checks to avoid overhead every time + auto& next_pos_variant_vector = MultipleVariantIterator::operator*(); + if (!next_pos_variant_vector.empty() + && next_pos_variant_vector[0].chromosome() == m_pending_chrom + && next_pos_variant_vector[0].alignment_start() == m_pending_min_end+1 + && !gamgee::missing(next_pos_variant_vector[0].ref())) + new_reference_allele = next_pos_variant_vector[0].ref()[0]; //only the first character is needed + + // advance iter by erase() or ++ depending on if + for (auto iter = m_pending_variants.begin(); iter != m_pending_variants.end(); ) { + auto& variant = *iter; + auto var_end = variant.alignment_stop(); + // don't split reference blocks which end at the correct point + // or variants with actual alt alleles + if (var_end == m_pending_min_end || variant.alt().size() > 1) { + m_split_variants.push_back(std::move(variant)); + iter = m_pending_variants.erase(iter); + } + else { + // this is a reference block that extends past the desired end, so split it + auto split_variant = variant; + split_variant.set_alignment_stop(m_pending_min_end); + m_split_variants.push_back(std::move(split_variant)); + + new_pending_start = m_pending_min_end + 1; + new_pending_end = std::min(new_pending_end, var_end); + variant.set_alignment_start(new_pending_start); + variant.set_alignment_stop(var_end); // stop is internally an offset to start, so we need to reset it after updating stop + variant.set_reference_allele(new_reference_allele); + iter++; + } + } + + if (new_pending_start != -1) { + m_pending_start = new_pending_start; + m_pending_min_end = new_pending_end; + } + +} + +void ReferenceBlockSplittingVariantIterator::fetch_next_split_vector() { + m_split_variants.clear(); + // run until we have split variants or the incoming pre-split vector is empty, indicating iteration is done + // if the incoming vector is empty, we also end with an empty vector to indicate our iteration is done + while (m_split_variants.empty() && ! MultipleVariantIterator::operator*().empty()) { + auto& incoming = MultipleVariantIterator::operator*(); + // when the pending list is empty, fill from the incoming vector and consume it + if (m_pending_variants.empty()) { + m_pending_chrom = incoming[0].chromosome(); + m_pending_start = incoming[0].alignment_start(); + m_pending_min_end = UINT_MAX; + populate_pending(); + } + else { + // if the incoming vector has the same start location as pending, add it to pending and consume it + if (!incoming.empty() + && incoming[0].chromosome() == m_pending_chrom + && incoming[0].alignment_start() == m_pending_start) + populate_pending(); + } + + // refresh incoming because it may have advanced + incoming = MultipleVariantIterator::operator*(); + + // if the pending Variants' end is after the incoming vector's start, then their end is too high + if (!incoming.empty() && incoming[0].chromosome() == m_pending_chrom) + m_pending_min_end = std::min(m_pending_min_end, incoming[0].alignment_start() - 1); + + populate_split_variants(); + } + + // we may need one final populate for the last ref block + if (m_split_variants.empty() && !m_pending_variants.empty()) + populate_split_variants(); +} + +} + diff --git a/gamgee/reference_block_splitting_variant_iterator.h b/gamgee/reference_block_splitting_variant_iterator.h new file mode 100644 index 000000000..374b40c8b --- /dev/null +++ b/gamgee/reference_block_splitting_variant_iterator.h @@ -0,0 +1,94 @@ +#ifndef __gamgee__reference_block_splitting_variant_iterator__ +#define __gamgee__reference_block_splitting_variant_iterator__ + +#include "variant.h" +#include "multiple_variant_iterator.h" + +#include + +namespace gamgee { + +/** + * @brief Utility class to handle reference blocks while iterating over multiple variant files + * + * @warn This class is experimental/WIP + */ +class ReferenceBlockSplittingVariantIterator : public MultipleVariantIterator { + public: + + /** + * @brief creates an empty iterator (used for the end() method) + */ + ReferenceBlockSplittingVariantIterator() = default; + + /** + * @brief initializes a new iterator based on a vector of input files (vcf or bcf) + * + * @param variant_files vector of vcf/bcf files opened via the bcf_open() macro from htslib + * @param variant_headers vector of variant headers corresponding to these files + */ + ReferenceBlockSplittingVariantIterator(const std::vector> variant_files, const std::vector> variant_headers); + + /** + * @brief a ReferenceBlockSplittingVariantIterator move constructor guarantees all objects will have the same state. + */ + ReferenceBlockSplittingVariantIterator(ReferenceBlockSplittingVariantIterator&&) = default; + ReferenceBlockSplittingVariantIterator& operator=(ReferenceBlockSplittingVariantIterator&&) = default; + + /** + * @brief a ReferenceBlockSplittingVariantIterator cannot be copied. + */ + ReferenceBlockSplittingVariantIterator(const ReferenceBlockSplittingVariantIterator&) = delete; + ReferenceBlockSplittingVariantIterator& operator=(const ReferenceBlockSplittingVariantIterator&) = delete; + + /** + * @brief pseudo-inequality operator (needed by for-each loop) + * + * @warning this method does the minimal work necessary to determine that we have reached the end of iteration. + * it is NOT a valid general-purpose inequality method. + * + * @param rhs the other ReferenceBlockSplittingVariantIterator to compare to + * + * @return whether both iterators have entered their end states + */ + bool operator!=(const ReferenceBlockSplittingVariantIterator& rhs); + + /** + * @brief dereference operator (needed by for-each loop) + * + * @return a reference to the iterator's Variant vector + */ + std::vector& operator*(); + + /** + * @brief advances the iterator, fetching the next vector + * + * @return a reference to the iterator's Variant vector + */ + std::vector& operator++(); + + private: + // fetches the next reference-block-split Variant vector + // calls populate_pending() and populate_split_variants() as needed + void fetch_next_split_vector(); + + // populates the list of pending variants from the incoming vector of pre-split reference-block variants + inline void populate_pending(); + + // populates the vector of split variants from the list of pending variants, modifying the pending list as well + inline void populate_split_variants(); + + // holds the incoming reference-block variants before and during split operations + std::list m_pending_variants; + + // caches next reference-block-split Variant vector + std::vector m_split_variants; + + unsigned int m_pending_chrom = UINT_MAX; + unsigned int m_pending_start = UINT_MAX; + unsigned int m_pending_min_end = UINT_MAX; +}; + +} // end namespace gamgee + +#endif // __gamgee__reference_block_splitting_variant_iterator__ diff --git a/gamgee/variant.h b/gamgee/variant.h index 0cd836464..b80eaf05e 100644 --- a/gamgee/variant.h +++ b/gamgee/variant.h @@ -180,6 +180,39 @@ class Variant { friend class VariantWriter; friend class VariantBuilder; ///< builder needs access to the internals in order to build efficiently + + // TODO: remove this friendship and these mutators after Issue #320 is resolved + + friend class ReferenceBlockSplittingVariantIterator; + + inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; } + inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; } + + inline void set_reference_allele(const char* ref, const int32_t ref_length) + { + bcf_unpack(m_body.get(), BCF_UN_STR); + //Try to avoid calling bcf_update_alleles as it causes significant overhead in + //terms of memory re-allocation etc. + //If enough space is available, over-write the current reference allele value + //Seems like an un-necessary hack, but trust me every single such overhead counts + if(m_body->rlen >= ref_length) + { + memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy + m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option + m_body->d.shared_dirty |= BCF1_DIRTY_ALS; + } + else + { + //expensive - causes significant reallocs within htslib + //hacks to interpret const ptrs to non-const. Again, saves some overhead + //No need to free d.allele[0] because it points to shared string within bcf1_1.shared + m_body->d.allele[0] = const_cast(ref); + //Re-use same d.allele char** as update_alleles writes to different region of memory anyway + bcf_update_alleles(const_cast(m_header.get()), m_body.get(), const_cast(m_body->d.allele), m_body->n_allele); + } + } + inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast(strlen(ref))); } + inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); } }; } // end of namespace diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d9d0d24bd..faf8377a4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,6 +8,7 @@ set(SOURCE_FILES main.cpp missing_test.cpp read_group_test.cpp + reference_block_splitting_variant_reader_test.cpp reference_test.cpp sam_builder_test.cpp sam_header_test.cpp diff --git a/test/reference_block_splitting_variant_reader_test.cpp b/test/reference_block_splitting_variant_reader_test.cpp new file mode 100644 index 000000000..f2ddd39ad --- /dev/null +++ b/test/reference_block_splitting_variant_reader_test.cpp @@ -0,0 +1,119 @@ +#include "multiple_variant_reader.h" +#include "reference_block_splitting_variant_iterator.h" + +#include "test_utils.h" + +#include + +using namespace std; +using namespace gamgee; + +using GVCFReader = MultipleVariantReader; + +const auto test_files = vector{"testdata/ref_block/test1.vcf", "testdata/ref_block/test2.vcf", "testdata/ref_block/test3.vcf", + "testdata/ref_block/test4.vcf", "testdata/ref_block/test5.vcf"}; +const auto truth_contigs = vector{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; +const auto truth_block_starts = vector{1, 20, 41, 42, 50, 61, 81, 101, 102, 103, 151, 201, 300, 401, 600, 650, 701, 10}; +const auto truth_block_stops = vector{19, 40, 41, 49, 60, 80, 100, 101, 102, 150, 200, 299, 400, 500, 649, 700, 750, 11}; +// note: final block is not a ref block, but the block ends at start + 1 because the reference allele length is 2 +const auto truth_refs = vector{"T", "T", "T", "T", "T", "N", "N", "T", "T", "T", "N", "N", "T", "N", "T", "T", "N", "GG"}; + +BOOST_AUTO_TEST_CASE( split_reference_blocks ) +{ + auto reader = GVCFReader{test_files, false}; + auto position_counter = 0u; + for (const auto& vec : reader) { + for (const auto& record : vec) { + BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]); + BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); + } + ++position_counter; + } + BOOST_CHECK_EQUAL(position_counter, truth_contigs.size()); +} + +BOOST_AUTO_TEST_CASE( reference_block_iterator_move_test ) { + auto reader0 = MultipleVariantReader{test_files, false}; + auto iter0 = reader0.begin(); + + auto reader1 = MultipleVariantReader{test_files, false}; + auto iter1 = reader1.begin(); + auto moved = check_move_constructor(iter1); + + auto record0 = *iter0; + auto moved_record = *iter1; + const auto position_counter = 0; + for (const auto vec : {record0, moved_record}) { + for (const auto record : vec) { + BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]); + BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]); + BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]); + } + } +} + +// test cases based on problematic regions from real 1000 Genomes data +// mainly involving long overlapping reference alleles + +const auto p1_file = "testdata/ref_block/problem1.vcf"; +const auto p1_block_starts = vector{11140505, 11140610, 11140611, 11140620, 11140621, 11140623, 11140629, 11140768}; +const auto p1_block_stops = vector{11140609, 11140610, 11140619, 11140628, 11140628, 11140628, 11140658, 11141021}; + +BOOST_AUTO_TEST_CASE( reference_block_problem_region_1 ) { + auto counter = 0u; + auto reader = GVCFReader{{p1_file}}; + for (const auto& vec : reader) { + BOOST_CHECK(vec.size() == 1u); + for (const auto& record : vec) { + BOOST_CHECK(record.alignment_start() == p1_block_starts[counter]); + BOOST_CHECK(record.alignment_stop() == p1_block_stops[counter]); + counter++; + } + } + BOOST_CHECK(counter == 8u); +} + +const auto p2_files = vector{"testdata/ref_block/problem2_file1.vcf", "testdata/ref_block/problem2_file2.vcf"}; +const auto p2_block_starts = vector{3319304, 3319319, 3319404, 3319407, 3319410, + 3319413, 3319433, 3319447, 3319558, 3319565, 3319577, 3319599, 3319600, 3319601, 3319602}; +const auto p2_block_stops = vector{3319318, 3319403, 3319406, 3319409, 3319412, + 3319432, 3319446, 3319557, 3319564, 3319576, 3319598, 3319599, 3319600, 3319601, 3319614}; + +BOOST_AUTO_TEST_CASE( reference_block_problem_region_2 ) { + auto counter = 0u; + auto reader = GVCFReader{p2_files}; + for (const auto& vec : reader) { + for (const auto& record : vec) { + BOOST_CHECK(record.alignment_start() == p2_block_starts[counter]); + if (record.alt().size() == 1) + BOOST_CHECK(record.alignment_stop() == p2_block_stops[counter]); + } + counter++; + } + + BOOST_CHECK(counter == p2_block_starts.size()); +} + +const auto p3_files = vector{"testdata/ref_block/problem3_file1.vcf", "testdata/ref_block/problem3_file2.vcf"}; +const auto p3_block_starts = vector{3319304, 3319319, 3319404, 3319407, 3319410, + 3319413, 3319433, 3319447, 3319558, 3319565, 3319577, 3319599, 3319600}; +const auto p3_block_stops = vector{3319318, 3319403, 3319406, 3319409, 3319412, + 3319432, 3319446, 3319557, 3319564, 3319576, 3319598, 3319599, 3319600}; + +BOOST_AUTO_TEST_CASE( reference_block_problem_region_3 ) { + auto counter = 0u; + auto reader = GVCFReader{p3_files}; + for (const auto& vec : reader) { + for (const auto& record : vec) { + BOOST_CHECK(record.alignment_start() == p3_block_starts[counter]); + if (record.alt().size() == 1) + BOOST_CHECK(record.alignment_stop() == p3_block_stops[counter]); + } + counter++; + } + + BOOST_CHECK(counter == p3_block_starts.size()); +} diff --git a/test/variant_reader_test.cpp b/test/variant_reader_test.cpp index 2ed9af686..6792f2af6 100644 --- a/test/variant_reader_test.cpp +++ b/test/variant_reader_test.cpp @@ -878,23 +878,30 @@ void single_variant_reader_sample_test(const string filename, const vector{}, true, 0); // exclude all samples (sites-only) + // Fails: Issue #209 + //single_variant_reader_sample_test("testdata/test_variants.bcf", vector{}, true, 0); // exclude all samples (sites-only) } BOOST_AUTO_TEST_CASE( single_variant_reader_include_all_samples ) { single_variant_reader_sample_test("testdata/test_variants.vcf", vector{}, false, 3); // include all samples by setting include == false and passing an empty list + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{}, false, 3); // include all samples by setting include == false and passing an empty list } BOOST_AUTO_TEST_CASE( single_variant_reader_including ) { single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12878"}, true, 1); // include only NA12878 single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12878", "NA12892"}, true, 2); // include both these samples + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12878"}, true, 1); // include only NA12878 + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12878", "NA12892"}, true, 2); // include both these samples } BOOST_AUTO_TEST_CASE( single_variant_reader_excluding ) { single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12891"}, false, 2); // exclude only NA12891 single_variant_reader_sample_test("testdata/test_variants.vcf", vector{"NA12891", "NA12878"}, false, 1); // exclude both these samples + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12891"}, false, 2); // exclude only NA12891 + single_variant_reader_sample_test("testdata/test_variants.bcf", vector{"NA12891", "NA12878"}, false, 1); // exclude both these samples } BOOST_AUTO_TEST_CASE( single_variant_reader_missing_data ) @@ -1033,12 +1040,10 @@ void multiple_variant_reader_sample_test(const vector samples, const boo BOOST_CHECK_EQUAL(reader.combined_header().n_samples(), desired_samples); } -/* TODO: Issue #209 BOOST_AUTO_TEST_CASE( multiple_variant_reader_sites_only ) { multiple_variant_reader_sample_test(vector{}, true, 0); // exclude all samples (sites-only) } -*/ BOOST_AUTO_TEST_CASE( multiple_variant_reader_include_all_samples ) { diff --git a/testdata/ref_block/problem1.vcf b/testdata/ref_block/problem1.vcf new file mode 100644 index 000000000..8b535c1cb --- /dev/null +++ b/testdata/ref_block/problem1.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE +1 11140505 . C . . END=11140609 GT:DP:GQ:MIN_DP:PL 0/0:102:99:64:0,117,1755 +1 11140610 . G . . END=11140610 GT:DP:GQ:MIN_DP:PL 0/0:65:0:65:0,0,1851 +1 11140611 . A . . END=11140619 GT:DP:GQ:MIN_DP:PL 0/0:83:69:82:0,60,900 +1 11140620 . AACACACAC A,AAC,AACACACACAC, 0.01 . . GT:AD:DP:GQ:PL:SB 0/3:20,0,0,5,0:25:9:198,444,4343,424,3388,3263,0,1715,1379,2412,9,694,634,502,457:6,14,3,2 +1 11140621 . ACACACAC A,AACACAC,AAC, 142.73 . . GT:AD:DP:GQ:PL:SB 0/2:41,0,10,2,0:53:99:204,420,4129,0,2639,2527,217,3164,2429,3061,180,2736,2400,2563,2525:18,23,2,8 +1 11140623 . ACACAC A,AAC, 0 . . GT:AD:DP:GQ:PL:SB 0/0:46,3,7,0:56:58:0,127,3660,58,2726,2688,145,2832,2670,2756:0,0,0,0 +1 11140629 . A . . END=11140658 GT:DP:GQ:MIN_DP:PL 0/0:76:99:73:0,120,1800 +1 11140768 . A . . END=11141021 GT:DP:GQ:MIN_DP:PL 0/0:127:99:93:0,120,1800 diff --git a/testdata/ref_block/problem2_file1.vcf b/testdata/ref_block/problem2_file1.vcf new file mode 100644 index 000000000..dba271e49 --- /dev/null +++ b/testdata/ref_block/problem2_file1.vcf @@ -0,0 +1,26 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE1 +1 3319304 . C . . END=3319318 GT:DP:GQ:MIN_DP:PL 0/0:19:51:17:0,42,618 +1 3319319 . A . . END=3319403 GT:DP:GQ:MIN_DP:PL 0/0:29:75:21:0,63,758 +1 3319404 . G . . END=3319406 GT:DP:GQ:MIN_DP:PL 0/0:24:57:24:0,57,855 +1 3319407 . G . . END=3319409 GT:DP:GQ:MIN_DP:PL 0/0:24:63:24:0,60,900 +1 3319410 . C . . END=3319412 GT:DP:GQ:MIN_DP:PL 0/0:24:54:24:0,54,810 +1 3319413 . G . . END=3319432 GT:DP:GQ:MIN_DP:PL 0/0:24:63:22:0,60,900 +1 3319433 . C . . END=3319446 GT:DP:GQ:MIN_DP:PL 0/0:21:57:20:0,45,810 +1 3319447 . G . . END=3319557 GT:DP:GQ:MIN_DP:PL 0/0:29:72:21:0,60,715 +1 3319558 . T . . END=3319564 GT:DP:GQ:MIN_DP:PL 0/0:24:57:23:0,54,810 +1 3319565 . G . . END=3319576 GT:DP:GQ:MIN_DP:PL 0/0:22:60:21:0,60,900 +1 3319577 . T . . END=3319600 GT:DP:GQ:MIN_DP:PL 0/0:17:42:13:0,23,495 +1 3319601 . ACCCTCCTCTGAGTCTTCCTCCCCTTCCCGTG A, 155.74 . DP=13;MLEAC=2,0;MLEAF=1.00,0.00;MQ=61.16;MQ0=0 GT:AD:DP:GQ:PL:SB 1/1:0,4,0:4:32:198,32,0,198,32,198:0,0,0,0 diff --git a/testdata/ref_block/problem2_file2.vcf b/testdata/ref_block/problem2_file2.vcf new file mode 100644 index 000000000..091417de7 --- /dev/null +++ b/testdata/ref_block/problem2_file2.vcf @@ -0,0 +1,18 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE2 +1 3319304 . C . . END=3319598 GT:DP:GQ:MIN_DP:PL 0/0:151:99:66:0,63,1800 +1 3319599 . G . . END=3319599 GT:DP:GQ:MIN_DP:PL 0/0:101:52:101:0,52,2707 +1 3319600 . T . . END=3319601 GT:DP:GQ:MIN_DP:PL 0/0:100:0:87:0,0,1855 +1 3319602 . C . . END=3319614 GT:DP:GQ:MIN_DP:PL 0/0:84:99:71:0,120,1800 diff --git a/testdata/ref_block/problem3_file1.vcf b/testdata/ref_block/problem3_file1.vcf new file mode 100644 index 000000000..ee87769e1 --- /dev/null +++ b/testdata/ref_block/problem3_file1.vcf @@ -0,0 +1,25 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE1 +1 3319304 . C . . END=3319318 GT:DP:GQ:MIN_DP:PL 0/0:19:51:17:0,42,618 +1 3319319 . A . . END=3319403 GT:DP:GQ:MIN_DP:PL 0/0:29:75:21:0,63,758 +1 3319404 . G . . END=3319406 GT:DP:GQ:MIN_DP:PL 0/0:24:57:24:0,57,855 +1 3319407 . G . . END=3319409 GT:DP:GQ:MIN_DP:PL 0/0:24:63:24:0,60,900 +1 3319410 . C . . END=3319412 GT:DP:GQ:MIN_DP:PL 0/0:24:54:24:0,54,810 +1 3319413 . G . . END=3319432 GT:DP:GQ:MIN_DP:PL 0/0:24:63:22:0,60,900 +1 3319433 . C . . END=3319446 GT:DP:GQ:MIN_DP:PL 0/0:21:57:20:0,45,810 +1 3319447 . G . . END=3319557 GT:DP:GQ:MIN_DP:PL 0/0:29:72:21:0,60,715 +1 3319558 . T . . END=3319564 GT:DP:GQ:MIN_DP:PL 0/0:24:57:23:0,54,810 +1 3319565 . G . . END=3319576 GT:DP:GQ:MIN_DP:PL 0/0:22:60:21:0,60,900 +1 3319577 . T . . END=3319600 GT:DP:GQ:MIN_DP:PL 0/0:17:42:13:0,23,495 diff --git a/testdata/ref_block/problem3_file2.vcf b/testdata/ref_block/problem3_file2.vcf new file mode 100644 index 000000000..84dfbcbb9 --- /dev/null +++ b/testdata/ref_block/problem3_file2.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.1 +##ALT= +##FILTER= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##reference=Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE2 +1 3319304 . C . . END=3319598 GT:DP:GQ:MIN_DP:PL 0/0:151:99:66:0,63,1800 +1 3319599 . G . . END=3319599 GT:DP:GQ:MIN_DP:PL 0/0:101:52:101:0,52,2707 diff --git a/testdata/ref_block/test1.vcf b/testdata/ref_block/test1.vcf new file mode 100644 index 000000000..a11d1b025 --- /dev/null +++ b/testdata/ref_block/test1.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 1 . T . . END=100;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 101 . T . . END=200;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test2.vcf b/testdata/ref_block/test2.vcf new file mode 100644 index 000000000..dd2c44b58 --- /dev/null +++ b/testdata/ref_block/test2.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 50 . T . . END=150;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 300 . T . . END=400;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test3.vcf b/testdata/ref_block/test3.vcf new file mode 100644 index 000000000..11fc2cbfb --- /dev/null +++ b/testdata/ref_block/test3.vcf @@ -0,0 +1,24 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 1 . T . . END=40;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 41 . T C, . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 42 . T . . END=60;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 650 . T . . END=750;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test4.vcf b/testdata/ref_block/test4.vcf new file mode 100644 index 000000000..20cfaa50c --- /dev/null +++ b/testdata/ref_block/test4.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 1 . T . . END=101;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 102 . T C, . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 103 . T . . END=500;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA diff --git a/testdata/ref_block/test5.vcf b/testdata/ref_block/test5.vcf new file mode 100644 index 000000000..bd98de167 --- /dev/null +++ b/testdata/ref_block/test5.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 +1 20 . T . . END=80;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +1 600 . T . . END=700;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA +20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA