Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
1. Modified ReferenceBlockSplittingVariantIterator to insert 'N' as the
Browse files Browse the repository at this point in the history
reference allele for a split <NON_REF> GVCF interval. This is done by
introducing a function set_reference_allele() in Variant.
2. Uncommented the test cases for
ReferenceBlockSplittingVariantIteratorr reference allele
3. The problem*.vcf files failed because their headers did not declare
an END field. htslib assumes that fields which are not declared are of
type String. Declaring END as an integer field in these files solves the
problem.
  • Loading branch information
kgururaj authored and droazen committed Oct 27, 2014
1 parent 7c35e7c commit f5ff2fa
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 4 deletions.
12 changes: 12 additions & 0 deletions gamgee/reference_block_splitting_variant_iterator.cpp
Expand Up @@ -46,6 +46,17 @@ void ReferenceBlockSplittingVariantIterator::populate_split_variants () {
// m_pending_chrom can only change when pending is empty
auto new_pending_start = -1;
auto new_pending_end = UINT_MAX;
//default value of reference if a REF interval is split
auto new_reference_allele = 'N';
//Try to see if the next record begins immediately after end of current interval
//If yes, then new_reference_base can be obtained from the next record
//FIXME: should remove these checks to avoid overhead every time
auto& next_pos_variant_vector = MultipleVariantIterator::operator*();
if (!next_pos_variant_vector.empty()
&& next_pos_variant_vector[0].chromosome() == m_pending_chrom
&& next_pos_variant_vector[0].alignment_start() == m_pending_min_end+1
&& !gamgee::missing(next_pos_variant_vector[0].ref()))
new_reference_allele = next_pos_variant_vector[0].ref()[0]; //only the first character is needed

// advance iter by erase() or ++ depending on if
for (auto iter = m_pending_variants.begin(); iter != m_pending_variants.end(); ) {
Expand All @@ -67,6 +78,7 @@ void ReferenceBlockSplittingVariantIterator::populate_split_variants () {
new_pending_end = std::min(new_pending_end, var_end);
variant.set_alignment_start(new_pending_start);
variant.set_alignment_stop(var_end); // stop is internally an offset to start, so we need to reset it after updating stop
variant.set_reference_allele(new_reference_allele);
iter++;
}
}
Expand Down
26 changes: 26 additions & 0 deletions gamgee/variant.h
Expand Up @@ -87,6 +87,32 @@ class Variant {
inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; }
inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; }

inline void set_reference_allele(const char* ref, const int32_t ref_length)
{
bcf_unpack(m_body.get(), BCF_UN_STR);
//Try to avoid calling bcf_update_alleles as it causes significant overhead in
//terms of memory re-allocation etc.
//If enough space is available, over-write the current reference allele value
//Seems like an un-necessary hack, but trust me every single such overhead counts
if(m_body->rlen >= ref_length)
{
memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy
m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option
m_body->d.shared_dirty |= BCF1_DIRTY_ALS;
}
else
{
//expensive - causes significant reallocs within htslib
//hacks to interpret const ptrs to non-const. Again, saves some overhead
//No need to free d.allele[0] because it points to shared string within bcf1_1.shared
m_body->d.allele[0] = const_cast<char*>(ref);
//Re-use same d.allele char** as update_alleles writes to different region of memory anyway
bcf_update_alleles(const_cast<const bcf_hdr_t*>(m_header.get()), m_body.get(), const_cast<const char**>(m_body->d.allele), m_body->n_allele);
}
}
inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast<int32_t>(strlen(ref))); }
inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); }

/**
* @brief functional-style set logic operations for variant field vectors
*
Expand Down
8 changes: 4 additions & 4 deletions test/reference_block_splitting_variant_reader_test.cpp
Expand Up @@ -16,7 +16,7 @@ const auto truth_contigs = vector<uint32_t>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
const auto truth_block_starts = vector<uint32_t>{1, 20, 41, 42, 50, 61, 81, 101, 102, 103, 151, 201, 300, 401, 600, 650, 701, 10};
const auto truth_block_stops = vector<uint32_t>{19, 40, 41, 49, 60, 80, 100, 101, 102, 150, 200, 299, 400, 500, 649, 700, 750, 11};
// note: final block is not a ref block, but the block ends at start + 1 because the reference allele length is 2
//TODO const auto truth_refs = vector<string>{"T", "T", "T", "T", "T", "61", "81", "T", "T", "T", "151", "201", "T", "401", "T", "T", "701", "GG"};
const auto truth_refs = vector<string>{"T", "T", "T", "T", "T", "N", "N", "T", "T", "T", "N", "N", "T", "N", "T", "T", "N", "GG"};

BOOST_AUTO_TEST_CASE( split_reference_blocks )
{
Expand All @@ -27,11 +27,11 @@ BOOST_AUTO_TEST_CASE( split_reference_blocks )
BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]);
BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]);
BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]);
//TODO BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]);
BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]);
}
++position_counter;
}
BOOST_CHECK_EQUAL(position_counter, 18u);
BOOST_CHECK_EQUAL(position_counter, truth_contigs.size());
}

BOOST_AUTO_TEST_CASE( reference_block_iterator_move_test ) {
Expand All @@ -50,7 +50,7 @@ BOOST_AUTO_TEST_CASE( reference_block_iterator_move_test ) {
BOOST_CHECK_EQUAL(record.chromosome(), truth_contigs[position_counter]);
BOOST_CHECK_EQUAL(record.alignment_start(), truth_block_starts[position_counter]);
BOOST_CHECK_EQUAL(record.alignment_stop(), truth_block_stops[position_counter]);
//TODO BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]);
BOOST_CHECK_EQUAL(record.ref(), truth_refs[position_counter]);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions testdata/ref_block/problem1.vcf
@@ -1,6 +1,7 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Expand Down
1 change: 1 addition & 0 deletions testdata/ref_block/problem2_file1.vcf
@@ -1,6 +1,7 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Expand Down
1 change: 1 addition & 0 deletions testdata/ref_block/problem2_file2.vcf
@@ -1,6 +1,7 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Expand Down
1 change: 1 addition & 0 deletions testdata/ref_block/problem3_file1.vcf
@@ -1,6 +1,7 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Expand Down
1 change: 1 addition & 0 deletions testdata/ref_block/problem3_file2.vcf
@@ -1,6 +1,7 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Expand Down

0 comments on commit f5ff2fa

Please sign in to comment.