Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Merge pull request #369 from broadinstitute/dr_fix_genotype_encoding_…
Browse files Browse the repository at this point in the history
…of_vector_ends

Prevent vector end values from getting transformed during Genotype encoding, and detect invalid values
  • Loading branch information
Mauricio Carneiro committed Nov 8, 2014
2 parents 3067369 + 1ddb657 commit 324e7e8
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 3 deletions.
10 changes: 9 additions & 1 deletion gamgee/genotype.h
Expand Up @@ -10,6 +10,7 @@

#include <memory>
#include <utility>
#include <stdexcept>

namespace gamgee {

Expand Down Expand Up @@ -296,7 +297,14 @@ class Genotype{
*/
static inline void encode_genotype(std::vector<int32_t>& alleles, bool phase_all_alleles) {
for ( auto allele_index = 0u; allele_index < alleles.size(); ++allele_index ) {
alleles[allele_index] = (alleles[allele_index] + 1) << 1 | (phase_all_alleles && allele_index > 0u ? 1 : 0);
// Only legal value below -1 is the int32 vector end value
if ( alleles[allele_index] < -1 && alleles[allele_index] != bcf_int32_vector_end ) {
throw std::invalid_argument{"Genotype vector must consist only of allele indices, -1 for missing values, or vector end values"};
}
// Do not modify vector end values
else if ( alleles[allele_index] != bcf_int32_vector_end ) {
alleles[allele_index] = (alleles[allele_index] + 1) << 1 | (phase_all_alleles && allele_index > 0u ? 1 : 0);
}
}
}

Expand Down
23 changes: 23 additions & 0 deletions test/genotypes_test.cpp
Expand Up @@ -6,6 +6,7 @@

#include <boost/dynamic_bitset.hpp>
#include <algorithm>
#include <stdexcept>

using namespace std;
using namespace gamgee;
Expand Down Expand Up @@ -194,6 +195,12 @@ BOOST_AUTO_TEST_CASE( encode_genotype ) {
Genotype::encode_genotype(alleles);
BOOST_CHECK(equal(alleles.begin(), alleles.end(), expected.begin()));

// vector end values should not get transformed by encoding
alleles = {bcf_int32_vector_end};
expected = {bcf_int32_vector_end};
Genotype::encode_genotype(alleles);
BOOST_CHECK(equal(alleles.begin(), alleles.end(), expected.begin()));

alleles = {0, 1};
expected = {2, 4};
Genotype::encode_genotype(alleles);
Expand All @@ -209,6 +216,12 @@ BOOST_AUTO_TEST_CASE( encode_genotype ) {
Genotype::encode_genotype(alleles);
BOOST_CHECK(equal(alleles.begin(), alleles.end(), expected.begin()));

// If there are multiple ploidies and/or missing genotypes we will have vector end values mixed in with the alleles
alleles = { 1, bcf_int32_vector_end, -1, bcf_int32_vector_end, 0, 1 };
expected = { 4, bcf_int32_vector_end, 0, bcf_int32_vector_end, 2, 4 };
Genotype::encode_genotype(alleles);
BOOST_CHECK(equal(alleles.begin(), alleles.end(), expected.begin()));

// First allele should not be phased when phasing is requested
alleles = {-1};
expected = {0};
Expand Down Expand Up @@ -240,4 +253,14 @@ BOOST_AUTO_TEST_CASE( encode_genotype ) {
expected = {4, 7, 3};
Genotype::encode_genotype(alleles, true);
BOOST_CHECK(equal(alleles.begin(), alleles.end(), expected.begin()));

// Should detect invalid values: only allele indices >= 0, -1 for missing values, and vector end should be allowed
alleles = {-2};
BOOST_CHECK_THROW(Genotype::encode_genotype(alleles), std::invalid_argument);

alleles = {bcf_int32_missing};
BOOST_CHECK_THROW(Genotype::encode_genotype(alleles), std::invalid_argument);

alleles = {0, bcf_int32_vector_end + 1};
BOOST_CHECK_THROW(Genotype::encode_genotype(alleles), std::invalid_argument);
}
11 changes: 9 additions & 2 deletions test/variant_builder_test.cpp
Expand Up @@ -1112,12 +1112,19 @@ BOOST_AUTO_TEST_CASE( bulk_set_genotype_field ) {
check_genotype_field(variant, expected);

// Flat vector, varying ploidy with manual padding
flat_vector = {0, 1, 2, 1, 0, -1, 0, -1, -1}; Genotype::encode_genotype(flat_vector);
expected = {{0, 1, 2}, {1, 0, missing_values::int32}, {0, missing_values::int32, missing_values::int32}};
flat_vector = {0, 1, 2, 1, bcf_int32_vector_end, bcf_int32_vector_end, 0, 1, bcf_int32_vector_end}; Genotype::encode_genotype(flat_vector);
expected = {{0, 1, 2}, {1, bcf_int32_vector_end, bcf_int32_vector_end}, {0, 1, bcf_int32_vector_end}};
variant = perform_move ? builder.set_genotypes(move(flat_vector)).build() :
builder.set_genotypes(flat_vector).build();
check_genotype_field(variant, expected);

// Flat vector, varying ploidy with manual padding and a missing sample
flat_vector = {0, 1, 2, -1, bcf_int32_vector_end, bcf_int32_vector_end, 0, 1, bcf_int32_vector_end}; Genotype::encode_genotype(flat_vector);
expected = {{0, 1, 2}, {bcf_int32_missing, bcf_int32_vector_end, bcf_int32_vector_end}, {0, 1, bcf_int32_vector_end}};
variant = perform_move ? builder.set_genotypes(move(flat_vector)).build() :
builder.set_genotypes(flat_vector).build();
check_genotype_field(variant, expected);

// Nested vector, one allele per sample
nested_vector = {{0}, {1}, {0}}; Genotype::encode_genotypes(nested_vector);
expected = {{0}, {1}, {0}};
Expand Down

0 comments on commit 324e7e8

Please sign in to comment.