Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Modified convert_data_to_integer and convert_data_to_float so that ve…
Browse files Browse the repository at this point in the history
…ctor_end and missing values

are correctly handled for INT8 and INT16 types. Since vector_end is
(INT8_MIN+1) while missing is (INT8_MIN), check if the dereferenced
value is <= vector_end. If yes, handle as special case and return
INT32_MIN+1 or INT32_MIN.

Updated test case for checking bcf_*_missing values. Added line in vcf
and bcf files with missing values in many fields

Modified == operator in IndividualFieldValue and SharedField to handle
special floating point values (bcf_float_missing and
bcf_float_vector_end)
  • Loading branch information
kgururaj committed Sep 23, 2014
1 parent d879419 commit b7255a3
Show file tree
Hide file tree
Showing 10 changed files with 239 additions and 53 deletions.
2 changes: 1 addition & 1 deletion gamgee/individual_field_value.h
Expand Up @@ -112,7 +112,7 @@ class IndividualFieldValue {
if (size() != other.size())
return false;
for (auto i=0u; i != size(); ++i) {
if (operator[](i) != other[i])
if (!utils::bcf_check_equal_primitive(operator[](i), other[i]))
return false;
}
return true;
Expand Down
2 changes: 1 addition & 1 deletion gamgee/shared_field.h
Expand Up @@ -90,7 +90,7 @@ class SharedField {
if (size() != other.size())
return false;
for (auto i=0u; i != size(); ++i) {
if (operator[](i) != other[i])
if (!utils::bcf_check_equal_primitive(operator[](i), other[i]))
return false;
}
return true;
Expand Down
21 changes: 21 additions & 0 deletions gamgee/utils/utils.h
Expand Up @@ -5,6 +5,7 @@
#include <memory>
#include <vector>
#include <sstream>
#include "htslib/vcf.h"

namespace gamgee {

Expand Down Expand Up @@ -66,6 +67,26 @@ inline void check_max_boundary(const uint32_t index, const uint32_t size) {
}
}

/**
* @brief - Check whether two values from VCF fields of primitive types (for which the == operator is defined) * are equal
* The function template is specialized for float fields
*/
template<class TYPE>
inline bool bcf_check_equal_primitive(const TYPE x, const TYPE y) {
return (x == y);
}
/**
* @brief: Check whether two float values from VCF fields are equal
* @note: since bcf_float_missing and bcf_float_vector_end are represented as NaNs and (nan == nan) returns false,
* a special check needs to be done for checking equality of float values in VCFs
*/
template<>
inline bool bcf_check_equal_primitive<float>(const float x, const float y) {
return ((x == y) || (bcf_float_is_missing(x) && bcf_float_is_missing(y))
|| (bcf_float_is_vector_end(x) && bcf_float_is_vector_end(y)));
}


} // end utils namespace
} // end gamgee namespace

Expand Down
40 changes: 35 additions & 5 deletions gamgee/utils/variant_field_type.cpp
Expand Up @@ -9,12 +9,18 @@ namespace gamgee {
namespace utils {

int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) {
auto return_val = -1;
const auto p = data_ptr + (index * num_bytes_per_value);
auto end_val = -1;
switch (type) {
case VariantFieldType::INT8:
return *(reinterpret_cast<const int8_t*>(p));
return_val = *(reinterpret_cast<const int8_t*>(p));
end_val = bcf_int8_vector_end;
break;
case VariantFieldType::INT16:
return *(reinterpret_cast<const int16_t*>(p));
return_val = *(reinterpret_cast<const int16_t*>(p));
end_val = bcf_int16_vector_end;
break;
case VariantFieldType::INT32:
return *(reinterpret_cast<const int32_t*>(p));
case VariantFieldType::FLOAT:
Expand All @@ -24,24 +30,48 @@ int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const
default:
return 0; // undefined or NULL type -- impossible to happen just so the compiler doesn't warn us
}
//diff = 0 if return_val == vector_end, 1 if return_val == missing, < 0 if return_val is 'normal'
auto diff = end_val - return_val;
if(diff >= 0)
return bcf_int32_vector_end - diff;
else
return return_val;
}

float convert_data_to_float(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) {
auto return_val = -1;
auto end_val = -1;
const auto p = data_ptr + (index * num_bytes_per_value);
switch (type) {
case VariantFieldType::INT8:
return *(reinterpret_cast<const int8_t*>(p));
return_val = *(reinterpret_cast<const int8_t*>(p));
end_val = bcf_int8_vector_end;
break;
case VariantFieldType::INT16:
return *(reinterpret_cast<const int16_t*>(p));
return_val = *(reinterpret_cast<const int16_t*>(p));
end_val = bcf_int16_vector_end;
break;
case VariantFieldType::INT32:
return *(reinterpret_cast<const int32_t*>(p));
return_val = *(reinterpret_cast<const int32_t*>(p));
end_val = bcf_int32_vector_end;
break;
case VariantFieldType::FLOAT:
return *(reinterpret_cast<const float*>(p));
case VariantFieldType::STRING:
throw std::invalid_argument("user requested a float value but underlying type is actually a string");
default:
return 0.0f; // undefined or NULL type -- impossible to happen just so the compiler doesn't warn us
}
if(end_val >= return_val) //can't use diff because INT32 types are also checked here and diff will wrap-around
{
auto tmp = 0.0f; //pack bcf_float_missing or bcf_float_vector_end into tmp and return tmp
//diff = 0 if return_val == vector_end, 1 if return_val == missing,
auto diff = end_val - return_val;
bcf_float_set(&tmp, bcf_float_vector_end - diff);
return tmp;
}
else
return return_val;
}

std::string convert_data_to_string(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) {
Expand Down
2 changes: 2 additions & 0 deletions test/genotypes_test.cpp
Expand Up @@ -33,6 +33,7 @@ BOOST_AUTO_TEST_CASE( hom_ref_test ){
dynamic_bitset<>(string("010")),
dynamic_bitset<>(string("010")),
dynamic_bitset<>(string("010")),
dynamic_bitset<>(string("010")),
dynamic_bitset<>(string("010"))};

select_if_test(truth, diploid, hom_ref );
Expand All @@ -53,6 +54,7 @@ BOOST_AUTO_TEST_CASE( hom_var_test ){
dynamic_bitset<>(string("100")),
dynamic_bitset<>(string("100")),
dynamic_bitset<>(string("100")),
dynamic_bitset<>(string("100")),
dynamic_bitset<>(string("100"))};

select_if_test(truth, diploid, hom_var);
Expand Down
4 changes: 2 additions & 2 deletions test/select_if_test.cpp
Expand Up @@ -45,8 +45,8 @@ BOOST_AUTO_TEST_CASE( select_if_individual_fields ) {
}

BOOST_AUTO_TEST_CASE( select_if_shared_field ) {
const auto truth_an_counts = std::vector<uint32_t>{1,1,1,1,1};
const auto truth_af_counts = std::vector<uint32_t>{0,0,0,0,1};
const auto truth_an_counts = std::vector<uint32_t>{1,1,1,1,1,1};
const auto truth_af_counts = std::vector<uint32_t>{0,0,0,0,1,0};
auto truth_index = 0u;
for (const auto& record : SingleVariantReader{"testdata/test_variants.vcf"}) {
const auto an = record.integer_shared_field("AN");
Expand Down

0 comments on commit b7255a3

Please sign in to comment.