Modified convert_data_to_integer and convert_data_to_float so that ve…

…ctor_end and missing values are correctly handled for INT8 and INT16 types. Since vector_end is (INT8_MIN+1) while missing is (INT8_MIN), check if the dereferenced value is <= vector_end. If yes, handle as special case and return INT32_MIN+1 or INT32_MIN. Updated test case for checking bcf_*_missing values. Added line in vcf and bcf files with missing values in many fields Modified == operator in IndividualFieldValue and SharedField to handle special floating point values (bcf_float_missing and bcf_float_vector_end)
broadinstitute · Sep 23, 2014 · b7255a3 · b7255a3
1 parent d879419
commit b7255a3
Show file tree

Hide file tree

Showing 10 changed files with 239 additions and 53 deletions.
diff --git a/gamgee/individual_field_value.h b/gamgee/individual_field_value.h
@@ -112,7 +112,7 @@ class IndividualFieldValue {
     if (size() != other.size()) 
       return false;
     for (auto i=0u; i != size(); ++i) {
-      if (operator[](i) != other[i])
+      if (!utils::bcf_check_equal_primitive(operator[](i), other[i]))
         return false;
     }
     return true;

diff --git a/gamgee/shared_field.h b/gamgee/shared_field.h
@@ -90,7 +90,7 @@ class SharedField {
     if (size() != other.size()) 
       return false;
     for (auto i=0u; i != size(); ++i) {
-      if (operator[](i) != other[i])
+      if (!utils::bcf_check_equal_primitive(operator[](i), other[i]))
         return false;
     }
     return true;

diff --git a/gamgee/utils/utils.h b/gamgee/utils/utils.h
@@ -5,6 +5,7 @@
 #include <memory>
 #include <vector>
 #include <sstream>
+#include "htslib/vcf.h" 
 
 namespace gamgee {
 
@@ -66,6 +67,26 @@ inline void check_max_boundary(const uint32_t index, const uint32_t size) {
   }
 }
 
+/**
+ * @brief - Check whether two values from VCF fields of primitive types (for which the == operator is defined) * are equal
+ * The function template is specialized for float fields
+ */
+template<class TYPE>
+inline bool bcf_check_equal_primitive(const TYPE x, const TYPE y) {
+  return (x == y);
+}
+/**
+ * @brief: Check whether two float values from VCF fields  are equal
+ * @note: since bcf_float_missing and bcf_float_vector_end are represented as NaNs and (nan == nan) returns false,
+ * a special check needs to be done for checking equality of float values in VCFs
+ */
+template<>
+inline bool bcf_check_equal_primitive<float>(const float x, const float y) {
+  return ((x == y) || (bcf_float_is_missing(x) && bcf_float_is_missing(y))
+	|| (bcf_float_is_vector_end(x) && bcf_float_is_vector_end(y)));
+}
+
+
 } // end utils namespace
 } // end gamgee namespace
 

diff --git a/gamgee/utils/variant_field_type.cpp b/gamgee/utils/variant_field_type.cpp
@@ -9,12 +9,18 @@ namespace gamgee {
 namespace utils {
 
 int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) {
+  auto return_val = -1;
   const auto p = data_ptr + (index * num_bytes_per_value);
+  auto end_val = -1;
   switch (type) {
     case VariantFieldType::INT8:
-      return *(reinterpret_cast<const int8_t*>(p));
+      return_val = *(reinterpret_cast<const int8_t*>(p));
+      end_val = bcf_int8_vector_end;
+      break;
     case VariantFieldType::INT16:
-      return *(reinterpret_cast<const int16_t*>(p));
+      return_val = *(reinterpret_cast<const int16_t*>(p));
+      end_val = bcf_int16_vector_end;
+      break;
     case VariantFieldType::INT32:
       return *(reinterpret_cast<const int32_t*>(p));
     case VariantFieldType::FLOAT:
@@ -24,24 +30,48 @@ int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const
     default:
       return 0; // undefined or NULL type -- impossible to happen just so the compiler doesn't warn us 
   }
+  //diff = 0 if return_val == vector_end, 1 if return_val == missing, < 0 if return_val is 'normal'
+  auto diff = end_val - return_val;
+  if(diff >= 0)
+    return bcf_int32_vector_end - diff;
+  else
+    return return_val;
 }
 
 float convert_data_to_float(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) {
+  auto return_val = -1;
+  auto end_val = -1;
   const auto p = data_ptr + (index * num_bytes_per_value);
   switch (type) {
     case VariantFieldType::INT8:
-      return *(reinterpret_cast<const int8_t*>(p)); 
+      return_val = *(reinterpret_cast<const int8_t*>(p)); 
+      end_val = bcf_int8_vector_end;
+      break;
     case VariantFieldType::INT16:
-      return *(reinterpret_cast<const int16_t*>(p)); 
+      return_val = *(reinterpret_cast<const int16_t*>(p));
+      end_val = bcf_int16_vector_end;
+      break;
     case VariantFieldType::INT32:
-      return *(reinterpret_cast<const int32_t*>(p)); 
+      return_val = *(reinterpret_cast<const int32_t*>(p)); 
+      end_val = bcf_int32_vector_end;
+      break;
     case VariantFieldType::FLOAT:
       return *(reinterpret_cast<const float*>(p)); 
     case VariantFieldType::STRING:
       throw std::invalid_argument("user requested a float value but underlying type is actually a string");
     default:
       return 0.0f; // undefined or NULL type -- impossible to happen just so the compiler doesn't warn us 
   }
+  if(end_val >= return_val)	//can't use diff because INT32 types are also checked here and diff will wrap-around 
+  {
+    auto tmp = 0.0f;	//pack bcf_float_missing or bcf_float_vector_end into tmp and return tmp
+    //diff = 0 if return_val == vector_end, 1 if return_val == missing, 
+    auto diff = end_val - return_val;
+    bcf_float_set(&tmp, bcf_float_vector_end - diff);
+    return tmp;
+  }
+  else
+    return return_val;
 }
 
 std::string convert_data_to_string(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) {

diff --git a/test/genotypes_test.cpp b/test/genotypes_test.cpp
@@ -33,6 +33,7 @@ BOOST_AUTO_TEST_CASE( hom_ref_test ){
     dynamic_bitset<>(string("010")),
     dynamic_bitset<>(string("010")),
     dynamic_bitset<>(string("010")),
+    dynamic_bitset<>(string("010")),
     dynamic_bitset<>(string("010"))};
 
   select_if_test(truth, diploid, hom_ref  );
@@ -53,6 +54,7 @@ BOOST_AUTO_TEST_CASE( hom_var_test ){
     dynamic_bitset<>(string("100")),
     dynamic_bitset<>(string("100")),
     dynamic_bitset<>(string("100")),
+    dynamic_bitset<>(string("100")),
     dynamic_bitset<>(string("100"))};
 
   select_if_test(truth, diploid, hom_var);

diff --git a/test/select_if_test.cpp b/test/select_if_test.cpp
@@ -45,8 +45,8 @@ BOOST_AUTO_TEST_CASE( select_if_individual_fields ) {
 }
 
 BOOST_AUTO_TEST_CASE( select_if_shared_field ) {
-  const auto truth_an_counts = std::vector<uint32_t>{1,1,1,1,1};
-  const auto truth_af_counts = std::vector<uint32_t>{0,0,0,0,1};
+  const auto truth_an_counts = std::vector<uint32_t>{1,1,1,1,1,1};
+  const auto truth_af_counts = std::vector<uint32_t>{0,0,0,0,1,0};
   auto truth_index = 0u;
   for (const auto& record : SingleVariantReader{"testdata/test_variants.vcf"}) {
     const auto an = record.integer_shared_field("AN");