diff --git a/gamgee/individual_field_value.h b/gamgee/individual_field_value.h index d0f95f9b3..c45a64aaf 100644 --- a/gamgee/individual_field_value.h +++ b/gamgee/individual_field_value.h @@ -112,7 +112,7 @@ class IndividualFieldValue { if (size() != other.size()) return false; for (auto i=0u; i != size(); ++i) { - if (operator[](i) != other[i]) + if (!utils::bcf_check_equal_primitive(operator[](i), other[i])) return false; } return true; diff --git a/gamgee/shared_field.h b/gamgee/shared_field.h index a5cb0cdec..a48ca2fae 100644 --- a/gamgee/shared_field.h +++ b/gamgee/shared_field.h @@ -90,7 +90,7 @@ class SharedField { if (size() != other.size()) return false; for (auto i=0u; i != size(); ++i) { - if (operator[](i) != other[i]) + if (!utils::bcf_check_equal_primitive(operator[](i), other[i])) return false; } return true; diff --git a/gamgee/utils/utils.h b/gamgee/utils/utils.h index 0a5f38bf8..a74a2bca9 100644 --- a/gamgee/utils/utils.h +++ b/gamgee/utils/utils.h @@ -5,6 +5,7 @@ #include #include #include +#include "htslib/vcf.h" namespace gamgee { @@ -66,6 +67,26 @@ inline void check_max_boundary(const uint32_t index, const uint32_t size) { } } +/** + * @brief - Check whether two values from VCF fields of primitive types (for which the == operator is defined) * are equal + * The function template is specialized for float fields + */ +template +inline bool bcf_check_equal_primitive(const TYPE x, const TYPE y) { + return (x == y); +} +/** + * @brief: Check whether two float values from VCF fields are equal + * @note: since bcf_float_missing and bcf_float_vector_end are represented as NaNs and (nan == nan) returns false, + * a special check needs to be done for checking equality of float values in VCFs + */ +template<> +inline bool bcf_check_equal_primitive(const float x, const float y) { + return ((x == y) || (bcf_float_is_missing(x) && bcf_float_is_missing(y)) + || (bcf_float_is_vector_end(x) && bcf_float_is_vector_end(y))); +} + + } // end utils namespace } // end gamgee namespace diff --git a/gamgee/utils/variant_field_type.cpp b/gamgee/utils/variant_field_type.cpp index 1d3aa3f84..9a3ea005f 100644 --- a/gamgee/utils/variant_field_type.cpp +++ b/gamgee/utils/variant_field_type.cpp @@ -9,12 +9,18 @@ namespace gamgee { namespace utils { int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) { + auto return_val = -1; const auto p = data_ptr + (index * num_bytes_per_value); + auto end_val = -1; switch (type) { case VariantFieldType::INT8: - return *(reinterpret_cast(p)); + return_val = *(reinterpret_cast(p)); + end_val = bcf_int8_vector_end; + break; case VariantFieldType::INT16: - return *(reinterpret_cast(p)); + return_val = *(reinterpret_cast(p)); + end_val = bcf_int16_vector_end; + break; case VariantFieldType::INT32: return *(reinterpret_cast(p)); case VariantFieldType::FLOAT: @@ -24,17 +30,31 @@ int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const default: return 0; // undefined or NULL type -- impossible to happen just so the compiler doesn't warn us } + //diff = 0 if return_val == vector_end, 1 if return_val == missing, < 0 if return_val is 'normal' + auto diff = end_val - return_val; + if(diff >= 0) + return bcf_int32_vector_end - diff; + else + return return_val; } float convert_data_to_float(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) { + auto return_val = -1; + auto end_val = -1; const auto p = data_ptr + (index * num_bytes_per_value); switch (type) { case VariantFieldType::INT8: - return *(reinterpret_cast(p)); + return_val = *(reinterpret_cast(p)); + end_val = bcf_int8_vector_end; + break; case VariantFieldType::INT16: - return *(reinterpret_cast(p)); + return_val = *(reinterpret_cast(p)); + end_val = bcf_int16_vector_end; + break; case VariantFieldType::INT32: - return *(reinterpret_cast(p)); + return_val = *(reinterpret_cast(p)); + end_val = bcf_int32_vector_end; + break; case VariantFieldType::FLOAT: return *(reinterpret_cast(p)); case VariantFieldType::STRING: @@ -42,6 +62,16 @@ float convert_data_to_float(const uint8_t* data_ptr, const int index, const uint default: return 0.0f; // undefined or NULL type -- impossible to happen just so the compiler doesn't warn us } + if(end_val >= return_val) //can't use diff because INT32 types are also checked here and diff will wrap-around + { + auto tmp = 0.0f; //pack bcf_float_missing or bcf_float_vector_end into tmp and return tmp + //diff = 0 if return_val == vector_end, 1 if return_val == missing, + auto diff = end_val - return_val; + bcf_float_set(&tmp, bcf_float_vector_end - diff); + return tmp; + } + else + return return_val; } std::string convert_data_to_string(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type) { diff --git a/test/genotypes_test.cpp b/test/genotypes_test.cpp index 864f3a719..017a8958c 100644 --- a/test/genotypes_test.cpp +++ b/test/genotypes_test.cpp @@ -33,6 +33,7 @@ BOOST_AUTO_TEST_CASE( hom_ref_test ){ dynamic_bitset<>(string("010")), dynamic_bitset<>(string("010")), dynamic_bitset<>(string("010")), + dynamic_bitset<>(string("010")), dynamic_bitset<>(string("010"))}; select_if_test(truth, diploid, hom_ref ); @@ -53,6 +54,7 @@ BOOST_AUTO_TEST_CASE( hom_var_test ){ dynamic_bitset<>(string("100")), dynamic_bitset<>(string("100")), dynamic_bitset<>(string("100")), + dynamic_bitset<>(string("100")), dynamic_bitset<>(string("100"))}; select_if_test(truth, diploid, hom_var); diff --git a/test/select_if_test.cpp b/test/select_if_test.cpp index 93ea54573..0a4ff7dde 100644 --- a/test/select_if_test.cpp +++ b/test/select_if_test.cpp @@ -45,8 +45,8 @@ BOOST_AUTO_TEST_CASE( select_if_individual_fields ) { } BOOST_AUTO_TEST_CASE( select_if_shared_field ) { - const auto truth_an_counts = std::vector{1,1,1,1,1}; - const auto truth_af_counts = std::vector{0,0,0,0,1}; + const auto truth_an_counts = std::vector{1,1,1,1,1,1}; + const auto truth_af_counts = std::vector{0,0,0,0,1,0}; auto truth_index = 0u; for (const auto& record : SingleVariantReader{"testdata/test_variants.vcf"}) { const auto an = record.integer_shared_field("AN"); diff --git a/test/variant_reader_test.cpp b/test/variant_reader_test.cpp index 9954b7c1c..2bc1aff91 100644 --- a/test/variant_reader_test.cpp +++ b/test/variant_reader_test.cpp @@ -11,45 +11,156 @@ #include #include +#include using namespace std; using namespace gamgee; constexpr auto FLOAT_COMPARISON_THRESHOLD = 0.0001f; -const auto truth_chromosome = vector{0, 1, 1, 1, 2}; -const auto truth_alignment_starts = vector{10000000, 10001000, 10002000, 10003000, 10004000}; -const auto truth_alignment_stops = vector{10000000, 10001001, 10002006, 10003000, 10004002}; -const auto truth_n_alleles = vector{2, 2, 2, 2, 3}; -const auto truth_filter_name = vector{"PASS", "PASS", "LOW_QUAL", "NOT_DEFINED", "PASS"}; -const auto truth_filter_size = vector{1,1,1,1,2}; -const auto truth_quals = vector{80,8.4,-1,-1,-1}; -const auto truth_ref = vector{"T", "GG", "TAGTGQA", "A", "GAT"}; -const auto truth_alt = vector< vector> { { "C" } , {"AA"}, {"T"}, {"AGCT"}, {"G","GATAT"}}; +float g_bcf_float_missing = reinterpret_cast(bcf_float_missing); + +const auto truth_chromosome = vector{0, 1, 1, 1, 2, 2}; +const auto truth_alignment_starts = vector{10000000, 10001000, 10002000, 10003000, 10004000, 10005000 }; +const auto truth_alignment_stops = vector{10000000, 10001001, 10002006, 10003000, 10004002, 10005002 }; +const auto truth_n_alleles = vector{2, 2, 2, 2, 3, 3}; +const auto truth_filter_name = vector{"PASS", "PASS", "LOW_QUAL", "NOT_DEFINED", "PASS", "PASS"}; +const auto truth_filter_size = vector{1,1,1,1,2,1}; +const auto truth_quals = vector{80,8.4,-1,-1,-1,-1}; +const auto truth_ref = vector{"T", "GG", "TAGTGQA", "A", "GAT","GAT"}; +const auto truth_alt = vector< vector> { { "C" } , {"AA"}, {"T"}, {"AGCT"}, {"G","GATAT"}, {"G","GATAT"}}; const auto truth_high_quality_hets = boost::dynamic_bitset<>{std::string{"001"}}; -const auto truth_id = vector{"db2342", "rs837472", ".", ".", "."}; -const auto truth_shared_af = vector>{{0.5}, {0.5}, {0.5}, {0.5}, {0.5, 0.0}}; -const auto truth_shared_an = vector>{{6}, {6}, {6}, {6}, {6}}; -const auto truth_shared_desc = vector>{{"Test1,Test2"}, {}, {}, {}, {}}; -const auto truth_shared_validated = vector{true, false, true, false, false}; -const auto truth_gq = vector>{{25,12,650}, {35,35,35}, {35,35,35}, {35,35,35}, {35,35,35}}; -const auto truth_af = vector { 3.1,2.2 }; +const auto truth_id = vector{"db2342", "rs837472", ".", ".", ".","."}; +const auto truth_shared_af = vector>{{0.5}, {0.5}, {0.5}, {0.5}, {0.5, 0.0}, {0.5, g_bcf_float_missing}}; +const auto truth_shared_an = vector>{{6}, {6}, {6}, {6}, {6}, {6}}; +const auto truth_shared_desc = vector>{{"Test1,Test2"}, {}, {}, {}, {}, {}}; +const auto truth_shared_validated = vector{true, false, true, false, false, false}; +const auto truth_gq_bt_type = vector{BCF_BT_INT16, BCF_BT_INT8, BCF_BT_INT8, BCF_BT_INT8, BCF_BT_INT8, BCF_BT_INT8}; +const auto truth_gq = vector>{{25,12,650}, {35,35,35}, {35,35,35}, {35,35,35}, {35,35,35}, {35,bcf_int32_missing,35}}; +const auto truth_af = vector>> { + { {3.1,2.2}, {3.1,2.2}, {3.1,2.2} }, + { {3.1,2.2}, {3.1,2.2}, {3.1,2.2} }, + { {3.1,2.2}, {3.1,2.2}, {3.1,2.2} }, + { {3.1,2.2}, {3.1,2.2}, {3.1,2.2} }, + { {3.1,2.2}, {3.1,2.2}, {3.1,2.2} }, + { {3.1,g_bcf_float_missing}, {3.1,2.2}, {3.1,2.2} } +}; + +const auto truth_pl_bt_type = vector{ BCF_BT_INT8, BCF_BT_INT8, BCF_BT_INT32, BCF_BT_INT8, BCF_BT_INT8, BCF_BT_INT8 }; const auto truth_pl = vector>>{ {{10,0,100 }, {0,10,1000 }, {10,100,0} }, {{10,0,100 }, {0,10,100 }, {10,100,0} }, {{10,0,100 }, {0,10,2000000000}, {10,100,0} }, {{10,0,100 }, {0,10,100 }, {10,100,0} }, - {{10,0,100,2,4,8}, {0,10,100,2,4,8 }, {10,100,0,2,4,8}} + {{10,0,100,2,4,8}, {0,10,100,2,4,8 }, {10,100,0,2,4,8}}, + {{10,0,100,bcf_int32_missing,4,bcf_int32_missing}, {0,10,100,2,4,8 }, {10,100,0,2,4,8}} }; const auto truth_as = vector>{ {"ABA","CA","XISPAFORRO"}, {"ABA","ABA","ABA"}, {"ABA","ABA","ABA"}, {"ABA","ABA","ABA"}, - {"ABA","ABA","ABA"} + {"ABA","ABA","ABA"}, + {"ABA","ABA","."} }; +/* + * @brief given the data type (BCF_BT_*), returns the int32_t with the value of bcf_*_vector_end + * @param bcf_bt_type type (BCF_BT_*) of the data + * @return bcf_*_vector_end + */ +int32_t bcf_vector_end_val(int32_t bcf_bt_type) +{ + auto vector_end_val = 0; + switch(bcf_bt_type) + { + case BCF_BT_INT8: + vector_end_val = bcf_int8_vector_end; + break; + case BCF_BT_INT16: + vector_end_val = bcf_int16_vector_end; + break; + case BCF_BT_INT32: + vector_end_val = bcf_int32_vector_end; + break; + case BCF_BT_FLOAT: + vector_end_val = bcf_float_vector_end; + break; + default: + BOOST_CHECK_MESSAGE(0, "Unknown VCF integer type "<< bcf_bt_type); + return -1; + } + return vector_end_val; +} +/* + * @brief given the int data type (BCF_BT_INT*) and int32 representation of the datatype, returns the string corresponding to + * the data type. Note that functions like shared_field_as_integer return int32 representation of data, even though the underlying data + * type might be int16/int8 etc. Hence, shared_field_as_integer would convert bcf_int8_missing to bcf_int32_missing so that + * missing functions in the rest of gamgee work correctly. However, a string comparison check in the tests here would fail + * since to_string(bcf_int8_missing) != to_string(bcf_int32_missing). This function re-converts 'special' values back to their + * original data types and then converts to string. + * @note the function checks whether it is a special value - anything <= bcf_int32_vector_end is a special value- if yes, compute + * the real special value corresponding to the data type bcf_bt_type. The 'real' special value is computed by obtaining the + * bcf_*_vector_end value and subtracting from it 'diff', where diff is the difference between v and the int32 special value + * bcf_int32_vector_end + * @param v int32_t value obtained from *_as_integer gamgee function + * @param bcf_bt_type int type (BCF_BT_INT*) of the data + * @return string representation of value + */ +string bcf_int32_to_string(int32_t v, int32_t bcf_bt_type) +{ + if(v <= bcf_int32_vector_end) + { + auto vector_end_val = bcf_vector_end_val(bcf_bt_type); + auto diff = bcf_int32_vector_end - v; //either 0 or 1 + return to_string(vector_end_val - diff); + } + else + return to_string(v); +} +/* + * @brief given the int data type (BCF_BT_INT*) and int32 representation of the datatype, returns the float corresponding to + * the data type. Note that functions like shared_field_as_integer return int32 representation of data, even though the underlying data + * type might be int16/int8 etc. Hence, shared_field_as_integer would convert bcf_int8_missing to bcf_int32_missing so that + * missing functions in the rest of gamgee work correctly. However, a float comparison check in the tests here would fail + * since float(bcf_int8_missing) != float(bcf_int32_missing). This function re-converts 'special' values back to their + * original data types and then converts to float. + * @note the function checks whether it is a special value - anything <= bcf_int32_vector_end is a special value- if yes, compute + * the real special value corresponding to the data type bcf_bt_type. The 'real' special value is computed by obtaining the + * bcf_*_vector_end value and subtracting from it 'diff', where diff is the difference between v and the int32 special value + * bcf_int32_vector_end + * @param v int32_t value obtained from *_as_integer gamgee function + * @param bcf_bt_type type (BCF_BT_*) of the data + * @return string representation of value + */ +float bcf_int32_to_float(int32_t v, int32_t bcf_bt_type) +{ + if(v <= bcf_int32_vector_end) + { + auto diff = bcf_int32_vector_end - v; //either 0 or 1 + auto pack_value = bcf_float_vector_end - diff; + auto tmp = 0.0f; + bcf_float_set(&tmp, pack_value); + return tmp; + } + else + return float(v); +} +/* + * @brief floating point comparison for VCF fields + * v1, v2 can be NaNs since the bcf_float_missing and vector_end values are NaNs. A simple comparison + * of v1 and v2 would fail since (nan == nan) is always false. This is fixed by explicitly comparing both values to bcf_float_missing and + * vector_end i.e. v1 and v2 are equal if ANY ONE of the following conditions is true + * (a) both are bcf_float_missing OR + * (b) both are bcf_float_vector_end OR + * (c) |v1-v2| high_qual_hets(const Variant& record) { // filter all hets that have GQ > 20 @@ -190,29 +301,33 @@ void check_individual_field_api(const Variant& record, const uint32_t truth_inde for(auto i=0u; i != record.n_samples(); ++i) { BOOST_CHECK_EQUAL(gq_int[i][0], truth_gq[truth_index][i]); BOOST_CHECK_EQUAL(gq_int_idx[i][0], truth_gq[truth_index][i]); - BOOST_CHECK_CLOSE(gq_float[i][0], float(truth_gq[truth_index][i]), FLOAT_COMPARISON_THRESHOLD); - BOOST_CHECK_CLOSE(gq_float_idx[i][0], float(truth_gq[truth_index][i]), FLOAT_COMPARISON_THRESHOLD); - BOOST_CHECK_EQUAL(gq_string[i][0], to_string(truth_gq[truth_index][i])); - BOOST_CHECK_EQUAL(gq_string_idx[i][0], to_string(truth_gq[truth_index][i])); - BOOST_REQUIRE_EQUAL(af_int[i].size(), truth_af.size()); // require otherwise next line may segfault - BOOST_REQUIRE_EQUAL(af_int_idx[i].size(), truth_af.size()); // require otherwise next line may segfault + BOOST_CHECK_MESSAGE(bcf_compare_float(gq_float[i][0], + bcf_int32_to_float(truth_gq[truth_index][i], truth_gq_bt_type[truth_index]), FLOAT_COMPARISON_THRESHOLD), + "["<(), + //tuple.get<1>(), FLOAT_COMPARISON_THRESHOLD); } ); const auto af_idx = record.float_shared_field(header.field_index("AF")); // test the index based api - BOOST_CHECK_EQUAL_COLLECTIONS(af_idx.begin(), af_idx.end(), truth_shared_af[truth_index].begin(), truth_shared_af[truth_index].end()); + af_index = 0; + for(auto af_value : af_idx) + { + BOOST_CHECK_MESSAGE(bcf_compare_float(af_value, truth_shared_af[truth_index][af_index], FLOAT_COMPARISON_THRESHOLD), + "["<{false, false, true, true, true}; + const auto truth_missing = vector{false, false, true, true, true, true}; auto i = 0u; for (const auto& record : SingleVariantReader{"testdata/test_variants.vcf"}) BOOST_CHECK_EQUAL(missing(record.id()), truth_missing[i++]); // check that the missing and non-missing values in the vcf actually return the right missingness @@ -587,12 +719,12 @@ BOOST_AUTO_TEST_CASE( multiple_variant_reader_test ) { } } -const auto multi_diff_truth_record_count = vector{4, 1, 1, 1, 2, 1}; -const auto multi_diff_truth_chromosome = vector{0, 1, 1, 1, 1, 2}; -const auto multi_diff_truth_alignment_starts = vector{10000000, 10001000, 10001999, 10002000, 10003000, 10004000}; -const auto multi_diff_truth_ref = vector{"T", "GG", "TAGTGQA", "TAGTGQA", "A", "GAT"}; -const auto multi_diff_truth_n_alleles = vector{2, 2, 2, 2, 2, 3}; -const auto multi_diff_truth_id = vector{"db2342", "rs837472", ".", ".", ".", "."}; +const auto multi_diff_truth_record_count = vector{4, 1, 1, 1, 2, 1, 1}; +const auto multi_diff_truth_chromosome = vector{0, 1, 1, 1, 1, 2, 2}; +const auto multi_diff_truth_alignment_starts = vector{10000000, 10001000, 10001999, 10002000, 10003000, 10004000, 10005000}; +const auto multi_diff_truth_ref = vector{"T", "GG", "TAGTGQA", "TAGTGQA", "A", "GAT", "GAT"}; +const auto multi_diff_truth_n_alleles = vector{2, 2, 2, 2, 2, 3, 3}; +const auto multi_diff_truth_id = vector{"db2342", "rs837472", ".", ".", ".", ".", "."}; BOOST_AUTO_TEST_CASE( multiple_variant_reader_difference_test ) { auto truth_index = 0u; @@ -609,7 +741,7 @@ BOOST_AUTO_TEST_CASE( multiple_variant_reader_difference_test ) { } ++truth_index; } - BOOST_CHECK_EQUAL(truth_index, 6u); + BOOST_CHECK_EQUAL(truth_index, 7u); } void multiple_variant_reader_sample_test(const vector samples, const bool include, const int desired_samples) { diff --git a/testdata/test_variants.bcf b/testdata/test_variants.bcf index 5d752f9f7..5646a3395 100644 Binary files a/testdata/test_variants.bcf and b/testdata/test_variants.bcf differ diff --git a/testdata/test_variants.vcf b/testdata/test_variants.vcf index 20811f8d7..6386ea4ab 100644 --- a/testdata/test_variants.vcf +++ b/testdata/test_variants.vcf @@ -21,3 +21,4 @@ 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6;VALIDATED GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 20 10003000 . A AGCT . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 10004000 . GAT G,GATAT . PASS;MISSED AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 1/2:35:10,0,100,2,4,8:3.1,2.2:ABA 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA +22 10005000 . GAT G,GATAT . PASS AF=0.5,.;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100,.,4,.:3.1,.:ABA 0/0:.:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:. diff --git a/testdata/test_variants.vcf.gz b/testdata/test_variants.vcf.gz index 79954e7b2..0016b49ea 100644 Binary files a/testdata/test_variants.vcf.gz and b/testdata/test_variants.vcf.gz differ