diff --git a/filter.cpp b/filter.cpp index a4829469..36e012ab 100644 --- a/filter.cpp +++ b/filter.cpp @@ -372,6 +372,13 @@ void Node::evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug) b = true; } } + else if (type==VT_N_FILTER) + { + i = bcf_get_n_filter(v); + f = i; + b = true; + value_exists = true; + } else if (type==VT_INFO) //figure out type { if (!bcf_hdr_idinfo_exists(h, BCF_HL_INFO, bcf_hdr_id2int(h, BCF_DT_ID, tag.s))) @@ -796,6 +803,12 @@ std::string Node::type2string(int32_t type) s += "FILTER"; } + if (type==VT_N_FILTER) + { + s += (s==""? "" : "|"); + s += "FILTER"; + } + if (type==VT_INFO) { s += (s==""? "" : "|"); @@ -1117,6 +1130,12 @@ void Filter::parse_literal(const char* exp, int32_t len, Node * node, bool debug if (debug) std::cerr << "\tis filter_op\n"; return; } + else if (strncmp(exp, "N_FILTER", len)==0) + { + node->type = VT_N_FILTER; + if (debug) std::cerr << "\tis n_filter_op\n"; + return; + } else if (strncmp(exp, "INFO.", 5)==0) { node->type = VT_INFO; diff --git a/filter.h b/filter.h index c764f8f2..4f6d9770 100644 --- a/filter.h +++ b/filter.h @@ -67,6 +67,7 @@ #define VT_VARIANT_TYPE (37|VT_INT|VT_BCF_OP|VT_BOOL) #define VT_VARIANT_DLEN (38|VT_INT|VT_BCF_OP) #define VT_VARIANT_LEN (39|VT_INT|VT_BCF_OP) +#define VT_N_FILTER (40|VT_INT|VT_BCF_OP) #define VT_UNKNOWN -1 diff --git a/hfilter.cpp b/hfilter.cpp index 45d4fad5..158f4e43 100644 --- a/hfilter.cpp +++ b/hfilter.cpp @@ -41,8 +41,9 @@ class Igor : Program std::vector intervals; std::string interval_list; std::string regions_bed_file; - std::string REGIONS_TAG; - std::string REGIONS_TAG_DESC; + std::string filter_tag; + std::string filter_tag_desc; + bool clear_filter; /////// //i/o// @@ -56,19 +57,17 @@ class Igor : Program std::string fexp; Filter filter; bool filter_exists; - + ///////// //stats// ///////// - int32_t no_variants_annotated; + int32_t no_variants_filtered; int32_t no_variants; //////////////// //common tools// //////////////// VariantManip *vm; - OrderedRegionOverlapMatcher *orom_regions; - OrderedRegionOverlapMatcher *orom_gencode_cds; Igor(int argc, char **argv) { @@ -86,9 +85,10 @@ class Igor : Program TCLAP::ValueArg arg_intervals("i", "i", "intervals", false, "", "str", cmd); TCLAP::ValueArg arg_interval_list("I", "I", "file containing list of intervals []", false, "", "str", cmd); TCLAP::ValueArg arg_output_vcf_file("o", "o", "output VCF file [-]", false, "-", "str", cmd); - TCLAP::ValueArg arg_regions_bed_file("b", "b", "regions BED file []", true, "", "str", cmd); - TCLAP::ValueArg arg_REGIONS_TAG("t", "t", "regions tag []", true, "", "str", cmd); - TCLAP::ValueArg arg_REGIONS_TAG_DESC("d", "d", "regions tag description []", true, "", "str", cmd); + TCLAP::ValueArg arg_filter_tag("t", "t", "filter tag []", true, "", "str", cmd); + TCLAP::ValueArg arg_filter_tag_desc("d", "d", "filter tag description []", true, "", "str", cmd); + TCLAP::ValueArg arg_fexp("f", "f", "filter expression []", true, "", "str", cmd); + TCLAP::SwitchArg arg_clear_filter("x", "x", "clear filter [false]", cmd, false); TCLAP::UnlabeledValueArg arg_input_vcf_file("", "input VCF file", true, "","file", cmd); cmd.parse(argc, argv); @@ -96,9 +96,10 @@ class Igor : Program input_vcf_file = arg_input_vcf_file.getValue(); output_vcf_file = arg_output_vcf_file.getValue(); parse_intervals(intervals, arg_interval_list.getValue(), arg_intervals.getValue()); - regions_bed_file = arg_regions_bed_file.getValue(); - REGIONS_TAG = arg_REGIONS_TAG.getValue(); - REGIONS_TAG_DESC = arg_REGIONS_TAG_DESC.getValue(); + fexp = arg_fexp.getValue(); + clear_filter = arg_clear_filter.getValue(); + filter_tag = arg_filter_tag.getValue(); + filter_tag_desc = arg_filter_tag_desc.getValue(); } catch (TCLAP::ArgException &e) { @@ -118,7 +119,7 @@ class Igor : Program odw = new BCFOrderedWriter(output_vcf_file); odw->link_hdr(odr->hdr); - std::string hrec = "##INFO="; + std::string hrec = "##FILTER="; bcf_hdr_append(odw->hdr, hrec.c_str()); ///////////////////////// @@ -126,28 +127,28 @@ class Igor : Program ///////////////////////// filter.parse(fexp.c_str(), false); filter_exists = fexp=="" ? false : true; - + /////////////////////// //tool initialization// /////////////////////// - orom_regions = new OrderedRegionOverlapMatcher(regions_bed_file); + vm = new VariantManip(); //////////////////////// //stats initialization// //////////////////////// - no_variants_annotated = 0; + no_variants_filtered = 0; no_variants = 0; } void print_options() { - std::clog << "annotate_regions v" << version << "\n"; + std::clog << "hfilter v" << version << "\n"; std::clog << "\n"; std::clog << "options: input VCF file(s) " << input_vcf_file << "\n"; std::clog << " [o] output VCF file " << output_vcf_file << "\n"; - print_str_op(" [m] regions bed file ", regions_bed_file); - print_str_op(" [m] regions bed file ", regions_bed_file); - print_str_op(" [m] regions bed file ", regions_bed_file); + print_str_op(" [f] filter expression ", fexp); + print_str_op(" [t] filter tag ", filter_tag); + print_str_op(" [d] filter description ", filter_tag_desc); print_int_op(" [i] intervals ", intervals); std::clog << "\n"; } @@ -155,7 +156,7 @@ class Igor : Program void print_stats() { std::clog << "\n"; - std::cerr << "stats: no. of variants annotated " << no_variants_annotated << "\n"; + std::cerr << "stats: no. of variants filtered " << no_variants_filtered << "\n"; std::cerr << " total no. of variants " << no_variants << "\n";std::clog << "\n"; } @@ -165,32 +166,25 @@ class Igor : Program bcf1_t *v = odw->get_bcf1_from_pool(); bcf_hdr_t *h = odr->hdr; - std::vector overlaps; Variant variant; - kstring_t s = {0,0,0}; + int32_t filter_id = bcf_hdr_id2int(h, BCF_DT_ID, filter_tag.c_str()); while (odr->read(v)) { - if (filter_exists) + vm->classify_variant(h, v, variant); + if (filter.apply(h, v, &variant, false)) { - vm->classify_variant(h, v, variant); - if (!filter.apply(h, v, &variant, false)) + if (clear_filter) + { + bcf_update_filter(h, v, &filter_id, 1); + } + else { - continue; + bcf_add_filter(h, v, filter_id); } + + ++no_variants_filtered; } - bcf_unpack(v, BCF_UN_STR); - int32_t vtype = vm->classify_variant(odr->hdr, v, variant); - std::string chrom = bcf_get_chrom(odr->hdr,v); - int32_t start1 = bcf_get_pos1(v); - int32_t end1 = bcf_get_end_pos1(v); - - if (orom_regions->overlaps_with(chrom, start1, end1)) - { - bcf_update_info_flag(odr->hdr, v, REGIONS_TAG.c_str(), "", 1); - ++no_variants_annotated; - } - ++no_variants; odw->write(v); v = odw->get_bcf1_from_pool(); diff --git a/hfilter.h b/hfilter.h index bb0f6d00..b4ce6600 100644 --- a/hfilter.h +++ b/hfilter.h @@ -42,13 +42,8 @@ #include "bcf_ordered_writer.h" #include "hts_utils.h" #include "utils.h" -#include "interval_tree.h" -#include "variant_manip.h" #include "program.h" #include "variant_manip.h" -#include "log_tool.h" -#include "gencode.h" -#include "ordered_region_overlap_matcher.h" #include "filter.h" void hfilter(int argc, char ** argv); diff --git a/hts_utils.h b/hts_utils.h index fa52336c..93bd9ad5 100644 --- a/hts_utils.h +++ b/hts_utils.h @@ -452,6 +452,11 @@ void bcf_set_id(bcf1_t *v, char* id); */ #define bcf_get_qual(v) ((v)->qual) +/** + * Get n_filt of a bcf record + */ +#define bcf_get_n_filter(v) ((v)->d.n_flt) + /** * Get ith format name */ diff --git a/main.cpp b/main.cpp index 4c5dc228..ebbddd41 100644 --- a/main.cpp +++ b/main.cpp @@ -40,6 +40,7 @@ #include "estimate.h" #include "genotype2.h" #include "genotype.h" +#include "hfilter.h" #include "index.h" #include "merge_candidate_variants.h" #include "merge.h" @@ -300,6 +301,10 @@ int main(int argc, char ** argv) { consolidate_variants(argc-1, ++argv); } + else if (argc>1 && cmd=="filter") + { + hfilter(argc-1, ++argv); + } else if (argc>1 && cmd=="xcmp") { xcmp(argc-1, ++argv);