Skip to content

Loading…

Minor VCF parser improvements #85

Merged
merged 4 commits into from

2 participants

@superbobry

Hello,

  • Biocaml_vcf now supports reserved sub-fields for INFO and FORMAT.
  • Also, I've implemented basic ALT parsing; currently everything is stringly typed, so, for example, deletion will be represented as "<DEL>" instead of an appropriate ADT constructor. I'm not yet sure how to approach the types for a zillion of variations listed in VCF standard, any ideas here?

By the way, is it okay to use Hashtbl for VCF parser state? the size of VCF header is usually small, so container choice doesn't affect efficiency, but using immutable Map looks like a cleaner solution. What do you think?

@smondet smondet added a commit that referenced this pull request
@smondet smondet Merge from superbobry/v0.3-dev (PR #85) 7944033
@smondet smondet merged commit bfe73e7 into biocaml:v0.3-dev
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Showing with 205 additions and 66 deletions.
  1. +119 −27 src/lib/biocaml_vcf.ml
  2. +4 −15 src/lib/biocaml_vcf.mli
  3. +8 −0 src/tests/data/vcf_04_reserved.vcf
  4. +18 −0 src/tests/data/vcf_05_alt.vcf
  5. +56 −24 src/tests/test_vcf.ml
View
146 src/lib/biocaml_vcf.ml
@@ -70,21 +70,10 @@ let coerce_to_vcf_info_type t s =
| `flag_value -> Ok (`flag s)
| t -> coerce_to_vcf_format_type t s
-type vcf_alt_type =
- | Deletion
- | Insertion
- | Duplication
- | Inversion
- | CNV
-
-type vcf_alt_subtype = string
-
type vcf_info_meta = Info of vcf_number * vcf_info_type * vcf_description
type vcf_filter_meta = Filter of vcf_description
-type vcf_format_meta =
- Format of vcf_number * vcf_format_type * vcf_description
-type vcf_alt_meta =
- Alt of vcf_alt_type * vcf_alt_subtype list * vcf_description
+type vcf_format_meta = Format of vcf_number * vcf_format_type * vcf_description
+type vcf_alt_meta = Alt of vcf_description
type vcf_meta = {
vcfm_version : string;
@@ -92,22 +81,96 @@ type vcf_meta = {
vcfm_info : (vcf_id, vcf_info_meta) Hashtbl.t;
vcfm_filters : (vcf_id * vcf_filter_meta) list;
vcfm_format : (vcf_id, vcf_format_meta) Hashtbl.t;
- vcfm_alt : vcf_alt_meta list;
+ vcfm_alt : (string, vcf_alt_meta) Hashtbl.t;
vcfm_arbitrary : (string, string) Hashtbl.t;
vcfm_header : string list
}
+(** Note(superbobry): this is mostly taken from PyVCF module by
+ James Casbon. See https://github.com/jamescasbon/PyVCF. The
+ standard does _NOT_ specify how many arguments are allowed
+ for each reserved item, so we assume 'Unknown'. *)
+let reserved_info = Hashtbl.Poly.of_alist_exn [
+ ("AA", `string_value);
+ ("AC", `integer_value);
+ ("AF", `float_value);
+ ("AN", `integer_value);
+ ("BQ", `float_value);
+ ("CIGAR", `string_value);
+ ("DB", `flag_value);
+ ("DP", `integer_value);
+ ("H2", `flag_value);
+ ("MQ", `float_value);
+ ("MQ0", `integer_value);
+ ("NS", `integer_value);
+ ("SB", `string_value);
+ ("SOMATIC", `flag_value);
+ ("VALIDATED", `flag_value);
+
+ (* VCF 4.1 Additions *)
+ ("IMPRECISE", `flag_value);
+ ("NOVEL", `flag_value);
+ ("END", `integer_value);
+ ("SVTYPE", `string_value);
+ ("CIPOS", `integer_value);
+ ("CIEND", `integer_value);
+ ("HOMLEN", `integer_value);
+ ("HOMSEQ", `integer_value);
+ ("BKPTID", `string_value);
+ ("MEINFO", `string_value);
+ ("METRANS", `string_value);
+ ("DGVID", `string_value);
+ ("DBVARID", `string_value);
+ ("MATEID", `string_value);
+ ("PARID", `string_value);
+ ("EVENT", `string_value);
+ ("CILEN", `integer_value);
+ ("CN", `integer_value);
+ ("CNADJ", `integer_value);
+ ("CICN", `integer_value);
+ ("CICNADJ",`integer_value)
+ ]
+
+and reserved_format = Hashtbl.Poly.of_alist_exn [
+ ("GT", `string_value);
+ ("DP", `integer_value);
+ ("FT", `string_value);
+ ("GL", `float_value);
+ ("GQ", `float_value);
+ ("HQ", `float_value);
+
+ (* VCF 4.1 Additions *)
+ ("CN", `integer_value);
+ ("CNQ", `float_value);
+ ("CNL", `float_value);
+ ("NQ", `integer_value);
+ ("HAP", `integer_value);
+ ("AHAP", `integer_value)
+ ]
+
+and reserved_alt = Hashtbl.Poly.of_alist_exn [
+ ("DEL", Alt "Deletion relative to the reference");
+ ("INS", Alt "Insertion of novel sequence relative to the reference");
+ ("DUP", Alt "Region of elevated copy number relative to the reference");
+ ("INV", Alt "Inversion of reference sequence");
+ ("CNV", Alt "Copy number variable region");
+ ("DUP:TANDEM", Alt "TANDEM Tandem duplication");
+ ("DEL:ME", Alt "ME Deletion of mobile element relative to the reference");
+ ("INS:ME", Alt "ME Insertion of a mobile element relative to the reference");
+]
+
let default_meta = {
vcfm_version = "<unknown>";
vcfm_id_cache = Set.Poly.empty;
vcfm_info = Hashtbl.Poly.create ();
vcfm_filters = [];
vcfm_format = Hashtbl.Poly.create ();
- vcfm_alt = [];
+ vcfm_alt = reserved_alt;
vcfm_arbitrary = Hashtbl.Poly.create ();
vcfm_header = []
}
+
type vcf_format = [ `integer of int
| `float of float
| `character of char
@@ -120,7 +183,7 @@ type vcf_row = {
vcfr_pos : int;
vcfr_ids : string list;
vcfr_ref : string;
- vcfr_alts : string list;
+ vcfr_alts : string list; (* FIXME(superbobry): proper typing! *)
vcfr_qual : float option;
vcfr_filter : vcf_id list;
vcfr_info : (vcf_id, vcf_info list) Hashtbl.t
@@ -134,6 +197,7 @@ type vcf_parse_row_error =
| `invalid_dna of string
| `unknown_info of vcf_id
| `unknown_filter of vcf_id
+ | `unknown_alt of string
| `duplicate_ids of vcf_id list
| `invalid_arguments_length of vcf_id * int * int
| `arbitrary_width_rows_not_supported
@@ -143,7 +207,6 @@ type vcf_parse_error =
[ `malformed_meta of Pos.t * string
| `malformed_row of Pos.t * vcf_parse_row_error * string
| `malformed_header of Pos.t * string
- | `alt_parsing_not_implemented of Pos.t
| `arbitrary_width_rows_not_supported of Pos.t
| `incomplete_input of Pos.t * Biocaml_lines.item list * string option
| `not_ready
@@ -169,7 +232,7 @@ let string_to_vcfr_info { vcfm_info } s =
let open Result.Monad_infix in
let chunk_values = match Hashtbl.find vcfm_info key with
- | Some (Info (Number 0, `flag_value, _description))
+ | Some (Info (_t, `flag_value, _description))
when raw_value = "" -> return [`flag key]
| Some (Info (Number n, t, _description)) ->
string_to_t raw_value t >>= fun values ->
@@ -210,14 +273,21 @@ let string_to_vcfr_ids { vcfm_id_cache } s =
then return chunks
else fail (`duplicate_ids duplicate_ids)
-let string_to_vcfr_alts s =
+let string_to_vcfr_alts { vcfm_alt } s =
let open Result.Monad_infix in
match String.split ~on:',' s with
| ["."] -> return []
| chunks ->
- match List.find chunks ~f:(fun chunk -> not (is_valid_dna chunk)) with
- | Some invalid -> fail (`invalid_dna invalid)
- | None -> return chunks
+ let res = List.map chunks ~f:(fun chunk ->
+ let n = String.length chunk in
+ match (chunk.[0] = '<' && chunk.[n - 1] = '>', is_valid_dna chunk) with
+ | (true, _) ->
+ if Hashtbl.mem vcfm_alt (String.sub ~pos:1 ~len:(n - 2) chunk)
+ then return chunk
+ else fail (`unknown_alt chunk)
+ | (false, true) -> return chunk
+ | (false, false) -> fail (`invalid_dna chunk)) (* Impossible. *)
+ in Result.all res
let list_to_vcf_row meta chunks =
match chunks with
@@ -227,7 +297,7 @@ let list_to_vcf_row meta chunks =
Safe.int_of_string raw_pos >>= fun vcfr_pos ->
string_to_vcfr_ids meta raw_id >>= fun vcfr_ids ->
string_to_vcfr_ref raw_ref >>= fun vcfr_ref ->
- string_to_vcfr_alts raw_alt >>= fun vcfr_alts ->
+ string_to_vcfr_alts meta raw_alt >>= fun vcfr_alts ->
string_to_vcfr_info meta raw_info >>= fun vcfr_info ->
string_to_vcfr_filter meta raw_filter >>= fun vcfr_filter ->
let row = {
@@ -251,6 +321,7 @@ let parse_row_error_to_string : vcf_parse_row_error -> string = function
| `invalid_dna s -> sprintf "invalid_dna (%s)" s
| `unknown_info s -> sprintf "unknown_info (%s)" s
| `unknown_filter f -> sprintf "unknown_filter (%s)" f
+| `unknown_alt s -> sprintf "unknown_alt (%s)" s
| `duplicate_ids ids ->
sprintf "duplicate_ids (%s)" (String.concat ~sep:", " ids)
| `invalid_arguments_length (key, got, expected) ->
@@ -263,8 +334,6 @@ let parse_error_to_string : vcf_parse_error -> string =
| `malformed_row (p, err, s) ->
sprintf "malformed_row (%s, %a, %S)" (parse_row_error_to_string err) pos p s
| `malformed_header (p, s) -> sprintf "malformed_header (%a, %s)" pos p s
- | `alt_parsing_not_implemented p ->
- sprintf "alt_parsing_not_implemented (%a)" pos p
| `arbitrary_width_rows_not_supported p ->
sprintf "arbitrary_width_rows_not_supported (%a)" pos p
| _ -> sprintf "unknown_error"
@@ -307,7 +376,11 @@ module Transform = struct
in Hashtbl.set vcfm_format id format_meta);
return (`partial meta)
| Some ("ALT", v) ->
- fail (`alt_parsing_not_implemented (current_position p))
+ Scanf.sscanf v "<ID=%s@,Description=%S>"
+ (fun id description ->
+ let alt_meta = Alt description in
+ Hashtbl.set vcfm_alt id alt_meta);
+ return (`partial meta)
| Some (k, v) -> begin
Hashtbl.set meta.vcfm_arbitrary ~key:k ~data:v;
return (`partial meta)
@@ -327,7 +400,26 @@ module Transform = struct
| "FORMAT" :: _ ->
fail (`arbitrary_width_rows_not_supported (current_position p))
| _ :: _ -> fail (`malformed_header (current_position p, l))
- | [] -> return (`complete { meta with vcfm_header = chunks })
+ | [] ->
+ let merge_with_reserved ~c = Hashtbl.merge
+ ~f:(fun ~key v ->
+ match v with
+ | `Left t -> Some (c Unknown t "<reserved>")
+ | `Right parsed -> Some parsed
+ | `Both (_t, parsed) -> Some parsed)
+ in
+
+ (** Note(superbobry): merge parsed INFO and FORMAT entries with
+ predefined ones; a VCF file _may_ override any of the
+ reserved fields, in that case, default definition won't be
+ used. *)
+ let vcfm_info = merge_with_reserved reserved_info vcfm_info
+ ~c:(fun n t description -> Info (n, t, description));
+ and vcfm_format = merge_with_reserved reserved_format vcfm_format
+ ~c:(fun n t description -> Format (n, t, description));
+ in return (`complete { meta with vcfm_header = chunks;
+ vcfm_info;
+ vcfm_format })
end
| _ -> fail (`malformed_header (current_position p, l))
end
View
19 src/lib/biocaml_vcf.mli
@@ -16,21 +16,10 @@ type vcf_format_type = [ `integer_value
type vcf_info_type = [ vcf_format_type | `flag_value ]
-type vcf_alt_type =
- | Deletion
- | Insertion
- | Duplication
- | Inversion
- | CNV
-
-type vcf_alt_subtype = string
-
type vcf_info_meta = Info of vcf_number * vcf_info_type * vcf_description
type vcf_filter_meta = Filter of vcf_description
-type vcf_format_meta =
- Format of vcf_number * vcf_format_type * vcf_description
-type vcf_alt_meta =
- Alt of vcf_alt_type * vcf_alt_subtype list * vcf_description
+type vcf_format_meta = Format of vcf_number * vcf_format_type * vcf_description
+type vcf_alt_meta = Alt of vcf_description
type vcf_meta = {
vcfm_version : string;
@@ -38,7 +27,7 @@ type vcf_meta = {
vcfm_info : (vcf_id, vcf_info_meta) Hashtbl.t;
vcfm_filters : (vcf_id * vcf_filter_meta) list;
vcfm_format : (vcf_id, vcf_format_meta) Hashtbl.t;
- vcfm_alt : vcf_alt_meta list;
+ vcfm_alt : (string, vcf_alt_meta) Hashtbl.t;
vcfm_arbitrary : (string, string) Hashtbl.t;
vcfm_header : string list
}
@@ -73,6 +62,7 @@ type vcf_parse_row_error =
| `invalid_dna of string
| `unknown_info of vcf_id
| `unknown_filter of vcf_id
+ | `unknown_alt of string
| `duplicate_ids of vcf_id list
| `invalid_arguments_length of vcf_id * int * int
| `arbitrary_width_rows_not_supported
@@ -82,7 +72,6 @@ type vcf_parse_error =
[ `malformed_meta of Pos.t * string
| `malformed_row of Pos.t * vcf_parse_row_error * string
| `malformed_header of Pos.t * string
- | `alt_parsing_not_implemented of Pos.t
| `arbitrary_width_rows_not_supported of Pos.t
| `incomplete_input of Pos.t * Biocaml_lines.item list * string option
| `not_ready
View
8 src/tests/data/vcf_04_reserved.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.1
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##phasing=partial
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+#CHROM POS ID REF ALT QUAL FILTER INFO
+20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2
View
18 src/tests/data/vcf_05_alt.vcf
@@ -0,0 +1,18 @@
+##fileformat=VCFv4.1
+##fileDate=20100501
+##reference=1000GenomesPilot-NCBI36
+##assembly=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta
+##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file">
+##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
+##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##INFO=<ID=HOMLEN,Number=.,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
+##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints">
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
+##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
+#CHROM POS ID REF ALT QUAL FILTER INFO
+2 321682 . T <DEL> 6 PASS IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62
+2 14477084 . C <DEL:ME:ALU> 12 PASS IMPRECISE;SVTYPE=DEL;END=14477381;SVLEN=-297;MEINFO=AluYa5,5,307,+;CIPOS=-22,18;CIEND=-12,32
View
80 src/tests/test_vcf.ml
@@ -22,8 +22,7 @@ let compare_rows r1 r2 =
let make_row ~chrom ~pos ~ids ~ref ~alts ~qual ~filter ~info =
let open Vcf in
- let vcfr_info = Hashtbl.Poly.create () in
- List.iter ~f:(fun (k, v) -> Hashtbl.set vcfr_info k v) info;
+ let vcfr_info = Hashtbl.Poly.of_alist_exn info in
{
vcfr_chrom = chrom;
vcfr_pos = pos;
@@ -35,6 +34,19 @@ let make_row ~chrom ~pos ~ids ~ref ~alts ~qual ~filter ~info =
vcfr_info
}
+let test_parse_vcf_generic filename rows =
+ let s = make_stream filename in
+ List.iter rows ~f:(fun row ->
+ match Stream.next s with
+ | Some (Ok actual_row) ->
+ assert_equal ~cmp:compare_rows row actual_row
+ | Some (Error err) ->
+ let msg = Vcf.parse_error_to_string err in
+ assert_bool
+ (sprintf "%s:row *not* parsed, reason: %s" filename msg)
+ false
+ | None -> assert_bool (sprintf "%s:row missing" filename) false)
+
let test_parse_vcf_header () =
let s = make_stream "vcf_01_header_only.vcf" in
match Stream.next s with
@@ -45,8 +57,8 @@ let test_parse_vcf_header () =
assert_bool (Printf.sprintf "test_parse_vcf_header, reason: %s" msg) false
let test_parse_vcf_simple () =
- let s = make_stream "vcf_02_simple.vcf" in
- let row = make_row ~chrom:"20" ~pos:14370 ~ids:["rs6054257"]
+ test_parse_vcf_generic "vcf_02_simple.vcf" [
+ make_row ~chrom:"20" ~pos:14370 ~ids:["rs6054257"]
~ref:"G" ~alts:["A"]
~qual:(Some 29.0) ~filter:[]
~info:[("NS", [`integer 3]);
@@ -54,18 +66,10 @@ let test_parse_vcf_simple () =
("AF", [`float 0.5]);
("DB", [`flag "DB"]);
("H2", [`flag "H2"])]
- in match Stream.next s with
- | Some (Ok actual_row) -> assert_equal ~cmp:compare_rows actual_row row
- | Some (Error err) ->
- let msg = Vcf.parse_error_to_string err in
- assert_bool
- (Printf.sprintf "test_parse_vcf_simple:row *not* parsed, reason: %s" msg)
- false
- | None -> assert_bool "test_parse_vcf1000g:row missing" false
+ ]
let test_parse_vcf_1000g () =
- let s = make_stream "vcf_03_1000g.vcf" in
- let rows = [
+ test_parse_vcf_generic "vcf_03_1000g.vcf" [
make_row ~chrom:"20" ~pos:17330 ~ids:[]
~ref:"T" ~alts:["A"]
~qual:(Some 3.0) ~filter:["q10"]
@@ -92,19 +96,47 @@ let test_parse_vcf_1000g () =
~info:[("NS", [`integer 3]);
("DP", [`integer 9]);
("AA", [`string "G"])]
- ] in List.iter rows ~f:(fun row ->
- match Stream.next s with
- | Some (Ok actual_row) ->
- assert_equal ~cmp:compare_rows row actual_row
- | Some (Error err) ->
- let msg = Vcf.parse_error_to_string err in
- assert_bool
- (Printf.sprintf "test_parse_vcf1000g:row *not* parsed, reason: %s" msg)
- false
- | None -> assert_bool "test_parse_vcf1000g:row missing" false)
+ ]
+
+let test_parse_vcf_reserved () =
+ test_parse_vcf_generic "vcf_04_reserved.vcf" [
+ make_row ~chrom:"20" ~pos:14370 ~ids:["rs6054257"]
+ ~ref:"G" ~alts:["A"]
+ ~qual:(Some 29.0) ~filter:[]
+ ~info:[("NS", [`integer 3]);
+ ("DP", [`integer 14]);
+ ("AF", [`float 0.5]);
+ ("DB", [`flag "DB"]);
+ ("H2", [`flag "H2"])]
+ ]
+
+let test_parse_vcf_alt () =
+ test_parse_vcf_generic "vcf_05_alt.vcf" [
+ make_row ~chrom:"2" ~pos:321682 ~ids:[]
+ ~ref:"T" ~alts:["<DEL>"]
+ ~qual:(Some 6.0) ~filter:[]
+ ~info:[("IMPRECISE", [`flag "IMPRECISE"]);
+ ("SVTYPE", [`string "DEL"]);
+ ("END", [`integer 321887]);
+ ("SVLEN", [`integer (-105)]);
+ ("CIPOS", [`integer (-56); `integer 20]);
+ ("CIEND", [`integer (-10); `integer 62])];
+ make_row ~chrom:"2" ~pos:14477084 ~ids:[]
+ ~ref:"C" ~alts:["<DEL:ME:ALU>"]
+ ~qual:(Some 12.0) ~filter:[]
+ ~info:[("IMPRECISE", [`flag "IMPRECISE"]);
+ ("SVTYPE", [`string "DEL"]);
+ ("END", [`integer 14477381]);
+ ("SVLEN", [`integer (-297)]);
+ ("MEINFO", [`string "AluYa5"; `string "5"; `string "307"; `string "+"]);
+ ("CIPOS", [`integer (-22); `integer 18]);
+ ("CIEND", [`integer (-12); `integer 32])]
+ ]
let tests = "VCF" >::: [
"Parse VCF header" >:: test_parse_vcf_header;
"Parse simple VCF (1 row)" >:: test_parse_vcf_simple;
"Parse sample VCF from 1000g project" >:: test_parse_vcf_1000g;
+ "Parse VCF missing INFO for reserved sub-fields" >:: test_parse_vcf_reserved;
+ "Parse VCF with custom ALT" >:: test_parse_vcf_alt;
]
Something went wrong with that request. Please try again.