[BDG-FORMATS-29] Re-organize the Feature schema

This is attempting to re-organize the Feature schema, along the discussions that we (Timothy, Uri, Matt, Frank) have had in email and on the phone. The main requirements are: - less file-format dependence in the field choice ('qValue'-like fields could be relegated to the 'attributes' field) - fewer fields to improve the memory footprint
bigdatagenomics · Sep 17, 2014 · c766065 · c766065
1 parent 05c8cd5
commit c766065
Showing 1 changed file with 57 additions and 57 deletions.
diff --git a/src/main/resources/avro/bdg.avdl b/src/main/resources/avro/bdg.avdl
@@ -684,68 +684,68 @@ enum Strand {
 }
 
 record Feature {
-  // identifier for the particular feature object
-  // if provided, then preferably unique within a given trackName
+  /**
+   Preferably unique ID for this Feature object
+   */
   union { null, string } featureId = null;
-
-  // the name of this feature-type/track (e.g., centipede, conservation, etc.)
-  union { null, string } trackName = null;
-
-  // list of keys into outside databases
-  array<string> dbxrefs = null;
-
-  // pointers to parent features, a la Chado feature database schema
-  // parentIds and parentdbxrefs should correspond to each other
-  // only inconsistency is Chado would treat Contig as a Feature;
-  array<string> parentIds = null;
-  array<string> parentdbxrefs = null;
-
-  // coordinate system to locate against
+
+  /**
+   The type of feature this is (aka, "track").
+   
+   Examples are "conservation", "centipede", "gene"
+   */
+  union { null, string } featureType = null;
+
+  /**
+   The original source for this feature.
+   
+   Path/filename/URL and/or the file type (e.g., BED, GFF, etc.)
+   */
+  union { null, string } source = null;
+
+  /**
+   The contig this feature is located on
+   */
   union { null, Contig } contig = null;
-
-  // position
+
+  /**
+   Start coordinate on the contig
+   */
   union { null, long } start = null;
+
+  /**
+   End coordinate on the contig
+   */
   union { null, long } end = null;
+
+  /**
+   Strand information for this feature
+   */
   union { null, Strand } strand = null;
-
-  // base observation field
-  union { null, long, double, string } value = null;
-
-  // BED format - http://genome.ucsc.edu/FAQ/FAQformat.html
-  // chrom -- parsed into contig
-  // chromStart -- parsed into start
-  // chromEnd -- parsed into end
-  // name -- parsed into trackName
-  // score -- parsed into value
-  // strand -- parsed into strand
-  union { null, long } thickStart = null;
-  union { null, long } thickEnd = null;
-  union { null, string } itemRgb = null;
-  // should these be parsed into new Features while setting their parentIds?
-  union { null, long } blockCount = null;
-  array<long> blockSizes = null;
-  array<long> blockStarts = null;
-
-  // GFF2 format - http://www.sanger.ac.uk/resources/software/gff/spec.html
-  // seqname -- parsed into contig
-  // source -- parsed into trackName
-  // feature --  parsed into trackName
-  // start -- parsed into start
-  // end -- parsed into end
-  // score -- parsed into value
-  // strand -- parsed into strand
-  union { null, long } frame = null;
-
-  // Also, parsing out 'source' and 'feature' into separate fields, since we'll
-  // need to independently sort and handle these values.
-  union { null, string } source = null;
-  union { null, string } featureType = null;
-
-  // narrowPeak format - (BED6+4)
-  union { null, double } signalValue = null;
-  union { null, double } pValue = null;
-  union { null, double } qValue = null;
-  union { null, long } peak = null;
+
+  /**
+   The value associated with this feature (if double)
+   */
+  union { null, double } value = null;
+
+  /**
+   Cross-references into other databases.
+   
+   Key is database name and value is the accession.
+   */
+  map<string> dbxrefs = null;
+
+  /**
+   List of parent featureIds, for implementing feature hierachies/graphs.
+   */
+  array<string> parentIds = null;
+
+  /**
+   Additional feature info that doesn't fit into the standard fields above.
+   
+   They are all encoded as (string, string) key-value pairs.
+   */
+  map<string> attributes = null;
 }
 
 }