Skip to content

Commit

Permalink
[BDG-FORMATS-29] Re-organize the Feature schema
Browse files Browse the repository at this point in the history
This is attempting to re-organize the Feature schema, along the
discussions that we (Timothy, Uri, Matt, Frank) have had in email and on
the phone.  The main requirements are:
- less file-format dependence in the field choice ('qValue'-like fields
  could be relegated to the 'attributes' field)
- fewer fields to improve the memory footprint
  • Loading branch information
tdanford committed Sep 17, 2014
1 parent 05c8cd5 commit c766065
Showing 1 changed file with 57 additions and 57 deletions.
114 changes: 57 additions & 57 deletions src/main/resources/avro/bdg.avdl
Original file line number Diff line number Diff line change
Expand Up @@ -684,68 +684,68 @@ enum Strand {
}

record Feature {
// identifier for the particular feature object
// if provided, then preferably unique within a given trackName
/**
Preferably unique ID for this Feature object
*/
union { null, string } featureId = null;

// the name of this feature-type/track (e.g., centipede, conservation, etc.)
union { null, string } trackName = null;

// list of keys into outside databases
array<string> dbxrefs = null;

// pointers to parent features, a la Chado feature database schema
// parentIds and parentdbxrefs should correspond to each other
// only inconsistency is Chado would treat Contig as a Feature;
array<string> parentIds = null;
array<string> parentdbxrefs = null;

// coordinate system to locate against

/**
The type of feature this is (aka, "track").
Examples are "conservation", "centipede", "gene"
*/
union { null, string } featureType = null;

/**
The original source for this feature.
Path/filename/URL and/or the file type (e.g., BED, GFF, etc.)
*/
union { null, string } source = null;

/**
The contig this feature is located on
*/
union { null, Contig } contig = null;

// position

/**
Start coordinate on the contig
*/
union { null, long } start = null;

/**
End coordinate on the contig
*/
union { null, long } end = null;

/**
Strand information for this feature
*/
union { null, Strand } strand = null;

// base observation field
union { null, long, double, string } value = null;

// BED format - http://genome.ucsc.edu/FAQ/FAQformat.html
// chrom -- parsed into contig
// chromStart -- parsed into start
// chromEnd -- parsed into end
// name -- parsed into trackName
// score -- parsed into value
// strand -- parsed into strand
union { null, long } thickStart = null;
union { null, long } thickEnd = null;
union { null, string } itemRgb = null;
// should these be parsed into new Features while setting their parentIds?
union { null, long } blockCount = null;
array<long> blockSizes = null;
array<long> blockStarts = null;

// GFF2 format - http://www.sanger.ac.uk/resources/software/gff/spec.html
// seqname -- parsed into contig
// source -- parsed into trackName
// feature -- parsed into trackName
// start -- parsed into start
// end -- parsed into end
// score -- parsed into value
// strand -- parsed into strand
union { null, long } frame = null;

// Also, parsing out 'source' and 'feature' into separate fields, since we'll
// need to independently sort and handle these values.
union { null, string } source = null;
union { null, string } featureType = null;

// narrowPeak format - (BED6+4)
union { null, double } signalValue = null;
union { null, double } pValue = null;
union { null, double } qValue = null;
union { null, long } peak = null;

/**
The value associated with this feature (if double)
*/
union { null, double } value = null;

/**
Cross-references into other databases.
Key is database name and value is the accession.
*/
map<string> dbxrefs = null;

/**
List of parent featureIds, for implementing feature hierachies/graphs.
*/
array<string> parentIds = null;

/**
Additional feature info that doesn't fit into the standard fields above.
They are all encoded as (string, string) key-value pairs.
*/
map<string> attributes = null;
}

}

0 comments on commit c766065

Please sign in to comment.