Skip to content

Commit

Permalink
Add proteinId field to Feature.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Oct 14, 2021
1 parent 1cb9111 commit bcbc800
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,14 @@ case class DatasetBoundFeatureDataset private[ds] (
transformDataset(dataset => dataset.filter(dataset.col("exonId") isin (exonIds: _*)))
}

override def filterToProtein(proteinId: String): FeatureDataset = {
transformDataset(dataset => dataset.filter(dataset.col("proteinId").eqNullSafe(proteinId)))
}

override def filterToProteins(proteinIds: Seq[String]): FeatureDataset = {
transformDataset(dataset => dataset.filter(dataset.col("proteinId") isin (proteinIds: _*)))
}

override def filterByScore(minimumScore: Double): FeatureDataset = {
transformDataset(dataset => dataset.filter(dataset.col("score").geq(minimumScore)))
}
Expand Down Expand Up @@ -654,6 +662,36 @@ sealed abstract class FeatureDataset extends AvroGenomicDataset[Feature, Feature
transform((rdd: RDD[Feature]) => rdd.filter(f => Option(f.getExonId).exists(exonIds.contains(_))))
}

/**
* Filter this FeatureDataset by protein to those that match the specified protein.
*
* @param proteinId Protein to filter by.
* @return FeatureDataset filtered by the specified protein.
*/
def filterToProtein(proteinId: String): FeatureDataset = {
transform((rdd: RDD[Feature]) => rdd.filter(f => Option(f.getProteinId).exists(_.equals(proteinId))))
}

/**
* (Java-specific) Filter this FeatureDataset by protein to those that match the specified proteins.
*
* @param proteinIds List of proteins to filter by.
* @return FeatureDataset filtered by the specified proteins.
*/
def filterToProteins(proteinIds: java.util.List[String]): FeatureDataset = {
filterToProteins(asScalaBuffer(proteinIds))
}

/**
* (Scala-specific) Filter this FeatureDataset by protein to those that match the specified proteins.
*
* @param proteinIds Sequence of proteins to filter by.
* @return FeatureDataset filtered by the specified proteins.
*/
def filterToProteins(proteinIds: Seq[String]): FeatureDataset = {
transform((rdd: RDD[Feature]) => rdd.filter(f => Option(f.getProteinId).exists(proteinIds.contains(_))))
}

/**
* Filter this FeatureDataset by score.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ private[feature] object Features {
case "gene_id" => f.setGeneId(entry._2)
case "transcript_id" => f.setTranscriptId(entry._2)
case "exon_id" => f.setExonId(entry._2)
case "protein_id" => f.setProteinId(entry._2)
// unrecognized key, save to attributes
case _ => remaining += entry
}
Expand Down Expand Up @@ -200,6 +201,7 @@ private[feature] object Features {
Option(feature.getGeneId).foreach(attrs += Tuple2("gene_id", _))
Option(feature.getTranscriptId).foreach(attrs += Tuple2("transcript_id", _))
Option(feature.getExonId).foreach(attrs += Tuple2("exon_id", _))
Option(feature.getProteinId).foreach(attrs += Tuple2("protein_id", _))
Option(feature.getSampleId).foreach(attrs += Tuple2("sampleId", _))
for (alias <- feature.getAliases) attrs += Tuple2("Alias", alias)
for (note <- feature.getNotes) attrs += Tuple2("Note", note)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1237,118 +1237,156 @@ class FeatureDatasetSuite extends ADAMFunSuite {

sparkTest("filter RDD bound features by gene") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.1").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.1").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToGene("CCDS22.1").rdd.count() === 2)
}

sparkTest("filter dataset bound features by gene") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.1").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.1").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToGene("CCDS22.1").rdd.count() === 2)
}

sparkTest("filter RDD bound features by genes") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.2").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.2").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToGenes(Seq("CCDS22.1", "CCDS22.2")).rdd.count() === 2)
}

sparkTest("filter dataset bound features by genes") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.2").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setGeneId("DVL1").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setGeneId("CCDS22.1").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setGeneId("CCDS22.2").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToGenes(Seq("CCDS22.1", "CCDS22.2")).rdd.count() === 2)
}

sparkTest("filter RDD bound features by transcript") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445648").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445648").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToTranscript("ENST00000445648").rdd.count() === 2)
}

sparkTest("filter dataset bound features by transcript") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445648").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445648").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToTranscript("ENST00000445648").rdd.count() === 2)
}

sparkTest("filter RDD bound features by transcripts") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445649").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445649").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToTranscripts(Seq("ENST00000445648", "ENST00000445649")).rdd.count() === 2)
}

sparkTest("filter dataset bound features by transcripts") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445649").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setTranscriptId("ENST00000339381").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setTranscriptId("ENST00000445648").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setTranscriptId("ENST00000445649").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToTranscripts(Seq("ENST00000445648", "ENST00000445649")).rdd.count() === 2)
}

sparkTest("filter RDD bound features by exon") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779983").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779983").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToExon("ENSE00001779983").rdd.count() === 2)
}

sparkTest("filter dataset bound features by exon") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779983").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779983").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToExon("ENSE00001779983").rdd.count() === 2)
}

sparkTest("filter RDD bound features by exons") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779984").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779984").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToExons(Seq("ENSE00001779983", "ENSE00001779984")).rdd.count() === 2)
}

sparkTest("filter dataset bound features by exons") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build();
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build();
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779984").build();
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setExonId("ENSE00001691126").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setExonId("ENSE00001779983").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setExonId("ENSE00001779984").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToExons(Seq("ENSE00001779983", "ENSE00001779984")).rdd.count() === 2)
}

sparkTest("filter RDD bound features by protein") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setProteinId("O14640").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setProteinId("O14640-2").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setProteinId("O14640-2").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToProtein("O14640-2").rdd.count() === 2)
}

sparkTest("filter dataset bound features by protein") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setProteinId("O14640").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setProteinId("O14640-2").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setProteinId("O14640-2").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToProtein("O14640-2").rdd.count() === 2)
}

sparkTest("filter RDD bound features by proteins") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setProteinId("O14640").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setProteinId("O14640-2").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setProteinId("O14640-3").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
assert(features.filterToProteins(Seq("O14640-2", "O14640-3")).rdd.count() === 2)
}

sparkTest("filter dataset bound features by proteins") {
val fb = Feature.newBuilder()
val f1 = fb.setReferenceName("1").setStart(1L).setEnd(101L).setProteinId("O14640").build()
val f2 = fb.setReferenceName("1").setStart(2L).setEnd(102L).setProteinId("O14640-2").build()
val f3 = fb.setReferenceName("1").setStart(3L).setEnd(103L).setProteinId("O14640-3").build()
val features = FeatureDataset(sc.parallelize(Seq(f1, f2, f3)))
val featuresDs = features.transformDataset(ds => ds)
assert(features.filterToProteins(Seq("O14640-2", "O14640-3")).rdd.count() === 2)
}

sparkTest("filter RDD bound features by score") {
val features = sc.loadFeatures(testFile("dvl1.200.bed"))
assert(features.filterByScore(10.0d).rdd.count() === 23)
Expand Down

0 comments on commit bcbc800

Please sign in to comment.