-
Notifications
You must be signed in to change notification settings - Fork 304
/
GenotypePredicatesSuite.scala
109 lines (91 loc) · 3.64 KB
/
GenotypePredicatesSuite.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*
* Copyright (c) 2014. Mount Sinai School of Medicine
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.predicates
import org.bdgenomics.adam.util.{ ParquetLogger, SparkFunSuite }
import java.util.logging.Level
import java.io.File
import org.bdgenomics.adam.avro.{
ADAMVariant,
ADAMGenotype,
ADAMContig,
VariantCallingAnnotations
}
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.rdd.ADAMContext._
import com.google.common.io.Files
import org.apache.commons.io.FileUtils
class GenotypePredicatesSuite extends SparkFunSuite {
sparkTest("Load only only PASSing records") {
ParquetLogger.hadoopLoggerLevel(Level.SEVERE)
val v0 = ADAMVariant.newBuilder
.setContig(ADAMContig.newBuilder.setContigName("chr11").build)
.setPosition(17409571)
.setReferenceAllele("T")
.setVariantAllele("C")
.build
val passFilterAnnotation =
VariantCallingAnnotations.newBuilder().setVariantIsPassing(true).build()
val failFilterAnnotation =
VariantCallingAnnotations.newBuilder().setVariantIsPassing(false).build()
val genotypes = sc.parallelize(List(
ADAMGenotype.newBuilder()
.setVariant(v0)
.setVariantCallingAnnotations(passFilterAnnotation)
.setSampleId("NA12878")
.build(),
ADAMGenotype.newBuilder()
.setVariant(v0)
.setVariantCallingAnnotations(failFilterAnnotation)
.setSampleId("NA12878")
.build()))
val genotypesParquetFile = new File(Files.createTempDir(), "genotypes")
genotypes.adamSave(genotypesParquetFile.getAbsolutePath)
val gts1: RDD[ADAMGenotype] = sc.adamLoad(
genotypesParquetFile.getAbsolutePath,
predicate = Some(classOf[GenotypeRecordPASSPredicate]))
assert(gts1.count === 1)
FileUtils.deleteDirectory(genotypesParquetFile.getParentFile)
}
sparkTest("Load all records and filter to only PASSing records") {
ParquetLogger.hadoopLoggerLevel(Level.SEVERE)
val v0 = ADAMVariant.newBuilder
.setContig(ADAMContig.newBuilder.setContigName("11").build)
.setPosition(17409571)
.setReferenceAllele("T")
.setVariantAllele("C")
.build
val passFilterAnnotation =
VariantCallingAnnotations.newBuilder().setVariantIsPassing(true).build()
val failFilterAnnotation =
VariantCallingAnnotations.newBuilder().setVariantIsPassing(false).build()
val genotypes = sc.parallelize(List(
ADAMGenotype.newBuilder().setVariant(v0)
.setSampleId("ignored")
.setVariantCallingAnnotations(passFilterAnnotation).build(),
ADAMGenotype.newBuilder()
.setSampleId("ignored")
.setVariant(v0)
.setVariantCallingAnnotations(failFilterAnnotation).build()))
val genotypesParquetFile = new File(Files.createTempDir(), "genotypes")
genotypes.adamSave(genotypesParquetFile.getAbsolutePath)
val gts: RDD[ADAMGenotype] = sc.adamLoad(genotypesParquetFile.getAbsolutePath)
assert(gts.count === 2)
val predicate = new GenotypeRecordPASSPredicate
val filtered = predicate(gts)
assert(filtered.count === 1)
FileUtils.deleteDirectory(genotypesParquetFile.getParentFile)
}
}