Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding printAttribute methods for alignment records, features, and samples. #1982

Merged
merged 1 commit into from
Jul 4, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 182 additions & 60 deletions adam-core/src/main/scala/org/bdgenomics/adam/util/ADAMShell.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@ import htsjdk.variant.vcf.{
}
import org.apache.spark.SparkContext
import org.bdgenomics.adam.models.VariantContext
import org.bdgenomics.adam.rdd.feature.FeatureRDD
import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.adam.rdd.variant.{
GenotypeRDD,
VariantRDD,
VariantContextRDD
}
import org.bdgenomics.formats.avro.{
AlignmentRecord,
Feature,
Genotype,
Sample,
Variant
}
import org.bdgenomics.utils.instrumentation.MetricsListener
Expand All @@ -43,88 +48,96 @@ import org.bdgenomics.utils.instrumentation._
*/
object ADAMShell {

/**
* Print filter values for variants in the specified rdd up to the limit.
*
* @param rdd VariantRDD.
* @param limit Number of variants to print filter values for. Defaults to 10.
*/
def printVariantFilters(rdd: VariantRDD, limit: Int = 10): Unit = {
printVariantFilters(rdd.rdd.take(limit), rdd.headerLines)
}

/** Variant headers. */
val variantHeaders = Array(
/** Alignment record headers. */
val alignmentHeaders = Array(
new ASCIITableHeader("Contig Name"),
new ASCIITableHeader("Start"),
new ASCIITableHeader("End"),
new ASCIITableHeader("Ref", Alignment.Left),
new ASCIITableHeader("Alt", Alignment.Left)
new ASCIITableHeader("Read Name"),
new ASCIITableHeader("Sample"),
new ASCIITableHeader("Read Group")
)

/**
* Print filter values for the specified variants.
* Print attribute values for alignment records in the specified rdd up to the limit.
*
* @param variants Sequence of variants to print filter values for.
* @param headerLines Sequence of VCF header lines.
* @param rdd AlignmentRecordRDD.
* @param keys Sequence of attribute keys.
* @param limit Number of alignment records to print attribute values for. Defaults to 10.
*/
def printVariantFilters(variants: Seq[Variant], headerLines: Seq[VCFHeaderLine]): Unit = {
println("Filter Header Lines")
headerLines.filter(line => line.isInstanceOf[VCFFilterHeaderLine]).foreach(println)
def printAlignmentAttributes(rdd: AlignmentRecordRDD, keys: Seq[String], limit: Int = 10): Unit = {
printAlignmentAttributes(rdd.rdd.take(limit), keys)
}

val header = variantHeaders ++ Array(
new ASCIITableHeader("Filters Applied"),
new ASCIITableHeader("Filters Passed"),
new ASCIITableHeader("Filters Failed")
)
private def findMatchingAttribute(key: String, attributes: String): String = {
AttributeUtils.parseAttributes(attributes).find(_.tag == key).fold("")(_.value.toString)
}

val rows: Array[Array[String]] = variants.map(v => Array[String](
v.getContigName(),
v.getStart().toString,
v.getEnd().toString,
v.getReferenceAllele(),
v.getAlternateAllele(),
v.getFiltersApplied().toString,
v.getFiltersPassed().toString,
v.getFiltersFailed().toString
)).toArray
/**
* Print attribute values for the specified alignment records.
*
* @param alignments Sequence of alignments.
* @param keys Sequence of attribute keys.
*/
def printAlignmentAttributes(alignments: Seq[AlignmentRecord], keys: Seq[String]): Unit = {
val header = alignmentHeaders ++ keys.map(key => new ASCIITableHeader(key))

println("\nVariant Filters\n" + new ASCIITable(header, rows).toString)
val rows: Array[Array[String]] = alignments.map(a => Array[String](
a.getContigName(),
a.getStart().toString,
a.getEnd().toString,
Option(a.getReadName()).getOrElse(""),
Option(a.getRecordGroupSample()).getOrElse(""),
Option(a.getRecordGroupName()).getOrElse("")
) ++ keys.map(key => findMatchingAttribute(key, a.getAttributes()))).toArray

println("\nAlignment Attributes\n" + new ASCIITable(header, rows).toString)
}

/** Feature headers. */
val featureHeaders = Array(
new ASCIITableHeader("Contig Name"),
new ASCIITableHeader("Start"),
new ASCIITableHeader("End"),
new ASCIITableHeader("Strand"),
new ASCIITableHeader("Name"),
new ASCIITableHeader("Identifier"),
new ASCIITableHeader("Type"),
new ASCIITableHeader("Score")
)

/**
* Print VCF INFO field attributes for variants in the specified rdd up to the limit.
* Print attribute values for features in the specified rdd up to the limit.
*
* @param rdd VariantRDD.
* @param keys Sequence of VCF INFO field attribute keys.
* @param limit Number of variants to print VCF INFO field attribute values for. Defaults to 10.
* @param rdd FeatureRDD.
* @param keys Sequence of attribute keys.
* @param limit Number of features to print attribute values for. Defaults to 10.
*/
def printInfoFields(rdd: VariantRDD, keys: Seq[String], limit: Int = 10): Unit = {
printInfoFields(rdd.rdd.take(limit), keys, rdd.headerLines)
def printFeatureAttributes(rdd: FeatureRDD, keys: Seq[String], limit: Int = 10): Unit = {
printFeatureAttributes(rdd.rdd.take(limit), keys)
}

/**
* Print VCF INFO field attributes for the specified variants.
* Print attribute values for the specified features.
*
* @param variants Sequence of variants.
* @param keys Sequence of VCF INFO field attribute keys.
* @param headerLines Sequence of VCF header lines.
* @param alignments Sequence of features.
* @param keys Sequence of attribute keys.
*/
def printInfoFields(variants: Seq[Variant], keys: Seq[String], headerLines: Seq[VCFHeaderLine]): Unit = {
println("Info Header Lines")
headerLines.filter(line => (line.isInstanceOf[VCFInfoHeaderLine] && keys.contains(line.asInstanceOf[VCFInfoHeaderLine].getID()))).foreach(println)
def printFeatureAttributes(features: Seq[Feature], keys: Seq[String]): Unit = {
val header = featureHeaders ++ keys.map(key => new ASCIITableHeader(key))

val header = variantHeaders ++ keys.map(key => new ASCIITableHeader(key))
val rows: Array[Array[String]] = features.map(f => Array[String](
f.getContigName(),
f.getStart().toString,
f.getEnd().toString,
f.getStrand().toString,
Option(f.getName()).getOrElse(""),
Option(f.getFeatureId()).getOrElse(""),
Option(f.getFeatureType()).getOrElse(""),
Option(f.getScore()).fold("")(_.toString)
) ++ keys.map(key => Option(f.getAttributes().get(key)).getOrElse(""))).toArray

val rows: Array[Array[String]] = variants.map(v => Array[String](
v.getContigName(),
v.getStart().toString,
v.getEnd().toString,
v.getReferenceAllele(),
v.getAlternateAllele()
) ++ keys.map(key => Option(v.getAnnotation().getAttributes().get(key)).getOrElse(""))).toArray

println("\nVariant Info Fields\n" + new ASCIITable(header, rows).toString)
println("\nFeature Attributes\n" + new ASCIITable(header, rows).toString)
}

/**
Expand All @@ -139,7 +152,12 @@ object ADAMShell {
}

/** Genotype headers. */
val genotypeHeaders = variantHeaders ++ Array(
val genotypeHeaders = Array(
new ASCIITableHeader("Contig Name"),
new ASCIITableHeader("Start"),
new ASCIITableHeader("End"),
new ASCIITableHeader("Ref", Alignment.Left),
new ASCIITableHeader("Alt", Alignment.Left),
new ASCIITableHeader("Alleles", Alignment.Center),
new ASCIITableHeader("Sample")
)
Expand Down Expand Up @@ -212,6 +230,110 @@ object ADAMShell {
println("\nGenotype Filters\n" + new ASCIITable(header, rows).toString)
}

/**
* Print attribute values for the specified features.
*
* @param alignments Sequence of features.
* @param keys Sequence of attribute keys.
*/
def printSampleAttributes(samples: Seq[Sample], keys: Seq[String]): Unit = {
val header = Array(
new ASCIITableHeader("Identifier"),
new ASCIITableHeader("Name")
) ++ keys.map(key => new ASCIITableHeader(key)) ++ Array(new ASCIITableHeader("Processing Steps"))

val rows: Array[Array[String]] = samples.map(s => Array[String](
s.getSampleId(),
s.getName()
) ++ keys.map(key => Option(s.getAttributes().get(key)).getOrElse("")) ++ Array(Option(s.getProcessingSteps().toString).getOrElse(""))).toArray

println("\nSample Attributes\n" + new ASCIITable(header, rows).toString)
}

/**
* Print filter values for variants in the specified rdd up to the limit.
*
* @param rdd VariantRDD.
* @param limit Number of variants to print filter values for. Defaults to 10.
*/
def printVariantFilters(rdd: VariantRDD, limit: Int = 10): Unit = {
printVariantFilters(rdd.rdd.take(limit), rdd.headerLines)
}

/** Variant headers. */
val variantHeaders = Array(
new ASCIITableHeader("Contig Name"),
new ASCIITableHeader("Start"),
new ASCIITableHeader("End"),
new ASCIITableHeader("Ref", Alignment.Left),
new ASCIITableHeader("Alt", Alignment.Left)
)

/**
* Print filter values for the specified variants.
*
* @param variants Sequence of variants to print filter values for.
* @param headerLines Sequence of VCF header lines.
*/
def printVariantFilters(variants: Seq[Variant], headerLines: Seq[VCFHeaderLine]): Unit = {
println("Filter Header Lines")
headerLines.filter(line => line.isInstanceOf[VCFFilterHeaderLine]).foreach(println)

val header = variantHeaders ++ Array(
new ASCIITableHeader("Filters Applied"),
new ASCIITableHeader("Filters Passed"),
new ASCIITableHeader("Filters Failed")
)

val rows: Array[Array[String]] = variants.map(v => Array[String](
v.getContigName(),
v.getStart().toString,
v.getEnd().toString,
v.getReferenceAllele(),
v.getAlternateAllele(),
v.getFiltersApplied().toString,
v.getFiltersPassed().toString,
v.getFiltersFailed().toString
)).toArray

println("\nVariant Filters\n" + new ASCIITable(header, rows).toString)
}

/**
* Print VCF INFO field attributes for variants in the specified rdd up to the limit.
*
* @param rdd VariantRDD.
* @param keys Sequence of VCF INFO field attribute keys.
* @param limit Number of variants to print VCF INFO field attribute values for. Defaults to 10.
*/
def printInfoFields(rdd: VariantRDD, keys: Seq[String], limit: Int = 10): Unit = {
printInfoFields(rdd.rdd.take(limit), keys, rdd.headerLines)
}

/**
* Print VCF INFO field attributes for the specified variants.
*
* @param variants Sequence of variants.
* @param keys Sequence of VCF INFO field attribute keys.
* @param headerLines Sequence of VCF header lines.
*/
def printInfoFields(variants: Seq[Variant], keys: Seq[String], headerLines: Seq[VCFHeaderLine]): Unit = {
println("Info Header Lines")
headerLines.filter(line => (line.isInstanceOf[VCFInfoHeaderLine] && keys.contains(line.asInstanceOf[VCFInfoHeaderLine].getID()))).foreach(println)

val header = variantHeaders ++ keys.map(key => new ASCIITableHeader(key))

val rows: Array[Array[String]] = variants.map(v => Array[String](
v.getContigName(),
v.getStart().toString,
v.getEnd().toString,
v.getReferenceAllele(),
v.getAlternateAllele()
) ++ keys.map(key => Option(v.getAnnotation().getAttributes().get(key)).getOrElse(""))).toArray

println("\nVariant Info Fields\n" + new ASCIITable(header, rows).toString)
}

/**
* Create and return a new metrics listener for the specified Spark context.
*
Expand Down