Skip to content

Commit

Permalink
Merge e8774e9 into d86f258
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Aug 25, 2016
2 parents d86f258 + e8774e9 commit e3194f8
Show file tree
Hide file tree
Showing 13 changed files with 82 additions and 51 deletions.
Expand Up @@ -49,8 +49,8 @@ class AlleleCountArgs extends Args4jBase with ParquetArgs {
object AlleleCountHelper extends Serializable {
def chooseAllele(x: (String, java.lang.Long, String, String, GenotypeAllele)) =
x match {
case (chr, position, refAllele, varAllele, GenotypeAllele.Ref) => Some(chr, position, refAllele)
case (chr, position, refAllele, varAllele, GenotypeAllele.Alt) => Some(chr, position, varAllele)
case (chr, position, refAllele, varAllele, GenotypeAllele.REF) => Some(chr, position, refAllele)
case (chr, position, refAllele, varAllele, GenotypeAllele.ALT) => Some(chr, position, varAllele)
case _ => None
}

Expand Down
Expand Up @@ -17,13 +17,15 @@
*/
package org.bdgenomics.adam.converters

import com.google.common.collect.ImmutableList
import htsjdk.variant.variantcontext.{
Allele,
GenotypesContext,
GenotypeLikelihoods,
VariantContext => HtsjdkVariantContext,
VariantContextBuilder
}
import htsjdk.variant.vcf.VCFConstants
import java.util.Collections
import org.bdgenomics.utils.misc.Logging
import org.bdgenomics.adam.models.{
Expand Down Expand Up @@ -72,10 +74,10 @@ private[adam] object VariantContextConverter {
* @return The Avro representation for this allele.
*/
private def convertAllele(vc: HtsjdkVariantContext, allele: Allele): GenotypeAllele = {
if (allele.isNoCall) GenotypeAllele.NoCall
else if (allele.isReference) GenotypeAllele.Ref
else if (allele == NON_REF_ALLELE || !vc.hasAlternateAllele(allele)) GenotypeAllele.OtherAlt
else GenotypeAllele.Alt
if (allele.isNoCall) GenotypeAllele.NO_CALL
else if (allele.isReference) GenotypeAllele.REF
else if (allele == NON_REF_ALLELE || !vc.hasAlternateAllele(allele)) GenotypeAllele.OTHER_ALT
else GenotypeAllele.ALT
}

/**
Expand Down Expand Up @@ -123,9 +125,9 @@ private[adam] object VariantContextConverter {
var alleles = g.getAlleles
if (alleles == null) return Collections.emptyList[Allele]
else g.getAlleles.map {
case GenotypeAllele.NoCall => Allele.NO_CALL
case GenotypeAllele.Ref | GenotypeAllele.OtherAlt => Allele.create(g.getVariant.getReferenceAllele, true)
case GenotypeAllele.Alt => Allele.create(g.getVariant.getAlternateAllele)
case GenotypeAllele.NO_CALL => Allele.NO_CALL
case GenotypeAllele.REF | GenotypeAllele.OTHER_ALT => Allele.create(g.getVariant.getReferenceAllele, true)
case GenotypeAllele.ALT => Allele.create(g.getVariant.getAlternateAllele)
}
}
}
Expand Down Expand Up @@ -312,6 +314,36 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N
contigToRefSeq.getOrElse(vc.getChr, vc.getChr)
}

/**
* Split the htsjdk variant context ID field into an array of names.
*
* @param vc htsjdk variant context
* @return Returns an Option wrapping an array of names split from the htsjdk
* variant context ID field
*/
private def splitIds(vc: HtsjdkVariantContext): Option[java.util.List[String]] = {
if (vc.hasID()) {
Some(ImmutableList.copyOf(vc.getID().split(VCFConstants.ID_FIELD_SEPARATOR)))
} else {
None
}
}

/**
* Join the array of variant names into a string for the htsjdk variant context ID field.
*
* @param variant variant
* @return Returns an Option wrapping a string for the htsjdk variant context ID field joined
* from the array of variant names
*/
private def joinNames(variant: Variant): Option[String] = {
if (variant.getNames != null && variant.getNames.length > 0) {
Some(variant.getNames.mkString(VCFConstants.ID_FIELD_SEPARATOR))
} else {
None
}
}

/**
* Builds an avro Variant for a site with a defined alt allele.
*
Expand All @@ -321,16 +353,14 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N
* @return Returns an Avro description of the genotyped site.
*/
private def createADAMVariant(vc: HtsjdkVariantContext, alt: Option[String]): Variant = {
// VCF CHROM, POS, REF and ALT
// VCF CHROM, POS, ID, REF and ALT
val builder = Variant.newBuilder
.setContigName(createContig(vc))
.setStart(vc.getStart - 1 /* ADAM is 0-indexed */ )
.setEnd(vc.getEnd /* ADAM is 0-indexed, so the 1-indexed inclusive end becomes exclusive */ )
.setReferenceAllele(vc.getReference.getBaseString)
if (vc.hasLog10PError) {
builder.setVariantErrorProbability(vc.getPhredScaledQual.intValue())
}
alt.foreach(builder.setAlternateAllele(_))
splitIds(vc).foreach(builder.setNames(_))
builder.build
}

Expand Down Expand Up @@ -392,7 +422,7 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N
.setVariantCallingAnnotations(annotations)
.setSampleId(g.getSampleName)
.setAlleles(g.getAlleles.map(VariantContextConverter.convertAllele(vc, _)))
.setIsPhased(g.isPhased)
.setPhased(g.isPhased)

if (g.hasGQ) genotype.setGenotypeQuality(g.getGQ)
if (g.hasDP) genotype.setReadDepth(g.getDP)
Expand Down Expand Up @@ -538,7 +568,10 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N
.stop(variant.getStart + variant.getReferenceAllele.length)
.alleles(VariantContextConverter.convertAlleles(variant))

vc.databases.flatMap(d => Option(d.getDbSnpId)).foreach(d => vcb.id("rs" + d))
joinNames(variant) match {
case None => vcb.noID()
case Some(s) => vcb.id(s)
}

// TODO: Extract provenance INFO fields
try {
Expand All @@ -547,7 +580,7 @@ private[adam] class VariantContextConverter(dict: Option[SequenceDictionary] = N
g.getSampleId, VariantContextConverter.convertAlleles(g)
)

Option(g.getIsPhased).foreach(gb.phased(_))
Option(g.getPhased).foreach(gb.phased(_))
Option(g.getGenotypeQuality).foreach(gb.GQ(_))
Option(g.getReadDepth).foreach(gb.DP(_))

Expand Down
Expand Up @@ -24,5 +24,5 @@ import org.bdgenomics.formats.avro.Feature
*/
object FeatureField extends FieldEnumeration(Feature.SCHEMA$) {

val featureId, name, source, featureType, contigName, start, end, strand, phase, frame, score, geneId, transcriptId, exonId, aliases, parentIds, target, gap, derivesFrom, notes, dbxrefs, ontologyTerms, isCircular, attributes = SchemaValue
val featureId, name, source, featureType, contigName, start, end, strand, phase, frame, score, geneId, transcriptId, exonId, aliases, parentIds, target, gap, derivesFrom, notes, dbxrefs, ontologyTerms, circular, attributes = SchemaValue
}
Expand Up @@ -24,5 +24,5 @@ import org.bdgenomics.formats.avro.Genotype
*/
object GenotypeField extends FieldEnumeration(Genotype.SCHEMA$) {

val variant, contigName, start, end, variantCallingAnnotations, sampleId, sampleDescription, processingDescription, alleles, referenceReadDepth, alternateReadDepth, readDepth, genotypeQuality, genotypeLikelihoods, splitFromMultiAllelic, isPhased, phaseSetId, phaseQuality = SchemaValue
val variant, contigName, start, end, variantCallingAnnotations, sampleId, sampleDescription, processingDescription, alleles, expectedAlleleDosage, referenceReadDepth, alternateReadDepth, readDepth, minReadDepth, genotypeQuality, genotypeLikelihoods, nonReferenceLikelihoods, strandBiasComponents, splitFromMultiAllelic, phased, phaseSetId, phaseQuality = SchemaValue
}
Expand Up @@ -24,5 +24,5 @@ import org.bdgenomics.formats.avro.Variant
*/
object VariantField extends FieldEnumeration(Variant.SCHEMA$) {

val contig, start, end, referenceAllele, variantAllele = SchemaValue
val contigName, start, end, names, referenceAllele, alternateAllele, somatic = SchemaValue
}
Expand Up @@ -138,7 +138,7 @@ private[features] object Features {
case "Target" => f.setTarget(entry._2)
case "Gap" => f.setGap(entry._2)
case "Derives_from" => f.setDerivesFrom(entry._2)
case "Is_circular" => f.setIsCircular(entry._2.toBoolean)
case "Is_circular" => f.setCircular(entry._2.toBoolean)
case "Alias" => aliases += entry._2
case "Note" => notes += entry._2
case "Parent" => parentIds += entry._2
Expand Down Expand Up @@ -185,7 +185,7 @@ private[features] object Features {
Option(feature.getTarget).foreach(attrs += Tuple2("Target", _))
Option(feature.getGap).foreach(attrs += Tuple2("Gap", _))
Option(feature.getDerivesFrom).foreach(attrs += Tuple2("Derives_from", _))
Option(feature.getIsCircular).foreach(addBooleanTuple)
Option(feature.getCircular).foreach(addBooleanTuple)
Option(feature.getGeneId).foreach(attrs += Tuple2("gene_id", _))
Option(feature.getTranscriptId).foreach(attrs += Tuple2("transcript_id", _))
Option(feature.getExonId).foreach(attrs += Tuple2("exon_id", _))
Expand Down
Expand Up @@ -40,26 +40,26 @@ class RichGenotype(val genotype: Genotype) {
distinctListOfAlleles match {

// If all alleles are the reference allele, the genotype is Homozygous Reference (HOM_REF)
case List(GenotypeAllele.Ref) => GenotypeType.HOM_REF
case List(GenotypeAllele.REF) => GenotypeType.HOM_REF

// If all alleles are the primary alternative allele, the genotype is Homozygous Alternative (HOM_ALT)
case List(GenotypeAllele.Alt) => GenotypeType.HOM_ALT
case List(GenotypeAllele.ALT) => GenotypeType.HOM_ALT

// If all alleles are not called, the genotype is not called (NO_CALL)
case List(GenotypeAllele.NoCall) => GenotypeType.NO_CALL
case List(GenotypeAllele.NO_CALL) => GenotypeType.NO_CALL

// If all alleles are OtherAlt.
// If genotype.getAlleles returns a single OtherAlt, the genotype is Homozygous Alternative (HOM_ALT)
// If genotype.getAlleles returns a multiple OtherAlt, the genotype is
// A) The OtherAlt alleles are the same OtherAlt alleles: Homozygous Alternative (HOM_ALT)
// B) The OtherAlt allales are different OtherAlt alleles: Heterozygous
// If all alleles are OTHER_ALT.
// If genotype.getAlleles returns a single OTHER_ALT, the genotype is Homozygous Alternative (HOM_ALT)
// If genotype.getAlleles returns a multiple OTHER_ALT, the genotype is
// A) The OTHER_ALT alleles are the same OTHER_ALT alleles: Homozygous Alternative (HOM_ALT)
// B) The OTHER_ALT alleles are different OTHER_ALT alleles: Heterozygous
// For now return NO_CALL as the genotypes, as was done in the previous getType function
// See also issue https://github.com/bigdatagenomics/adam/issues/897
case List(GenotypeAllele.OtherAlt) => GenotypeType.NO_CALL
case List(GenotypeAllele.OTHER_ALT) => GenotypeType.NO_CALL

// only the four above alleles are possible
// https://github.com/bigdatagenomics/bdg-formats/blob/master/src/main/resources/avro/bdg.avdl#L464
case _ => throw new IllegalStateException("Found single distinct allele other than the four possible alleles: Ref, Alt, NoCall and OtherAlt")
case _ => throw new IllegalStateException("Found single distinct allele other than the four possible alleles: REF, ALT, NO_CALL and OTHER_ALT")
}
} // In the case that there are multiple distinct alleles
// This should be applicable to any genome ploidy.
Expand All @@ -69,9 +69,9 @@ class RichGenotype(val genotype: Genotype) {
// IN HTS-JDK this would be GenotypeType.MIXED , this type is not available in BDG / ADAM
// https://github.com/bigdatagenomics/bdg-formats/blob/master/src/main/resources/avro/bdg.avdl#L483
// https://github.com/samtools/htsjdk/blob/master/src/java/htsjdk/variant/variantcontext/Genotype.java#L218
if (distinctListOfAlleles contains GenotypeAllele.NoCall) {
if (distinctListOfAlleles contains GenotypeAllele.NO_CALL) {
GenotypeType.NO_CALL
} // Otherwise the distinct alleles are a combination of 2 or 3 alleles from the list (GenotypeAllele.Ref, GenotypeAllele.Alt, GenotypeAllele.OtherAlt)
} // Otherwise the distinct alleles are a combination of 2 or 3 alleles from the list (GenotypeAllele.REF, GenotypeAllele.ALT, GenotypeAllele.OTHER_ALT)
// Therefore the genotype is Heterozygous HET
else {
GenotypeType.HET
Expand Down
Expand Up @@ -116,7 +116,6 @@ class ADAMKryoRegistrator extends KryoRegistrator {

kryo.register(classOf[Contig], new AvroSerializer[Contig])
kryo.register(classOf[RecordGroupMetadata], new AvroSerializer[RecordGroupMetadata])
kryo.register(classOf[StructuralVariant], new AvroSerializer[StructuralVariant])
kryo.register(classOf[VariantCallingAnnotations], new AvroSerializer[VariantCallingAnnotations])
kryo.register(classOf[TranscriptEffect], new AvroSerializer[TranscriptEffect])
kryo.register(classOf[VariantAnnotation], new AvroSerializer[VariantAnnotation])
Expand Down
Expand Up @@ -121,7 +121,7 @@ class VariantContextConverterSuite extends ADAMFunSuite {
val adamGTs = adamVCs.flatMap(_.genotypes)
assert(adamGTs.length === 1)
val adamGT = adamGTs.head
assert(adamGT.getAlleles.sameElements(List(GenotypeAllele.Ref, GenotypeAllele.Alt)))
assert(adamGT.getAlleles.sameElements(List(GenotypeAllele.REF, GenotypeAllele.ALT)))
assert(adamGT.getPhaseSetId === 1)
assert(adamGT.getPhaseQuality === 50)
}
Expand Down Expand Up @@ -182,7 +182,7 @@ class VariantContextConverterSuite extends ADAMFunSuite {
val genotype = Genotype.newBuilder
.setVariant(variant)
.setSampleId("NA12878")
.setAlleles(List(GenotypeAllele.Ref, GenotypeAllele.Alt))
.setAlleles(List(GenotypeAllele.REF, GenotypeAllele.ALT))
.setVariantCallingAnnotations(VariantCallingAnnotations.newBuilder()
.setFisherStrandBiasPValue(3.0f)
.setRmsMapQ(0.0f)
Expand Down Expand Up @@ -233,19 +233,19 @@ class VariantContextConverterSuite extends ADAMFunSuite {
val adamGT = adamVC.genotypes.head
assert(adamGT.getSplitFromMultiAllelic)
assert(adamGT.getReferenceReadDepth === 4)
assert(adamGT.getIsPhased)
assert(adamGT.getPhased)
}

val adamGT1 = adamVCs(0).genotypes.head
val adamGT2 = adamVCs(1).genotypes.head
assert(adamGT1.getAlleles.sameElements(List(GenotypeAllele.Alt, GenotypeAllele.OtherAlt)))
assert(adamGT1.getAlleles.sameElements(List(GenotypeAllele.ALT, GenotypeAllele.OTHER_ALT)))
assert(adamGT1.getAlternateReadDepth === 2)
assert(adamGT1.getGenotypeLikelihoods
.map(f => f: scala.Float)
.map(PhredUtils.logProbabilityToPhred)
.sameElements(List(59, 0, 256)))

assert(adamGT2.getAlleles.sameElements(List(GenotypeAllele.OtherAlt, GenotypeAllele.Alt)))
assert(adamGT2.getAlleles.sameElements(List(GenotypeAllele.OTHER_ALT, GenotypeAllele.ALT)))
assert(adamGT2.getAlternateReadDepth === 3)
assert(adamGT2.getGenotypeLikelihoods
.map(f => f: scala.Float)
Expand All @@ -269,7 +269,7 @@ class VariantContextConverterSuite extends ADAMFunSuite {
assert(adamGTs.length === 1)
val adamGT = adamGTs.head
assert(adamGT.getVariant.getAlternateAllele === null)
assert(adamGT.getAlleles.sameElements(List(GenotypeAllele.Ref, GenotypeAllele.Ref)))
assert(adamGT.getAlleles.sameElements(List(GenotypeAllele.REF, GenotypeAllele.REF)))
assert(adamGT.getMinReadDepth === 38)
assert(adamGT.getGenotypeLikelihoods.isEmpty)
assert(adamGT.getNonReferenceLikelihoods
Expand Down
Expand Up @@ -44,7 +44,7 @@ class FeatureRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals {
a.getTarget === b.getTarget &&
a.getGap === b.getGap &&
a.getDerivesFrom === b.getDerivesFrom &&
a.getIsCircular === b.getIsCircular &&
a.getCircular === b.getCircular &&
a.getAliases === b.getAliases &&
a.getNotes === b.getNotes &&
a.getParentIds === b.getParentIds &&
Expand Down
Expand Up @@ -46,7 +46,7 @@ class VariantContextRDDSuite extends ADAMFunSuite {

val g0 = Genotype.newBuilder().setVariant(v0)
.setSampleId("NA12878")
.setAlleles(List(GenotypeAllele.Ref, GenotypeAllele.Alt))
.setAlleles(List(GenotypeAllele.REF, GenotypeAllele.ALT))
.build

VariantContextRDD(sc.parallelize(List(
Expand Down
Expand Up @@ -32,31 +32,30 @@ class RichGenotypeSuite extends FunSuite {
test("different ploidy") {
val gb = Genotype.newBuilder.setVariant(v0).setSampleId("NA12878")
for (ploidy <- 0 until 3) {
val g = gb.setAlleles(List.fill(ploidy)(GenotypeAllele.Ref)).build
val g = gb.setAlleles(List.fill(ploidy)(GenotypeAllele.REF)).build
assert(g.ploidy === ploidy)
}
}

test("all types for diploid genotype") {
val gb = Genotype.newBuilder.setVariant(v0).setSampleId("NA12878")

val hom_ref = gb.setAlleles(List(GenotypeAllele.Ref, GenotypeAllele.Ref)).build
val hom_ref = gb.setAlleles(List(GenotypeAllele.REF, GenotypeAllele.REF)).build
assert(hom_ref.getType === GenotypeType.HOM_REF)

val het1 = gb.setAlleles(List(GenotypeAllele.Ref, GenotypeAllele.Alt)).build
val het1 = gb.setAlleles(List(GenotypeAllele.REF, GenotypeAllele.ALT)).build
assert(het1.getType === GenotypeType.HET)
val het2 = gb.setAlleles(List(GenotypeAllele.Alt, GenotypeAllele.Ref)).build
val het2 = gb.setAlleles(List(GenotypeAllele.ALT, GenotypeAllele.REF)).build
assert(het2.getType === GenotypeType.HET)

val hom_alt = gb.setAlleles(List(GenotypeAllele.Alt, GenotypeAllele.Alt)).build
val hom_alt = gb.setAlleles(List(GenotypeAllele.ALT, GenotypeAllele.ALT)).build
assert(hom_alt.getType === GenotypeType.HOM_ALT)

for (a <- GenotypeAllele.values) {
val no_call1 = gb.setAlleles(List(GenotypeAllele.NoCall, a)).build
val no_call1 = gb.setAlleles(List(GenotypeAllele.NO_CALL, a)).build
assert(no_call1.getType === GenotypeType.NO_CALL)
val no_call2 = gb.setAlleles(List(a, GenotypeAllele.NoCall)).build
val no_call2 = gb.setAlleles(List(a, GenotypeAllele.NO_CALL)).build
assert(no_call2.getType === GenotypeType.NO_CALL)
}
}

}
2 changes: 1 addition & 1 deletion pom.xml
Expand Up @@ -27,7 +27,7 @@
<hadoop.version>2.6.0</hadoop.version>
<hadoop-bam.version>7.6.0</hadoop-bam.version>
<slf4j.version>1.7.21</slf4j.version>
<bdg-formats.version>0.10.0-SNAPSHOT</bdg-formats.version>
<bdg-formats.version>0.9.1-SNAPSHOT</bdg-formats.version>
<bdg-utils.version>0.2.7</bdg-utils.version>
<htsjdk.version>2.5.0</htsjdk.version>
</properties>
Expand Down

0 comments on commit e3194f8

Please sign in to comment.