Skip to content

Commit

Permalink
Add Avro-friendly ctrs to rdd.variant package.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Sep 27, 2019
1 parent 8a16401 commit cc5b30c
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import org.bdgenomics.utils.interval.array.{ IntervalArray, IntervalArraySeriali
import org.bdgenomics.formats.avro.{
Genotype,
GenotypeAllele,
Reference,
Sample,
Variant,
VariantAnnotation
Expand Down Expand Up @@ -86,50 +87,82 @@ private[adam] class GenotypeArraySerializer extends IntervalArraySerializer[Refe
object GenotypeDataset extends Serializable {

/**
* An genomic dataset containing genotypes called in a set of samples against a given
* reference genome.
* Builds a GenotypeDataset from an RDD.
*
* @param rdd Called genotypes.
* @param sequences A dictionary describing the reference genome.
* @param samples The samples called.
* @param headerLines The VCF header lines that cover all INFO/FORMAT fields
* needed to represent this genomic dataset of Genotypes.
* @param rdd The underlying Genotype RDD.
* @param references The references for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new GenotypeDataset.
*/
def apply(rdd: RDD[Genotype],
references: Iterable[Reference],
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine]): GenotypeDataset = {

RDDBoundGenotypeDataset(rdd, SequenceDictionary.fromAvro(references.toSeq), samples.toSeq, headerLines, None)
}

/**
* Builds a GenotypeDataset from an RDD.
*
* @param rdd The underlying Genotype RDD.
* @param references The references for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new GenotypeDataset.
*/
def apply(rdd: RDD[Genotype],
sequences: SequenceDictionary,
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): GenotypeDataset = {

RDDBoundGenotypeDataset(rdd, sequences, samples.toSeq, headerLines, None)
}

/**
* An genomic dataset containing genotypes called in a set of samples against a given
* reference genome, populated from a SQL Dataset.
* Builds a GenotypeDataset from a Dataset.
*
* @param ds Called genotypes.
* @param sequences A dictionary describing the reference genome.
* @param samples The samples called.
* @param headerLines The VCF header lines that cover all INFO/FORMAT fields
* needed to represent this genomic dataset of Genotypes.
* @param ds The underlying Genotype Dataset.
* @return A new GenotypeDataset.
*/
def apply(ds: Dataset[GenotypeProduct]): GenotypeDataset = {
GenotypeDataset(ds, SequenceDictionary.empty, Seq.empty, DefaultHeaderLines.allHeaderLines)
DatasetBoundGenotypeDataset(ds, SequenceDictionary.empty, Seq.empty, DefaultHeaderLines.allHeaderLines)
}

/**
* Builds a GenotypeDataset from a Dataset.
*
* @param ds The underlying Genotype Dataset.
* @param references The references for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new GenotypeDataset.
*/

def apply(ds: Dataset[GenotypeProduct],
references: Iterable[Reference],
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine]): GenotypeDataset = {

DatasetBoundGenotypeDataset(ds, SequenceDictionary.fromAvro(references.toSeq), samples.toSeq, headerLines)
}

/**
* An genomic dataset containing genotypes called in a set of samples against a given
* reference genome, populated from a SQL Dataset.
* Builds a GenotypeDataset from a Dataset.
*
* @param ds Called genotypes.
* @param sequences A dictionary describing the reference genome.
* @param samples The samples called.
* @param headerLines The VCF header lines that cover all INFO/FORMAT fields
* needed to represent this genomic dataset of Genotypes.
* @param ds The underlying Genotype Dataset.
* @param sequences The sequence dictionary for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new GenotypeDataset.
*/

def apply(ds: Dataset[GenotypeProduct],
sequences: SequenceDictionary,
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine]): GenotypeDataset = {

DatasetBoundGenotypeDataset(ds, sequences, samples.toSeq, headerLines)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ import org.bdgenomics.adam.rdd.{
}
import org.bdgenomics.adam.sql.{ VariantContext => VariantContextProduct }
import org.bdgenomics.adam.util.{ FileMerger, FileExtensions }
import org.bdgenomics.formats.avro.Sample
import org.bdgenomics.formats.avro.{ Reference, Sample }
import org.bdgenomics.utils.interval.array.{
IntervalArray,
IntervalArraySerializer
Expand Down Expand Up @@ -102,29 +102,41 @@ private[adam] class VariantContextArraySerializer extends IntervalArraySerialize
object VariantContextDataset extends Serializable {

/**
* Builds a VariantContextDataset without a partition map.
* Builds a VariantContextDataset from an RDD.
*
* @param rdd The underlying VariantContext RDD.
* @param sequences The sequence dictionary for the genomic dataset.
* @param references The references for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantContextDataset.
*/
def apply(rdd: RDD[VariantContext],
sequences: SequenceDictionary,
references: Iterable[Reference],
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine]): VariantContextDataset = {
RDDBoundVariantContextDataset(rdd, sequences, samples.toSeq, headerLines, None)

RDDBoundVariantContextDataset(rdd, SequenceDictionary.fromAvro(references.toSeq), samples.toSeq, headerLines)
}

/**
* Builds a VariantContextDataset from an RDD.
*
* @param rdd The underlying VariantContext RDD.
* @param sequences The sequence dictionary for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantContextDataset.
*/
def apply(rdd: RDD[VariantContext],
sequences: SequenceDictionary,
samples: Iterable[Sample]): VariantContextDataset = {
RDDBoundVariantContextDataset(rdd, sequences, samples.toSeq, null)
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): VariantContextDataset = {

RDDBoundVariantContextDataset(rdd, sequences, samples.toSeq, headerLines, None)
}

/**
* Builds a VariantContextDataset without a partition map.
* Builds a VariantContextDataset from a Dataset.
*
* @param ds The underlying VariantContext dataset.
* @return A new VariantContextDataset.
Expand All @@ -134,7 +146,24 @@ object VariantContextDataset extends Serializable {
}

/**
* Builds a VariantContextDataset without a partition map.
* Builds a VariantContextDataset from a Dataset.
*
* @param ds The underlying VariantContext dataset.
* @param references The references for the genomic dataset.
* @param samples The samples for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantContextDataset.
*/
def apply(ds: Dataset[VariantContextProduct],
references: Iterable[Reference],
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine]): VariantContextDataset = {

DatasetBoundVariantContextDataset(ds, SequenceDictionary.fromAvro(references.toSeq), samples.toSeq, headerLines)
}

/**
* Builds a VariantContextDataset from a Dataset.
*
* @param ds The underlying VariantContext dataset.
* @param sequences The sequence dictionary for the genomic dataset.
Expand All @@ -146,6 +175,7 @@ object VariantContextDataset extends Serializable {
sequences: SequenceDictionary,
samples: Iterable[Sample],
headerLines: Seq[VCFHeaderLine]): VariantContextDataset = {

DatasetBoundVariantContextDataset(ds, sequences, samples.toSeq, headerLines)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ import org.bdgenomics.adam.rdd.{
import org.bdgenomics.adam.rich.RichVariant
import org.bdgenomics.adam.serialization.AvroSerializer
import org.bdgenomics.adam.sql.{ Variant => VariantProduct }
import org.bdgenomics.formats.avro.{ Sample, Variant }
import org.bdgenomics.formats.avro.{
Reference,
Sample,
Variant
}
import org.bdgenomics.utils.interval.array.{
IntervalArray,
IntervalArraySerializer
Expand Down Expand Up @@ -78,42 +82,73 @@ private[adam] class VariantArraySerializer extends IntervalArraySerializer[Refer
object VariantDataset extends Serializable {

/**
* Builds a VariantDataset without a partition map.
* Builds a VariantDataset from an RDD.
*
* @param rdd The underlying Variant RDD.
* @param references The references for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantDataset.
*/
def apply(rdd: RDD[Variant],
references: Iterable[Reference],
headerLines: Seq[VCFHeaderLine]): VariantDataset = {

RDDBoundVariantDataset(rdd, SequenceDictionary.fromAvro(references.toSeq), headerLines, None)
}

/**
* Builds a VariantDataset from an RDD.
*
* @param rdd The underlying Variant RDD.
* @param sequences The sequence dictionary for the RDD.
* @param headerLines The header lines for the RDD.
* @return A new Variant RDD.
* @param sequences The sequence dictionary for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantDataset.
*/
def apply(rdd: RDD[Variant],
sequences: SequenceDictionary,
headerLines: Seq[VCFHeaderLine] = DefaultHeaderLines.allHeaderLines): VariantDataset = {

new RDDBoundVariantDataset(rdd, sequences, headerLines, None)
RDDBoundVariantDataset(rdd, sequences, headerLines, None)
}

/**
* An dataset containing variants called against a given reference genome.
* Builds a VariantDataset from a Dataset.
*
* @param ds Variants.
* @param sequences A dictionary describing the reference genome.
* @param ds The underlying Variant Dataset.
* @return A new VariantDataset.
*/
def apply(ds: Dataset[VariantProduct]): VariantDataset = {
VariantDataset(ds, SequenceDictionary.empty, DefaultHeaderLines.allHeaderLines)
DatasetBoundVariantDataset(ds, SequenceDictionary.empty, DefaultHeaderLines.allHeaderLines)
}

/**
* An dataset containing variants called against a given reference genome.
* Builds a VariantDataset from a Dataset.
*
* @param ds Variants.
* @param sequences A dictionary describing the reference genome.
* @param headerLines The VCF header lines that cover all INFO/FORMAT fields
* needed to represent this RDD of Variants.
* @param ds The underlying Variant Dataset.
* @param references The references for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantDataset.
*/
def apply(ds: Dataset[VariantProduct],
references: Iterable[Reference],
headerLines: Seq[VCFHeaderLine]): VariantDataset = {

DatasetBoundVariantDataset(ds, SequenceDictionary.fromAvro(references.toSeq), headerLines)
}

/**
* Builds a VariantDataset from a Dataset.
*
* @param ds The underlying Variant Dataset.
* @param sequences The sequence dictionary for the genomic dataset.
* @param headerLines The header lines for the genomic dataset.
* @return A new VariantDataset.
*/
def apply(ds: Dataset[VariantProduct],
sequences: SequenceDictionary,
headerLines: Seq[VCFHeaderLine]): VariantDataset = {
new DatasetBoundVariantDataset(ds, sequences, headerLines)

DatasetBoundVariantDataset(ds, sequences, headerLines)
}
}

Expand Down

0 comments on commit cc5b30c

Please sign in to comment.