diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala
index 1684e627d6..b2d3efe793 100644
--- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala
+++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala
@@ -24,10 +24,10 @@ import org.bdgenomics.adam.rdd.{
GenomicDataset,
GenomicDatasetConversion
}
-import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset
import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset }
import org.bdgenomics.adam.rdd.fragment.FragmentDataset
-import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
+import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, ReadDataset }
+import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset }
import org.bdgenomics.adam.rdd.variant.{
VariantDataset,
GenotypeDataset,
@@ -38,18 +38,15 @@ import org.bdgenomics.adam.sql.{
Feature => FeatureProduct,
Fragment => FragmentProduct,
Genotype => GenotypeProduct,
- NucleotideContigFragment => NucleotideContigFragmentProduct,
+ Read => ReadProduct,
+ Sequence => SequenceProduct,
+ Slice => SliceProduct,
Variant => VariantProduct,
VariantContext => VariantContextProduct
}
import org.bdgenomics.formats.avro._
import scala.reflect.runtime.universe._
-trait ToContigDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
-
- val yTag: TypeTag[NucleotideContigFragmentProduct] = typeTag[NucleotideContigFragmentProduct]
-}
-
trait ToCoverageDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Coverage, Coverage, CoverageDataset] {
val yTag: TypeTag[Coverage] = typeTag[Coverage]
@@ -75,63 +72,27 @@ trait ToGenotypeDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]]
val yTag: TypeTag[GenotypeProduct] = typeTag[GenotypeProduct]
}
-trait ToVariantDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Variant, VariantProduct, VariantDataset] {
-
- val yTag: TypeTag[VariantProduct] = typeTag[VariantProduct]
-}
-
-trait ToVariantContextDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, VariantContext, VariantContextProduct, VariantContextDataset] {
-
- val yTag: TypeTag[VariantContextProduct] = typeTag[VariantContextProduct]
-}
-
-final class ContigsToCoverageDatasetConverter extends ToCoverageDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
+trait ToReadDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Read, ReadProduct, ReadDataset] {
- def call(v1: NucleotideContigFragmentDataset, v2: Dataset[Coverage]): CoverageDataset = {
- ADAMContext.contigsToCoverageDatasetConversionFn(v1, v2)
- }
+ val yTag: TypeTag[ReadProduct] = typeTag[ReadProduct]
}
-final class ContigsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
+trait ToSequenceDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Sequence, SequenceProduct, SequenceDataset] {
- def call(v1: NucleotideContigFragmentDataset, v2: Dataset[FeatureProduct]): FeatureDataset = {
- ADAMContext.contigsToFeaturesDatasetConversionFn(v1, v2)
- }
+ val yTag: TypeTag[SequenceProduct] = typeTag[SequenceProduct]
}
-final class ContigsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
+trait ToSliceDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Slice, SliceProduct, SliceDataset] {
- def call(v1: NucleotideContigFragmentDataset, v2: Dataset[FragmentProduct]): FragmentDataset = {
- ADAMContext.contigsToFragmentsDatasetConversionFn(v1, v2)
- }
-}
-
-final class ContigsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = {
- ADAMContext.contigsToAlignmentRecordsDatasetConversionFn(v1, v2)
- }
+ val yTag: TypeTag[SliceProduct] = typeTag[SliceProduct]
}
-final class ContigsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = {
- ADAMContext.contigsToGenotypesDatasetConversionFn(v1, v2)
- }
-}
-
-final class ContigsToVariantsDatasetConverter extends ToVariantDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = {
- ADAMContext.contigsToVariantsDatasetConversionFn(v1, v2)
- }
+trait ToVariantDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Variant, VariantProduct, VariantDataset] {
+ val yTag: TypeTag[VariantProduct] = typeTag[VariantProduct]
}
-final class CoverageToContigsDatasetConverter extends ToContigDatasetConversion[Coverage, Coverage, CoverageDataset] {
-
- def call(v1: CoverageDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = {
- ADAMContext.coverageToContigsDatasetConversionFn(v1, v2)
- }
+trait ToVariantContextDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, VariantContext, VariantContextProduct, VariantContextDataset] {
+ val yTag: TypeTag[VariantContextProduct] = typeTag[VariantContextProduct]
}
final class CoverageToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Coverage, Coverage, CoverageDataset] {
@@ -162,17 +123,30 @@ final class CoverageToGenotypesDatasetConverter extends ToGenotypeDatasetConvers
}
}
-final class CoverageToVariantsDatasetConverter extends ToVariantDatasetConversion[Coverage, Coverage, CoverageDataset] {
+final class CoverageToReadsDatasetConverter extends ToReadDatasetConversion[Coverage, Coverage, CoverageDataset] {
- def call(v1: CoverageDataset, v2: Dataset[VariantProduct]): VariantDataset = {
- ADAMContext.coverageToVariantsDatasetConversionFn(v1, v2)
+ def call(v1: CoverageDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.coverageToReadsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class CoverageToSequencesDatasetConverter extends ToSequenceDatasetConversion[Coverage, Coverage, CoverageDataset] {
+
+ def call(v1: CoverageDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.coverageToSequencesDatasetConversionFn(v1, v2)
}
}
-final class FeaturesToContigsDatasetConverter extends ToContigDatasetConversion[Feature, FeatureProduct, FeatureDataset] {
+final class CoverageToSlicesDatasetConverter extends ToSliceDatasetConversion[Coverage, Coverage, CoverageDataset] {
- def call(v1: FeatureDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = {
- ADAMContext.featuresToContigsDatasetConversionFn(v1, v2)
+ def call(v1: CoverageDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.coverageToSlicesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class CoverageToVariantsDatasetConverter extends ToVariantDatasetConversion[Coverage, Coverage, CoverageDataset] {
+ def call(v1: CoverageDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.coverageToVariantsDatasetConversionFn(v1, v2)
}
}
@@ -204,17 +178,30 @@ final class FeaturesToGenotypesDatasetConverter extends ToGenotypeDatasetConvers
}
}
-final class FeaturesToVariantsDatasetConverter extends ToVariantDatasetConversion[Feature, FeatureProduct, FeatureDataset] {
+final class FeaturesToReadsDatasetConverter extends ToReadDatasetConversion[Feature, FeatureProduct, FeatureDataset] {
- def call(v1: FeatureDataset, v2: Dataset[VariantProduct]): VariantDataset = {
- ADAMContext.featuresToVariantsDatasetConversionFn(v1, v2)
+ def call(v1: FeatureDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.featuresToReadsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class FeaturesToSequencesDatasetConverter extends ToSequenceDatasetConversion[Feature, FeatureProduct, FeatureDataset] {
+
+ def call(v1: FeatureDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.featuresToSequencesDatasetConversionFn(v1, v2)
}
}
-final class FragmentsToContigsDatasetConverter extends ToContigDatasetConversion[Fragment, FragmentProduct, FragmentDataset] {
+final class FeaturesToSlicesDatasetConverter extends ToSliceDatasetConversion[Feature, FeatureProduct, FeatureDataset] {
+
+ def call(v1: FeatureDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.featuresToSlicesDatasetConversionFn(v1, v2)
+ }
+}
- def call(v1: FragmentDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = {
- ADAMContext.fragmentsToContigsDatasetConversionFn(v1, v2)
+final class FeaturesToVariantsDatasetConverter extends ToVariantDatasetConversion[Feature, FeatureProduct, FeatureDataset] {
+ def call(v1: FeatureDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.featuresToVariantsDatasetConversionFn(v1, v2)
}
}
@@ -246,22 +233,34 @@ final class FragmentsToGenotypesDatasetConverter extends ToGenotypeDatasetConver
}
}
-final class FragmentsToVariantsDatasetConverter extends ToVariantDatasetConversion[Fragment, FragmentProduct, FragmentDataset] {
+final class FragmentsToReadsDatasetConverter extends ToReadDatasetConversion[Fragment, FragmentProduct, FragmentDataset] {
- def call(v1: FragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = {
- ADAMContext.fragmentsToVariantsDatasetConversionFn(v1, v2)
+ def call(v1: FragmentDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.fragmentsToReadsDatasetConversionFn(v1, v2)
}
}
-final class AlignmentRecordsToContigsDatasetConverter extends ToContigDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
+final class FragmentsToSequencesDatasetConverter extends ToSequenceDatasetConversion[Fragment, FragmentProduct, FragmentDataset] {
- def call(v1: AlignmentRecordDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = {
- ADAMContext.alignmentRecordsToContigsDatasetConversionFn(v1, v2)
+ def call(v1: FragmentDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.fragmentsToSequencesDatasetConversionFn(v1, v2)
}
}
-final class AlignmentRecordsToCoverageDatasetConverter extends ToCoverageDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
+final class FragmentsToSlicesDatasetConverter extends ToSliceDatasetConversion[Fragment, FragmentProduct, FragmentDataset] {
+
+ def call(v1: FragmentDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.fragmentsToSlicesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class FragmentsToVariantsDatasetConverter extends ToVariantDatasetConversion[Fragment, FragmentProduct, FragmentDataset] {
+ def call(v1: FragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.fragmentsToVariantsDatasetConversionFn(v1, v2)
+ }
+}
+final class AlignmentRecordsToCoverageDatasetConverter extends ToCoverageDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
def call(v1: AlignmentRecordDataset, v2: Dataset[Coverage]): CoverageDataset = {
ADAMContext.alignmentRecordsToCoverageDatasetConversionFn(v1, v2)
}
@@ -288,22 +287,34 @@ final class AlignmentRecordsToGenotypesDatasetConverter extends ToGenotypeDatase
}
}
-final class AlignmentRecordsToVariantsDatasetConverter extends ToVariantDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
+final class AlignmentRecordsToReadsDatasetConverter extends ToReadDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
- def call(v1: AlignmentRecordDataset, v2: Dataset[VariantProduct]): VariantDataset = {
- ADAMContext.alignmentRecordsToVariantsDatasetConversionFn(v1, v2)
+ def call(v1: AlignmentRecordDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.alignmentRecordsToReadsDatasetConversionFn(v1, v2)
}
}
-final class GenotypesToContigsDatasetConverter extends ToContigDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
+final class AlignmentRecordsToSequencesDatasetConverter extends ToSequenceDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
- def call(v1: GenotypeDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = {
- ADAMContext.genotypesToContigsDatasetConversionFn(v1, v2)
+ def call(v1: AlignmentRecordDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.alignmentRecordsToSequencesDatasetConversionFn(v1, v2)
}
}
-final class GenotypesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
+final class AlignmentRecordsToSlicesDatasetConverter extends ToSliceDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
+ def call(v1: AlignmentRecordDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.alignmentRecordsToSlicesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class AlignmentRecordsToVariantsDatasetConverter extends ToVariantDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] {
+ def call(v1: AlignmentRecordDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.alignmentRecordsToVariantsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class GenotypesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
def call(v1: GenotypeDataset, v2: Dataset[Coverage]): CoverageDataset = {
ADAMContext.genotypesToCoverageDatasetConversionFn(v1, v2)
}
@@ -330,6 +341,27 @@ final class GenotypesToAlignmentRecordsDatasetConverter extends ToAlignmentRecor
}
}
+final class GenotypesToReadsDatasetConverter extends ToReadDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
+
+ def call(v1: GenotypeDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.genotypesToReadsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class GenotypesToSequencesDatasetConverter extends ToSequenceDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
+
+ def call(v1: GenotypeDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.genotypesToSequencesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class GenotypesToSlicesDatasetConverter extends ToSliceDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
+
+ def call(v1: GenotypeDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.genotypesToSlicesDatasetConversionFn(v1, v2)
+ }
+}
+
final class GenotypesToVariantsDatasetConverter extends ToVariantDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] {
def call(v1: GenotypeDataset, v2: Dataset[VariantProduct]): VariantDataset = {
@@ -337,15 +369,175 @@ final class GenotypesToVariantsDatasetConverter extends ToVariantDatasetConversi
}
}
-final class VariantsToContigsDatasetConverter extends ToContigDatasetConversion[Variant, VariantProduct, VariantDataset] {
+final class ReadsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Read, ReadProduct, ReadDataset] {
- def call(v1: VariantDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = {
- ADAMContext.variantsToContigsDatasetConversionFn(v1, v2)
+ def call(v1: ReadDataset, v2: Dataset[Coverage]): CoverageDataset = {
+ ADAMContext.readsToCoverageDatasetConversionFn(v1, v2)
}
}
-final class VariantsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Variant, VariantProduct, VariantDataset] {
+final class ReadsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[FeatureProduct]): FeatureDataset = {
+ ADAMContext.readsToFeaturesDatasetConversionFn(v1, v2)
+ }
+}
+final class ReadsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[FragmentProduct]): FragmentDataset = {
+ ADAMContext.readsToFragmentsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = {
+ ADAMContext.readsToAlignmentRecordsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = {
+ ADAMContext.readsToGenotypesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToSequencesDatasetConverter extends ToSequenceDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.readsToSequencesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToSlicesDatasetConverter extends ToSliceDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.readsToSlicesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToVariantsDatasetConverter extends ToVariantDatasetConversion[Read, ReadProduct, ReadDataset] {
+
+ def call(v1: ReadDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.readsToVariantsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[Coverage]): CoverageDataset = {
+ ADAMContext.sequencesToCoverageDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[FeatureProduct]): FeatureDataset = {
+ ADAMContext.sequencesToFeaturesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[FragmentProduct]): FragmentDataset = {
+ ADAMContext.sequencesToFragmentsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = {
+ ADAMContext.sequencesToAlignmentRecordsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = {
+ ADAMContext.sequencesToGenotypesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToReadsDatasetConverter extends ToReadDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.sequencesToReadsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToSlicesDatasetConverter extends ToSliceDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.sequencesToSlicesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToVariantsDatasetConverter extends ToVariantDatasetConversion[Sequence, SequenceProduct, SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.sequencesToVariantsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[Coverage]): CoverageDataset = {
+ ADAMContext.slicesToCoverageDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[FeatureProduct]): FeatureDataset = {
+ ADAMContext.slicesToFeaturesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[FragmentProduct]): FragmentDataset = {
+ ADAMContext.slicesToFragmentsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = {
+ ADAMContext.slicesToAlignmentRecordsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = {
+ ADAMContext.slicesToGenotypesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToReadsDatasetConverter extends ToReadDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.slicesToReadsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToSequencesDatasetConverter extends ToSequenceDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.slicesToSequencesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToVariantsDatasetConverter extends ToVariantDatasetConversion[Slice, SliceProduct, SliceDataset] {
+
+ def call(v1: SliceDataset, v2: Dataset[VariantProduct]): VariantDataset = {
+ ADAMContext.slicesToVariantsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class VariantsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Variant, VariantProduct, VariantDataset] {
def call(v1: VariantDataset, v2: Dataset[Coverage]): CoverageDataset = {
ADAMContext.variantsToCoverageDatasetConversionFn(v1, v2)
}
@@ -378,3 +570,24 @@ final class VariantsToGenotypesDatasetConverter extends ToGenotypeDatasetConvers
ADAMContext.variantsToGenotypesDatasetConversionFn(v1, v2)
}
}
+
+final class VariantsToReadsDatasetConverter extends ToReadDatasetConversion[Variant, VariantProduct, VariantDataset] {
+
+ def call(v1: VariantDataset, v2: Dataset[ReadProduct]): ReadDataset = {
+ ADAMContext.variantsToReadsDatasetConversionFn(v1, v2)
+ }
+}
+
+final class VariantsToSequencesDatasetConverter extends ToSequenceDatasetConversion[Variant, VariantProduct, VariantDataset] {
+
+ def call(v1: VariantDataset, v2: Dataset[SequenceProduct]): SequenceDataset = {
+ ADAMContext.variantsToSequencesDatasetConversionFn(v1, v2)
+ }
+}
+
+final class VariantsToSlicesDatasetConverter extends ToSliceDatasetConversion[Variant, VariantProduct, VariantDataset] {
+
+ def call(v1: VariantDataset, v2: Dataset[SliceProduct]): SliceDataset = {
+ ADAMContext.variantsToSlicesDatasetConversionFn(v1, v2)
+ }
+}
diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala
index cb10c9460a..b9e39068b5 100644
--- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala
+++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala
@@ -24,10 +24,10 @@ import org.bdgenomics.adam.models.{
VariantContext
}
import org.bdgenomics.adam.rdd.ADAMContext
-import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset
import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset }
import org.bdgenomics.adam.rdd.fragment.FragmentDataset
-import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
+import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, ReadDataset }
+import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset }
import org.bdgenomics.adam.rdd.variant.{
VariantDataset,
GenotypeDataset,
@@ -35,68 +35,7 @@ import org.bdgenomics.adam.rdd.variant.{
}
import org.bdgenomics.formats.avro._
-final class ContigsToContigsConverter extends Function2[NucleotideContigFragmentDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.contigsToContigsConversionFn(v1, v2)
- }
-}
-
-final class ContigsToCoverageConverter extends Function2[NucleotideContigFragmentDataset, RDD[Coverage], CoverageDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[Coverage]): CoverageDataset = {
- ADAMContext.contigsToCoverageConversionFn(v1, v2)
- }
-}
-
-final class ContigsToFeaturesConverter extends Function2[NucleotideContigFragmentDataset, RDD[Feature], FeatureDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[Feature]): FeatureDataset = {
- ADAMContext.contigsToFeaturesConversionFn(v1, v2)
- }
-}
-
-final class ContigsToFragmentsConverter extends Function2[NucleotideContigFragmentDataset, RDD[Fragment], FragmentDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[Fragment]): FragmentDataset = {
- ADAMContext.contigsToFragmentsConversionFn(v1, v2)
- }
-}
-
-final class ContigsToAlignmentRecordsConverter extends Function2[NucleotideContigFragmentDataset, RDD[AlignmentRecord], AlignmentRecordDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = {
- ADAMContext.contigsToAlignmentRecordsConversionFn(v1, v2)
- }
-}
-
-final class ContigsToGenotypesConverter extends Function2[NucleotideContigFragmentDataset, RDD[Genotype], GenotypeDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[Genotype]): GenotypeDataset = {
- ADAMContext.contigsToGenotypesConversionFn(v1, v2)
- }
-}
-
-final class ContigsToVariantsConverter extends Function2[NucleotideContigFragmentDataset, RDD[Variant], VariantDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[Variant]): VariantDataset = {
- ADAMContext.contigsToVariantsConversionFn(v1, v2)
- }
-}
-
-final class ContigsToVariantContextsConverter extends Function2[NucleotideContigFragmentDataset, RDD[VariantContext], VariantContextDataset] {
-
- def call(v1: NucleotideContigFragmentDataset, v2: RDD[VariantContext]): VariantContextDataset = {
- ADAMContext.contigsToVariantContextConversionFn(v1, v2)
- }
-}
-
-final class CoverageToContigsConverter extends Function2[CoverageDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: CoverageDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.coverageToContigsConversionFn(v1, v2)
- }
-}
+// coverage conversion functions
final class CoverageToCoverageConverter extends Function2[CoverageDataset, RDD[Coverage], CoverageDataset] {
@@ -133,6 +72,27 @@ final class CoverageToGenotypesConverter extends Function2[CoverageDataset, RDD[
}
}
+final class CoverageToReadsConverter extends Function2[CoverageDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: CoverageDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.coverageToReadsConversionFn(v1, v2)
+ }
+}
+
+final class CoverageToSequencesConverter extends Function2[CoverageDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: CoverageDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.coverageToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class CoverageToSlicesConverter extends Function2[CoverageDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: CoverageDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.coverageToSlicesConversionFn(v1, v2)
+ }
+}
+
final class CoverageToVariantsConverter extends Function2[CoverageDataset, RDD[Variant], VariantDataset] {
def call(v1: CoverageDataset, v2: RDD[Variant]): VariantDataset = {
@@ -147,12 +107,7 @@ final class CoverageToVariantContextConverter extends Function2[CoverageDataset,
}
}
-final class FeaturesToContigsConverter extends Function2[FeatureDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: FeatureDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.featuresToContigsConversionFn(v1, v2)
- }
-}
+// features conversion functions
final class FeaturesToCoverageConverter extends Function2[FeatureDataset, RDD[Coverage], CoverageDataset] {
@@ -189,6 +144,27 @@ final class FeaturesToGenotypesConverter extends Function2[FeatureDataset, RDD[G
}
}
+final class FeaturesToReadsConverter extends Function2[FeatureDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: FeatureDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.featuresToReadsConversionFn(v1, v2)
+ }
+}
+
+final class FeaturesToSequencesConverter extends Function2[FeatureDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: FeatureDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.featuresToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class FeaturesToSlicesConverter extends Function2[FeatureDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: FeatureDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.featuresToSlicesConversionFn(v1, v2)
+ }
+}
+
final class FeaturesToVariantsConverter extends Function2[FeatureDataset, RDD[Variant], VariantDataset] {
def call(v1: FeatureDataset, v2: RDD[Variant]): VariantDataset = {
@@ -203,12 +179,7 @@ final class FeaturesToVariantContextConverter extends Function2[FeatureDataset,
}
}
-final class FragmentsToContigsConverter extends Function2[FragmentDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: FragmentDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.fragmentsToContigsConversionFn(v1, v2)
- }
-}
+// fragments conversion functions
final class FragmentsToCoverageConverter extends Function2[FragmentDataset, RDD[Coverage], CoverageDataset] {
@@ -245,8 +216,28 @@ final class FragmentsToGenotypesConverter extends Function2[FragmentDataset, RDD
}
}
-final class FragmentsToVariantsConverter extends Function2[FragmentDataset, RDD[Variant], VariantDataset] {
+final class FragmentsToReadsConverter extends Function2[FragmentDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: FragmentDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.fragmentsToReadsConversionFn(v1, v2)
+ }
+}
+
+final class FragmentsToSequencesConverter extends Function2[FragmentDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: FragmentDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.fragmentsToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class FragmentsToSlicesConverter extends Function2[FragmentDataset, RDD[Slice], SliceDataset] {
+ def call(v1: FragmentDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.fragmentsToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class FragmentsToVariantsConverter extends Function2[FragmentDataset, RDD[Variant], VariantDataset] {
def call(v1: FragmentDataset, v2: RDD[Variant]): VariantDataset = {
ADAMContext.fragmentsToVariantsConversionFn(v1, v2)
}
@@ -259,12 +250,7 @@ final class FragmentsToVariantContextConverter extends Function2[FragmentDataset
}
}
-final class AlignmentRecordsToContigsConverter extends Function2[AlignmentRecordDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: AlignmentRecordDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.alignmentRecordsToContigsConversionFn(v1, v2)
- }
-}
+// alignment records conversion functions
final class AlignmentRecordsToCoverageConverter extends Function2[AlignmentRecordDataset, RDD[Coverage], CoverageDataset] {
@@ -301,8 +287,28 @@ final class AlignmentRecordsToGenotypesConverter extends Function2[AlignmentReco
}
}
-final class AlignmentRecordsToVariantsConverter extends Function2[AlignmentRecordDataset, RDD[Variant], VariantDataset] {
+final class AlignmentRecordsToReadsConverter extends Function2[AlignmentRecordDataset, RDD[Read], ReadDataset] {
+ def call(v1: AlignmentRecordDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.alignmentRecordsToReadsConversionFn(v1, v2)
+ }
+}
+
+final class AlignmentRecordsToSequencesConverter extends Function2[AlignmentRecordDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: AlignmentRecordDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.alignmentRecordsToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class AlignmentRecordsToSlicesConverter extends Function2[AlignmentRecordDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: AlignmentRecordDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.alignmentRecordsToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class AlignmentRecordsToVariantsConverter extends Function2[AlignmentRecordDataset, RDD[Variant], VariantDataset] {
def call(v1: AlignmentRecordDataset, v2: RDD[Variant]): VariantDataset = {
ADAMContext.alignmentRecordsToVariantsConversionFn(v1, v2)
}
@@ -315,12 +321,7 @@ final class AlignmentRecordsToVariantContextConverter extends Function2[Alignmen
}
}
-final class GenotypesToContigsConverter extends Function2[GenotypeDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: GenotypeDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.genotypesToContigsConversionFn(v1, v2)
- }
-}
+// genotypes conversion functions
final class GenotypesToCoverageConverter extends Function2[GenotypeDataset, RDD[Coverage], CoverageDataset] {
@@ -357,8 +358,28 @@ final class GenotypesToGenotypesConverter extends Function2[GenotypeDataset, RDD
}
}
-final class GenotypesToVariantsConverter extends Function2[GenotypeDataset, RDD[Variant], VariantDataset] {
+final class GenotypesToReadsConverter extends Function2[GenotypeDataset, RDD[Read], ReadDataset] {
+ def call(v1: GenotypeDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.genotypesToReadsConversionFn(v1, v2)
+ }
+}
+
+final class GenotypesToSequencesConverter extends Function2[GenotypeDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: GenotypeDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.genotypesToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class GenotypesToSlicesConverter extends Function2[GenotypeDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: GenotypeDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.genotypesToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class GenotypesToVariantsConverter extends Function2[GenotypeDataset, RDD[Variant], VariantDataset] {
def call(v1: GenotypeDataset, v2: RDD[Variant]): VariantDataset = {
ADAMContext.genotypesToVariantsConversionFn(v1, v2)
}
@@ -371,13 +392,224 @@ final class GenotypesToVariantContextConverter extends Function2[GenotypeDataset
}
}
-final class VariantsToContigsConverter extends Function2[VariantDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
+// reads conversion functions
+
+final class ReadsToCoverageConverter extends Function2[ReadDataset, RDD[Coverage], CoverageDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Coverage]): CoverageDataset = {
+ ADAMContext.readsToCoverageConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToFeaturesConverter extends Function2[ReadDataset, RDD[Feature], FeatureDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Feature]): FeatureDataset = {
+ ADAMContext.readsToFeaturesConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToFragmentsConverter extends Function2[ReadDataset, RDD[Fragment], FragmentDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Fragment]): FragmentDataset = {
+ ADAMContext.readsToFragmentsConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToAlignmentRecordsConverter extends Function2[ReadDataset, RDD[AlignmentRecord], AlignmentRecordDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = {
+ ADAMContext.readsToAlignmentRecordsConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToGenotypesConverter extends Function2[ReadDataset, RDD[Genotype], GenotypeDataset] {
- def call(v1: VariantDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.variantsToContigsConversionFn(v1, v2)
+ def call(v1: ReadDataset, v2: RDD[Genotype]): GenotypeDataset = {
+ ADAMContext.readsToGenotypesConversionFn(v1, v2)
}
}
+final class ReadsToReadsConverter extends Function2[ReadDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.readsToReadsConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToSequencesConverter extends Function2[ReadDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.readsToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToSlicesConverter extends Function2[ReadDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.readsToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToVariantsConverter extends Function2[ReadDataset, RDD[Variant], VariantDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[Variant]): VariantDataset = {
+ ADAMContext.readsToVariantsConversionFn(v1, v2)
+ }
+}
+
+final class ReadsToVariantContextsConverter extends Function2[ReadDataset, RDD[VariantContext], VariantContextDataset] {
+
+ def call(v1: ReadDataset, v2: RDD[VariantContext]): VariantContextDataset = {
+ ADAMContext.readsToVariantContextsConversionFn(v1, v2)
+ }
+}
+
+// sequence conversion functions
+
+final class SequencesToCoverageConverter extends Function2[SequenceDataset, RDD[Coverage], CoverageDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Coverage]): CoverageDataset = {
+ ADAMContext.sequencesToCoverageConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToFeaturesConverter extends Function2[SequenceDataset, RDD[Feature], FeatureDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Feature]): FeatureDataset = {
+ ADAMContext.sequencesToFeaturesConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToFragmentsConverter extends Function2[SequenceDataset, RDD[Fragment], FragmentDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Fragment]): FragmentDataset = {
+ ADAMContext.sequencesToFragmentsConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToAlignmentRecordsConverter extends Function2[SequenceDataset, RDD[AlignmentRecord], AlignmentRecordDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = {
+ ADAMContext.sequencesToAlignmentRecordsConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToGenotypesConverter extends Function2[SequenceDataset, RDD[Genotype], GenotypeDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Genotype]): GenotypeDataset = {
+ ADAMContext.sequencesToGenotypesConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToReadsConverter extends Function2[SequenceDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.sequencesToReadsConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToSequencesConverter extends Function2[SequenceDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.sequencesToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToSlicesConverter extends Function2[SequenceDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.sequencesToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToVariantsConverter extends Function2[SequenceDataset, RDD[Variant], VariantDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[Variant]): VariantDataset = {
+ ADAMContext.sequencesToVariantsConversionFn(v1, v2)
+ }
+}
+
+final class SequencesToVariantContextsConverter extends Function2[SequenceDataset, RDD[VariantContext], VariantContextDataset] {
+
+ def call(v1: SequenceDataset, v2: RDD[VariantContext]): VariantContextDataset = {
+ ADAMContext.sequencesToVariantContextsConversionFn(v1, v2)
+ }
+}
+
+// slice conversion functions
+
+final class SlicesToCoverageConverter extends Function2[SliceDataset, RDD[Coverage], CoverageDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Coverage]): CoverageDataset = {
+ ADAMContext.slicesToCoverageConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToFeaturesConverter extends Function2[SliceDataset, RDD[Feature], FeatureDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Feature]): FeatureDataset = {
+ ADAMContext.slicesToFeaturesConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToFragmentsConverter extends Function2[SliceDataset, RDD[Fragment], FragmentDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Fragment]): FragmentDataset = {
+ ADAMContext.slicesToFragmentsConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToAlignmentRecordsConverter extends Function2[SliceDataset, RDD[AlignmentRecord], AlignmentRecordDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = {
+ ADAMContext.slicesToAlignmentRecordsConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToGenotypesConverter extends Function2[SliceDataset, RDD[Genotype], GenotypeDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Genotype]): GenotypeDataset = {
+ ADAMContext.slicesToGenotypesConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToReadsConverter extends Function2[SliceDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.slicesToReadsConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToSequencesConverter extends Function2[SliceDataset, RDD[Sequence], SequenceDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.slicesToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToSlicesConverter extends Function2[SliceDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.slicesToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToVariantsConverter extends Function2[SliceDataset, RDD[Variant], VariantDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[Variant]): VariantDataset = {
+ ADAMContext.slicesToVariantsConversionFn(v1, v2)
+ }
+}
+
+final class SlicesToVariantContextsConverter extends Function2[SliceDataset, RDD[VariantContext], VariantContextDataset] {
+
+ def call(v1: SliceDataset, v2: RDD[VariantContext]): VariantContextDataset = {
+ ADAMContext.slicesToVariantContextsConversionFn(v1, v2)
+ }
+}
+
+// variants conversion functions
+
final class VariantsToCoverageConverter extends Function2[VariantDataset, RDD[Coverage], CoverageDataset] {
def call(v1: VariantDataset, v2: RDD[Coverage]): CoverageDataset = {
@@ -413,8 +645,28 @@ final class VariantsToGenotypesConverter extends Function2[VariantDataset, RDD[G
}
}
-final class VariantsToVariantsConverter extends Function2[VariantDataset, RDD[Variant], VariantDataset] {
+final class VariantsToReadsConverter extends Function2[VariantDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: VariantDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.variantsToReadsConversionFn(v1, v2)
+ }
+}
+
+final class VariantsToSequencesConverter extends Function2[VariantDataset, RDD[Sequence], SequenceDataset] {
+ def call(v1: VariantDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.variantsToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class VariantsToSlicesConverter extends Function2[VariantDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: VariantDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.variantsToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class VariantsToVariantsConverter extends Function2[VariantDataset, RDD[Variant], VariantDataset] {
def call(v1: VariantDataset, v2: RDD[Variant]): VariantDataset = {
ADAMContext.variantsToVariantsConversionFn(v1, v2)
}
@@ -427,12 +679,7 @@ final class VariantsToVariantContextConverter extends Function2[VariantDataset,
}
}
-final class VariantContextsToContigsConverter extends Function2[VariantContextDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] {
-
- def call(v1: VariantContextDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = {
- ADAMContext.variantContextsToContigsConversionFn(v1, v2)
- }
-}
+// variant contexts conversion functions
final class VariantContextsToCoverageConverter extends Function2[VariantContextDataset, RDD[Coverage], CoverageDataset] {
@@ -469,8 +716,28 @@ final class VariantContextsToGenotypesConverter extends Function2[VariantContext
}
}
-final class VariantContextsToVariantsConverter extends Function2[VariantContextDataset, RDD[Variant], VariantDataset] {
+final class VariantContextsToReadsConverter extends Function2[VariantContextDataset, RDD[Read], ReadDataset] {
+
+ def call(v1: VariantContextDataset, v2: RDD[Read]): ReadDataset = {
+ ADAMContext.variantContextsToReadsConversionFn(v1, v2)
+ }
+}
+
+final class VariantContextsToSequencesConverter extends Function2[VariantContextDataset, RDD[Sequence], SequenceDataset] {
+ def call(v1: VariantContextDataset, v2: RDD[Sequence]): SequenceDataset = {
+ ADAMContext.variantContextsToSequencesConversionFn(v1, v2)
+ }
+}
+
+final class VariantContextsToSlicesConverter extends Function2[VariantContextDataset, RDD[Slice], SliceDataset] {
+
+ def call(v1: VariantContextDataset, v2: RDD[Slice]): SliceDataset = {
+ ADAMContext.variantContextsToSlicesConversionFn(v1, v2)
+ }
+}
+
+final class VariantContextsToVariantsConverter extends Function2[VariantContextDataset, RDD[Variant], VariantDataset] {
def call(v1: VariantContextDataset, v2: RDD[Variant]): VariantDataset = {
ADAMContext.variantContextsToVariantsConversionFn(v1, v2)
}
diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala
index 31e0e15781..14f679452b 100644
--- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala
+++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala
@@ -21,10 +21,10 @@ import htsjdk.samtools.ValidationStringency
import org.apache.spark.api.java.JavaSparkContext
import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.rdd.ADAMContext
-import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset
import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset }
import org.bdgenomics.adam.rdd.fragment.FragmentDataset
-import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
+import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, ReadDataset }
+import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset }
import org.bdgenomics.adam.rdd.variant.{
GenotypeDataset,
VariantDataset
@@ -131,26 +131,6 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
ac.loadIndexedBam(pathName, viewRegions.toIterable, stringency = stringency)
}
- /**
- * (Java-specific) Load nucleotide contig fragments into a NucleotideContigFragmentDataset.
- *
- * If the path name has a .fa/.fasta extension, load as FASTA format.
- * Else, fall back to Parquet + Avro.
- *
- * For FASTA format, compressed files are supported through compression codecs configured
- * in Hadoop, which by default include .gz and .bz2, but can include more.
- *
- * @see ADAMContext#loadContigFragments
- *
- * @param pathName The path name to load nucleotide contig fragments from.
- * Globs/directories are supported, although file extension must be present
- * for FASTA format.
- * @return Returns a NucleotideContigFragmentDataset.
- */
- def loadContigFragments(pathName: java.lang.String): NucleotideContigFragmentDataset = {
- ac.loadContigFragments(pathName)
- }
-
/**
* (Java-specific) Load fragments into a FragmentDataset.
*
@@ -390,10 +370,10 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
/**
* (Java-specific) Load reference sequences into a broadcastable ReferenceFile.
*
- * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments
+ * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices
* to load the reference as an RDD, which is then collected to the driver.
*
- * @see loadContigFragments
+ * @see ADAMContext#loadSlices
*
* @param pathName The path name to load reference sequences from.
* Globs/directories for 2bit format are not supported.
@@ -409,11 +389,11 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
/**
* (Java-specific) Load reference sequences into a broadcastable ReferenceFile.
*
- * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments
+ * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices
* to load the reference as an RDD, which is then collected to the driver. Uses a
* maximum fragment length of 10kbp.
*
- * @see loadContigFragments
+ * @see ADAMContext#loadSlices
*
* @param pathName The path name to load reference sequences from.
* Globs/directories for 2bit format are not supported.
@@ -422,4 +402,90 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
def loadReferenceFile(pathName: java.lang.String): ReferenceFile = {
loadReferenceFile(pathName, 10000L)
}
+
+ /**
+ * (Java-specific) Load DNA sequences into a SequenceDataset.
+ *
+ * If the path name has a .fa/.fasta extension, load as FASTA format.
+ * Else, fall back to Parquet + Avro.
+ *
+ * For FASTA format, compressed files are supported through compression codecs configured
+ * in Hadoop, which by default include .gz and .bz2, but can include more.
+ *
+ * @see ADAMContext#loadFastaDna
+ * @see ADAMContext#loadParquetSequences
+ *
+ * @param pathName The path name to load sequences from.
+ * Globs/directories are supported, although file extension must be present
+ * for FASTA format.
+ * @return Returns a SequenceDataset containing DNA sequences.
+ */
+ def loadDnaSequences(pathName: java.lang.String): SequenceDataset = {
+ ac.loadDnaSequences(pathName)
+ }
+
+ /**
+ * (Java-specific) Load protein sequences into a SequenceDataset.
+ *
+ * If the path name has a .fa/.fasta extension, load as FASTA format.
+ * Else, fall back to Parquet + Avro.
+ *
+ * For FASTA format, compressed files are supported through compression codecs configured
+ * in Hadoop, which by default include .gz and .bz2, but can include more.
+ *
+ * @see ADAMContext#loadFastaProtein
+ * @see ADAMContext#loadParquetSequences
+ *
+ * @param pathName The path name to load sequences from.
+ * Globs/directories are supported, although file extension must be present
+ * for FASTA format.
+ * @return Returns a SequenceDataset containing protein sequences.
+ */
+ def loadProteinSequences(pathName: java.lang.String): SequenceDataset = {
+ ac.loadProteinSequences(pathName)
+ }
+
+ /**
+ * (Java-specific) Load RNA sequences into a SequenceDataset.
+ *
+ * If the path name has a .fa/.fasta extension, load as FASTA format.
+ * Else, fall back to Parquet + Avro.
+ *
+ * For FASTA format, compressed files are supported through compression codecs configured
+ * in Hadoop, which by default include .gz and .bz2, but can include more.
+ *
+ * @see ADAMContext#loadFastaRna
+ * @see ADAMContext#loadParquetSequences
+ *
+ * @param pathName The path name to load sequences from.
+ * Globs/directories are supported, although file extension must be present
+ * for FASTA format.
+ * @return Returns a SequenceDataset containing RNA sequences.
+ */
+ def loadRnaSequences(pathName: java.lang.String): SequenceDataset = {
+ ac.loadRnaSequences(pathName)
+ }
+
+ /**
+ * (Java-specific) Load slices into a SliceDataset.
+ *
+ * If the path name has a .fa/.fasta extension, load as DNA in FASTA format.
+ * Else, fall back to Parquet + Avro.
+ *
+ * For FASTA format, compressed files are supported through compression codecs configured
+ * in Hadoop, which by default include .gz and .bz2, but can include more.
+ *
+ * @param pathName The path name to load DNA slices from.
+ * Globs/directories are supported, although file extension must be present
+ * for FASTA format.
+ * @param maximumLength Maximum fragment length. Values greater
+ * than 1e9 should be avoided.
+ * @return Returns a SliceDataset.
+ */
+ def loadSlices(
+ pathName: java.lang.String,
+ maximumLength: java.lang.Long): SliceDataset = {
+
+ ac.loadSlices(pathName, maximumLength)
+ }
}
diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java
new file mode 100644
index 0000000000..0e88d75a1f
--- /dev/null
+++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.api.java;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.bdgenomics.adam.rdd.ADAMContext;
+import org.bdgenomics.adam.rdd.sequence.SequenceDataset;
+
+/**
+ * A simple test class for the JavaADAMRDD/Context. Writes an RDD of sequences
+ * to disk and reads it back.
+ */
+final class JavaADAMSequenceConduit {
+ public static SequenceDataset conduit(final SequenceDataset sequenceDataset,
+ final ADAMContext ac) throws IOException {
+
+ // make temp directory and save file
+ Path tempDir = Files.createTempDirectory("javaAC");
+ String fileName = tempDir.toString() + "/testRdd.sequences.adam";
+ sequenceDataset.save(fileName, true, true);
+
+ // create a new adam context and load the file
+ JavaADAMContext jac = new JavaADAMContext(ac);
+ return jac.loadDnaSequences(fileName);
+ }
+}
diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java
similarity index 70%
rename from adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java
rename to adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java
index fe732bb02c..eead1baf97 100644
--- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java
+++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java
@@ -20,24 +20,25 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
+
import org.bdgenomics.adam.rdd.ADAMContext;
-import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset;
+import org.bdgenomics.adam.rdd.sequence.SliceDataset;
/**
- * A simple test class for the JavaADAMRDD/Context. Writes an RDD of nucleotide
- * contig fragments to disk and reads it back.
+ * A simple test class for the JavaADAMRDD/Context. Writes an RDD of slices
+ * to disk and reads it back.
*/
-final class JavaADAMContigConduit {
- public static NucleotideContigFragmentDataset conduit(final NucleotideContigFragmentDataset recordRdd,
- final ADAMContext ac) throws IOException {
+final class JavaADAMSliceConduit {
+ public static SliceDataset conduit(final SliceDataset sliceDataset,
+ final ADAMContext ac) throws IOException {
// make temp directory and save file
Path tempDir = Files.createTempDirectory("javaAC");
- String fileName = tempDir.toString() + "/testRdd.contig.adam";
- recordRdd.save(fileName, true);
+ String fileName = tempDir.toString() + "/testRdd.slices.adam";
+ sliceDataset.save(fileName, true, true);
// create a new adam context and load the file
JavaADAMContext jac = new JavaADAMContext(ac);
- return jac.loadContigFragments(fileName);
+ return jac.loadSlices(fileName, 10000L);
}
}
diff --git a/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala b/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala
index 12dfbbecd5..f912aee6ab 100644
--- a/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala
+++ b/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala
@@ -49,16 +49,6 @@ class JavaADAMContextSuite extends ADAMFunSuite {
assert(reads.rdd.count == 2)
}
- sparkTest("can read and write a small FASTA file") {
- val path = copyResource("chr20.250k.fa.gz")
- val aRdd = jac.loadContigFragments(path)
- assert(aRdd.jrdd.count() === 26)
-
- val newRdd = JavaADAMContigConduit.conduit(aRdd, sc)
-
- assert(newRdd.jrdd.count() === 26)
- }
-
sparkTest("can read and write a small .SAM file as fragments") {
val path = copyResource("small.sam")
val aRdd = jac.loadFragments(path)
@@ -114,4 +104,24 @@ class JavaADAMContextSuite extends ADAMFunSuite {
val refFile = jac.loadReferenceFile(path)
assert(refFile.extract(ReferenceRegion("hg19_chrM", 16561, 16571)) === "CATCACGATG")
}
+
+ sparkTest("can read and write .fa as sequences") {
+ val path = copyResource("trinity.fa")
+ val sequences = jac.loadDnaSequences(path)
+ assert(sequences.jrdd.count() === 5)
+
+ val newRdd = JavaADAMSequenceConduit.conduit(sequences, sc)
+
+ assert(newRdd.jrdd.count() === 5)
+ }
+
+ sparkTest("can read and write .fa as slices") {
+ val path = copyResource("trinity.fa")
+ val slices = jac.loadSlices(path, 10000L)
+ assert(slices.jrdd.count() === 5)
+
+ val newRdd = JavaADAMSliceConduit.conduit(slices, sc)
+
+ assert(newRdd.jrdd.count() === 5)
+ }
}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala
deleted file mode 100644
index 5af5aae2d7..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import grizzled.slf4j.Logging
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.cli.FileSystemUtils._
-import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.formats.avro.NucleotideContigFragment
-import org.bdgenomics.utils.cli._
-import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
-
-class ADAM2FastaArgs extends Args4jBase {
- @Argument(required = true, metaVar = "ADAM", usage = "The Parquet file to convert", index = 0)
- var inputPath: String = null
- @Argument(required = true, metaVar = "FASTA", usage = "Location to write the FASTA to", index = 1)
- var outputPath: String = null
- @Args4jOption(required = false, name = "-single", usage = "Saves FASTA as single file")
- var asSingleFile: Boolean = false
- @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output")
- var disableFastConcat: Boolean = false
- @Args4jOption(required = false, name = "-coalesce", usage = "Choose the number of partitions to coalesce down to.")
- var coalesce: Int = -1
- @Args4jOption(required = false, name = "-force_shuffle_coalesce", usage = "Force shuffle while partitioning, default false.")
- var forceShuffle: Boolean = false
- @Args4jOption(required = false, name = "-line_width", usage = "Hard wrap FASTA formatted sequence at line width, default 60")
- var lineWidth: Int = 60
-}
-
-object ADAM2Fasta extends BDGCommandCompanion {
- override val commandName = "adam2fasta"
- override val commandDescription = "Convert ADAM nucleotide contig fragments to FASTA files"
-
- override def apply(cmdLine: Array[String]): ADAM2Fasta =
- new ADAM2Fasta(Args4j[ADAM2FastaArgs](cmdLine))
-}
-
-class ADAM2Fasta(val args: ADAM2FastaArgs) extends BDGSparkCommand[ADAM2FastaArgs] with Logging {
- override val companion = ADAM2Fasta
-
- override def run(sc: SparkContext): Unit = {
- checkWriteablePath(args.outputPath, sc.hadoopConfiguration)
-
- info("Loading ADAM nucleotide contig fragments from disk.")
- val contigFragments = sc.loadContigFragments(args.inputPath)
-
- info("Merging fragments and writing FASTA to disk.")
- val contigs = contigFragments.mergeFragments()
-
- val cc = if (args.coalesce > 0) {
- if (args.coalesce > contigs.rdd.partitions.length || args.forceShuffle) {
- contigs.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(args.coalesce, shuffle = true))
- } else {
- contigs.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(args.coalesce, shuffle = false))
- }
- } else {
- contigs
- }
- cc.saveAsFasta(
- args.outputPath,
- args.lineWidth,
- asSingleFile = args.asSingleFile,
- disableFastConcat = args.disableFastConcat
- )
- }
-}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
index a816273b77..0fb688564d 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
@@ -33,10 +33,12 @@ object ADAMMain {
"ADAM ACTIONS",
List(
CountReadKmers,
- CountContigKmers,
+ CountSliceKmers,
TransformAlignments,
TransformFeatures,
TransformGenotypes,
+ TransformSequences,
+ TransformSlices,
TransformVariants,
MergeShards,
Reads2Coverage
@@ -45,8 +47,6 @@ object ADAMMain {
CommandGroup(
"CONVERSION OPERATIONS",
List(
- Fasta2ADAM,
- ADAM2Fasta,
ADAM2Fastq,
TransformFragments
)
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountSliceKmers.scala
similarity index 79%
rename from adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala
rename to adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountSliceKmers.scala
index b664d9c4bf..85b6061470 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountSliceKmers.scala
@@ -24,16 +24,16 @@ import org.bdgenomics.adam.cli.FileSystemUtils._
import org.bdgenomics.utils.cli._
import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
-object CountContigKmers extends BDGCommandCompanion {
- val commandName = "countContigKmers"
- val commandDescription = "Counts the k-mers/q-mers from a read dataset."
+object CountSliceKmers extends BDGCommandCompanion {
+ val commandName = "countSliceKmers"
+ val commandDescription = "Counts the k-mers/q-mers from a slice dataset."
def apply(cmdLine: Array[String]) = {
- new CountContigKmers(Args4j[CountContigKmersArgs](cmdLine))
+ new CountSliceKmers(Args4j[CountSliceKmersArgs](cmdLine))
}
}
-class CountContigKmersArgs extends Args4jBase with ParquetArgs {
+class CountSliceKmersArgs extends Args4jBase with ParquetArgs {
@Argument(required = true, metaVar = "INPUT", usage = "The ADAM or FASTA file to count kmers from", index = 0)
var inputPath: String = null
@Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts", index = 1)
@@ -44,17 +44,17 @@ class CountContigKmersArgs extends Args4jBase with ParquetArgs {
var printHistogram: Boolean = false
}
-class CountContigKmers(protected val args: CountContigKmersArgs) extends BDGSparkCommand[CountContigKmersArgs] with Logging {
- val companion = CountContigKmers
+class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkCommand[CountSliceKmersArgs] with Logging {
+ val companion = CountSliceKmers
def run(sc: SparkContext) {
checkWriteablePath(args.outputPath, sc.hadoopConfiguration)
// read from disk
- val fragments = sc.loadContigFragments(args.inputPath)
+ val slices = sc.loadSlices(args.inputPath)
// count kmers
- val countedKmers = fragments.countKmers(args.kmerLength)
+ val countedKmers = slices.countKmers(args.kmerLength)
// print histogram, if requested
if (args.printHistogram) {
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fasta2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fasta2ADAM.scala
deleted file mode 100644
index 36d525aab2..0000000000
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fasta2ADAM.scala
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import grizzled.slf4j.Logging
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.cli.FileSystemUtils._
-import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.formats.avro.NucleotideContigFragment
-import org.bdgenomics.utils.cli._
-import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
-
-object Fasta2ADAM extends BDGCommandCompanion {
- val commandName: String = "fasta2adam"
- val commandDescription: String = "Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences."
-
- def apply(cmdLine: Array[String]) = {
- new Fasta2ADAM(Args4j[Fasta2ADAMArgs](cmdLine))
- }
-}
-
-class Fasta2ADAMArgs extends Args4jBase with ParquetSaveArgs {
- @Argument(required = true, metaVar = "FASTA", usage = "The FASTA file to convert", index = 0)
- var fastaFile: String = null
- @Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM data", index = 1)
- var outputPath: String = null
- @Args4jOption(required = false, name = "-verbose", usage = "Prints enhanced debugging info, including contents of seq dict.")
- var verbose: Boolean = false
- @Args4jOption(required = false, name = "-reads", usage = "Maps contig IDs to match contig IDs of reads.")
- var reads: String = ""
- @Args4jOption(required = false, name = "-fragment_length", usage = "Sets maximum fragment length. Default value is 10,000. Values greater than 1e9 should be avoided.")
- var maximumLength: Long = 10000L
- @Args4jOption(required = false, name = "-repartition", usage = "Sets the number of output partitions to write, if desired.")
- var partitions: Int = -1
-}
-
-class Fasta2ADAM(protected val args: Fasta2ADAMArgs) extends BDGSparkCommand[Fasta2ADAMArgs] with Logging {
- val companion = Fasta2ADAM
-
- def run(sc: SparkContext) {
- checkWriteablePath(args.outputPath, sc.hadoopConfiguration)
-
- info("Loading FASTA data from disk.")
- val adamFasta = sc.loadFasta(args.fastaFile, maximumLength = args.maximumLength)
-
- if (args.verbose) {
- info("FASTA contains: %s".format(adamFasta.sequences.toString))
- }
-
- info("Writing records to disk.")
- val finalFasta = if (args.partitions > 0) {
- adamFasta.transform((rdd: RDD[NucleotideContigFragment]) => rdd.repartition(args.partitions))
- } else {
- adamFasta
- }
-
- finalFasta.saveAsParquet(args)
- }
-}
-
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
index 88859800d2..9ddc232e1a 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala
@@ -34,11 +34,11 @@ object TransformFeatures extends BDGCommandCompanion {
class TransformFeaturesArgs extends Args4jBase with ParquetSaveArgs {
@Argument(required = true, metaVar = "INPUT",
- usage = "The features file to convert (e.g., .bed, .gff/.gtf, .gff3, .interval_list, .narrowPeak). If extension is not detected, Parquet is assumed.", index = 0)
+ usage = "The feature file to convert (e.g., .bed, .gff/.gtf, .gff3, .interval_list, .narrowPeak). If extension is not detected, Parquet is assumed.", index = 0)
var featuresFile: String = _
@Argument(required = true, metaVar = "OUTPUT",
- usage = "Location to write ADAM features data. If extension is not detected, Parquet is assumed.", index = 1)
+ usage = "Location to write ADAM feature data. If extension is not detected, Parquet is assumed.", index = 1)
var outputPath: String = null
@Args4jOption(required = false, name = "-num_partitions",
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSequences.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSequences.scala
new file mode 100644
index 0000000000..6824973387
--- /dev/null
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSequences.scala
@@ -0,0 +1,71 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.cli
+
+import org.apache.spark.SparkContext
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.formats.avro.Alphabet;
+import org.bdgenomics.utils.cli._
+import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption }
+
+object TransformSequences extends BDGCommandCompanion {
+ val commandName = "transformSequences"
+ val commandDescription = "Convert a FASTA file as sequences into corresponding ADAM format and vice versa"
+
+ def apply(cmdLine: Array[String]) = {
+ new TransformSequences(Args4j[TransformSequencesArgs](cmdLine))
+ }
+}
+
+class TransformSequencesArgs extends Args4jBase with ParquetSaveArgs {
+ @Argument(required = true, metaVar = "INPUT",
+ usage = "The sequence file to convert (e.g., .fa, .fasta). If extension is not detected, Parquet is assumed.", index = 0)
+ var sequencesFile: String = _
+
+ @Argument(required = true, metaVar = "OUTPUT",
+ usage = "Location to write ADAM sequence data. If extension is not detected, Parquet is assumed.", index = 1)
+ var outputPath: String = null
+
+ @Args4jOption(required = false, name = "-single",
+ usage = "Save as a single file, for the text formats.")
+ var single: Boolean = false
+
+ @Args4jOption(required = false, name = "-alphabet",
+ usage = "Alphabet in which to interpret the loaded sequences { DNA, PROTEIN, RNA }. Defaults to Alphabet.DNA.")
+ var alphabet: String = "DNA"
+
+ @Args4jOption(required = false, name = "-disable_fast_concat",
+ usage = "Disables the parallel file concatenation engine.")
+ var disableFastConcat: Boolean = false
+}
+
+class TransformSequences(val args: TransformSequencesArgs)
+ extends BDGSparkCommand[TransformSequencesArgs] {
+
+ val companion = TransformSequences
+ val alphabet = Alphabet.valueOf(args.alphabet)
+
+ def run(sc: SparkContext) {
+ val sequences = alphabet match {
+ case Alphabet.DNA => sc.loadDnaSequences(args.sequencesFile, optPredicate = None, optProjection = None)
+ case Alphabet.PROTEIN => sc.loadProteinSequences(args.sequencesFile, optPredicate = None, optProjection = None)
+ case Alphabet.RNA => sc.loadRnaSequences(args.sequencesFile, optPredicate = None, optProjection = None)
+ }
+ sequences.save(args.outputPath, args.single, args.disableFastConcat)
+ }
+}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSlices.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSlices.scala
new file mode 100644
index 0000000000..a4d15fd1c8
--- /dev/null
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSlices.scala
@@ -0,0 +1,69 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.cli
+
+import org.apache.spark.SparkContext
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.utils.cli._
+import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption }
+
+object TransformSlices extends BDGCommandCompanion {
+ val commandName = "transformSlices"
+ val commandDescription = "Convert a FASTA file as slices into corresponding ADAM format and vice versa"
+
+ def apply(cmdLine: Array[String]) = {
+ new TransformSlices(Args4j[TransformSlicesArgs](cmdLine))
+ }
+}
+
+class TransformSlicesArgs extends Args4jBase with ParquetSaveArgs {
+ @Argument(required = true, metaVar = "INPUT",
+ usage = "The slice file to convert (e.g., .fa, .fasta). If extension is not detected, Parquet is assumed.", index = 0)
+ var slicesFile: String = _
+
+ @Argument(required = true, metaVar = "OUTPUT",
+ usage = "Location to write ADAM slice data. If extension is not detected, Parquet is assumed.", index = 1)
+ var outputPath: String = null
+
+ @Args4jOption(required = false, name = "-maximum_length",
+ usage = "Maximum slice length. Defaults to 10000L.")
+ var maximumLength: Long = 10000L
+
+ @Args4jOption(required = false, name = "-single",
+ usage = "Save as a single file, for the text formats.")
+ var single: Boolean = false
+
+ @Args4jOption(required = false, name = "-disable_fast_concat",
+ usage = "Disables the parallel file concatenation engine.")
+ var disableFastConcat: Boolean = false
+}
+
+class TransformSlices(val args: TransformSlicesArgs)
+ extends BDGSparkCommand[TransformSlicesArgs] {
+
+ val companion = TransformSlices
+
+ def run(sc: SparkContext) {
+ sc.loadSlices(
+ args.slicesFile,
+ maximumLength = args.maximumLength,
+ optPredicate = None,
+ optProjection = None
+ ).save(args.outputPath, args.single, args.disableFastConcat)
+ }
+}
diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2FastaSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2FastaSuite.scala
deleted file mode 100644
index fcdef5e73d..0000000000
--- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2FastaSuite.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import com.google.common.io.Files
-import java.io.File
-import org.bdgenomics.adam.util.ADAMFunSuite
-import org.bdgenomics.utils.cli._
-
-class ADAM2FastaSuite extends ADAMFunSuite {
-
- sparkTest("round trip FASTA to nucleotide contig fragments in ADAM format to FASTA") {
- val fastaFile = testFile("contigs.fa")
-
- val outputDir = Files.createTempDir()
- val outputContigFragmentsFile = outputDir.getAbsolutePath + "/contigs.adam"
- val outputFastaFile = outputDir.getAbsolutePath + "/contigs.fa"
-
- val args0: Array[String] = Array(fastaFile, outputContigFragmentsFile)
- new Fasta2ADAM(Args4j[Fasta2ADAMArgs](args0)).run(sc)
-
- val args1: Array[String] = Array(outputContigFragmentsFile, outputFastaFile)
- new ADAM2Fasta(Args4j[ADAM2FastaArgs](args1)).run(sc)
-
- val fastaLines = scala.io.Source.fromFile(new File(fastaFile)).getLines().toSeq
- val outputFastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq
-
- assert(outputFastaLines.length === fastaLines.length)
- outputFastaLines.zip(fastaLines).foreach(kv => assert(kv._1 === kv._2))
- }
-}
diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Fasta2ADAMSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Fasta2ADAMSuite.scala
deleted file mode 100644
index ff5e87d4bc..0000000000
--- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Fasta2ADAMSuite.scala
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Licensed to Big Data Genomics (BDG) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The BDG licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.bdgenomics.adam.cli
-
-import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.adam.util.ADAMFunSuite
-
-class Fasta2ADAMSuite extends ADAMFunSuite {
- sparkTest("can load fasta records after conversion") {
- val inputPath = copyResource("chr20.250k.fa.gz")
- val convertPath = tmpFile("chr20.contig.adam")
- val cmd = Fasta2ADAM(Array(inputPath, convertPath)).run(sc)
-
- val contigFragments = sc.loadParquetContigFragments(convertPath)
- assert(contigFragments.rdd.count() === 26)
- val first = contigFragments.rdd.first()
- assert(first.getContigName === null)
- assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly")
- assert(first.getIndex === 0)
- assert(first.getSequence.length === 10000)
- assert(first.getStart === 0L)
- assert(first.getEnd === 10000L)
- assert(first.getFragments === 26)
- }
-}
diff --git a/adam-core/pom.xml b/adam-core/pom.xml
index fb1ba39122..90ebb73448 100644
--- a/adam-core/pom.xml
+++ b/adam-core/pom.xml
@@ -106,10 +106,12 @@
+ * >description start-slice:strand + *+ * + * @param filePath Path to save files to. + * @param asSingleFile If true, saves output as a single file. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + * @param lineWidth Hard wrap FASTA formatted slice at line width, default 60. + */ + def saveAsFasta(filePath: String, + asSingleFile: Boolean = false, + disableFastConcat: Boolean = false, + lineWidth: Int = 60) { + + def toFasta(slice: Slice): String = { + val sb = new StringBuilder() + sb.append(">") + sb.append(slice.getName) + Option(slice.getDescription).foreach(n => sb.append(" ").append(n)) + sb.append(s" slice.getStart-slice.getEnd:slice.getStrand") + slice.getSequence.grouped(lineWidth).foreach(line => { + sb.append("\n") + sb.append(line) + }) + sb.toString + } + + writeTextRdd(rdd.map(toFasta), + filePath, + asSingleFile = asSingleFile, + disableFastConcat = disableFastConcat) + } + + /** + * Extract the specified region from this genomic dataset of slices as a string, merging + * slices if necessary. + * + * @param region Region to extract. + * @return Return the specified region from this genomic dataset of slices as a string, merging + * slices if necessary. + */ + def extract(region: ReferenceRegion): String = { + def getString(slice: (ReferenceRegion, Slice)): (ReferenceRegion, String) = { + val trimStart = max(0, region.start - slice._1.start).toInt + val trimEnd = max(0, slice._1.end - region.end).toInt + + val fragmentSequence: String = slice._2.getSequence + + val str = fragmentSequence.drop(trimStart) + .dropRight(trimEnd) + val reg = new ReferenceRegion( + slice._1.referenceName, + slice._1.start + trimStart, + slice._1.end - trimEnd + ) + (reg, str) + } + + def reducePairs( + kv1: (ReferenceRegion, String), + kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { + assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " + + kv1 + ", " + kv2) + + (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { + kv1._2 + kv2._2 + } else { + kv2._2 + kv1._2 + }) + } + + try { + val refPairRDD: RDD[(ReferenceRegion, String)] = rdd.keyBy(ReferenceRegion(_)) + .filter(kv => kv._1.isDefined) + .map(kv => (kv._1.get, kv._2)) + .filter(kv => kv._1.overlaps(region)) + .sortByKey() + .map(kv => getString(kv)) + + val pair: (ReferenceRegion, String) = refPairRDD.collect.reduceLeft(reducePairs) + assert( + pair._1.compareTo(region) == 0, + "Merging slices returned a different region than requested." + ) + + pair._2 + } catch { + case (uoe: UnsupportedOperationException) => + throw new UnsupportedOperationException("Could not find " + region + "in reference RDD.") + } + } + + /** + * Extract the specified regions from this genomic dataset of slices as an RDD of (ReferenceRegion, + * String) tuples, merging slices if necessary. + * + * @param regions Zero or more regions to extract. + * @return Return the specified regions from this genomic dataset of slices as an RDD of (ReferenceRegion, + * String) tuples, merging slices if necessary. + */ + def extractRegions(regions: Iterable[ReferenceRegion]): RDD[(ReferenceRegion, String)] = { + def extractSequence(sliceRegion: ReferenceRegion, slice: Slice, region: ReferenceRegion): (ReferenceRegion, String) = { + val merged = sliceRegion.intersection(region) + val start = (merged.start - sliceRegion.start).toInt + val end = (merged.end - sliceRegion.start).toInt + val fragmentSequence: String = slice.getSequence + (merged, fragmentSequence.substring(start, end)) + } + + def reduceRegionSequences( + kv1: (ReferenceRegion, String), + kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { + (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { + kv1._2 + kv2._2 + } else { + kv2._2 + kv1._2 + }) + } + + val places = flattenRddByRegions() + .flatMap { + case (sliceRegion, slice) => + regions.collect { + case region if sliceRegion.overlaps(region) => + (region, extractSequence(sliceRegion, slice, region)) + } + }.sortByKey() + + places.reduceByKey(reduceRegionSequences).values + } + + /** + * (Java-friendly) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent + * slices now overlap by _n_ bases, where _n_ is the flank length. + * + * @param flankLength The length to extend adjacent slices by. + * @return Returns this genomic dataset, with all adjacent slices extended with flanking sequence. + */ + def flankAdjacent(flankLength: java.lang.Integer): SliceDataset = { + val flank: Int = flankLength + flankAdjacent(flank) + } + + /** + * (Scala-friendly) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent + * slices now overlap by _n_ bases, where _n_ is the flank length. + * + * @param flankLength The length to extend adjacent slices by. + * @return Returns this genomic dataset, with all adjacent slices extended with flanking sequence. + */ + def flankAdjacent(flankLength: Int): SliceDataset = { + replaceRdd(FlankSlices(rdd, + sequences, + flankLength)) + } + + /** + * (Scala-friendly) Counts the k-mers contained in this genomic dataset of slices. + * + * @param kmerLength The length of k-mers to count. + * @return Returns an RDD containing k-mer/count pairs. + */ + def countKmers(kmerLength: Int): RDD[(String, Long)] = { + flankAdjacent(kmerLength).rdd.flatMap(r => { + // cut each read into k-mers, and attach a count of 1L + r.getSequence + .sliding(kmerLength) + .map(k => (k, 1L)) + }).reduceByKey((k1: Long, k2: Long) => k1 + k2) + } + + /** + * (Java-friendly) Counts the k-mers contained in this genomic dataset of slices. + * + * @param kmerLength The length of k-mers to count. + * @return Returns an RDD containing k-mer/count pairs. + */ + def countKmers( + kmerLength: java.lang.Integer): JavaRDD[(String, java.lang.Long)] = { + val k: Int = kmerLength + countKmers(k).map(p => { + (p._1, p._2: java.lang.Long) + }).toJavaRDD() + } + + /** + * @param newRdd The RDD to replace the underlying RDD with. + * @return Returns a new SliceDataset with the underlying RDD replaced. + */ + protected def replaceRdd(newRdd: RDD[Slice], + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): SliceDataset = { + new RDDBoundSliceDataset(newRdd, sequences, newPartitionMap) + } + + /** + * @param slice Slice to extract a region from. + * @return Returns a reference region that covers the entirety of the slice. + */ + protected def getReferenceRegions(slice: Slice): Seq[ReferenceRegion] = { + Seq(ReferenceRegion(slice.getName, slice.getStart, slice.getEnd, slice.getStrand)) + } +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala index abc3163557..872f59b69c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala @@ -167,7 +167,8 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { kryo.register(classOf[org.bdgenomics.adam.algorithms.consensus.Consensus]) // org.bdgenomics.adam.converters - kryo.register(classOf[org.bdgenomics.adam.converters.FastaConverter.FastaDescriptionLine]) + kryo.register(classOf[org.bdgenomics.adam.converters.FastaSequenceConverter.FastaDescriptionLine]) + kryo.register(classOf[org.bdgenomics.adam.converters.FastaSliceConverter.FastaDescriptionLine]) kryo.register(classOf[org.bdgenomics.adam.converters.FragmentCollector]) // org.bdgenomics.adam.models @@ -203,8 +204,12 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { new org.bdgenomics.adam.rdd.fragment.FragmentArraySerializer) kryo.register(classOf[org.bdgenomics.adam.rdd.variant.GenotypeArray], new org.bdgenomics.adam.rdd.variant.GenotypeArraySerializer) - kryo.register(classOf[org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentArray], - new org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentArraySerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.read.ReadArray], + new org.bdgenomics.adam.rdd.read.ReadArraySerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.sequence.SequenceArray], + new org.bdgenomics.adam.rdd.sequence.SequenceArraySerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.sequence.SliceArray], + new org.bdgenomics.adam.rdd.sequence.SliceArraySerializer) kryo.register(classOf[org.bdgenomics.adam.rdd.variant.VariantArray], new org.bdgenomics.adam.rdd.variant.VariantArraySerializer) kryo.register(classOf[org.bdgenomics.adam.rdd.variant.VariantContextArray], @@ -256,8 +261,6 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { new AvroSerializer[org.bdgenomics.formats.avro.Genotype]) kryo.register(classOf[org.bdgenomics.formats.avro.GenotypeAllele]) kryo.register(classOf[org.bdgenomics.formats.avro.GenotypeType]) - kryo.register(classOf[org.bdgenomics.formats.avro.NucleotideContigFragment], - new AvroSerializer[org.bdgenomics.formats.avro.NucleotideContigFragment]) kryo.register(classOf[org.bdgenomics.formats.avro.OntologyTerm], new AvroSerializer[org.bdgenomics.formats.avro.OntologyTerm]) kryo.register(classOf[org.bdgenomics.formats.avro.ProcessingStep], @@ -329,7 +332,6 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.Genotype]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.GenotypeAllele]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.OntologyTerm]]) - kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.NucleotideContigFragment]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.Read]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.ReadGroup]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.Reference]]) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala index f29905f65a..b424702c20 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala @@ -26,15 +26,15 @@ import org.bdgenomics.adam.models.{ SequenceRecord } import org.bdgenomics.adam.serialization.AvroSerializer -import org.bdgenomics.formats.avro.NucleotideContigFragment +import org.bdgenomics.formats.avro.Slice /** * A broadcastable ReferenceFile backed by a map containing contig name -> - * Seq[NucleotideContigFragment] pairs. + * Seq[Slice] pairs. * - * @param contigMap a map containing a Seq of contig fragments per contig. + * @param contigMap a map containing a Seq of slices per contig. */ -case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragment]]) extends ReferenceFile { +case class ReferenceContigMap(contigMap: Map[String, Seq[Slice]]) extends ReferenceFile { private def keys(): String = { contigMap.keys.toList.sortBy(x => x).mkString(", ") @@ -64,7 +64,7 @@ case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragmen "Contig %s not found in reference map with keys: %s".format(region.referenceName, keys()) ) ) - .dropWhile(f => f.getStart + f.getSequence.length < region.start) + .dropWhile(s => s.getStart + s.getSequence.length < region.start) .takeWhile(_.getStart < region.end) .map( clipFragment(_, region.start, region.end) @@ -72,41 +72,39 @@ case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragmen .mkString("") } - private def clipFragment(fragment: NucleotideContigFragment, start: Long, end: Long): String = { + private def clipFragment(slice: Slice, start: Long, end: Long): String = { val min = math.max( 0L, - start - fragment.getStart + start - slice.getStart ).toInt val max = math.min( - fragment.getSequence.length, - end - fragment.getStart + slice.getSequence.length, + end - slice.getStart ).toInt - fragment.getSequence.substring(min, max) + slice.getSequence.substring(min, max) } } /** - * Companion object for creating a ReferenceContigMap from an RDD of contig - * fragments. + * Companion object for creating a ReferenceContigMap from an RDD of slices. */ object ReferenceContigMap { /** - * Builds a ReferenceContigMap from an RDD of fragments. + * Builds a ReferenceContigMap from an RDD of slices. * - * @param fragments RDD of nucleotide contig fragments describing a genome - * reference. - * @return Returns a serializable wrapper around these fragments that enables + * @param slices RDD of slices describing a genome reference. + * @return Returns a serializable wrapper around these slices that enables * random access into the reference genome. */ - def apply(fragments: RDD[NucleotideContigFragment]): ReferenceContigMap = { + def apply(slices: RDD[Slice]): ReferenceContigMap = { ReferenceContigMap( - fragments - .groupBy(_.getContigName) + slices + .groupBy(_.getName) .mapValues(_.toSeq.sortBy(_.getStart)) .collectAsMap .toMap @@ -115,30 +113,30 @@ object ReferenceContigMap { } class ReferenceContigMapSerializer extends Serializer[ReferenceContigMap] { - private val ncfSerializer = new AvroSerializer[NucleotideContigFragment] + private val sliceSerializer = new AvroSerializer[Slice] def write(kryo: Kryo, out: Output, record: ReferenceContigMap) = { out.writeInt(record.contigMap.size) record.contigMap.foreach(p => { out.writeString(p._1) out.writeInt(p._2.size) - p._2.foreach(ncf => { - ncfSerializer.write(kryo, out, ncf) + p._2.foreach(slice => { + sliceSerializer.write(kryo, out, slice) }) }) } def read(kryo: Kryo, in: Input, clazz: Class[ReferenceContigMap]): ReferenceContigMap = { val n = in.readInt() - val array = new Array[(String, Seq[NucleotideContigFragment])](n) + val array = new Array[(String, Seq[Slice])](n) (0 until n).foreach(idx => { val key = in.readString() - val numNcfs = in.readInt() - val ncfArray = new Array[NucleotideContigFragment](numNcfs) - (0 until numNcfs).foreach(jdx => { - ncfArray(jdx) = ncfSerializer.read(kryo, in, classOf[NucleotideContigFragment]) + val numSlices = in.readInt() + val sliceArray = new Array[Slice](numSlices) + (0 until numSlices).foreach(jdx => { + sliceArray(jdx) = sliceSerializer.read(kryo, in, classOf[Slice]) }) - array(idx) = (key, ncfArray.toSeq) + array(idx) = (key, sliceArray.toSeq) }) ReferenceContigMap(array.toMap) } diff --git a/adam-core/src/test/resources/trinity.fa b/adam-core/src/test/resources/trinity.fa new file mode 100644 index 0000000000..366819d4a4 --- /dev/null +++ b/adam-core/src/test/resources/trinity.fa @@ -0,0 +1,43 @@ +>000872-000883_All_comp1777_c1_seq1 len=375 ~FPKM=14.3 path=[0:0-239 240:240-374] +CACTGCACCACCAGGGAAGCCCCAGGTGAATTTCTTACTTCCTTAAGTGCAGGACCTTGT +TTCAGACCTCCCTGCCTTCCTATATGCTGCCTTCTGCCTGGAAACCCCTCCCTCCTTTCT +CCTCTGCACCAACTCCTATCCACCGTTTGAAACTTGCTTCATGTCTCCTTTTATAGGAGG +ACTTCTCTGATTCCCCAATTTGTTTTTTTTCCACTGATCTGTTTTGTTATTTTAATTGAA +TGAATGATTCTTTAAATTCTATAGTGCTTTACAATTTTCAAAAGTTTCACACACATGATC +TCATATAATCCCATAGCAACCCTTTTTCTTCCTCCTACCCCTATATTGCCCCTCTCCCCA +CTGGTAGCCACTAGG +>000872-000883_All_comp1777_c1_seq2 len=344 ~FPKM=5.9 path=[1135:0-208 240:209-343] +TGTCTTACAAACATATGCCGGTGCCTGGAAAAAAAGAATTTCAAAAGTAAAAAATTAAGG +TCATTCCCATCTCAAACATACCTAAAATACATAATGATAAGTAACTTAGCACAGGGAATA +AAGTTGTTACTCAATAAATATTTACTTAATTAGACATGGCAGGAATACAGATATTTGTCC +TGAAGACTTTTTGTAGTTTATTTTTTTATTGAATGATTCTTTAAATTCTATAGTGCTTTA +CAATTTTCAAAAGTTTCACACACATGATCTCATATAATCCCATAGCAACCCTTTTTCTTC +CTCCTACCCCTATATTGCCCCTCTCCCCACTGGTAGCCACTAGG +>000872-000883_All_comp1777_c1_seq3 len=265 ~FPKM=14.1 path=[240:0-134 375:135-143 2488:144-150 2495:151-158 2503:159-167 408:168-264] +TGAATGATTCTTTAAATTCTATAGTGCTTTACAATTTTCAAAAGTTTCACACACATGATC +TCATATAATCCCATAGCAACCCTTTTTCTTCCTCCTACCCCTATATTGCCCCTCTCCCCA +CTGGTAGCCACTAGGTTGTTCTCTATATCTGTGAGTCTGCTTCTTTTTGTTTTATTCACC +AGTTTGTTGTAATTTTTACATTCCACATACAAGTGATAACATATAGTATGTCTTTGACAT +TTCACTTAGTATAATGCCCTCCAAT +>000872-000883_All_comp1777_c3_seq1 len=221 ~FPKM=1.1 path=[1966:0-220] +TATACGCTGATAGGTCACAACTTGCTTTTTAAAAAAAAGTTTTTTTAATCATTGAACAAA +GCTATCTTTAAATTTTATAGTACTTTACAGTTTTCACAAGTTTCACACACGATTTCATAT +AATCCTATAATAACCCTTTTTTATCCCCTACTCCTATATTGCCCTTCCCCCTTCCCTGTC +CCCAATGATAACTACTAGTTCTATCTGTGAGTCTGCTTTTA +>000872-000883_All_comp1852_c0_seq1 len=902 ~FPKM=5.6 path=[0:0-901] +ACTGCGCCACCAGGGAAGCCCCAGCATACCTATTTTTGGTACTGATAAGCTGCTGCCATC +CCAACCAGTGATGTAGCCAATCTTCTCAAATTTTTGGCCCATAGGAGTGAATAGATTCTT +GATCCTTTGCTTCCCACAATGCATAGAAGGGACAGCCAAATTAGTCCCTCACTGCTGGAC +AACGCTGATGGAACTAAAAATGGGTTTGTAGCCCAAGAAAATGAACCCCAGGGCTGTCCA +GAGTACTTTCCTGATACCTGGTGAGAGATTAAGATGCCAGTCACGTATATGCCCCGCAGG +TTCAGTTCTGGGAAGGCCTTCAACATACCATTCTGTGATTTTCCCGAATAAAATGACAGA +CTTCGGTTATGCCTAAGGCTGATTCCTAGATTTTTTTTTTTTCCTGTGTTCACAGCATCT +GGTACATTTTATTTTGTCTAACCAAAATAAAATGTTTATTAGCTTCATTTATGAAAACAA +ACTCACTTAACATTTTAATTGGAAACAGTTCATATGGTCTGTTCAGTGGAAGCTCAGATA +GGCAAAGTGCAATAAGCAGAATGAAATATCACACATATGTCTCTGCATAAATCAACATGA +AAAAGTATTTAAGATGTTTCAAATGGAAAAAGCAAGCTACAAAATAATAAGTACAGTGAG +AGTTCTCTGTTAAACATTTTTTCCTAAACTATTTATGTAGTATTATTATTTTTGCAAAAG +TTATTGCTTTTGTAATACAAAACAGCAAAATTAACAAATACAACAAATACAATGTTTAAT +GAGAAACCTGTGGTATACAGTGTAATTGTCTCTGGACAAAAGTCTTCTTTACAAGTCAAG +GAACAAAACTATTTCTTTTAATTAAATGTAGGTTTTAGAAAAACATTCATTACACATTAC +TA diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala deleted file mode 100644 index aba853accf..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala +++ /dev/null @@ -1,221 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.converters - -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.ADAMFunSuite -import java.io.File - -class FastaConverterSuite extends ADAMFunSuite { - - val converter = new FastaConverter(1000) - - sparkTest("find contig index") { - val headerLines = sc.parallelize(Seq( - (0L, ">1 dna:chromosome chromosome:GRCh37:1:1:249250621:1"), - (252366306L, ">2 dna:chromosome chromosome:GRCh37:2:1:243199373:1"), - (699103487L, ">4 dna:chromosome chromosome:GRCh37:4:1:191154276:1"), - (892647244L, ">5 dna:chromosome chromosome:GRCh37:5:1:180915260:1"), - (498605724L, ">3 dna:chromosome chromosome:GRCh37:3:1:198022430:1"))) - val descLines = FastaConverter.getDescriptionLines(headerLines) - val headerIndices: List[Long] = descLines.keys.toList - - assert(0 === FastaConverter.findReferenceIndex(252366300L, headerIndices)) - assert(892647244L === FastaConverter.findReferenceIndex(892647249L, headerIndices)) - assert(252366306L === FastaConverter.findReferenceIndex(498605720L, headerIndices)) - } - - test("convert a single record without naming information") { - val contig = converter.convert(None, 0, Seq("AAATTTGCGC"), None) - - assert(contig.head.getSequence.map(_.toString).reduce(_ + _) === "AAATTTGCGC") - assert(contig.head.getContigLength === 10) - assert(contig.head.getContigName === null) - assert(contig.head.getDescription === null) - } - - test("convert a single record with naming information") { - val contig = converter.convert(Some("chr2"), 1, Seq("NNNN"), Some("hg19")) - - assert(contig.head.getSequence.map(_.toString).reduce(_ + _) === "NNNN") - assert(contig.head.getContigLength === 4) - assert(contig.head.getContigName === "chr2") - assert(contig.head.getDescription === "hg19") - } - - sparkTest("convert single fasta sequence") { - val fasta = List((0L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA"), - (1L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (2L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (3L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (4L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (5L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (6L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (7L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (8L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (9L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (10L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (11L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (12L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (13L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (14L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (15L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")) - val rdd = sc.parallelize(fasta.toSeq) - - val adamFasta = FastaConverter(rdd) - assert(adamFasta.count === 1) - - val fastaElement = adamFasta.first() - val fastaSequence = fasta.map(_._2).reduce(_ + _) - val convertedSequence = fastaElement.getSequence.map(_.toString).reduce(_ + _) - - assert(convertedSequence === fastaSequence) - assert(fastaElement.getContigLength() == fastaSequence.length) - assert(fastaElement.getContigName === null) - assert(fastaElement.getDescription === null) - } - - sparkTest("convert fasta with multiple sequences") { - val fasta1 = List((0L, ">chr1"), - (1L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA"), - (2L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (3L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (4L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (5L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (6L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (7L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (8L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (9L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (10L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (11L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (12L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (13L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (14L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (15L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (16L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")) - val fasta2 = List((17L, ">chr2"), - (18L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTCCCCCCCCCCTTTTTTTTTTCCCCCC"), - (19L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (20L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (21L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (22L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (23L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (24L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (25L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (26L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (27L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (28L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (29L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (30L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (31L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (32L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (33L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")) - val fasta = fasta1 ::: fasta2 - val rdd = sc.parallelize(fasta.toSeq) - - val adamFasta = FastaConverter(rdd) - assert(adamFasta.count === 2) - - val fastaElement1 = adamFasta.filter(_.getContigName == "chr1").first() - val fastaSequence1 = fasta1.drop(1).map(_._2).reduce(_ + _) - val convertedSequence1 = fastaElement1.getSequence.map(_.toString).reduce(_ + _) - - assert(convertedSequence1 === fastaSequence1) - assert(fastaElement1.getContigLength() == fastaSequence1.length) - assert(fastaElement1.getContigName().toString === "chr1") - assert(fastaElement1.getDescription === null) - - val fastaElement2 = adamFasta.filter(_.getContigName == "chr2").first() - val fastaSequence2 = fasta2.drop(1).map(_._2).reduce(_ + _) - val convertedSequence2 = fastaElement2.getSequence.map(_.toString).reduce(_ + _) - - assert(convertedSequence2 === fastaSequence2) - assert(fastaElement2.getContigLength() == fastaSequence2.length) - assert(fastaElement2.getContigName().toString === "chr2") - assert(fastaElement2.getDescription === null) - } - - sparkTest("convert fasta with multiple sequences; short fragment") { - val fasta1 = List((0L, ">chr1"), - (1L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA"), - (2L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (3L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (4L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (5L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (6L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (7L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (8L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (9L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (10L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (11L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (12L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (13L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (14L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (15L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (16L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")) - val fasta2 = List((17L, ">chr2"), - (18L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTCCCCCCCCCCTTTTTTTTTTCCCCCC"), - (19L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (20L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (21L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (22L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (23L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (24L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (25L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (26L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (27L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (28L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (29L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (30L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (31L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (32L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (33L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")) - val fasta = fasta1 ::: fasta2 - val rdd = sc.parallelize(fasta.toSeq) - - val adamFasta = FastaConverter(rdd, maximumLength = 35) - assert(adamFasta.count === 64) - - val fastaElement1 = adamFasta.filter(_.getContigName == "chr1").collect() - val fastaSequence1 = fasta1.drop(1).map(_._2).mkString - val seqs = fastaElement1.sortBy(_.getIndex) - val convertedSequence1 = fastaElement1.sortBy(_.getIndex).map(_.getSequence.toString).mkString - assert(seqs != null) - assert(convertedSequence1 === fastaSequence1) - - val fastaElement2 = adamFasta.filter(_.getContigName == "chr2").collect() - val fastaSequence2 = fasta2.drop(1).map(_._2).mkString - val convertedSequence2 = fastaElement2.sortBy(_.getIndex).map(_.getSequence.toString).mkString - - assert(convertedSequence2 === fastaSequence2) - } - - val chr1File = testFile("human_g1k_v37_chr1_59kb.fasta") - - sparkTest("convert reference fasta file") { - //Loading "human_g1k_v37_chr1_59kb.fasta" - val referenceSequences = sc.loadContigFragments(chr1File, maximumLength = 10).rdd.collect() - assert(referenceSequences.forall(_.getContigName.toString == "1")) - assert(referenceSequences.slice(0, referenceSequences.length - 2).forall(_.getSequence.length == 10)) - - val reassembledSequence = referenceSequences.sortBy(_.getIndex).map(_.getSequence).mkString - val originalSequence = scala.io.Source.fromFile(new File(chr1File)).getLines().filter(!_.startsWith(">")).mkString - - assert(reassembledSequence === originalSequence) - } -} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala index 7e85087dda..c3afc8bba2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala @@ -24,8 +24,8 @@ import org.bdgenomics.formats.avro._ class FragmentConverterSuite extends ADAMFunSuite { test("build a fragment collector and convert to a read") { - val fcOpt = FragmentCollector(NucleotideContigFragment.newBuilder() - .setContigName("ctg") + val fcOpt = FragmentCollector(Slice.newBuilder() + .setName("ctg") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) @@ -50,18 +50,18 @@ class FragmentConverterSuite extends ADAMFunSuite { } test("if a fragment isn't associated with a contig, don't get a fragment collector") { - val fcOpt = FragmentCollector(NucleotideContigFragment.newBuilder().build()) + val fcOpt = FragmentCollector(Slice.newBuilder().build()) assert(fcOpt.isEmpty) } sparkTest("convert an rdd of discontinuous fragments, all from the same contig") { - val rdd = sc.parallelize(Seq(NucleotideContigFragment.newBuilder() - .setContigName("ctg") + val rdd = sc.parallelize(Seq(Slice.newBuilder() + .setName("ctg") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg") + .build(), Slice.newBuilder() + .setName("ctg") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) @@ -85,18 +85,18 @@ class FragmentConverterSuite extends ADAMFunSuite { } sparkTest("convert an rdd of contiguous fragments, all from the same contig") { - val rdd = sc.parallelize(Seq(NucleotideContigFragment.newBuilder() - .setContigName("ctg") + val rdd = sc.parallelize(Seq(Slice.newBuilder() + .setName("ctg") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg") + .build(), Slice.newBuilder() + .setName("ctg") .setSequence("TGTGTG") .setStart(8L) .setEnd(14L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg") + .build(), Slice.newBuilder() + .setName("ctg") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) @@ -114,33 +114,33 @@ class FragmentConverterSuite extends ADAMFunSuite { } sparkTest("convert an rdd of varied fragments from multiple contigs") { - val rdd = sc.parallelize(Seq(NucleotideContigFragment.newBuilder() - .setContigName("ctg1") + val rdd = sc.parallelize(Seq(Slice.newBuilder() + .setName("ctg1") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg1") + .build(), Slice.newBuilder() + .setName("ctg1") .setSequence("TGTGTG") .setStart(8L) .setEnd(14L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg1") + .build(), Slice.newBuilder() + .setName("ctg1") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg2") + .build(), Slice.newBuilder() + .setName("ctg2") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg2") + .build(), Slice.newBuilder() + .setName("ctg2") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg3") + .build(), Slice.newBuilder() + .setName("ctg3") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index 160b153de7..c3da738862 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -403,32 +403,36 @@ class ADAMContextSuite extends ADAMFunSuite { sparkTest("read a HLA fasta from GRCh38") { val inputPath = testFile("HLA_DQB1_05_01_01_02.fa") - val gDataset = sc.loadFasta(inputPath, 10000L) - assert(gDataset.sequences.records.size === 1) - assert(gDataset.sequences.records.head.name === "HLA-DQB1*05:01:01:02") - val fragments = gDataset.rdd.collect - assert(fragments.size === 1) - assert(fragments.head.getContigName === "HLA-DQB1*05:01:01:02") + val gRdd = sc.loadFastaDna(inputPath) + + // see https://github.com/bigdatagenomics/adam/issues/1894 + val withSequenceDictionary = gRdd.createSequenceDictionary() + assert(withSequenceDictionary.sequences.records.size === 1) + assert(withSequenceDictionary.sequences.records.head.name === "HLA-DQB1*05:01:01:02") + + val sequences = gRdd.rdd.collect + assert(sequences.size === 1) + assert(sequences.head.getName === "HLA-DQB1*05:01:01:02") } sparkTest("read a gzipped fasta file") { val inputPath = testFile("chr20.250k.fa.gz") - val contigFragments = sc.loadFasta(inputPath, 10000L) - .transform((rdd: RDD[NucleotideContigFragment]) => { - rdd.sortBy(_.getIndex.toInt) - }) - assert(contigFragments.rdd.count() === 26) - val first: NucleotideContigFragment = contigFragments.rdd.first() - assert(first.getContigName === null) + val slices = sc.loadFastaDna(inputPath, 10000L) + .rdd + .sortBy(_.getIndex.toInt) + + assert(slices.count() === 26) + val first = slices.first() + assert(first.getName === null) assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") assert(first.getIndex === 0) assert(first.getSequence.length === 10000) assert(first.getStart === 0L) assert(first.getEnd === 10000L) - assert(first.getFragments === 26) + assert(first.getSlices === 26) // 250k file actually has 251930 bases - val last: NucleotideContigFragment = contigFragments.rdd.collect().last + val last = slices.collect().last assert(last.getIndex === 25) assert(last.getStart === 250000L) assert(last.getEnd === 251930L) @@ -992,4 +996,60 @@ class ADAMContextSuite extends ADAMFunSuite { assert(reloaded.headerLines.toSet == variants.headerLines.toSet) assert(reloaded.rdd.collect().deep == variants.rdd.collect().deep) } + + sparkTest("read a fasta file with short sequences as sequences") { + val inputPath = testFile("trinity.fa") + val sequences = sc.loadFastaDna(inputPath) + assert(sequences.rdd.count === 5) + } + + sparkTest("read a fasta file with long sequences as sequences") { + val inputPath = testFile("chr20.250k.fa.gz") + val sequences = sc.loadFastaDna(inputPath) + assert(sequences.rdd.count === 1) + val sequence = sequences.rdd.first() + assert(sequence.getName === null) + assert(sequence.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") + assert(sequence.getAlphabet === org.bdgenomics.formats.avro.Alphabet.DNA) + assert(sequence.getSequence.length === 251930) + assert(sequence.getLength === 251930L) + } + + sparkTest("read a fasta file with short sequences as slices") { + val inputPath = testFile("trinity.fa") + val slices = sc.loadFastaDna(inputPath, 10000L) + assert(slices.rdd.count === 5) + } + + sparkTest("read a fasta file with long sequences as slices") { + val inputPath = testFile("chr20.250k.fa.gz") + val slices = sc.loadFastaDna(inputPath, 10000L) + slices.transform(rdd => rdd.sortBy(_.getIndex.toInt)) + assert(slices.rdd.count() === 26) + + val first = slices.rdd.first() + assert(first.getName === null) + assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") + assert(first.getAlphabet === org.bdgenomics.formats.avro.Alphabet.DNA) + assert(first.getSequence.length === 10000) + assert(first.getLength === 10000L) + assert(first.getStart === 0L) + assert(first.getEnd === 10000L) + assert(first.getIndex === 0) + assert(first.getSlices === 26) + assert(first.getTotalLength === 251930L) + + // 250k file actually has 251930 bases + val last = slices.rdd.collect().last + assert(last.getName === null) + assert(last.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") + assert(last.getAlphabet === org.bdgenomics.formats.avro.Alphabet.DNA) + assert(last.getSequence.length === 1930) + assert(last.getLength === 1930L) + assert(last.getStart === 250000L) + assert(last.getEnd === 251930L) + assert(last.getIndex === 25) + assert(last.getSlices === 26) + assert(last.getTotalLength === 251930L) + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala deleted file mode 100644 index 3e7047055a..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala +++ /dev/null @@ -1,922 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.contig - -import java.io.File -import java.lang.{ Long => JLong } - -import com.google.common.io.Files -import org.apache.parquet.filter2.predicate.Operators.{ BinaryColumn, LongColumn } -import org.apache.parquet.filter2.predicate.{ FilterApi, FilterPredicate } -import org.apache.parquet.io.api.Binary -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{ Dataset, SQLContext } -import org.bdgenomics.adam.models._ -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } -import org.bdgenomics.adam.rdd.fragment.FragmentDataset -import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset -import org.bdgenomics.adam.rdd.variant.{ - GenotypeDataset, - VariantDataset, - VariantContextDataset -} -import org.bdgenomics.adam.sql.{ - AlignmentRecord => AlignmentRecordProduct, - Feature => FeatureProduct, - Fragment => FragmentProduct, - Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, - Variant => VariantProduct, - VariantContext => VariantContextProduct -} -import org.bdgenomics.adam.util.ADAMFunSuite -import org.bdgenomics.formats.avro._ -import scala.collection.mutable.ListBuffer - -object NucleotideContigFragmentDatasetSuite extends Serializable { - - def covFn(ncf: NucleotideContigFragment): Coverage = { - Coverage(ncf.getContigName, - ncf.getStart, - ncf.getEnd, - 1) - } - - def featFn(ncf: NucleotideContigFragment): Feature = { - Feature.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def fragFn(ncf: NucleotideContigFragment): Fragment = { - Fragment.newBuilder - .setName(ncf.getContigName) - .build - } - - def genFn(ncf: NucleotideContigFragment): Genotype = { - Genotype.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def readFn(ncf: NucleotideContigFragment): AlignmentRecord = { - AlignmentRecord.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def varFn(ncf: NucleotideContigFragment): Variant = { - Variant.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def vcFn(ncf: NucleotideContigFragment): VariantContext = { - VariantContext(Variant.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build) - } -} - -class NucleotideContigFragmentDatasetSuite extends ADAMFunSuite { - - sparkTest("union two ncf genomic datasets together") { - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 10000L) - val fragments2 = sc.loadFasta(testFile("artificial.fa")) - val union = fragments1.union(fragments2) - assert(union.rdd.count === (fragments1.rdd.count + fragments2.rdd.count)) - assert(union.sequences.size === 2) - } - - sparkTest("round trip a ncf to parquet") { - def testMetadata(fRdd: NucleotideContigFragmentDataset) { - val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) - assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - } - - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - assert(fragments1.rdd.count === 8L) - assert(fragments1.dataset.count === 8L) - testMetadata(fragments1) - - // save using dataset path - val output1 = tmpFile("ctg.adam") - val dsBound = fragments1.transformDataset(ds => ds) - testMetadata(dsBound) - dsBound.saveAsParquet(output1) - val fragments2 = sc.loadContigFragments(output1) - testMetadata(fragments2) - assert(fragments2.rdd.count === 8L) - assert(fragments2.dataset.count === 8L) - - // save using rdd path - val output2 = tmpFile("ctg.adam") - val rddBound = fragments2.transform((rdd: RDD[NucleotideContigFragment]) => rdd) - testMetadata(rddBound) - rddBound.saveAsParquet(output2) - val fragments3 = sc.loadContigFragments(output2) - assert(fragments3.rdd.count === 8L) - assert(fragments3.dataset.count === 8L) - } - - sparkTest("round trip a ncf to partitioned parquet") { - def testMetadata(fRdd: NucleotideContigFragmentDataset) { - val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) - assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - } - - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - assert(fragments1.rdd.count === 8L) - assert(fragments1.dataset.count === 8L) - testMetadata(fragments1) - - // save using dataset path - val output1 = tmpFile("ctg.adam") - val dsBound = fragments1.transformDataset(ds => ds) - testMetadata(dsBound) - dsBound.saveAsPartitionedParquet(output1) - val fragments2 = sc.loadPartitionedParquetContigFragments(output1) - testMetadata(fragments2) - assert(fragments2.rdd.count === 8L) - assert(fragments2.dataset.count === 8L) - - // save using rdd path - val output2 = tmpFile("ctg.adam") - val rddBound = fragments2.transform((rdd: RDD[NucleotideContigFragment]) => rdd) - testMetadata(rddBound) - rddBound.saveAsPartitionedParquet(output2) - val fragments3 = sc.loadPartitionedParquetContigFragments(output2) - assert(fragments3.rdd.count === 8L) - assert(fragments3.dataset.count === 8L) - } - - sparkTest("save fasta back as a single file") { - val origFasta = testFile("artificial.fa") - val tmpFasta = tmpFile("test.fa") - sc.loadFasta(origFasta) - .saveAsFasta(tmpFasta, asSingleFile = true, lineWidth = 70) - checkFiles(origFasta, tmpFasta) - } - - sparkTest("generate sequence dict from fasta") { - - val ctg0 = NucleotideContigFragment.newBuilder() - .setContigName("chr0") - .setContigLength(1000L) - .build() - val ctg1 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(900L) - .build() - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(ctg0, ctg1))) - - assert(rdd.sequences.containsReferenceName("chr0")) - val chr0 = rdd.sequences("chr0").get - assert(chr0.length === 1000L) - assert(rdd.sequences.containsReferenceName("chr1")) - val chr1 = rdd.sequences("chr1").get - assert(chr1.length === 900L) - } - - sparkTest("recover reference string from a single contig fragment") { - - val sequence = "ACTGTAC" - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val region = ReferenceRegion(fragment).get - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - assert(rdd.extract(region) === "ACTGTAC") - } - - sparkTest("recover trimmed reference string from a single contig fragment") { - - val sequence = "ACTGTAC" - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val region = new ReferenceRegion("chr1", 1L, 6L) - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - assert(rdd.extract(region) === "CTGTA") - } - - sparkTest("recover reference string from multiple contig fragments") { - - val sequence = "ACTGTACTC" - val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT - val sequence2 = sequence.takeRight(6).reverse // CTCATG - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence0) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence1) - .setIndex(0) - .setStart(0L) - .setEnd(5L) - .setFragments(2) - .build() - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence2) - .setIndex(1) - .setStart(5L) - .setEnd(12L) - .setFragments(2) - .build() - val region0 = ReferenceRegion(fragment0).get - val region1 = ReferenceRegion(fragment1).get.merge(ReferenceRegion(fragment2).get) - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, - fragment1, - fragment2))) - - assert(rdd.extract(region0) === "ACTGTAC") - assert(rdd.extract(region1) === "GTACTCTCATG") - } - - sparkTest("extract sequences based on the list of reference regions") { - val test = "test" - - def dnas2fragments(dnas: Seq[String]): List[NucleotideContigFragment] = { - val (_, frags) = dnas.foldLeft((0L, List.empty[NucleotideContigFragment])) { - case ((start, acc), str) => - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("test") - .setStart(start) - .setLength(str.length: Long) - .setSequence(str) - .setEnd(start + str.length) - .build() - (start + str.length, fragment :: acc) - } - frags.reverse - } - - val dnas: Seq[String] = Vector( - "ACAGCTGATCTCCAGATATGACCATGGGTT", - "CAGCTGATCTCCAGATATGACCATGGGTTT", - "CCAGAAGTTTGAGCCACAAACCCATGGTCA" - ) - - val merged = dnas.reduce(_ + _) - - val record = SequenceRecord("test", merged.length) - - val dic = new SequenceDictionary(Vector(record)) - val frags = sc.parallelize(dnas2fragments(dnas)) - val fragments = NucleotideContigFragmentDataset(frags, dic) - - val byRegion = fragments.rdd.keyBy(ReferenceRegion(_)) - - val regions = List( - new ReferenceRegion(test, 0, 5), - new ReferenceRegion(test, 25, 35), - new ReferenceRegion(test, 40, 50), - new ReferenceRegion(test, 50, 70) - ) - - val results: Set[(ReferenceRegion, String)] = fragments.extractRegions(regions).collect().toSet - val seqs = regions.zip(List("ACAGC", "GGGTTCAGCT", "CCAGATATGA", "CCATGGGTTTCCAGAAGTTT")).toSet - assert(seqs === results) - } - - sparkTest("recover trimmed reference string from multiple contig fragments") { - - val sequence = "ACTGTACTC" - val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT - val sequence2 = sequence.takeRight(6).reverse // CTCATG - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence0) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence1) - .setIndex(0) - .setStart(0L) - .setEnd(5L) - .setFragments(2) - .build() - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence2) - .setIndex(1) - .setStart(5L) - .setEnd(11L) - .setFragments(2) - .build() - val region0 = new ReferenceRegion("chr1", 1L, 6L) - val region1 = new ReferenceRegion("chr2", 3L, 9L) - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, - fragment1, - fragment2))) - - assert(rdd.extract(region0) === "CTGTA") - assert(rdd.extract(region1) === "CTCTCA") - } - - sparkTest("testing nondeterminism from reduce when recovering referencestring") { - - var fragments: ListBuffer[NucleotideContigFragment] = new ListBuffer[NucleotideContigFragment]() - for (a <- 0L to 1000L) { - val seq = "A" - val frag = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(1000L) - .setStart(a) - .setEnd(a + 1L) - .setSequence(seq) - .build() - fragments += frag - } - var passed = true - val rdd = NucleotideContigFragmentDataset(sc.parallelize(fragments.toList)) - try { - val result = rdd.extract(new ReferenceRegion("chr1", 0L, 1000L)) - } catch { - case e: AssertionError => passed = false - } - assert(passed == true) - } - - sparkTest("save single contig fragment as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with description as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setDescription("description") - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1 description") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with null fields as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(null) - .setStart(null) - .setEnd(null) - .setFragments(null) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with null fragment number as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(null) - .setStart(null) - .setEnd(null) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with null number of fragments in contig as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(0) - .setStart(null) - .setEnd(null) - .setFragments(null) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - def validate(fileName: String) { - val fastaLines = scala.io.Source.fromFile(new File(fileName + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - val outputFastaFile = tmpFile("test.fa") - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - validate(outputFastaFile) - - val outputFastaFile2 = tmpFile("test2.fa") - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile2) - validate(outputFastaFile2) - } - - sparkTest("save multiple contig fragments from same contig as FASTA text file") { - - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(3) - .build - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setSequence("GCATATC") - .setIndex(1) - .setFragments(3) - .build - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setSequence("CTGATCG") - .setIndex(2) - .setFragments(3) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, fragment1, fragment2))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 6) - assert(fastaLines(0) === ">chr1 fragment 1 of 3") - assert(fastaLines(1) === "ACTGTAC") - assert(fastaLines(2) === ">chr1 fragment 2 of 3") - assert(fastaLines(3) === "GCATATC") - assert(fastaLines(4) === ">chr1 fragment 3 of 3") - assert(fastaLines(5) === "CTGATCG") - } - - sparkTest("save multiple contig fragments with description from same contig as FASTA text file") { - - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setDescription("description") - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(3) - .build - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setDescription("description") - .setSequence("GCATATC") - .setIndex(1) - .setFragments(3) - .build - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setDescription("description") - .setSequence("CTGATCG") - .setIndex(2) - .setFragments(3) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, - fragment1, - fragment2))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 6) - assert(fastaLines(0) === ">chr1 description fragment 1 of 3") - assert(fastaLines(1) === "ACTGTAC") - assert(fastaLines(2) === ">chr1 description fragment 2 of 3") - assert(fastaLines(3) === "GCATATC") - assert(fastaLines(4) === ">chr1 description fragment 3 of 3") - assert(fastaLines(5) === "CTGATCG") - } - - sparkTest("merge single contig fragment null fragment number") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(null) - .setStart(null) - .setEnd(null) - .setFragments(null) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - val merged = rdd.mergeFragments() - - assert(merged.rdd.count == 1L) - assert(merged.rdd.first.getSequence() === "ACTGTAC") - } - - sparkTest("merge single contig fragment number zero") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - val merged = rdd.mergeFragments() - - assert(merged.rdd.count == 1L) - assert(merged.rdd.first.getSequence() === "ACTGTAC") - } - - sparkTest("merge multiple contig fragments") { - - val sequence = "ACTGTACTC" - val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT - val sequence2 = sequence.takeRight(6).reverse // CTCATG - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence0) - .setIndex(0) - .setStart(0L) - .setEnd(sequence0.length - 1L) - .setFragments(1) - .build() - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence1) - .setIndex(0) - .setStart(0L) - .setEnd(sequence1.length - 1L) - .setFragments(2) - .build() - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence2) - .setIndex(1) - .setStart(5L) - .setEnd(sequence2.length - 1L) - .setFragments(2) - .build() - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment2, - fragment1, - fragment0))) - val merged = rdd.mergeFragments() - - assert(merged.rdd.count == 2L) - - val collect = merged.rdd.collect - assert(collect(0).getSequence() === "ACTGTAC") - assert(collect(1).getSequence() === "GTACTCTCATG") - } - - sparkTest("save as parquet and apply predicate pushdown") { - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - assert(fragments1.rdd.count === 8) - val output = tmpFile("contigs.adam") - fragments1.saveAsParquet(output) - val fragments2 = sc.loadContigFragments(output) - assert(fragments2.rdd.count === 8) - val fragments3 = sc.loadContigFragments(output, - optPredicate = Some( - // ReferenceRegion.toPredicate uses referenceName instead of contigName - FilterApi.and( - FilterApi.and( - FilterApi.eq[Binary, BinaryColumn]( - FilterApi.binaryColumn("contigName"), - Binary.fromString("HLA-DQB1*05:01:01:02")), - FilterApi.gt[JLong, LongColumn](FilterApi.longColumn("end"), 500L)), - FilterApi.ltEq[JLong, LongColumn](FilterApi.longColumn("start"), 1500L)) - ) - ) - assert(fragments3.rdd.count === 2) - } - - sparkTest("load fasta sequences from GFF3 file") { - val sequences = sc.loadFasta(testFile("ctg123.fasta.gff3")) - assert(sequences.rdd.count() === 4) - } - - sparkTest("transform contigs to coverage genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(coverage: CoverageDataset) { - val tempPath = tmpLocation(".bed") - coverage.save(tempPath, false, false) - - assert(sc.loadCoverage(tempPath).rdd.count === 8) - } - - val coverage = contigs.transmute[Coverage, Coverage, CoverageDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.covFn) - }) - - checkSave(coverage) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val coverageDs: CoverageDataset = contigs.transmuteDataset[Coverage, Coverage, CoverageDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => NucleotideContigFragmentDatasetSuite.covFn(r.toAvro)) - }) - - checkSave(coverageDs) - } - - sparkTest("transform contigs to feature genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(features: FeatureDataset) { - val tempPath = tmpLocation(".bed") - features.saveAsBed(tempPath) - - assert(sc.loadFeatures(tempPath).rdd.count === 8) - } - - val features: FeatureDataset = contigs.transmute[Feature, FeatureProduct, FeatureDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.featFn) - }) - - checkSave(features) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val featuresDs: FeatureDataset = contigs.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - FeatureProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.featFn(r.toAvro)) - }) - }) - - checkSave(featuresDs) - } - - sparkTest("transform contigs to fragment genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(fragments: FragmentDataset) { - val tempPath = tmpLocation(".adam") - fragments.saveAsParquet(tempPath) - - assert(sc.loadFragments(tempPath).rdd.count === 8) - } - - val fragments: FragmentDataset = contigs.transmute[Fragment, FragmentProduct, FragmentDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.fragFn) - }) - - checkSave(fragments) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val fragmentsDs: FragmentDataset = contigs.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - FragmentProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.fragFn(r.toAvro)) - }) - }) - - checkSave(fragmentsDs) - } - - sparkTest("transform contigs to read genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(reads: AlignmentRecordDataset) { - val tempPath = tmpLocation(".adam") - reads.saveAsParquet(tempPath) - - assert(sc.loadAlignments(tempPath).rdd.count === 8) - } - - val reads: AlignmentRecordDataset = contigs.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.readFn) - }) - - checkSave(reads) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val readsDs: AlignmentRecordDataset = contigs.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - AlignmentRecordProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.readFn(r.toAvro)) - }) - }) - - checkSave(readsDs) - } - - sparkTest("transform contigs to genotype genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(genotypes: GenotypeDataset) { - val tempPath = tmpLocation(".adam") - genotypes.saveAsParquet(tempPath) - - assert(sc.loadGenotypes(tempPath).rdd.count === 8) - } - - val genotypes: GenotypeDataset = contigs.transmute[Genotype, GenotypeProduct, GenotypeDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.genFn) - }) - - checkSave(genotypes) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val genotypesDs: GenotypeDataset = contigs.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - GenotypeProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.genFn(r.toAvro)) - }) - }) - - checkSave(genotypesDs) - } - - sparkTest("transform contigs to variant genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(variants: VariantDataset) { - val tempPath = tmpLocation(".adam") - variants.saveAsParquet(tempPath) - - assert(sc.loadVariants(tempPath).rdd.count === 8) - } - - val variants: VariantDataset = contigs.transmute[Variant, VariantProduct, VariantDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.varFn) - }) - - checkSave(variants) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val variantsDs: VariantDataset = contigs.transmuteDataset[Variant, VariantProduct, VariantDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - VariantProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.varFn(r.toAvro)) - }) - }) - - checkSave(variantsDs) - } - - sparkTest("transform contigs to variant context genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(variantContexts: VariantContextDataset) { - assert(variantContexts.rdd.count === 8) - } - - val variantContexts: VariantContextDataset = contigs.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.vcFn) - }) - - checkSave(variantContexts) - } -} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala index fcc42a2077..6dd1fc651b 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala @@ -28,9 +28,9 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -41,7 +41,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -50,11 +50,9 @@ import org.bdgenomics.formats.avro._ object CoverageDatasetSuite extends Serializable { - def ncfFn(cov: Coverage): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(cov.referenceName) - .setStart(cov.start) - .setEnd(cov.end) + def sliceFn(cov: Coverage): Slice = { + Slice.newBuilder + .setName(cov.referenceName) .build } @@ -289,38 +287,38 @@ class CoverageDatasetSuite extends ADAMFunSuite { assert(collapsed.rdd.count == 8) } - sparkTest("transform coverage to contig rdd") { + sparkTest("transform coverage to slice genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 3) + assert(sc.loadSlices(tempPath).rdd.count === 3) } - val contigs: NucleotideContigFragmentDataset = coverage.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = coverage.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageDatasetSuite.ncfFn) + rdd.map(CoverageDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = coverage.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = coverage.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - CoverageDatasetSuite.ncfFn(r)) + SliceProduct.fromAvro( + CoverageDatasetSuite.sliceFn(r)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } - sparkTest("transform coverage to feature rdd") { + sparkTest("transform coverage to feature genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(features: FeatureDataset) { @@ -351,7 +349,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(featuresDs) } - sparkTest("transform coverage to fragment rdd") { + sparkTest("transform coverage to fragment genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(fragments: FragmentDataset) { @@ -382,7 +380,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(fragmentsDs) } - sparkTest("transform coverage to read rdd") { + sparkTest("transform coverage to read genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(reads: AlignmentRecordDataset) { @@ -413,7 +411,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(readsDs) } - sparkTest("transform coverage to genotype rdd") { + sparkTest("transform coverage to genotype genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(genotypes: GenotypeDataset) { @@ -444,7 +442,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(genotypesDs) } - sparkTest("transform coverage to variant rdd") { + sparkTest("transform coverage to variant genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(variants: VariantDataset) { @@ -475,7 +473,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(variantsDs) } - sparkTest("transform coverage to variant context rdd") { + sparkTest("transform coverage to variant context genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(variantContexts: VariantContextDataset) { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala index 14678051bb..cc8561f198 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala @@ -28,9 +28,9 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -41,7 +41,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -72,9 +72,9 @@ object FeatureDatasetSuite extends Serializable { .build } - def ncfFn(f: Feature): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(f.getReferenceName) + def sliceFn(f: Feature): Slice = { + Slice.newBuilder + .setName(f.getReferenceName) .build } @@ -1002,38 +1002,38 @@ class FeatureDatasetSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 4) } - sparkTest("transform features to contig rdd") { + sparkTest("transform features to slice genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 3) + assert(sc.loadSlices(tempPath).rdd.count === 3) } - val contigs: NucleotideContigFragmentDataset = features.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = features.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureDatasetSuite.ncfFn) + rdd.map(FeatureDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = features.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = features.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - FeatureDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + FeatureDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } - sparkTest("transform features to coverage rdd") { + sparkTest("transform features to coverage genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(coverage: CoverageDataset) { @@ -1061,7 +1061,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(coverageDs) } - sparkTest("transform features to fragment rdd") { + sparkTest("transform features to fragment genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(fragments: FragmentDataset) { @@ -1092,7 +1092,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(fragmentsDs) } - sparkTest("transform features to read rdd") { + sparkTest("transform features to read genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(reads: AlignmentRecordDataset) { @@ -1123,7 +1123,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(readsDs) } - sparkTest("transform features to genotype rdd") { + sparkTest("transform features to genotype genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(genotypes: GenotypeDataset) { @@ -1154,7 +1154,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(genotypesDs) } - sparkTest("transform features to variant rdd") { + sparkTest("transform features to variant genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(variants: VariantDataset) { @@ -1185,7 +1185,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(variantsDs) } - sparkTest("transform features to variant context rdd") { + sparkTest("transform features to variant context genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(variantContexts: VariantContextDataset) { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala index 75e05fc8c8..5dfa0c3b7f 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala @@ -27,7 +27,6 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, @@ -35,6 +34,7 @@ import org.bdgenomics.adam.rdd.read.{ AnySAMOutFormatter, QualityScoreBin } +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -45,7 +45,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -380,35 +380,35 @@ class FragmentDatasetSuite extends ADAMFunSuite { assert(rdd4.dataset.count === 20) } - sparkTest("transform fragments to contig genomic dataset") { + sparkTest("transform fragments to slice genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(ncRdd: NucleotideContigFragmentDataset) { + def checkSave(sliceRdd: SliceDataset) { val tempPath = tmpLocation(".fa") - ncRdd.saveAsFasta(tempPath) + sliceRdd.saveAsFasta(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count.toInt === 20) + assert(sc.loadSlices(tempPath).rdd.count.toInt === 20) } - val features: NucleotideContigFragmentDataset = fragments.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = fragments.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordDatasetSuite.ncfFn) + rdd.map(AlignmentRecordDatasetSuite.sliceFn) }) - checkSave(features) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: NucleotideContigFragmentDataset = fragments.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = fragments.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - AlignmentRecordDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + AlignmentRecordDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(featuresDs) + checkSave(slicesDs) } sparkTest("transform fragments to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala index 64f2892e25..6accf636dc 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala @@ -41,9 +41,9 @@ import org.bdgenomics.adam.rdd.{ ADAMContext, TestSaveArgs } -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -55,7 +55,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -89,15 +89,15 @@ object AlignmentRecordDatasetSuite extends Serializable { f.getAlignments().get(0) } - def ncfFn(r: AlignmentRecord): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(r.getReferenceName) + def sliceFn(r: AlignmentRecord): Slice = { + Slice.newBuilder + .setName(r.getReferenceName) .setSequence(r.getSequence) .build } - def ncfFn(f: Fragment): NucleotideContigFragment = { - ncfFn(fragToRead(f)) + def sliceFn(f: Fragment): Slice = { + sliceFn(fragToRead(f)) } def covFn(r: AlignmentRecord): Coverage = { @@ -1440,35 +1440,35 @@ class AlignmentRecordDatasetSuite extends ADAMFunSuite { assert(kmerCounts.toDF().where($"kmer" === "CCAAGA" && $"count" === 3).count === 1) } - sparkTest("transform reads to contig genomic dataset") { + sparkTest("transform reads to slice genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(ncRdd: NucleotideContigFragmentDataset) { + def checkSave(sliceRdd: SliceDataset) { val tempPath = tmpLocation(".fa") - ncRdd.saveAsFasta(tempPath) + sliceRdd.saveAsFasta(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count.toInt === 20) + assert(sc.loadSlices(tempPath).rdd.count.toInt === 20) } - val features: NucleotideContigFragmentDataset = reads.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = reads.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordDatasetSuite.ncfFn) + rdd.map(AlignmentRecordDatasetSuite.sliceFn) }) - checkSave(features) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: NucleotideContigFragmentDataset = reads.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = reads.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - AlignmentRecordDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + AlignmentRecordDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(featuresDs) + checkSave(slicesDs) } sparkTest("transform reads to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala index 4512e1974d..bcfa046080 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala @@ -20,7 +20,7 @@ package org.bdgenomics.adam.rdd.read import htsjdk.samtools.ValidationStringency import org.apache.spark.rdd.RDD import org.bdgenomics.adam.util.{ ADAMFunSuite, ReferenceContigMap } -import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment, Reference } +import org.bdgenomics.formats.avro.{ AlignmentRecord, Reference, Slice } class MDTaggingSuite extends ADAMFunSuite { val chr1 = @@ -36,14 +36,14 @@ class MDTaggingSuite extends ADAMFunSuite { .setLength(100L) .build() - def makeFrags(frags: (Reference, Int, String)*): RDD[NucleotideContigFragment] = + def makeFrags(frags: (Reference, Int, String)*): RDD[Slice] = sc.parallelize( for { (reference, start, seq) <- frags } yield ( - NucleotideContigFragment.newBuilder - .setContigLength(reference.getLength) - .setContigName(reference.getName) + Slice.newBuilder() + .setTotalLength(reference.getLength) + .setName(reference.getName) .setStart(start.toLong) .setEnd(start.toLong + seq.length) .setSequence(seq).build() diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/ReadDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/ReadDatasetSuite.scala new file mode 100644 index 0000000000..c709ff70db --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/ReadDatasetSuite.scala @@ -0,0 +1,189 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.read + +import java.io.File + +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ + ReferenceRegion, + SequenceDictionary, + SequenceRecord +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.feature.FeatureDataset +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ + Alphabet, + Feature, + Read, + Strand +} + +class ReadDatasetSuite extends ADAMFunSuite { + + val r1 = Read.newBuilder() + .setName("name1") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setLength(4L) + .setSequence("actg") + .setQualityScores("9999") + .build + + val r2 = Read.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setLength(4L) + .setSequence("actg") + .setQualityScores("9999") + .build + + val sd = SequenceDictionary( + SequenceRecord("name1", 4), + SequenceRecord("name2", 4) + ) + + def tempLocation(suffix: String = ".adam"): String = { + val tempFile = File.createTempFile("ReadDatasetSuite", "") + val tempDir = tempFile.getParentFile + new File(tempDir, tempFile.getName + suffix).getAbsolutePath + } + + sparkTest("create a new read genomic dataset") { + val reads: RDD[Read] = sc.parallelize(Seq(r1, r2)) + assert(ReadDataset(reads).rdd.count === 2) + } + + sparkTest("create a new read genomic dataset with sequence dictionary") { + val reads: RDD[Read] = sc.parallelize(Seq(r1, r2)) + assert(ReadDataset(reads, sd).rdd.count === 2) + } + + sparkTest("save as parquet") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".adam") + reads.save(outputPath, asSingleFile = false) + } + + sparkTest("round trip as parquet") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".adam") + reads.saveAsParquet(outputPath) + + val parquetReads = sc.loadParquetReads(outputPath) + assert(parquetReads.rdd.count === 2) + } + + sparkTest("save as fastq") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".fastq") + reads.save(outputPath, asSingleFile = false) + } + + sparkTest("save as single file fastq") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".fastq") + reads.save(outputPath, asSingleFile = true) + } + + sparkTest("filter read genomic dataset by reference region") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val filtered = reads.filterByOverlappingRegion(ReferenceRegion.all("name1")) + assert(filtered.rdd.count() === 1) + } + + sparkTest("broadcast region join reads and features") { + val feature = Feature.newBuilder() + .setReferenceName("name2") + .setStart(0L) + .setEnd(3L) + .build + + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val features: FeatureDataset = FeatureDataset(sc.parallelize(Seq(feature))) + + val kv = reads.broadcastRegionJoin(features).rdd.first + assert(kv._1 === r2) + assert(kv._2 === feature) + } + + sparkTest("shuffle region join reads and features") { + val feature = Feature.newBuilder() + .setReferenceName("name1") + .setStart(0L) + .setEnd(3L) + .build + + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val features: FeatureDataset = FeatureDataset(sc.parallelize(Seq(feature))) + + val kv = reads.broadcastRegionJoin(features).rdd.first + assert(kv._1 === r1) + assert(kv._2 === feature) + } + + sparkTest("convert reads to sequences") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val sequences = reads.toSequences.rdd.collect() + assert(sequences.length === 2) + + val s1 = sequences(0) + assert(s1.getName === "name1") + assert(s1.getDescription === "description") + assert(s1.getAlphabet === Alphabet.DNA) + assert(s1.getLength === 4L) + assert(s1.getSequence === "actg") + + val s2 = sequences(1) + assert(s2.getName === "name2") + assert(s2.getDescription === "description") + assert(s2.getAlphabet === Alphabet.DNA) + assert(s2.getLength === 4L) + assert(s2.getSequence === "actg") + } + + sparkTest("convert reads to slices") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val slices = reads.toSlices.rdd.collect() + assert(slices.length === 2) + + val s1 = slices(0) + assert(s1.getName === "name1") + assert(s1.getDescription === "description") + assert(s1.getAlphabet === Alphabet.DNA) + assert(s1.getLength === 4L) + assert(s1.getTotalLength === 4L) + assert(s1.getSequence === "actg") + assert(s1.getStart === 0L) + assert(s1.getEnd === 4L) + assert(s1.getStrand === Strand.INDEPENDENT) + + val s2 = slices(1) + assert(s2.getName === "name2") + assert(s2.getDescription === "description") + assert(s2.getAlphabet === Alphabet.DNA) + assert(s2.getLength === 4L) + assert(s2.getTotalLength === 4L) + assert(s2.getSequence === "actg") + assert(s2.getStart === 0L) + assert(s2.getEnd === 4L) + assert(s2.getStrand === Strand.INDEPENDENT) + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragmentsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/FlankSlicesSuite.scala similarity index 50% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragmentsSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/FlankSlicesSuite.scala index 6bbf5bd314..6d453dfb5d 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragmentsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/FlankSlicesSuite.scala @@ -15,73 +15,73 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.rdd.contig +package org.bdgenomics.adam.rdd.sequence import org.bdgenomics.adam.models.ReferenceRegion -import org.bdgenomics.formats.avro.NucleotideContigFragment +import org.bdgenomics.formats.avro.Slice import org.scalatest.FunSuite -class FlankReferenceFragmentsSuite extends FunSuite { +class FlankSlicesSuite extends FunSuite { - test("don't put flanks on non-adjacent fragments") { + test("don't put flanks on non-adjacent slices") { val testIter = Iterator((ReferenceRegion("chr1", 0L, 10L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("AAAAATTTTT") .setStart(0L) .setEnd(9L) .build()), (ReferenceRegion("chr1", 20L, 30L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("CCCCCGGGGG") .setStart(20L) .setEnd(29L) .build())) - val fragments = FlankReferenceFragments.flank(testIter, 5).toSeq + val slices = FlankSlices.flank(testIter, 5).toSeq - assert(fragments.size === 2) - fragments.foreach(_.getSequence.length === 10) - assert(fragments(0).getSequence === "AAAAATTTTT") - assert(fragments(0).getStart === 0L) - assert(fragments(0).getEnd === 9L) - assert(fragments(1).getSequence === "CCCCCGGGGG") - assert(fragments(1).getStart === 20L) - assert(fragments(1).getEnd === 29L) + assert(slices.size === 2) + slices.foreach(_.getSequence.length === 10) + assert(slices(0).getSequence === "AAAAATTTTT") + assert(slices(0).getStart === 0L) + assert(slices(0).getEnd === 9L) + assert(slices(1).getSequence === "CCCCCGGGGG") + assert(slices(1).getStart === 20L) + assert(slices(1).getEnd === 29L) } - test("put flanks on adjacent fragments") { + test("put flanks on adjacent slices") { val testIter = Iterator((ReferenceRegion("chr1", 0L, 10L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("AAAAATTTTT") .setStart(0L) .setEnd(9L) .build()), (ReferenceRegion("chr1", 10L, 20L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("NNNNNUUUUU") .setStart(10L) .setEnd(19L) .build()), (ReferenceRegion("chr1", 20L, 30L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("CCCCCGGGGG") .setStart(20L) .setEnd(29L) .build())) - val fragments = FlankReferenceFragments.flank(testIter, 5).toSeq + val slices = FlankSlices.flank(testIter, 5).toSeq - assert(fragments.size === 3) - assert(fragments(0).getSequence === "AAAAATTTTTNNNNN") - assert(fragments(0).getStart === 0L) - assert(fragments(0).getEnd === 14L) - assert(fragments(1).getSequence === "TTTTTNNNNNUUUUUCCCCC") - assert(fragments(1).getStart === 5L) - assert(fragments(1).getEnd === 24L) - assert(fragments(2).getSequence === "UUUUUCCCCCGGGGG") - assert(fragments(2).getStart === 15L) - assert(fragments(2).getEnd === 29L) + assert(slices.size === 3) + assert(slices(0).getSequence === "AAAAATTTTTNNNNN") + assert(slices(0).getStart === 0L) + assert(slices(0).getEnd === 14L) + assert(slices(1).getSequence === "TTTTTNNNNNUUUUUCCCCC") + assert(slices(1).getStart === 5L) + assert(slices(1).getEnd === 24L) + assert(slices(2).getSequence === "UUUUUCCCCCGGGGG") + assert(slices(2).getStart === 15L) + assert(slices(2).getEnd === 29L) } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SequenceDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SequenceDatasetSuite.scala new file mode 100644 index 0000000000..27e636fcec --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SequenceDatasetSuite.scala @@ -0,0 +1,398 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.sequence + +import com.google.common.collect.ComparisonChain +import java.io.File +import java.util.Comparator +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ + ReferenceRegion, + SequenceDictionary, + SequenceRecord +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ + Alphabet, + Sequence, + Strand +} + +class SequenceDatasetSuite extends ADAMFunSuite { + + val s1 = Sequence.newBuilder() + .setName("name1") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("actg") + .setLength(4L) + .build + + val s2 = Sequence.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("actg") + .setLength(4L) + .build + + val sd = SequenceDictionary( + SequenceRecord("name1", 4), + SequenceRecord("name2", 4) + ) + + def tempLocation(suffix: String = ".adam"): String = { + val tempFile = File.createTempFile("SequenceDatasetSuite", "") + val tempDir = tempFile.getParentFile + new File(tempDir, tempFile.getName + suffix).getAbsolutePath + } + + sparkTest("create a new sequence genomic dataset") { + val sequences: RDD[Sequence] = sc.parallelize(Seq(s1, s2)) + assert(SequenceDataset(sequences).rdd.count === 2) + } + + sparkTest("create a new sequence genomic dataset with sequence dictionary") { + val sequences: RDD[Sequence] = sc.parallelize(Seq(s1, s2)) + assert(SequenceDataset(sequences, sd).rdd.count === 2) + } + + sparkTest("save as parquet") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".adam") + sequences.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("round trip as parquet") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".adam") + sequences.saveAsParquet(outputPath) + + val parquetSequences = sc.loadParquetSequences(outputPath) + assert(parquetSequences.rdd.count === 2) + } + + sparkTest("save as fasta") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".fasta") + sequences.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("save as single file fasta") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".fasta") + sequences.save(outputPath, asSingleFile = true, disableFastConcat = false) + } + + sparkTest("convert sequences to reads") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val reads = sequences.toReads.rdd.collect() + assert(reads.length === 2) + + val r1 = reads(0) + assert(r1.getName === "name1") + assert(r1.getDescription === "description") + assert(r1.getAlphabet === Alphabet.DNA) + assert(r1.getLength === 4L) + assert(r1.getSequence === "actg") + assert(r1.getQualityScores === "BBBB") + + val r2 = reads(1) + assert(r2.getName === "name2") + assert(r2.getDescription === "description") + assert(r2.getAlphabet === Alphabet.DNA) + assert(r2.getLength === 4L) + assert(r2.getSequence === "actg") + assert(r2.getQualityScores === "BBBB") + } + + sparkTest("convert sequences to slices") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.toSlices.rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 4L) + assert(slice2.getTotalLength === 4) + assert(slice2.getSequence === "actg") + assert(slice2.getStart === 0L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences to a maximum length") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(3L).rdd.collect() + assert(slices.length === 4) + + slices.sortWith((v1, v2) => ComparisonChain.start() + .compare(v1.getName, v2.getName) + .compare(v1.getStart, v2.getStart) + .result() < 0 + ) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 3L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "act") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 3L) + assert(slice1.getStrand === Strand.INDEPENDENT) + assert(slice1.getIndex === 0) + assert(slice1.getSlices === 2) + + val slice2 = slices(1) + assert(slice2.getName === "name1") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 1L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "g") + assert(slice2.getStart === 3L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + assert(slice2.getIndex === 1) + assert(slice2.getSlices === 2) + + val slice3 = slices(2) + assert(slice3.getName === "name2") + assert(slice3.getDescription === "description") + assert(slice3.getAlphabet === Alphabet.DNA) + assert(slice3.getLength === 3L) + assert(slice3.getTotalLength === 4L) + assert(slice3.getSequence === "act") + assert(slice3.getStart === 0L) + assert(slice3.getEnd === 3L) + assert(slice3.getStrand === Strand.INDEPENDENT) + assert(slice3.getIndex === 0) + assert(slice3.getSlices === 2) + + val slice4 = slices(3) + assert(slice4.getName === "name2") + assert(slice4.getDescription === "description") + assert(slice4.getAlphabet === Alphabet.DNA) + assert(slice4.getLength === 1L) + assert(slice4.getTotalLength === 4L) + assert(slice4.getSequence === "g") + assert(slice4.getStart === 3L) + assert(slice4.getEnd === 4L) + assert(slice4.getStrand === Strand.INDEPENDENT) + assert(slice4.getIndex === 1) + assert(slice4.getSlices === 2) + } + + sparkTest("slice sequences shorter than maximum length") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(10L).rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + assert(slice1.getIndex === 0) + assert(slice1.getSlices === 1) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 4L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "actg") + assert(slice2.getStart === 0L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + assert(slice2.getIndex === 0) + assert(slice2.getSlices === 1) + } + + sparkTest("filter sequences by overlapping region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val filtered = sequences.filterByOverlappingRegion(ReferenceRegion("name1", 1L, 3L)).rdd.collect() + assert(filtered.length == 1) + + val sequence1 = filtered(0) + assert(sequence1.getName === "name1") + assert(sequence1.getDescription === "description") + assert(sequence1.getAlphabet === Alphabet.DNA) + assert(sequence1.getLength === 4L) + assert(sequence1.getSequence === "actg") + } + + sparkTest("filter sequences failing to overlap region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + assert(sequences.filterByOverlappingRegion(ReferenceRegion("name1", 99L, 101L)).rdd.isEmpty) + } + + sparkTest("filter sequences by overlapping regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 1L, 3L), ReferenceRegion("name2", 1L, 3L)) + val filtered = sequences.filterByOverlappingRegions(regions).rdd.collect() + assert(filtered.length == 2) + + val sequence1 = filtered(0) + assert(sequence1.getName === "name1") + assert(sequence1.getDescription === "description") + assert(sequence1.getAlphabet === Alphabet.DNA) + assert(sequence1.getLength === 4L) + assert(sequence1.getSequence === "actg") + + val sequence2 = filtered(1) + assert(sequence2.getName === "name2") + assert(sequence2.getDescription === "description") + assert(sequence2.getAlphabet === Alphabet.DNA) + assert(sequence2.getLength === 4L) + assert(sequence2.getSequence === "actg") + } + + sparkTest("filter sequences failing to overlap regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 99L, 101L), ReferenceRegion("name2", 99L, 101L)) + assert(sequences.filterByOverlappingRegions(regions).rdd.isEmpty) + } + + sparkTest("slice sequences overlapping a smaller region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(ReferenceRegion("name1", 1L, 3L)).rdd.collect() + assert(slices.length === 1) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 2L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "ct") + assert(slice1.getStart === 1L) + assert(slice1.getEnd === 3L) + assert(slice1.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences overlapping a larger region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(ReferenceRegion("name1", 0L, 99L)).rdd.collect() + assert(slices.length === 1) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences failing to overlap a region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(ReferenceRegion("name1", 99L, 101L)).rdd.collect() + assert(slices.length === 0) + } + + sparkTest("slice sequences overlapping smaller regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 1L, 3L), ReferenceRegion("name2", 1L, 3L)) + val slices = sequences.slice(regions).rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 2L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "ct") + assert(slice1.getStart === 1L) + assert(slice1.getEnd === 3L) + assert(slice1.getStrand === Strand.INDEPENDENT) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 2L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "ct") + assert(slice2.getStart === 1L) + assert(slice2.getEnd === 3L) + assert(slice2.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences overlapping larger regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 0L, 99L), ReferenceRegion("name2", 0L, 99L)) + val slices = sequences.slice(regions).rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 4L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "actg") + assert(slice2.getStart === 0L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences failing to overlap regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 99L, 101L), ReferenceRegion("name2", 99L, 101L)) + val slices = sequences.slice(regions).rdd.collect() + assert(slices.length === 0) + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SliceDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SliceDatasetSuite.scala new file mode 100644 index 0000000000..8f910f9920 --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SliceDatasetSuite.scala @@ -0,0 +1,173 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.sequence + +import java.io.File + +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ + SequenceDictionary, + SequenceRecord +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ + Alphabet, + Slice, + Strand +} + +class SliceDatasetSuite extends ADAMFunSuite { + + val s1 = Slice.newBuilder() + .setName("name1") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("actg") + .setStart(0L) + .setEnd(3L) + .setStrand(Strand.INDEPENDENT) + .setLength(4L) + .build + + val s2 = Slice.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("aatt") + .setStart(0L) + .setEnd(3L) + .setStrand(Strand.INDEPENDENT) + .setLength(4L) + .build + + val s3 = Slice.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("ccgg") + .setStart(4L) + .setEnd(7L) + .setStrand(Strand.INDEPENDENT) + .setLength(4L) + .build + + val sd = SequenceDictionary( + SequenceRecord("name1", 4), + SequenceRecord("name2", 4) + ) + + sparkTest("create a new slice genomic dataset") { + val slices: RDD[Slice] = sc.parallelize(Seq(s1, s2, s3)) + assert(SliceDataset(slices).rdd.count === 3) + } + + sparkTest("create a new slice genomic dataset with sequence dictionary") { + val slices: RDD[Slice] = sc.parallelize(Seq(s1, s2, s3)) + assert(SliceDataset(slices, sd).rdd.count === 3) + } + + sparkTest("merge slices into a sequence genomic dataset") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val sequences = slices.merge() + assert(sequences.rdd.count === 2) + + val seqs = sequences.rdd.collect + val seq1 = seqs(0) + val seq2 = seqs(1) + + assert(seq1.getLength === 4L) + assert(seq2.getLength === 8L) + assert(seq2.getSequence === "aattccgg") + } + + def tempLocation(suffix: String = ".adam"): String = { + val tempFile = File.createTempFile("SliceDatasetSuite", "") + val tempDir = tempFile.getParentFile + new File(tempDir, tempFile.getName + suffix).getAbsolutePath + } + + sparkTest("save as parquet") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".adam") + slices.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("round trip as parquet") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".adam") + slices.saveAsParquet(outputPath) + + val parquetSlices = sc.loadParquetSlices(outputPath) + assert(parquetSlices.rdd.count === 3) + } + + sparkTest("save as fasta") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".fasta") + slices.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("save as single file fasta") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".fasta") + slices.save(outputPath, asSingleFile = true, disableFastConcat = false) + } + + sparkTest("convert slices to reads") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2))) + val reads = slices.toReads.rdd.collect() + assert(reads.length === 2) + + val r1 = reads(0) + assert(r1.getName === "name1") + assert(r1.getDescription === "description") + assert(r1.getAlphabet === Alphabet.DNA) + assert(r1.getLength === 4L) + assert(r1.getSequence === "actg") + assert(r1.getQualityScores === "BBBB") + + val r2 = reads(1) + assert(r2.getName === "name2") + assert(r2.getDescription === "description") + assert(r2.getAlphabet === Alphabet.DNA) + assert(r2.getLength === 4L) + assert(r2.getSequence === "aatt") + assert(r2.getQualityScores === "BBBB") + } + + sparkTest("convert slices to sequences") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2))) + val sequences = slices.toSequences.rdd.collect() + assert(sequences.length === 2) + + val sequence1 = sequences(0) + assert(sequence1.getName === "name1") + assert(sequence1.getDescription === "description") + assert(sequence1.getAlphabet === Alphabet.DNA) + assert(sequence1.getLength === 4L) + assert(sequence1.getSequence === "actg") + + val sequence2 = sequences(1) + assert(sequence2.getName === "name2") + assert(sequence2.getDescription === "description") + assert(sequence2.getAlphabet === Alphabet.DNA) + assert(sequence2.getLength === 4L) + assert(sequence2.getSequence === "aatt") + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala index 446c754ae5..87c243dfe1 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala @@ -32,16 +32,16 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -71,9 +71,9 @@ object GenotypeDatasetSuite extends Serializable { .build } - def ncfFn(g: Genotype): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(g.getReferenceName) + def sliceFn(g: Genotype): Slice = { + Slice.newBuilder + .setName(g.getReferenceName) .build } @@ -386,35 +386,35 @@ class GenotypeDatasetSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 18) } - sparkTest("transform genotypes to contig genomic dataset") { + sparkTest("transform genotypes to slice genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 18) + assert(sc.loadSlices(tempPath).rdd.count === 18) } - val contigs: NucleotideContigFragmentDataset = genotypes.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = genotypes.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeDatasetSuite.ncfFn) + rdd.map(GenotypeDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = genotypes.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = genotypes.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - GenotypeDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + GenotypeDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } sparkTest("transform genotypes to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala index 16f4ce328f..90aff20e27 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala @@ -40,16 +40,16 @@ import org.bdgenomics.adam.models.{ } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.TestSaveArgs -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -395,22 +395,22 @@ class VariantContextDatasetSuite extends ADAMFunSuite { } } - sparkTest("transform variant contexts to contig genomic dataset") { + sparkTest("transform variant contexts to slice genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 6) + assert(sc.loadSlices(tempPath).rdd.count === 6) } - val contigs: NucleotideContigFragmentDataset = variantContexts.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = variantContexts.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantDatasetSuite.ncfFn) + rdd.map(VariantDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) } sparkTest("transform variant contexts to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala index 4a461d517f..23a1af2416 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala @@ -28,16 +28,16 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -79,14 +79,14 @@ object VariantDatasetSuite extends Serializable { fragFn(vc.variant.variant) } - def ncfFn(v: Variant): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(v.getReferenceName) + def sliceFn(v: Variant): Slice = { + Slice.newBuilder + .setName(v.getReferenceName) .build } - def ncfFn(vc: VariantContext): NucleotideContigFragment = { - ncfFn(vc.variant.variant) + def sliceFn(vc: VariantContext): Slice = { + sliceFn(vc.variant.variant) } def readFn(v: Variant): AlignmentRecord = { @@ -402,35 +402,35 @@ class VariantDatasetSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 6) } - sparkTest("transform variants to contig genomic dataset") { + sparkTest("transform variants to slice genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 6) + assert(sc.loadSlices(tempPath).rdd.count === 6) } - val contigs: NucleotideContigFragmentDataset = variants.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = variants.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantDatasetSuite.ncfFn) + rdd.map(VariantDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = variants.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = variants.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - VariantDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + VariantDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } sparkTest("transform variants to coverage genomic dataset") { diff --git a/adam-python/bdgenomics/adam/adamContext.py b/adam-python/bdgenomics/adam/adamContext.py index fb8e3f142b..5f9ddefbc1 100644 --- a/adam-python/bdgenomics/adam/adamContext.py +++ b/adam-python/bdgenomics/adam/adamContext.py @@ -31,8 +31,10 @@ FeatureDataset, \ FragmentDataset, \ GenotypeDataset, \ - NucleotideContigFragmentDataset, \ + SequenceDataset, \ + SliceDataset, \ VariantDataset + from bdgenomics.adam.stringency import STRICT, _toJava @@ -147,26 +149,6 @@ def loadCoverage(self, filePath, return CoverageDataset(adamRdd, self._sc) - def loadContigFragments(self, filePath): - """ - Load nucleotide contig fragments into a NucleotideContigFragmentDataset. - - If the path name has a .fa/.fasta extension, load as FASTA format. - Else, fall back to Parquet + Avro. - - For FASTA format, compressed files are supported through compression codecs configured - in Hadoop, which by default include .gz and .bz2, but can include more. - - :param str filePath: The path to load the file from. - :return: Returns a genomic dataset containing sequence fragments. - :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentDataset - """ - - adamRdd = self.__jac.loadContigFragments(filePath) - - return NucleotideContigFragmentDataset(adamRdd, self._sc) - - def loadFragments(self, filePath, stringency=STRICT): """ Load fragments into a FragmentDataset. @@ -255,3 +237,73 @@ def loadVariants(self, filePath, stringency=STRICT): _toJava(stringency, self._jvm)) return VariantDataset(adamRdd, self._sc) + + + def loadDnaSequences(self, filePath): + """ + Load DNA sequences into a SequenceDataset. + + If the path name has a .fa/.fasta extension, load as FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :return: Returns a genomic dataset containing DNA sequences. + :rtype: bdgenomics.adam.rdd.SequenceDataset + """ + + adamRdd = self.__jac.loadDnaSequences(filePath) + + return SequenceDataset(adamRdd, self._sc) + + + def loadProteinSequences(self, filePath): + """ + Load protein sequences into a SequenceDataset. + + If the path name has a .fa/.fasta extension, load as FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :return: Returns a genomic dataset containing protein sequences. + :rtype: bdgenomics.adam.rdd.SequenceDataset + """ + + adamRdd = self.__jac.loadProteinSequences(filePath) + + return SequenceDataset(adamRdd, self._sc) + + + def loadRnaSequences(self, filePath): + """ + Load RNA sequences into a SequenceDataset. + + If the path name has a .fa/.fasta extension, load as FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :return: Returns a genomic dataset containing RNA sequences. + :rtype: bdgenomics.adam.rdd.SequenceDataset + """ + + adamRdd = self.__jac.loadRnaSequences(filePath) + + return SequenceDataset(adamRdd, self._sc) + + + def loadSlices(self, filePath, maximumLength): + """ + Load slices into a SliceDataset. + + If the path name has a .fa/.fasta extension, load as DNA in FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :param long maximumLength: Maximum slice length. + :return: Returns a genomic dataset containing sequence slices. + :rtype: bdgenomics.adam.rdd.SliceDataset + """ + + adamRdd = self.__jac.loadSlices(filePath, maximumLength) + + return SliceDataset(adamRdd, self._sc) + diff --git a/adam-python/bdgenomics/adam/rdd.py b/adam-python/bdgenomics/adam/rdd.py index ae6709bede..20dd8dc728 100644 --- a/adam-python/bdgenomics/adam/rdd.py +++ b/adam-python/bdgenomics/adam/rdd.py @@ -30,7 +30,8 @@ FeatureDataset FragmentDataset GenotypeDataset - NucleotideContigFragmentDataset + SequenceDataset + SliceDataset VariantDataset VariantContextDataset """ @@ -228,9 +229,7 @@ def _inferConversionFn(self, destClass): def _destClassSuffix(self, destClass): - if destClass is NucleotideContigFragmentDataset: - return "ContigsDatasetConverter" - elif destClass is CoverageDataset: + if destClass is CoverageDataset: return "CoverageDatasetConverter" elif destClass is FeatureDataset: return "FeaturesDatasetConverter" @@ -242,6 +241,12 @@ def _destClassSuffix(self, destClass): return "GenotypeDatasetConverter" elif destClass is VariantDataset: return "VariantDatasetConverter" + elif destClass is ReadRDD: + return "ReadDatasetConverter" + elif destClass is SequenceRDD: + return "SequenceDatasetConverter" + elif destClass is SliceRDD: + return "SliceDatasetConverter" else: raise ValueError("No conversion method known for %s." % destClass) @@ -1484,23 +1489,21 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.GenotypesTo%s" % self._destClassSuffix(destClass) -class NucleotideContigFragmentDataset(GenomicDataset): - """ - Wraps an GenomicDataset with Nucleotide Contig Fragment metadata and functions. - """ +class SliceDataset(GenomicDataset): + def _replaceRdd(self, newRdd): - return NucleotideContigFragmentDataset(newRdd, self.sc) + return SliceDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python NucleotideContigFragmentDataset from a JVM - NucleotideContigFragmentDataset. Should not be called from user code; + Constructs a Python SliceDataset from a JVM + SliceDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM NucleotideContigFragmentDataset. + :param jvmRdd: Py4j handle to the underlying JVM SliceDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1509,9 +1512,9 @@ def __init__(self, jvmRdd, sc): def save(self, fileName): """ - Save nucleotide contig fragments as Parquet or FASTA. + Save slices as Parquet or FASTA. - If filename ends in .fa or .fasta, saves as Fasta. If not, saves + If filename ends in .fa or .fasta, saves as FASTA. If not, saves fragments to Parquet. Defaults to 60 character line length, if saving to FASTA. @@ -1528,17 +1531,18 @@ def flankAdjacentFragments(self, flankLength): length. :param int flankLength: The length to extend adjacent records by. - :return: Returns the genomic dataset, with all adjacent fragments extended with + :return: Returns the genomic dataset, with all adjacent slices extended with flanking sequence. - :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentDataset + :rtype: bdgenomics.adam.rdd.SliceDataset """ - return NucleotideContigFragmentDataset(self._jvmRdd.flankAdjacentFragments(flankLength), self.sc) + return SliceDataset(self._jvmRdd.flankAdjacentFragments(flankLength), + self.sc) def countKmers(self, kmerLength): """ - Counts the k-mers contained in a FASTA contig. + Counts the k-mers contained in a slice. :param int kmerLength: The value of _k_ to use for cutting _k_-mers. :return: Returns an RDD containing k-mer/count pairs. @@ -1550,7 +1554,7 @@ def countKmers(self, kmerLength): def _inferConversionFn(self, destClass): - return "org.bdgenomics.adam.api.java.ContigsTo%s" % self._destClassSuffix(destClass) + return "org.bdgenomics.adam.api.java.SlicesTo%s" % self._destClassSuffix(destClass) class VariantDataset(VCFSupportingGenomicDataset): @@ -1648,3 +1652,85 @@ def saveAsVcf(self, deferMerging, disableFastConcat, _toJava(stringency, self.sc._jvm)) + + +class ReadRDD(GenomicDataset): + + + def _replaceRdd(self, newRdd): + + return ReadRDD(newRdd, self.sc) + + + def __init__(self, jvmRdd, sc): + """ + Constructs a Python ReadRDD from a JVM + ReadRDD. Should not be called from user code; + instead, go through bdgenomics.adamContext.ADAMContext. + + :param jvmRdd: Py4j handle to the underlying JVM ReadRDD. + :param pyspark.context.SparkContext sc: Active Spark Context. + """ + + GenomicDataset.__init__(self, jvmRdd, sc) + + + def save(self, fileName): + """ + Save reads as Parquet or FASTQ. + + If filename ends in .fq or .fastq, saves as FASTQ. If not, saves + reads to Parquet. + + :param str fileName: Path to save to. + """ + + self._jvmRdd.save(fileName) + + + def _inferConversionFn(self, destClass): + + return "org.bdgenomics.adam.api.java.ReadsTo%s" % self._destClassSuffix(destClass) + + +class SequenceDataset(GenomicDataset): + + + def _replaceRdd(self, newRdd): + + return SequenceDataset(newRdd, self.sc) + + + def __init__(self, jvmRdd, sc): + """ + Constructs a Python SequenceDataset from a JVM + SequenceDataset. Should not be called from user code; + instead, go through bdgenomics.adamContext.ADAMContext. + + :param jvmRdd: Py4j handle to the underlying JVM SequenceDataset. + :param pyspark.context.SparkContext sc: Active Spark Context. + """ + + GenomicDataset.__init__(self, jvmRdd, sc) + +# slice(maximumLength) +# slice(region) +# slice(regions) + + def save(self, fileName): + """ + Save slices as Parquet or FASTA. + + If filename ends in .fa or .fasta, saves as Fasta. If not, saves + sequences to Parquet. Defaults to 60 character line length, if saving to + FASTA. + + :param str fileName: Path to save to. + """ + + self._jvmRdd.save(fileName) + + + def _inferConversionFn(self, destClass): + + return "org.bdgenomics.adam.api.java.SequencesTo%s" % self._destClassSuffix(destClass) diff --git a/adam-python/bdgenomics/adam/test/adamContext_test.py b/adam-python/bdgenomics/adam/test/adamContext_test.py index d8612cc658..cef2bf46c1 100644 --- a/adam-python/bdgenomics/adam/test/adamContext_test.py +++ b/adam-python/bdgenomics/adam/test/adamContext_test.py @@ -128,13 +128,25 @@ def test_load_variants(self): self.assertEqual(reads._jvmRdd.jrdd().count(), 6) - def test_load_contig_fragments(self): + def test_load_slices(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) - reads = ac.loadContigFragments(testFile) + slices = ac.loadSlices(testFile, 10000) - self.assertEqual(reads.toDF().count(), 1) - self.assertEqual(reads._jvmRdd.jrdd().count(), 1) + self.assertEqual(slices.toDF().count(), 1) + self.assertEqual(slices._jvmRdd.jrdd().count(), 1) + + + def test_load_dna_sequences(self): + + + testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") + ac = ADAMContext(self.ss) + + sequences = ac.loadDnaSequences(testFile) + + self.assertEqual(sequences.toDF().count(), 1) + self.assertEqual(sequences._jvmRdd.jrdd().count(), 1) diff --git a/adam-r/bdgenomics.adam/R/adam-context.R b/adam-r/bdgenomics.adam/R/adam-context.R index b9640be486..f47220a63d 100644 --- a/adam-r/bdgenomics.adam/R/adam-context.R +++ b/adam-r/bdgenomics.adam/R/adam-context.R @@ -98,7 +98,7 @@ setMethod("loadAlignments", AlignmentRecordDataset(jrdd) }) -#' Load nucleotide contig fragments into a NucleotideContigFragmentDataset. +#' Load DNA sequences into a SequenceDataset. #' #' If the path name has a .fa/.fasta extension, load as FASTA format. #' Else, fall back to Parquet + Avro. @@ -108,16 +108,83 @@ setMethod("loadAlignments", #' #' @param ac The ADAMContext. #' @param filePath The path to load the file from. -#' @return Returns a genomic dataset containing nucleotide contig fragments. +#' @return Returns a genomic dataset containing DNA sequences. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("loadContigFragments", +setMethod("loadDnaSequences", signature(ac = "ADAMContext", filePath = "character"), function(ac, filePath) { - jrdd <- sparkR.callJMethod(ac@jac, "loadContigFragments", filePath) - NucleotideContigFragmentDataset(jrdd) + jrdd <- sparkR.callJMethod(ac@jac, "loadDnaSequences", filePath) + SequenceDataset(jrdd) + }) + +#' Load protein sequences into a SequenceDataset. +#' +#' If the path name has a .fa/.fasta extension, load as FASTA format. +#' Else, fall back to Parquet + Avro. +#' +#' For FASTA format, compressed files are supported through compression codecs configured +#' in Hadoop, which by default include .gz and .bz2, but can include more. +#' +#' @param ac The ADAMContext. +#' @param filePath The path to load the file from. +#' @return Returns a genomic dataset containing protein sequences. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("loadProteinSequences", + signature(ac = "ADAMContext", filePath = "character"), + function(ac, filePath) { + jrdd <- sparkR.callJMethod(ac@jac, "loadProteinSequences", filePath) + SequenceDataset(jrdd) + }) + +#' Load RNA sequences into a SequenceDataset. +#' +#' If the path name has a .fa/.fasta extension, load as FASTA format. +#' Else, fall back to Parquet + Avro. +#' +#' For FASTA format, compressed files are supported through compression codecs configured +#' in Hadoop, which by default include .gz and .bz2, but can include more. +#' +#' @param ac The ADAMContext. +#' @param filePath The path to load the file from. +#' @return Returns a genomic dataset containing RNA sequences. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("loadRnaSequences", + signature(ac = "ADAMContext", filePath = "character"), + function(ac, filePath) { + jrdd <- sparkR.callJMethod(ac@jac, "loadRnaSequences", filePath) + SequenceDataset(jrdd) + }) + +#' Load slices into a SliceDataset. +#' +#' If the path name has a .fa/.fasta extension, load as DNA in FASTA format. +#' Else, fall back to Parquet + Avro. +#' +#' For FASTA format, compressed files are supported through compression codecs configured +#' in Hadoop, which by default include .gz and .bz2, but can include more. +#' +#' @param ac The ADAMContext. +#' @param filePath The path to load the file from. +#' @param maximumLength Maximum slice length. +#' @return Returns a genomic dataset containing slices. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("loadSlices", + signature(ac = "ADAMContext", filePath = "character", maximumLength = "integer"), + function(ac, filePath, maximumLength) { + jrdd <- sparkR.callJMethod(ac@jac, "loadSlices", filePath, maximumLength) + SliceDataset(jrdd) }) #' Load fragments into a FragmentDataset. diff --git a/adam-r/bdgenomics.adam/R/generics.R b/adam-r/bdgenomics.adam/R/generics.R index 2b61383820..d78cf06bff 100644 --- a/adam-r/bdgenomics.adam/R/generics.R +++ b/adam-r/bdgenomics.adam/R/generics.R @@ -33,8 +33,23 @@ setGeneric("loadAlignments", #' @rdname ADAMContext #' @export -setGeneric("loadContigFragments", - function(ac, filePath) { standardGeneric("loadContigFragments") }) +setGeneric("loadDnaSequences", + function(ac, filePath) { standardGeneric("loadDnaSequences") }) + +#' @rdname ADAMContext +#' @export +setGeneric("loadProteinSequences", + function(ac, filePath) { standardGeneric("loadProteinSequences") }) + +#' @rdname ADAMContext +#' @export +setGeneric("loadRnaSequences", + function(ac, filePath) { standardGeneric("loadRnaSequences") }) + +#' @rdname ADAMContext +#' @export +setGeneric("loadSlices", + function(ac, filePath) { standardGeneric("loadSlices") }) #' @rdname ADAMContext #' @export @@ -380,15 +395,15 @@ setGeneric("toVariantContexts", setGeneric("toVariants", function(ardd, ...) { standardGeneric("toVariants") }) -#### NucleotideContigFragment operations #### +#### Slice operations #### -#' The NucleotideContigFragmentDataset class is used to manipulate contigs. +#' The SliceDataset class is used to manipulate slices. #' -#' @name NucleotideContigFragmentDataset +#' @name SliceDataset NULL -#' @rdname NucleotideContigFragmentDataset -#' @param ardd The genomic dataset to apply this to. +#' @rdname SliceDataset +#' @param ardd The RDD to apply this to. #' @param flankLength The length to extend adjacent records by. #' @export setGeneric("flankAdjacentFragments", diff --git a/adam-r/bdgenomics.adam/R/rdd.R b/adam-r/bdgenomics.adam/R/rdd.R index be2e7ec98f..3fe7044454 100644 --- a/adam-r/bdgenomics.adam/R/rdd.R +++ b/adam-r/bdgenomics.adam/R/rdd.R @@ -107,19 +107,34 @@ GenotypeDataset <- function(jrdd) { new("GenotypeDataset", jrdd = jrdd) } -#' A class that wraps an RDD of contigs with helpful metadata. +#' A class that wraps an RDD of sequences with helpful metadata. #' -#' @rdname NucleotideContigFragmentDataset -#' @slot jrdd The Java RDD of contigs that this class wraps. +#' @rdname SequenceDataset +#' @slot jrdd The Java RDD of sequences that this class wraps. #' #' @export -setClass("NucleotideContigFragmentDataset", +setClass("SequenceDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -NucleotideContigFragmentDataset <- function(jrdd) { - new("NucleotideContigFragmentDataset", jrdd = jrdd) +SequenceDataset <- function(jrdd) { + new("SequenceDataset", jrdd = jrdd) +} + +#' A class that wraps an RDD of slices with helpful metadata. +#' +#' @rdname SliceDataset +#' @slot jrdd The Java RDD of slices that this class wraps. +#' +#' @export +setClass("SliceDataset", + slots = list(jrdd = "jobj"), + contains = "GenomicDataset") + +#' @importFrom methods new +SliceDataset <- function(jrdd) { + new("SliceDataset", jrdd = jrdd) } #' A class that wraps an RDD of variants with helpful metadata. @@ -373,9 +388,7 @@ setMethod("inferConversionFn", setMethod("destClassSuffix", signature(destClass = "character"), function(destClass) { - if (destClass == "NucleotideContigFragmentDataset") { - "ContigsDatasetConverter" - } else if (destClass == "CoverageDataset") { + if (destClass == "CoverageDataset") { "CoverageDatasetConverter" } else if (destClass == "FeatureDataset") { "FeaturesDatasetConverter" @@ -387,6 +400,12 @@ setMethod("destClassSuffix", "GenotypeDatasetConverter" } else if (destClass == "VariantDataset") { "VariantDatasetConverter" + } else if (destClass == "ReadDataset") { + "ReadDatasetConverter" + } else if (destClass == "SequenceDataset") { + "SequenceDatasetConverter" + } else if (destClass == "SliceDataset") { + "SliceDatasetConverter" } else { stop(paste("No conversion method known for", destClass)) @@ -1272,23 +1291,39 @@ setMethod("toVariantContexts", signature(ardd = "GenotypeDataset"), }) setMethod("inferConversionFn", - signature(ardd = "NucleotideContigFragmentDataset", + signature(ardd = "SliceDataset", destClass = "character"), function(ardd, destClass) { - paste0("org.bdgenomics.adam.api.java.ContigsTo", + paste0("org.bdgenomics.adam.api.java.SlicesTo", destClassSuffix(destClass)) }) setMethod("replaceRdd", - signature(ardd = "NucleotideContigFragmentDataset", + signature(ardd = "SliceDataset", rdd = "jobj"), function(ardd, rdd) { - NucleotideContigFragmentDataset(rdd) + SliceDataset(rdd) + }) + +#' Save sequences as Parquet or FASTA. +#' +#' If filename ends in .fa or .fasta, saves as FASTA. If not, saves slices to +#' Parquet. Defaults to 60 character line length, if saving as FASTA. +#' +#' @param ardd The RDD to apply this to. +#' @param filePath Path to save to. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("save", signature(ardd = "SequenceDataset", filePath = "character"), + function(ardd, filePath) { + invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath)) }) -#' Save nucleotide contig fragments as Parquet or FASTA. +#' Save slices as Parquet or FASTA. #' -#' If filename ends in .fa or .fasta, saves as Fasta. If not, saves fragments to +#' If filename ends in .fa or .fasta, saves as FASTA. If not, saves slices to #' Parquet. Defaults to 60 character line length, if saving as FASTA. #' #' @param ardd The genomic dataset to apply this to. @@ -1297,7 +1332,7 @@ setMethod("replaceRdd", #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("save", signature(ardd = "NucleotideContigFragmentDataset", filePath = "character"), +setMethod("save", signature(ardd = "SliceDataset", filePath = "character"), function(ardd, filePath) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath)) }) @@ -1314,11 +1349,11 @@ setMethod("save", signature(ardd = "NucleotideContigFragmentDataset", filePath = #' #' @export setMethod("flankAdjacentFragments", - signature(ardd = "NucleotideContigFragmentDataset", flankLength = "numeric"), + signature(ardd = "SliceDataset", flankLength = "numeric"), function(ardd, flankLength) { - NucleotideContigFragmentDataset(sparkR.callJMethod(ardd@jrdd, - "flankAdjacentFragments", - flankLength)) + SliceRDD(sparkR.callJMethod(ardd@jrdd, + "flankAdjacentFragments", + flankLength)) }) setMethod("inferConversionFn", diff --git a/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R b/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R index 8582117c02..0e75fb92f4 100644 --- a/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R +++ b/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R @@ -71,8 +71,14 @@ test_that("load variants from vcf", { expect_equal(count(variantDf), 6) }) -test_that("load fasta", { - ncfs <- loadContigFragments(ac, resourceFile("HLA_DQB1_05_01_01_02.fa")) - ncfDf <- toDF(ncfs) - expect_equal(count(ncfDf), 1) +test_that("load fasta sequences", { + sequences <- loadDnaSequences(ac, resourceFile("HLA_DQB1_05_01_01_02.fa")) + sequencesDf <- toDF(sequences) + expect_equal(count(sequencesDf), 1) +}) + +test_that("load fasta slices", { + slices <- loadSlices(ac, resourceFile("HLA_DQB1_05_01_01_02.fa"), 10000) + slicesDf <- toDF(slices) + expect_equal(count(slicesDf), 1) }) diff --git a/docs/api/adamContext.rst b/docs/api/adamContext.rst index a13fbde68a..84ac22915d 100644 --- a/docs/api/adamContext.rst +++ b/docs/api/adamContext.rst @@ -100,12 +100,17 @@ With an ``ADAMContext``, you can load: - From partitioned Parquet using ``loadPartitionedParquetFeatures`` (Scala only) - Autodetected from any of the above using ``loadFeatures`` (Scala, Java, Python, and R) -- Fragmented contig sequence as a ``NucleotideContigFragmentDataset``: +- Sequences as a ``SequenceDataset``: - - From FASTA with ``loadFasta`` (Scala only) - - From Parquet with ``loadParquetContigFragments`` (Scala only) - - From partitioned Parquet with ``loadPartitionedParquetContigFragments`` (Scala only) - - Autodetected from either of the above using ``loadSequences`` (Scala, Java, Python, and R) + - From FASTA with ``loadFastaDna``, ``loadFastaProtein``, ``loadFastaRna`` (Scala only) + - From Parquet with ``loadParquetSequences`` (Scala only) + - Autodetected from either of the above using ``loadDnaSequences``, ``loadProteinSequences``, ``loadRnaSequences`` (Scala, Java, Python, and R) + +- Sequence slices as a ``SliceDataset``: + + - From FASTA with ``loadFastaDna`` (Scala only) + - From Parquet with ``loadParquetSlices`` (Scala only) + - Autodetected from either of the above using ``loadSlices`` (Scala, Java, Python, and R) - Coverage data as a ``CoverageDataset``: diff --git a/docs/architecture/schemas.rst b/docs/architecture/schemas.rst index beae36f339..c95e5437d2 100644 --- a/docs/architecture/schemas.rst +++ b/docs/architecture/schemas.rst @@ -19,8 +19,8 @@ schemas: from a single sequenced fragment. - The *Genotype* schema represents a genotype call, along with annotations about the quality/read support of the called genotype. -- The *NucleotideContigFragment* schema represents a section of a - contig's sequence. +- The *Sequence* and *Slice* schema represents sequences and slices of + sequences, respectfully. - The *Variant* schema represents a sequence variant, along with statistics about that variant's support across a group of samples, and annotations about the effect of the variant. diff --git a/docs/cli/actions.rst b/docs/cli/actions.rst index 5cb867e5d9..10e70e6e88 100644 --- a/docs/cli/actions.rst +++ b/docs/cli/actions.rst @@ -162,7 +162,7 @@ fall into several general categories: - ``mismatchingPositions`` tagging options: We can recompute the ``mismatchingPositions`` field of an AlignmentRecord (SAM "MD" tag) with the ``-add_md_tags`` flag. This flag takes a path to a reference - file in either FASTA or Parquet ``NucleotideContigFragment`` format. + file in either FASTA or Parquet ``Sequence`` format. Additionally, this engine takes the following options: - ``-md_tag_fragment_size``: If loading from FASTA, sets the size of diff --git a/docs/cli/conversions.rst b/docs/cli/conversions.rst index 70ab009921..37003cece4 100644 --- a/docs/cli/conversions.rst +++ b/docs/cli/conversions.rst @@ -4,53 +4,6 @@ Conversion tools These tools convert data between a legacy genomic file format and using ADAM's schemas to store data in Parquet. -fasta2adam and adam2fasta -~~~~~~~~~~~~~~~~~~~~~~~~~ - -These commands convert between FASTA and Parquet files storing -assemblies using the NucleotideContigFragment schema. - -``fasta2adam`` takes two required arguments: - -1. ``FASTA``: The input FASTA file to convert. -2. ``ADAM``: The path to save the Parquet formatted - NucleotideContigFragments to. - -``fasta2adam`` supports the full set of `default -options <#default-args>`__, as well as the following options: - -- ``-fragment_length``: The fragment length to shard a given contig - into. Defaults to 10,000bp. -- ``-reads``: Path to a set of reads that includes sequence info. This - read path is used to obtain the sequence indices for ordering the - contigs from the FASTA file. -- ``-repartition``: The number of partitions to save the data to. If - provided, forces a shuffle. -- ``-verbose``: If given, enables additional logging where the sequence - dictionary is printed. - -``adam2fasta`` takes two required arguments: - -1. ``ADAM``: The path to a Parquet file containing - NucleotideContigFragments. -2. ``FASTA``: The path to save the FASTA file to. - -``adam2fasta`` only supports the ``-print_metrics`` option from the -`default options <#default-args>`__. Additionally, ``adam2fasta`` takes -the following options: - -- ``-line_width``: The line width in characters to use for breaking - FASTA lines. Defaults to 60 characters. -- ``-coalesce``: Sets the number of partitions to coalesce the output - to. If ``-force_shuffle_coalesce`` is not provided, the Spark engine - may ignore the coalesce directive. -- ``-force_shuffle_coalesce``: Forces a shuffle that leads to the - output being saved with the number of partitions requested by - ``-coalesce``. This is necessary if the ``-coalesce`` would increase - the number of partitions, or if it would reduce the number of - partitions to fewer than the number of Spark executors. This may have - a substantial performance cost, and will invalidate any sort order. - adam2fastq ~~~~~~~~~~ @@ -125,4 +78,3 @@ Additionally, ``transformFragments`` takes the following options: - ``-sort_lexicographically``: Sorts reads by alignment position. Unmapped reads are placed at the end of all reads. Contigs are ordered lexicographically. -