diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala index 1684e627d6..b2d3efe793 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicDatasetConverters.scala @@ -24,10 +24,10 @@ import org.bdgenomics.adam.rdd.{ GenomicDataset, GenomicDatasetConversion } -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset -import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, ReadDataset } +import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset } import org.bdgenomics.adam.rdd.variant.{ VariantDataset, GenotypeDataset, @@ -38,18 +38,15 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Read => ReadProduct, + Sequence => SequenceProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } import org.bdgenomics.formats.avro._ import scala.reflect.runtime.universe._ -trait ToContigDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - - val yTag: TypeTag[NucleotideContigFragmentProduct] = typeTag[NucleotideContigFragmentProduct] -} - trait ToCoverageDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Coverage, Coverage, CoverageDataset] { val yTag: TypeTag[Coverage] = typeTag[Coverage] @@ -75,63 +72,27 @@ trait ToGenotypeDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] val yTag: TypeTag[GenotypeProduct] = typeTag[GenotypeProduct] } -trait ToVariantDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Variant, VariantProduct, VariantDataset] { - - val yTag: TypeTag[VariantProduct] = typeTag[VariantProduct] -} - -trait ToVariantContextDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, VariantContext, VariantContextProduct, VariantContextDataset] { - - val yTag: TypeTag[VariantContextProduct] = typeTag[VariantContextProduct] -} - -final class ContigsToCoverageDatasetConverter extends ToCoverageDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { +trait ToReadDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Read, ReadProduct, ReadDataset] { - def call(v1: NucleotideContigFragmentDataset, v2: Dataset[Coverage]): CoverageDataset = { - ADAMContext.contigsToCoverageDatasetConversionFn(v1, v2) - } + val yTag: TypeTag[ReadProduct] = typeTag[ReadProduct] } -final class ContigsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { +trait ToSequenceDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Sequence, SequenceProduct, SequenceDataset] { - def call(v1: NucleotideContigFragmentDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { - ADAMContext.contigsToFeaturesDatasetConversionFn(v1, v2) - } + val yTag: TypeTag[SequenceProduct] = typeTag[SequenceProduct] } -final class ContigsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { +trait ToSliceDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Slice, SliceProduct, SliceDataset] { - def call(v1: NucleotideContigFragmentDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { - ADAMContext.contigsToFragmentsDatasetConversionFn(v1, v2) - } -} - -final class ContigsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { - ADAMContext.contigsToAlignmentRecordsDatasetConversionFn(v1, v2) - } + val yTag: TypeTag[SliceProduct] = typeTag[SliceProduct] } -final class ContigsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { - ADAMContext.contigsToGenotypesDatasetConversionFn(v1, v2) - } -} - -final class ContigsToVariantsDatasetConverter extends ToVariantDatasetConversion[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = { - ADAMContext.contigsToVariantsDatasetConversionFn(v1, v2) - } +trait ToVariantDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, Variant, VariantProduct, VariantDataset] { + val yTag: TypeTag[VariantProduct] = typeTag[VariantProduct] } -final class CoverageToContigsDatasetConverter extends ToContigDatasetConversion[Coverage, Coverage, CoverageDataset] { - - def call(v1: CoverageDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - ADAMContext.coverageToContigsDatasetConversionFn(v1, v2) - } +trait ToVariantContextDatasetConversion[T, U <: Product, V <: GenomicDataset[T, U, V]] extends GenomicDatasetConversion[T, U, V, VariantContext, VariantContextProduct, VariantContextDataset] { + val yTag: TypeTag[VariantContextProduct] = typeTag[VariantContextProduct] } final class CoverageToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Coverage, Coverage, CoverageDataset] { @@ -162,17 +123,30 @@ final class CoverageToGenotypesDatasetConverter extends ToGenotypeDatasetConvers } } -final class CoverageToVariantsDatasetConverter extends ToVariantDatasetConversion[Coverage, Coverage, CoverageDataset] { +final class CoverageToReadsDatasetConverter extends ToReadDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: CoverageDataset, v2: Dataset[VariantProduct]): VariantDataset = { - ADAMContext.coverageToVariantsDatasetConversionFn(v1, v2) + def call(v1: CoverageDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.coverageToReadsDatasetConversionFn(v1, v2) + } +} + +final class CoverageToSequencesDatasetConverter extends ToSequenceDatasetConversion[Coverage, Coverage, CoverageDataset] { + + def call(v1: CoverageDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.coverageToSequencesDatasetConversionFn(v1, v2) } } -final class FeaturesToContigsDatasetConverter extends ToContigDatasetConversion[Feature, FeatureProduct, FeatureDataset] { +final class CoverageToSlicesDatasetConverter extends ToSliceDatasetConversion[Coverage, Coverage, CoverageDataset] { - def call(v1: FeatureDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - ADAMContext.featuresToContigsDatasetConversionFn(v1, v2) + def call(v1: CoverageDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.coverageToSlicesDatasetConversionFn(v1, v2) + } +} + +final class CoverageToVariantsDatasetConverter extends ToVariantDatasetConversion[Coverage, Coverage, CoverageDataset] { + def call(v1: CoverageDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.coverageToVariantsDatasetConversionFn(v1, v2) } } @@ -204,17 +178,30 @@ final class FeaturesToGenotypesDatasetConverter extends ToGenotypeDatasetConvers } } -final class FeaturesToVariantsDatasetConverter extends ToVariantDatasetConversion[Feature, FeatureProduct, FeatureDataset] { +final class FeaturesToReadsDatasetConverter extends ToReadDatasetConversion[Feature, FeatureProduct, FeatureDataset] { - def call(v1: FeatureDataset, v2: Dataset[VariantProduct]): VariantDataset = { - ADAMContext.featuresToVariantsDatasetConversionFn(v1, v2) + def call(v1: FeatureDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.featuresToReadsDatasetConversionFn(v1, v2) + } +} + +final class FeaturesToSequencesDatasetConverter extends ToSequenceDatasetConversion[Feature, FeatureProduct, FeatureDataset] { + + def call(v1: FeatureDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.featuresToSequencesDatasetConversionFn(v1, v2) } } -final class FragmentsToContigsDatasetConverter extends ToContigDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { +final class FeaturesToSlicesDatasetConverter extends ToSliceDatasetConversion[Feature, FeatureProduct, FeatureDataset] { + + def call(v1: FeatureDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.featuresToSlicesDatasetConversionFn(v1, v2) + } +} - def call(v1: FragmentDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - ADAMContext.fragmentsToContigsDatasetConversionFn(v1, v2) +final class FeaturesToVariantsDatasetConverter extends ToVariantDatasetConversion[Feature, FeatureProduct, FeatureDataset] { + def call(v1: FeatureDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.featuresToVariantsDatasetConversionFn(v1, v2) } } @@ -246,22 +233,34 @@ final class FragmentsToGenotypesDatasetConverter extends ToGenotypeDatasetConver } } -final class FragmentsToVariantsDatasetConverter extends ToVariantDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { +final class FragmentsToReadsDatasetConverter extends ToReadDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: FragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = { - ADAMContext.fragmentsToVariantsDatasetConversionFn(v1, v2) + def call(v1: FragmentDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.fragmentsToReadsDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToContigsDatasetConverter extends ToContigDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { +final class FragmentsToSequencesDatasetConverter extends ToSequenceDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { - def call(v1: AlignmentRecordDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - ADAMContext.alignmentRecordsToContigsDatasetConversionFn(v1, v2) + def call(v1: FragmentDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.fragmentsToSequencesDatasetConversionFn(v1, v2) } } -final class AlignmentRecordsToCoverageDatasetConverter extends ToCoverageDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { +final class FragmentsToSlicesDatasetConverter extends ToSliceDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { + + def call(v1: FragmentDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.fragmentsToSlicesDatasetConversionFn(v1, v2) + } +} + +final class FragmentsToVariantsDatasetConverter extends ToVariantDatasetConversion[Fragment, FragmentProduct, FragmentDataset] { + def call(v1: FragmentDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.fragmentsToVariantsDatasetConversionFn(v1, v2) + } +} +final class AlignmentRecordsToCoverageDatasetConverter extends ToCoverageDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { def call(v1: AlignmentRecordDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.alignmentRecordsToCoverageDatasetConversionFn(v1, v2) } @@ -288,22 +287,34 @@ final class AlignmentRecordsToGenotypesDatasetConverter extends ToGenotypeDatase } } -final class AlignmentRecordsToVariantsDatasetConverter extends ToVariantDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { +final class AlignmentRecordsToReadsDatasetConverter extends ToReadDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: AlignmentRecordDataset, v2: Dataset[VariantProduct]): VariantDataset = { - ADAMContext.alignmentRecordsToVariantsDatasetConversionFn(v1, v2) + def call(v1: AlignmentRecordDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.alignmentRecordsToReadsDatasetConversionFn(v1, v2) } } -final class GenotypesToContigsDatasetConverter extends ToContigDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { +final class AlignmentRecordsToSequencesDatasetConverter extends ToSequenceDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { - def call(v1: GenotypeDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - ADAMContext.genotypesToContigsDatasetConversionFn(v1, v2) + def call(v1: AlignmentRecordDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.alignmentRecordsToSequencesDatasetConversionFn(v1, v2) } } -final class GenotypesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { +final class AlignmentRecordsToSlicesDatasetConverter extends ToSliceDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { + def call(v1: AlignmentRecordDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.alignmentRecordsToSlicesDatasetConversionFn(v1, v2) + } +} + +final class AlignmentRecordsToVariantsDatasetConverter extends ToVariantDatasetConversion[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset] { + def call(v1: AlignmentRecordDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.alignmentRecordsToVariantsDatasetConversionFn(v1, v2) + } +} + +final class GenotypesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { def call(v1: GenotypeDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.genotypesToCoverageDatasetConversionFn(v1, v2) } @@ -330,6 +341,27 @@ final class GenotypesToAlignmentRecordsDatasetConverter extends ToAlignmentRecor } } +final class GenotypesToReadsDatasetConverter extends ToReadDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { + + def call(v1: GenotypeDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.genotypesToReadsDatasetConversionFn(v1, v2) + } +} + +final class GenotypesToSequencesDatasetConverter extends ToSequenceDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { + + def call(v1: GenotypeDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.genotypesToSequencesDatasetConversionFn(v1, v2) + } +} + +final class GenotypesToSlicesDatasetConverter extends ToSliceDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { + + def call(v1: GenotypeDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.genotypesToSlicesDatasetConversionFn(v1, v2) + } +} + final class GenotypesToVariantsDatasetConverter extends ToVariantDatasetConversion[Genotype, GenotypeProduct, GenotypeDataset] { def call(v1: GenotypeDataset, v2: Dataset[VariantProduct]): VariantDataset = { @@ -337,15 +369,175 @@ final class GenotypesToVariantsDatasetConverter extends ToVariantDatasetConversi } } -final class VariantsToContigsDatasetConverter extends ToContigDatasetConversion[Variant, VariantProduct, VariantDataset] { +final class ReadsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Read, ReadProduct, ReadDataset] { - def call(v1: VariantDataset, v2: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - ADAMContext.variantsToContigsDatasetConversionFn(v1, v2) + def call(v1: ReadDataset, v2: Dataset[Coverage]): CoverageDataset = { + ADAMContext.readsToCoverageDatasetConversionFn(v1, v2) } } -final class VariantsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Variant, VariantProduct, VariantDataset] { +final class ReadsToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { + ADAMContext.readsToFeaturesDatasetConversionFn(v1, v2) + } +} +final class ReadsToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { + ADAMContext.readsToFragmentsDatasetConversionFn(v1, v2) + } +} + +final class ReadsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + ADAMContext.readsToAlignmentRecordsDatasetConversionFn(v1, v2) + } +} + +final class ReadsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { + ADAMContext.readsToGenotypesDatasetConversionFn(v1, v2) + } +} + +final class ReadsToSequencesDatasetConverter extends ToSequenceDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.readsToSequencesDatasetConversionFn(v1, v2) + } +} + +final class ReadsToSlicesDatasetConverter extends ToSliceDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.readsToSlicesDatasetConversionFn(v1, v2) + } +} + +final class ReadsToVariantsDatasetConverter extends ToVariantDatasetConversion[Read, ReadProduct, ReadDataset] { + + def call(v1: ReadDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.readsToVariantsDatasetConversionFn(v1, v2) + } +} + +final class SequencesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[Coverage]): CoverageDataset = { + ADAMContext.sequencesToCoverageDatasetConversionFn(v1, v2) + } +} + +final class SequencesToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { + ADAMContext.sequencesToFeaturesDatasetConversionFn(v1, v2) + } +} + +final class SequencesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { + ADAMContext.sequencesToFragmentsDatasetConversionFn(v1, v2) + } +} + +final class SequencesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + ADAMContext.sequencesToAlignmentRecordsDatasetConversionFn(v1, v2) + } +} + +final class SequencesToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { + ADAMContext.sequencesToGenotypesDatasetConversionFn(v1, v2) + } +} + +final class SequencesToReadsDatasetConverter extends ToReadDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.sequencesToReadsDatasetConversionFn(v1, v2) + } +} + +final class SequencesToSlicesDatasetConverter extends ToSliceDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.sequencesToSlicesDatasetConversionFn(v1, v2) + } +} + +final class SequencesToVariantsDatasetConverter extends ToVariantDatasetConversion[Sequence, SequenceProduct, SequenceDataset] { + + def call(v1: SequenceDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.sequencesToVariantsDatasetConversionFn(v1, v2) + } +} + +final class SlicesToCoverageDatasetConverter extends ToCoverageDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[Coverage]): CoverageDataset = { + ADAMContext.slicesToCoverageDatasetConversionFn(v1, v2) + } +} + +final class SlicesToFeaturesDatasetConverter extends ToFeatureDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[FeatureProduct]): FeatureDataset = { + ADAMContext.slicesToFeaturesDatasetConversionFn(v1, v2) + } +} + +final class SlicesToFragmentsDatasetConverter extends ToFragmentDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[FragmentProduct]): FragmentDataset = { + ADAMContext.slicesToFragmentsDatasetConversionFn(v1, v2) + } +} + +final class SlicesToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + ADAMContext.slicesToAlignmentRecordsDatasetConversionFn(v1, v2) + } +} + +final class SlicesToGenotypesDatasetConverter extends ToGenotypeDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[GenotypeProduct]): GenotypeDataset = { + ADAMContext.slicesToGenotypesDatasetConversionFn(v1, v2) + } +} + +final class SlicesToReadsDatasetConverter extends ToReadDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.slicesToReadsDatasetConversionFn(v1, v2) + } +} + +final class SlicesToSequencesDatasetConverter extends ToSequenceDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.slicesToSequencesDatasetConversionFn(v1, v2) + } +} + +final class SlicesToVariantsDatasetConverter extends ToVariantDatasetConversion[Slice, SliceProduct, SliceDataset] { + + def call(v1: SliceDataset, v2: Dataset[VariantProduct]): VariantDataset = { + ADAMContext.slicesToVariantsDatasetConversionFn(v1, v2) + } +} + +final class VariantsToCoverageDatasetConverter extends ToCoverageDatasetConversion[Variant, VariantProduct, VariantDataset] { def call(v1: VariantDataset, v2: Dataset[Coverage]): CoverageDataset = { ADAMContext.variantsToCoverageDatasetConversionFn(v1, v2) } @@ -378,3 +570,24 @@ final class VariantsToGenotypesDatasetConverter extends ToGenotypeDatasetConvers ADAMContext.variantsToGenotypesDatasetConversionFn(v1, v2) } } + +final class VariantsToReadsDatasetConverter extends ToReadDatasetConversion[Variant, VariantProduct, VariantDataset] { + + def call(v1: VariantDataset, v2: Dataset[ReadProduct]): ReadDataset = { + ADAMContext.variantsToReadsDatasetConversionFn(v1, v2) + } +} + +final class VariantsToSequencesDatasetConverter extends ToSequenceDatasetConversion[Variant, VariantProduct, VariantDataset] { + + def call(v1: VariantDataset, v2: Dataset[SequenceProduct]): SequenceDataset = { + ADAMContext.variantsToSequencesDatasetConversionFn(v1, v2) + } +} + +final class VariantsToSlicesDatasetConverter extends ToSliceDatasetConversion[Variant, VariantProduct, VariantDataset] { + + def call(v1: VariantDataset, v2: Dataset[SliceProduct]): SliceDataset = { + ADAMContext.variantsToSlicesDatasetConversionFn(v1, v2) + } +} diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala index cb10c9460a..b9e39068b5 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/GenomicRDDConverters.scala @@ -24,10 +24,10 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset -import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, ReadDataset } +import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset } import org.bdgenomics.adam.rdd.variant.{ VariantDataset, GenotypeDataset, @@ -35,68 +35,7 @@ import org.bdgenomics.adam.rdd.variant.{ } import org.bdgenomics.formats.avro._ -final class ContigsToContigsConverter extends Function2[NucleotideContigFragmentDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.contigsToContigsConversionFn(v1, v2) - } -} - -final class ContigsToCoverageConverter extends Function2[NucleotideContigFragmentDataset, RDD[Coverage], CoverageDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[Coverage]): CoverageDataset = { - ADAMContext.contigsToCoverageConversionFn(v1, v2) - } -} - -final class ContigsToFeaturesConverter extends Function2[NucleotideContigFragmentDataset, RDD[Feature], FeatureDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[Feature]): FeatureDataset = { - ADAMContext.contigsToFeaturesConversionFn(v1, v2) - } -} - -final class ContigsToFragmentsConverter extends Function2[NucleotideContigFragmentDataset, RDD[Fragment], FragmentDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[Fragment]): FragmentDataset = { - ADAMContext.contigsToFragmentsConversionFn(v1, v2) - } -} - -final class ContigsToAlignmentRecordsConverter extends Function2[NucleotideContigFragmentDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { - ADAMContext.contigsToAlignmentRecordsConversionFn(v1, v2) - } -} - -final class ContigsToGenotypesConverter extends Function2[NucleotideContigFragmentDataset, RDD[Genotype], GenotypeDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[Genotype]): GenotypeDataset = { - ADAMContext.contigsToGenotypesConversionFn(v1, v2) - } -} - -final class ContigsToVariantsConverter extends Function2[NucleotideContigFragmentDataset, RDD[Variant], VariantDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[Variant]): VariantDataset = { - ADAMContext.contigsToVariantsConversionFn(v1, v2) - } -} - -final class ContigsToVariantContextsConverter extends Function2[NucleotideContigFragmentDataset, RDD[VariantContext], VariantContextDataset] { - - def call(v1: NucleotideContigFragmentDataset, v2: RDD[VariantContext]): VariantContextDataset = { - ADAMContext.contigsToVariantContextConversionFn(v1, v2) - } -} - -final class CoverageToContigsConverter extends Function2[CoverageDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: CoverageDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.coverageToContigsConversionFn(v1, v2) - } -} +// coverage conversion functions final class CoverageToCoverageConverter extends Function2[CoverageDataset, RDD[Coverage], CoverageDataset] { @@ -133,6 +72,27 @@ final class CoverageToGenotypesConverter extends Function2[CoverageDataset, RDD[ } } +final class CoverageToReadsConverter extends Function2[CoverageDataset, RDD[Read], ReadDataset] { + + def call(v1: CoverageDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.coverageToReadsConversionFn(v1, v2) + } +} + +final class CoverageToSequencesConverter extends Function2[CoverageDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: CoverageDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.coverageToSequencesConversionFn(v1, v2) + } +} + +final class CoverageToSlicesConverter extends Function2[CoverageDataset, RDD[Slice], SliceDataset] { + + def call(v1: CoverageDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.coverageToSlicesConversionFn(v1, v2) + } +} + final class CoverageToVariantsConverter extends Function2[CoverageDataset, RDD[Variant], VariantDataset] { def call(v1: CoverageDataset, v2: RDD[Variant]): VariantDataset = { @@ -147,12 +107,7 @@ final class CoverageToVariantContextConverter extends Function2[CoverageDataset, } } -final class FeaturesToContigsConverter extends Function2[FeatureDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: FeatureDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.featuresToContigsConversionFn(v1, v2) - } -} +// features conversion functions final class FeaturesToCoverageConverter extends Function2[FeatureDataset, RDD[Coverage], CoverageDataset] { @@ -189,6 +144,27 @@ final class FeaturesToGenotypesConverter extends Function2[FeatureDataset, RDD[G } } +final class FeaturesToReadsConverter extends Function2[FeatureDataset, RDD[Read], ReadDataset] { + + def call(v1: FeatureDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.featuresToReadsConversionFn(v1, v2) + } +} + +final class FeaturesToSequencesConverter extends Function2[FeatureDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: FeatureDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.featuresToSequencesConversionFn(v1, v2) + } +} + +final class FeaturesToSlicesConverter extends Function2[FeatureDataset, RDD[Slice], SliceDataset] { + + def call(v1: FeatureDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.featuresToSlicesConversionFn(v1, v2) + } +} + final class FeaturesToVariantsConverter extends Function2[FeatureDataset, RDD[Variant], VariantDataset] { def call(v1: FeatureDataset, v2: RDD[Variant]): VariantDataset = { @@ -203,12 +179,7 @@ final class FeaturesToVariantContextConverter extends Function2[FeatureDataset, } } -final class FragmentsToContigsConverter extends Function2[FragmentDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: FragmentDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.fragmentsToContigsConversionFn(v1, v2) - } -} +// fragments conversion functions final class FragmentsToCoverageConverter extends Function2[FragmentDataset, RDD[Coverage], CoverageDataset] { @@ -245,8 +216,28 @@ final class FragmentsToGenotypesConverter extends Function2[FragmentDataset, RDD } } -final class FragmentsToVariantsConverter extends Function2[FragmentDataset, RDD[Variant], VariantDataset] { +final class FragmentsToReadsConverter extends Function2[FragmentDataset, RDD[Read], ReadDataset] { + + def call(v1: FragmentDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.fragmentsToReadsConversionFn(v1, v2) + } +} + +final class FragmentsToSequencesConverter extends Function2[FragmentDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: FragmentDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.fragmentsToSequencesConversionFn(v1, v2) + } +} + +final class FragmentsToSlicesConverter extends Function2[FragmentDataset, RDD[Slice], SliceDataset] { + def call(v1: FragmentDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.fragmentsToSlicesConversionFn(v1, v2) + } +} + +final class FragmentsToVariantsConverter extends Function2[FragmentDataset, RDD[Variant], VariantDataset] { def call(v1: FragmentDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.fragmentsToVariantsConversionFn(v1, v2) } @@ -259,12 +250,7 @@ final class FragmentsToVariantContextConverter extends Function2[FragmentDataset } } -final class AlignmentRecordsToContigsConverter extends Function2[AlignmentRecordDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: AlignmentRecordDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.alignmentRecordsToContigsConversionFn(v1, v2) - } -} +// alignment records conversion functions final class AlignmentRecordsToCoverageConverter extends Function2[AlignmentRecordDataset, RDD[Coverage], CoverageDataset] { @@ -301,8 +287,28 @@ final class AlignmentRecordsToGenotypesConverter extends Function2[AlignmentReco } } -final class AlignmentRecordsToVariantsConverter extends Function2[AlignmentRecordDataset, RDD[Variant], VariantDataset] { +final class AlignmentRecordsToReadsConverter extends Function2[AlignmentRecordDataset, RDD[Read], ReadDataset] { + def call(v1: AlignmentRecordDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.alignmentRecordsToReadsConversionFn(v1, v2) + } +} + +final class AlignmentRecordsToSequencesConverter extends Function2[AlignmentRecordDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: AlignmentRecordDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.alignmentRecordsToSequencesConversionFn(v1, v2) + } +} + +final class AlignmentRecordsToSlicesConverter extends Function2[AlignmentRecordDataset, RDD[Slice], SliceDataset] { + + def call(v1: AlignmentRecordDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.alignmentRecordsToSlicesConversionFn(v1, v2) + } +} + +final class AlignmentRecordsToVariantsConverter extends Function2[AlignmentRecordDataset, RDD[Variant], VariantDataset] { def call(v1: AlignmentRecordDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.alignmentRecordsToVariantsConversionFn(v1, v2) } @@ -315,12 +321,7 @@ final class AlignmentRecordsToVariantContextConverter extends Function2[Alignmen } } -final class GenotypesToContigsConverter extends Function2[GenotypeDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: GenotypeDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.genotypesToContigsConversionFn(v1, v2) - } -} +// genotypes conversion functions final class GenotypesToCoverageConverter extends Function2[GenotypeDataset, RDD[Coverage], CoverageDataset] { @@ -357,8 +358,28 @@ final class GenotypesToGenotypesConverter extends Function2[GenotypeDataset, RDD } } -final class GenotypesToVariantsConverter extends Function2[GenotypeDataset, RDD[Variant], VariantDataset] { +final class GenotypesToReadsConverter extends Function2[GenotypeDataset, RDD[Read], ReadDataset] { + def call(v1: GenotypeDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.genotypesToReadsConversionFn(v1, v2) + } +} + +final class GenotypesToSequencesConverter extends Function2[GenotypeDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: GenotypeDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.genotypesToSequencesConversionFn(v1, v2) + } +} + +final class GenotypesToSlicesConverter extends Function2[GenotypeDataset, RDD[Slice], SliceDataset] { + + def call(v1: GenotypeDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.genotypesToSlicesConversionFn(v1, v2) + } +} + +final class GenotypesToVariantsConverter extends Function2[GenotypeDataset, RDD[Variant], VariantDataset] { def call(v1: GenotypeDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.genotypesToVariantsConversionFn(v1, v2) } @@ -371,13 +392,224 @@ final class GenotypesToVariantContextConverter extends Function2[GenotypeDataset } } -final class VariantsToContigsConverter extends Function2[VariantDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { +// reads conversion functions + +final class ReadsToCoverageConverter extends Function2[ReadDataset, RDD[Coverage], CoverageDataset] { + + def call(v1: ReadDataset, v2: RDD[Coverage]): CoverageDataset = { + ADAMContext.readsToCoverageConversionFn(v1, v2) + } +} + +final class ReadsToFeaturesConverter extends Function2[ReadDataset, RDD[Feature], FeatureDataset] { + + def call(v1: ReadDataset, v2: RDD[Feature]): FeatureDataset = { + ADAMContext.readsToFeaturesConversionFn(v1, v2) + } +} + +final class ReadsToFragmentsConverter extends Function2[ReadDataset, RDD[Fragment], FragmentDataset] { + + def call(v1: ReadDataset, v2: RDD[Fragment]): FragmentDataset = { + ADAMContext.readsToFragmentsConversionFn(v1, v2) + } +} + +final class ReadsToAlignmentRecordsConverter extends Function2[ReadDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { + + def call(v1: ReadDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { + ADAMContext.readsToAlignmentRecordsConversionFn(v1, v2) + } +} + +final class ReadsToGenotypesConverter extends Function2[ReadDataset, RDD[Genotype], GenotypeDataset] { - def call(v1: VariantDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.variantsToContigsConversionFn(v1, v2) + def call(v1: ReadDataset, v2: RDD[Genotype]): GenotypeDataset = { + ADAMContext.readsToGenotypesConversionFn(v1, v2) } } +final class ReadsToReadsConverter extends Function2[ReadDataset, RDD[Read], ReadDataset] { + + def call(v1: ReadDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.readsToReadsConversionFn(v1, v2) + } +} + +final class ReadsToSequencesConverter extends Function2[ReadDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: ReadDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.readsToSequencesConversionFn(v1, v2) + } +} + +final class ReadsToSlicesConverter extends Function2[ReadDataset, RDD[Slice], SliceDataset] { + + def call(v1: ReadDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.readsToSlicesConversionFn(v1, v2) + } +} + +final class ReadsToVariantsConverter extends Function2[ReadDataset, RDD[Variant], VariantDataset] { + + def call(v1: ReadDataset, v2: RDD[Variant]): VariantDataset = { + ADAMContext.readsToVariantsConversionFn(v1, v2) + } +} + +final class ReadsToVariantContextsConverter extends Function2[ReadDataset, RDD[VariantContext], VariantContextDataset] { + + def call(v1: ReadDataset, v2: RDD[VariantContext]): VariantContextDataset = { + ADAMContext.readsToVariantContextsConversionFn(v1, v2) + } +} + +// sequence conversion functions + +final class SequencesToCoverageConverter extends Function2[SequenceDataset, RDD[Coverage], CoverageDataset] { + + def call(v1: SequenceDataset, v2: RDD[Coverage]): CoverageDataset = { + ADAMContext.sequencesToCoverageConversionFn(v1, v2) + } +} + +final class SequencesToFeaturesConverter extends Function2[SequenceDataset, RDD[Feature], FeatureDataset] { + + def call(v1: SequenceDataset, v2: RDD[Feature]): FeatureDataset = { + ADAMContext.sequencesToFeaturesConversionFn(v1, v2) + } +} + +final class SequencesToFragmentsConverter extends Function2[SequenceDataset, RDD[Fragment], FragmentDataset] { + + def call(v1: SequenceDataset, v2: RDD[Fragment]): FragmentDataset = { + ADAMContext.sequencesToFragmentsConversionFn(v1, v2) + } +} + +final class SequencesToAlignmentRecordsConverter extends Function2[SequenceDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { + + def call(v1: SequenceDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { + ADAMContext.sequencesToAlignmentRecordsConversionFn(v1, v2) + } +} + +final class SequencesToGenotypesConverter extends Function2[SequenceDataset, RDD[Genotype], GenotypeDataset] { + + def call(v1: SequenceDataset, v2: RDD[Genotype]): GenotypeDataset = { + ADAMContext.sequencesToGenotypesConversionFn(v1, v2) + } +} + +final class SequencesToReadsConverter extends Function2[SequenceDataset, RDD[Read], ReadDataset] { + + def call(v1: SequenceDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.sequencesToReadsConversionFn(v1, v2) + } +} + +final class SequencesToSequencesConverter extends Function2[SequenceDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: SequenceDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.sequencesToSequencesConversionFn(v1, v2) + } +} + +final class SequencesToSlicesConverter extends Function2[SequenceDataset, RDD[Slice], SliceDataset] { + + def call(v1: SequenceDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.sequencesToSlicesConversionFn(v1, v2) + } +} + +final class SequencesToVariantsConverter extends Function2[SequenceDataset, RDD[Variant], VariantDataset] { + + def call(v1: SequenceDataset, v2: RDD[Variant]): VariantDataset = { + ADAMContext.sequencesToVariantsConversionFn(v1, v2) + } +} + +final class SequencesToVariantContextsConverter extends Function2[SequenceDataset, RDD[VariantContext], VariantContextDataset] { + + def call(v1: SequenceDataset, v2: RDD[VariantContext]): VariantContextDataset = { + ADAMContext.sequencesToVariantContextsConversionFn(v1, v2) + } +} + +// slice conversion functions + +final class SlicesToCoverageConverter extends Function2[SliceDataset, RDD[Coverage], CoverageDataset] { + + def call(v1: SliceDataset, v2: RDD[Coverage]): CoverageDataset = { + ADAMContext.slicesToCoverageConversionFn(v1, v2) + } +} + +final class SlicesToFeaturesConverter extends Function2[SliceDataset, RDD[Feature], FeatureDataset] { + + def call(v1: SliceDataset, v2: RDD[Feature]): FeatureDataset = { + ADAMContext.slicesToFeaturesConversionFn(v1, v2) + } +} + +final class SlicesToFragmentsConverter extends Function2[SliceDataset, RDD[Fragment], FragmentDataset] { + + def call(v1: SliceDataset, v2: RDD[Fragment]): FragmentDataset = { + ADAMContext.slicesToFragmentsConversionFn(v1, v2) + } +} + +final class SlicesToAlignmentRecordsConverter extends Function2[SliceDataset, RDD[AlignmentRecord], AlignmentRecordDataset] { + + def call(v1: SliceDataset, v2: RDD[AlignmentRecord]): AlignmentRecordDataset = { + ADAMContext.slicesToAlignmentRecordsConversionFn(v1, v2) + } +} + +final class SlicesToGenotypesConverter extends Function2[SliceDataset, RDD[Genotype], GenotypeDataset] { + + def call(v1: SliceDataset, v2: RDD[Genotype]): GenotypeDataset = { + ADAMContext.slicesToGenotypesConversionFn(v1, v2) + } +} + +final class SlicesToReadsConverter extends Function2[SliceDataset, RDD[Read], ReadDataset] { + + def call(v1: SliceDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.slicesToReadsConversionFn(v1, v2) + } +} + +final class SlicesToSequencesConverter extends Function2[SliceDataset, RDD[Sequence], SequenceDataset] { + + def call(v1: SliceDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.slicesToSequencesConversionFn(v1, v2) + } +} + +final class SlicesToSlicesConverter extends Function2[SliceDataset, RDD[Slice], SliceDataset] { + + def call(v1: SliceDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.slicesToSlicesConversionFn(v1, v2) + } +} + +final class SlicesToVariantsConverter extends Function2[SliceDataset, RDD[Variant], VariantDataset] { + + def call(v1: SliceDataset, v2: RDD[Variant]): VariantDataset = { + ADAMContext.slicesToVariantsConversionFn(v1, v2) + } +} + +final class SlicesToVariantContextsConverter extends Function2[SliceDataset, RDD[VariantContext], VariantContextDataset] { + + def call(v1: SliceDataset, v2: RDD[VariantContext]): VariantContextDataset = { + ADAMContext.slicesToVariantContextsConversionFn(v1, v2) + } +} + +// variants conversion functions + final class VariantsToCoverageConverter extends Function2[VariantDataset, RDD[Coverage], CoverageDataset] { def call(v1: VariantDataset, v2: RDD[Coverage]): CoverageDataset = { @@ -413,8 +645,28 @@ final class VariantsToGenotypesConverter extends Function2[VariantDataset, RDD[G } } -final class VariantsToVariantsConverter extends Function2[VariantDataset, RDD[Variant], VariantDataset] { +final class VariantsToReadsConverter extends Function2[VariantDataset, RDD[Read], ReadDataset] { + + def call(v1: VariantDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.variantsToReadsConversionFn(v1, v2) + } +} + +final class VariantsToSequencesConverter extends Function2[VariantDataset, RDD[Sequence], SequenceDataset] { + def call(v1: VariantDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.variantsToSequencesConversionFn(v1, v2) + } +} + +final class VariantsToSlicesConverter extends Function2[VariantDataset, RDD[Slice], SliceDataset] { + + def call(v1: VariantDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.variantsToSlicesConversionFn(v1, v2) + } +} + +final class VariantsToVariantsConverter extends Function2[VariantDataset, RDD[Variant], VariantDataset] { def call(v1: VariantDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.variantsToVariantsConversionFn(v1, v2) } @@ -427,12 +679,7 @@ final class VariantsToVariantContextConverter extends Function2[VariantDataset, } } -final class VariantContextsToContigsConverter extends Function2[VariantContextDataset, RDD[NucleotideContigFragment], NucleotideContigFragmentDataset] { - - def call(v1: VariantContextDataset, v2: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - ADAMContext.variantContextsToContigsConversionFn(v1, v2) - } -} +// variant contexts conversion functions final class VariantContextsToCoverageConverter extends Function2[VariantContextDataset, RDD[Coverage], CoverageDataset] { @@ -469,8 +716,28 @@ final class VariantContextsToGenotypesConverter extends Function2[VariantContext } } -final class VariantContextsToVariantsConverter extends Function2[VariantContextDataset, RDD[Variant], VariantDataset] { +final class VariantContextsToReadsConverter extends Function2[VariantContextDataset, RDD[Read], ReadDataset] { + + def call(v1: VariantContextDataset, v2: RDD[Read]): ReadDataset = { + ADAMContext.variantContextsToReadsConversionFn(v1, v2) + } +} + +final class VariantContextsToSequencesConverter extends Function2[VariantContextDataset, RDD[Sequence], SequenceDataset] { + def call(v1: VariantContextDataset, v2: RDD[Sequence]): SequenceDataset = { + ADAMContext.variantContextsToSequencesConversionFn(v1, v2) + } +} + +final class VariantContextsToSlicesConverter extends Function2[VariantContextDataset, RDD[Slice], SliceDataset] { + + def call(v1: VariantContextDataset, v2: RDD[Slice]): SliceDataset = { + ADAMContext.variantContextsToSlicesConversionFn(v1, v2) + } +} + +final class VariantContextsToVariantsConverter extends Function2[VariantContextDataset, RDD[Variant], VariantDataset] { def call(v1: VariantContextDataset, v2: RDD[Variant]): VariantDataset = { ADAMContext.variantContextsToVariantsConversionFn(v1, v2) } diff --git a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala index 31e0e15781..14f679452b 100644 --- a/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala +++ b/adam-apis/src/main/scala/org/bdgenomics/adam/api/java/JavaADAMContext.scala @@ -21,10 +21,10 @@ import htsjdk.samtools.ValidationStringency import org.apache.spark.api.java.JavaSparkContext import org.bdgenomics.adam.models.ReferenceRegion import org.bdgenomics.adam.rdd.ADAMContext -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset -import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, ReadDataset } +import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset } import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset @@ -131,26 +131,6 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { ac.loadIndexedBam(pathName, viewRegions.toIterable, stringency = stringency) } - /** - * (Java-specific) Load nucleotide contig fragments into a NucleotideContigFragmentDataset. - * - * If the path name has a .fa/.fasta extension, load as FASTA format. - * Else, fall back to Parquet + Avro. - * - * For FASTA format, compressed files are supported through compression codecs configured - * in Hadoop, which by default include .gz and .bz2, but can include more. - * - * @see ADAMContext#loadContigFragments - * - * @param pathName The path name to load nucleotide contig fragments from. - * Globs/directories are supported, although file extension must be present - * for FASTA format. - * @return Returns a NucleotideContigFragmentDataset. - */ - def loadContigFragments(pathName: java.lang.String): NucleotideContigFragmentDataset = { - ac.loadContigFragments(pathName) - } - /** * (Java-specific) Load fragments into a FragmentDataset. * @@ -390,10 +370,10 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { /** * (Java-specific) Load reference sequences into a broadcastable ReferenceFile. * - * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments + * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices * to load the reference as an RDD, which is then collected to the driver. * - * @see loadContigFragments + * @see ADAMContext#loadSlices * * @param pathName The path name to load reference sequences from. * Globs/directories for 2bit format are not supported. @@ -409,11 +389,11 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { /** * (Java-specific) Load reference sequences into a broadcastable ReferenceFile. * - * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments + * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices * to load the reference as an RDD, which is then collected to the driver. Uses a * maximum fragment length of 10kbp. * - * @see loadContigFragments + * @see ADAMContext#loadSlices * * @param pathName The path name to load reference sequences from. * Globs/directories for 2bit format are not supported. @@ -422,4 +402,90 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable { def loadReferenceFile(pathName: java.lang.String): ReferenceFile = { loadReferenceFile(pathName, 10000L) } + + /** + * (Java-specific) Load DNA sequences into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see ADAMContext#loadFastaDna + * @see ADAMContext#loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @return Returns a SequenceDataset containing DNA sequences. + */ + def loadDnaSequences(pathName: java.lang.String): SequenceDataset = { + ac.loadDnaSequences(pathName) + } + + /** + * (Java-specific) Load protein sequences into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see ADAMContext#loadFastaProtein + * @see ADAMContext#loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @return Returns a SequenceDataset containing protein sequences. + */ + def loadProteinSequences(pathName: java.lang.String): SequenceDataset = { + ac.loadProteinSequences(pathName) + } + + /** + * (Java-specific) Load RNA sequences into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see ADAMContext#loadFastaRna + * @see ADAMContext#loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @return Returns a SequenceDataset containing RNA sequences. + */ + def loadRnaSequences(pathName: java.lang.String): SequenceDataset = { + ac.loadRnaSequences(pathName) + } + + /** + * (Java-specific) Load slices into a SliceDataset. + * + * If the path name has a .fa/.fasta extension, load as DNA in FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @param pathName The path name to load DNA slices from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @param maximumLength Maximum fragment length. Values greater + * than 1e9 should be avoided. + * @return Returns a SliceDataset. + */ + def loadSlices( + pathName: java.lang.String, + maximumLength: java.lang.Long): SliceDataset = { + + ac.loadSlices(pathName, maximumLength) + } } diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java new file mode 100644 index 0000000000..0e88d75a1f --- /dev/null +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSequenceConduit.java @@ -0,0 +1,44 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.api.java; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.bdgenomics.adam.rdd.ADAMContext; +import org.bdgenomics.adam.rdd.sequence.SequenceDataset; + +/** + * A simple test class for the JavaADAMRDD/Context. Writes an RDD of sequences + * to disk and reads it back. + */ +final class JavaADAMSequenceConduit { + public static SequenceDataset conduit(final SequenceDataset sequenceDataset, + final ADAMContext ac) throws IOException { + + // make temp directory and save file + Path tempDir = Files.createTempDirectory("javaAC"); + String fileName = tempDir.toString() + "/testRdd.sequences.adam"; + sequenceDataset.save(fileName, true, true); + + // create a new adam context and load the file + JavaADAMContext jac = new JavaADAMContext(ac); + return jac.loadDnaSequences(fileName); + } +} diff --git a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java similarity index 70% rename from adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java rename to adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java index fe732bb02c..eead1baf97 100644 --- a/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMContigConduit.java +++ b/adam-apis/src/test/java/org/bdgenomics/adam/api/java/JavaADAMSliceConduit.java @@ -20,24 +20,25 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.bdgenomics.adam.rdd.ADAMContext; -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset; +import org.bdgenomics.adam.rdd.sequence.SliceDataset; /** - * A simple test class for the JavaADAMRDD/Context. Writes an RDD of nucleotide - * contig fragments to disk and reads it back. + * A simple test class for the JavaADAMRDD/Context. Writes an RDD of slices + * to disk and reads it back. */ -final class JavaADAMContigConduit { - public static NucleotideContigFragmentDataset conduit(final NucleotideContigFragmentDataset recordRdd, - final ADAMContext ac) throws IOException { +final class JavaADAMSliceConduit { + public static SliceDataset conduit(final SliceDataset sliceDataset, + final ADAMContext ac) throws IOException { // make temp directory and save file Path tempDir = Files.createTempDirectory("javaAC"); - String fileName = tempDir.toString() + "/testRdd.contig.adam"; - recordRdd.save(fileName, true); + String fileName = tempDir.toString() + "/testRdd.slices.adam"; + sliceDataset.save(fileName, true, true); // create a new adam context and load the file JavaADAMContext jac = new JavaADAMContext(ac); - return jac.loadContigFragments(fileName); + return jac.loadSlices(fileName, 10000L); } } diff --git a/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala b/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala index 12dfbbecd5..f912aee6ab 100644 --- a/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala +++ b/adam-apis/src/test/scala/org/bdgenomics/adam/api/java/JavaADAMContextSuite.scala @@ -49,16 +49,6 @@ class JavaADAMContextSuite extends ADAMFunSuite { assert(reads.rdd.count == 2) } - sparkTest("can read and write a small FASTA file") { - val path = copyResource("chr20.250k.fa.gz") - val aRdd = jac.loadContigFragments(path) - assert(aRdd.jrdd.count() === 26) - - val newRdd = JavaADAMContigConduit.conduit(aRdd, sc) - - assert(newRdd.jrdd.count() === 26) - } - sparkTest("can read and write a small .SAM file as fragments") { val path = copyResource("small.sam") val aRdd = jac.loadFragments(path) @@ -114,4 +104,24 @@ class JavaADAMContextSuite extends ADAMFunSuite { val refFile = jac.loadReferenceFile(path) assert(refFile.extract(ReferenceRegion("hg19_chrM", 16561, 16571)) === "CATCACGATG") } + + sparkTest("can read and write .fa as sequences") { + val path = copyResource("trinity.fa") + val sequences = jac.loadDnaSequences(path) + assert(sequences.jrdd.count() === 5) + + val newRdd = JavaADAMSequenceConduit.conduit(sequences, sc) + + assert(newRdd.jrdd.count() === 5) + } + + sparkTest("can read and write .fa as slices") { + val path = copyResource("trinity.fa") + val slices = jac.loadSlices(path, 10000L) + assert(slices.jrdd.count() === 5) + + val newRdd = JavaADAMSliceConduit.conduit(slices, sc) + + assert(newRdd.jrdd.count() === 5) + } } diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala deleted file mode 100644 index 5af5aae2d7..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import grizzled.slf4j.Logging -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.cli.FileSystemUtils._ -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.formats.avro.NucleotideContigFragment -import org.bdgenomics.utils.cli._ -import org.kohsuke.args4j.{ Argument, Option => Args4jOption } - -class ADAM2FastaArgs extends Args4jBase { - @Argument(required = true, metaVar = "ADAM", usage = "The Parquet file to convert", index = 0) - var inputPath: String = null - @Argument(required = true, metaVar = "FASTA", usage = "Location to write the FASTA to", index = 1) - var outputPath: String = null - @Args4jOption(required = false, name = "-single", usage = "Saves FASTA as single file") - var asSingleFile: Boolean = false - @Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output") - var disableFastConcat: Boolean = false - @Args4jOption(required = false, name = "-coalesce", usage = "Choose the number of partitions to coalesce down to.") - var coalesce: Int = -1 - @Args4jOption(required = false, name = "-force_shuffle_coalesce", usage = "Force shuffle while partitioning, default false.") - var forceShuffle: Boolean = false - @Args4jOption(required = false, name = "-line_width", usage = "Hard wrap FASTA formatted sequence at line width, default 60") - var lineWidth: Int = 60 -} - -object ADAM2Fasta extends BDGCommandCompanion { - override val commandName = "adam2fasta" - override val commandDescription = "Convert ADAM nucleotide contig fragments to FASTA files" - - override def apply(cmdLine: Array[String]): ADAM2Fasta = - new ADAM2Fasta(Args4j[ADAM2FastaArgs](cmdLine)) -} - -class ADAM2Fasta(val args: ADAM2FastaArgs) extends BDGSparkCommand[ADAM2FastaArgs] with Logging { - override val companion = ADAM2Fasta - - override def run(sc: SparkContext): Unit = { - checkWriteablePath(args.outputPath, sc.hadoopConfiguration) - - info("Loading ADAM nucleotide contig fragments from disk.") - val contigFragments = sc.loadContigFragments(args.inputPath) - - info("Merging fragments and writing FASTA to disk.") - val contigs = contigFragments.mergeFragments() - - val cc = if (args.coalesce > 0) { - if (args.coalesce > contigs.rdd.partitions.length || args.forceShuffle) { - contigs.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(args.coalesce, shuffle = true)) - } else { - contigs.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(args.coalesce, shuffle = false)) - } - } else { - contigs - } - cc.saveAsFasta( - args.outputPath, - args.lineWidth, - asSingleFile = args.asSingleFile, - disableFastConcat = args.disableFastConcat - ) - } -} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala index a816273b77..0fb688564d 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala @@ -33,10 +33,12 @@ object ADAMMain { "ADAM ACTIONS", List( CountReadKmers, - CountContigKmers, + CountSliceKmers, TransformAlignments, TransformFeatures, TransformGenotypes, + TransformSequences, + TransformSlices, TransformVariants, MergeShards, Reads2Coverage @@ -45,8 +47,6 @@ object ADAMMain { CommandGroup( "CONVERSION OPERATIONS", List( - Fasta2ADAM, - ADAM2Fasta, ADAM2Fastq, TransformFragments ) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountSliceKmers.scala similarity index 79% rename from adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala rename to adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountSliceKmers.scala index b664d9c4bf..85b6061470 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountContigKmers.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CountSliceKmers.scala @@ -24,16 +24,16 @@ import org.bdgenomics.adam.cli.FileSystemUtils._ import org.bdgenomics.utils.cli._ import org.kohsuke.args4j.{ Argument, Option => Args4jOption } -object CountContigKmers extends BDGCommandCompanion { - val commandName = "countContigKmers" - val commandDescription = "Counts the k-mers/q-mers from a read dataset." +object CountSliceKmers extends BDGCommandCompanion { + val commandName = "countSliceKmers" + val commandDescription = "Counts the k-mers/q-mers from a slice dataset." def apply(cmdLine: Array[String]) = { - new CountContigKmers(Args4j[CountContigKmersArgs](cmdLine)) + new CountSliceKmers(Args4j[CountSliceKmersArgs](cmdLine)) } } -class CountContigKmersArgs extends Args4jBase with ParquetArgs { +class CountSliceKmersArgs extends Args4jBase with ParquetArgs { @Argument(required = true, metaVar = "INPUT", usage = "The ADAM or FASTA file to count kmers from", index = 0) var inputPath: String = null @Argument(required = true, metaVar = "OUTPUT", usage = "Location for storing k-mer counts", index = 1) @@ -44,17 +44,17 @@ class CountContigKmersArgs extends Args4jBase with ParquetArgs { var printHistogram: Boolean = false } -class CountContigKmers(protected val args: CountContigKmersArgs) extends BDGSparkCommand[CountContigKmersArgs] with Logging { - val companion = CountContigKmers +class CountSliceKmers(protected val args: CountSliceKmersArgs) extends BDGSparkCommand[CountSliceKmersArgs] with Logging { + val companion = CountSliceKmers def run(sc: SparkContext) { checkWriteablePath(args.outputPath, sc.hadoopConfiguration) // read from disk - val fragments = sc.loadContigFragments(args.inputPath) + val slices = sc.loadSlices(args.inputPath) // count kmers - val countedKmers = fragments.countKmers(args.kmerLength) + val countedKmers = slices.countKmers(args.kmerLength) // print histogram, if requested if (args.printHistogram) { diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fasta2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fasta2ADAM.scala deleted file mode 100644 index 36d525aab2..0000000000 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Fasta2ADAM.scala +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import grizzled.slf4j.Logging -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.cli.FileSystemUtils._ -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.formats.avro.NucleotideContigFragment -import org.bdgenomics.utils.cli._ -import org.kohsuke.args4j.{ Argument, Option => Args4jOption } - -object Fasta2ADAM extends BDGCommandCompanion { - val commandName: String = "fasta2adam" - val commandDescription: String = "Converts a text FASTA sequence file into an ADAMNucleotideContig Parquet file which represents assembled sequences." - - def apply(cmdLine: Array[String]) = { - new Fasta2ADAM(Args4j[Fasta2ADAMArgs](cmdLine)) - } -} - -class Fasta2ADAMArgs extends Args4jBase with ParquetSaveArgs { - @Argument(required = true, metaVar = "FASTA", usage = "The FASTA file to convert", index = 0) - var fastaFile: String = null - @Argument(required = true, metaVar = "ADAM", usage = "Location to write ADAM data", index = 1) - var outputPath: String = null - @Args4jOption(required = false, name = "-verbose", usage = "Prints enhanced debugging info, including contents of seq dict.") - var verbose: Boolean = false - @Args4jOption(required = false, name = "-reads", usage = "Maps contig IDs to match contig IDs of reads.") - var reads: String = "" - @Args4jOption(required = false, name = "-fragment_length", usage = "Sets maximum fragment length. Default value is 10,000. Values greater than 1e9 should be avoided.") - var maximumLength: Long = 10000L - @Args4jOption(required = false, name = "-repartition", usage = "Sets the number of output partitions to write, if desired.") - var partitions: Int = -1 -} - -class Fasta2ADAM(protected val args: Fasta2ADAMArgs) extends BDGSparkCommand[Fasta2ADAMArgs] with Logging { - val companion = Fasta2ADAM - - def run(sc: SparkContext) { - checkWriteablePath(args.outputPath, sc.hadoopConfiguration) - - info("Loading FASTA data from disk.") - val adamFasta = sc.loadFasta(args.fastaFile, maximumLength = args.maximumLength) - - if (args.verbose) { - info("FASTA contains: %s".format(adamFasta.sequences.toString)) - } - - info("Writing records to disk.") - val finalFasta = if (args.partitions > 0) { - adamFasta.transform((rdd: RDD[NucleotideContigFragment]) => rdd.repartition(args.partitions)) - } else { - adamFasta - } - - finalFasta.saveAsParquet(args) - } -} - diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala index 88859800d2..9ddc232e1a 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformFeatures.scala @@ -34,11 +34,11 @@ object TransformFeatures extends BDGCommandCompanion { class TransformFeaturesArgs extends Args4jBase with ParquetSaveArgs { @Argument(required = true, metaVar = "INPUT", - usage = "The features file to convert (e.g., .bed, .gff/.gtf, .gff3, .interval_list, .narrowPeak). If extension is not detected, Parquet is assumed.", index = 0) + usage = "The feature file to convert (e.g., .bed, .gff/.gtf, .gff3, .interval_list, .narrowPeak). If extension is not detected, Parquet is assumed.", index = 0) var featuresFile: String = _ @Argument(required = true, metaVar = "OUTPUT", - usage = "Location to write ADAM features data. If extension is not detected, Parquet is assumed.", index = 1) + usage = "Location to write ADAM feature data. If extension is not detected, Parquet is assumed.", index = 1) var outputPath: String = null @Args4jOption(required = false, name = "-num_partitions", diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSequences.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSequences.scala new file mode 100644 index 0000000000..6824973387 --- /dev/null +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSequences.scala @@ -0,0 +1,71 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.cli + +import org.apache.spark.SparkContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.formats.avro.Alphabet; +import org.bdgenomics.utils.cli._ +import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption } + +object TransformSequences extends BDGCommandCompanion { + val commandName = "transformSequences" + val commandDescription = "Convert a FASTA file as sequences into corresponding ADAM format and vice versa" + + def apply(cmdLine: Array[String]) = { + new TransformSequences(Args4j[TransformSequencesArgs](cmdLine)) + } +} + +class TransformSequencesArgs extends Args4jBase with ParquetSaveArgs { + @Argument(required = true, metaVar = "INPUT", + usage = "The sequence file to convert (e.g., .fa, .fasta). If extension is not detected, Parquet is assumed.", index = 0) + var sequencesFile: String = _ + + @Argument(required = true, metaVar = "OUTPUT", + usage = "Location to write ADAM sequence data. If extension is not detected, Parquet is assumed.", index = 1) + var outputPath: String = null + + @Args4jOption(required = false, name = "-single", + usage = "Save as a single file, for the text formats.") + var single: Boolean = false + + @Args4jOption(required = false, name = "-alphabet", + usage = "Alphabet in which to interpret the loaded sequences { DNA, PROTEIN, RNA }. Defaults to Alphabet.DNA.") + var alphabet: String = "DNA" + + @Args4jOption(required = false, name = "-disable_fast_concat", + usage = "Disables the parallel file concatenation engine.") + var disableFastConcat: Boolean = false +} + +class TransformSequences(val args: TransformSequencesArgs) + extends BDGSparkCommand[TransformSequencesArgs] { + + val companion = TransformSequences + val alphabet = Alphabet.valueOf(args.alphabet) + + def run(sc: SparkContext) { + val sequences = alphabet match { + case Alphabet.DNA => sc.loadDnaSequences(args.sequencesFile, optPredicate = None, optProjection = None) + case Alphabet.PROTEIN => sc.loadProteinSequences(args.sequencesFile, optPredicate = None, optProjection = None) + case Alphabet.RNA => sc.loadRnaSequences(args.sequencesFile, optPredicate = None, optProjection = None) + } + sequences.save(args.outputPath, args.single, args.disableFastConcat) + } +} diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSlices.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSlices.scala new file mode 100644 index 0000000000..a4d15fd1c8 --- /dev/null +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/TransformSlices.scala @@ -0,0 +1,69 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.cli + +import org.apache.spark.SparkContext +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.utils.cli._ +import org.kohsuke.args4j.{ Argument, Option ⇒ Args4jOption } + +object TransformSlices extends BDGCommandCompanion { + val commandName = "transformSlices" + val commandDescription = "Convert a FASTA file as slices into corresponding ADAM format and vice versa" + + def apply(cmdLine: Array[String]) = { + new TransformSlices(Args4j[TransformSlicesArgs](cmdLine)) + } +} + +class TransformSlicesArgs extends Args4jBase with ParquetSaveArgs { + @Argument(required = true, metaVar = "INPUT", + usage = "The slice file to convert (e.g., .fa, .fasta). If extension is not detected, Parquet is assumed.", index = 0) + var slicesFile: String = _ + + @Argument(required = true, metaVar = "OUTPUT", + usage = "Location to write ADAM slice data. If extension is not detected, Parquet is assumed.", index = 1) + var outputPath: String = null + + @Args4jOption(required = false, name = "-maximum_length", + usage = "Maximum slice length. Defaults to 10000L.") + var maximumLength: Long = 10000L + + @Args4jOption(required = false, name = "-single", + usage = "Save as a single file, for the text formats.") + var single: Boolean = false + + @Args4jOption(required = false, name = "-disable_fast_concat", + usage = "Disables the parallel file concatenation engine.") + var disableFastConcat: Boolean = false +} + +class TransformSlices(val args: TransformSlicesArgs) + extends BDGSparkCommand[TransformSlicesArgs] { + + val companion = TransformSlices + + def run(sc: SparkContext) { + sc.loadSlices( + args.slicesFile, + maximumLength = args.maximumLength, + optPredicate = None, + optProjection = None + ).save(args.outputPath, args.single, args.disableFastConcat) + } +} diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2FastaSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2FastaSuite.scala deleted file mode 100644 index fcdef5e73d..0000000000 --- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/ADAM2FastaSuite.scala +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import com.google.common.io.Files -import java.io.File -import org.bdgenomics.adam.util.ADAMFunSuite -import org.bdgenomics.utils.cli._ - -class ADAM2FastaSuite extends ADAMFunSuite { - - sparkTest("round trip FASTA to nucleotide contig fragments in ADAM format to FASTA") { - val fastaFile = testFile("contigs.fa") - - val outputDir = Files.createTempDir() - val outputContigFragmentsFile = outputDir.getAbsolutePath + "/contigs.adam" - val outputFastaFile = outputDir.getAbsolutePath + "/contigs.fa" - - val args0: Array[String] = Array(fastaFile, outputContigFragmentsFile) - new Fasta2ADAM(Args4j[Fasta2ADAMArgs](args0)).run(sc) - - val args1: Array[String] = Array(outputContigFragmentsFile, outputFastaFile) - new ADAM2Fasta(Args4j[ADAM2FastaArgs](args1)).run(sc) - - val fastaLines = scala.io.Source.fromFile(new File(fastaFile)).getLines().toSeq - val outputFastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(outputFastaLines.length === fastaLines.length) - outputFastaLines.zip(fastaLines).foreach(kv => assert(kv._1 === kv._2)) - } -} diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Fasta2ADAMSuite.scala b/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Fasta2ADAMSuite.scala deleted file mode 100644 index ff5e87d4bc..0000000000 --- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/Fasta2ADAMSuite.scala +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.cli - -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.ADAMFunSuite - -class Fasta2ADAMSuite extends ADAMFunSuite { - sparkTest("can load fasta records after conversion") { - val inputPath = copyResource("chr20.250k.fa.gz") - val convertPath = tmpFile("chr20.contig.adam") - val cmd = Fasta2ADAM(Array(inputPath, convertPath)).run(sc) - - val contigFragments = sc.loadParquetContigFragments(convertPath) - assert(contigFragments.rdd.count() === 26) - val first = contigFragments.rdd.first() - assert(first.getContigName === null) - assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") - assert(first.getIndex === 0) - assert(first.getSequence.length === 10000) - assert(first.getStart === 0L) - assert(first.getEnd === 10000L) - assert(first.getFragments === 26) - } -} diff --git a/adam-core/pom.xml b/adam-core/pom.xml index fb1ba39122..90ebb73448 100644 --- a/adam-core/pom.xml +++ b/adam-core/pom.xml @@ -106,10 +106,12 @@ org.bdgenomics.formats.avro.Feature org.bdgenomics.formats.avro.Fragment org.bdgenomics.formats.avro.Genotype - org.bdgenomics.formats.avro.NucleotideContigFragment org.bdgenomics.formats.avro.OntologyTerm org.bdgenomics.formats.avro.ProcessingStep org.bdgenomics.formats.avro.Reference + org.bdgenomics.formats.avro.Read + org.bdgenomics.formats.avro.Sequence + org.bdgenomics.formats.avro.Slice org.bdgenomics.formats.avro.TranscriptEffect org.bdgenomics.formats.avro.Variant org.bdgenomics.formats.avro.VariantAnnotation @@ -134,7 +136,6 @@ org.bdgenomics.formats.avro.Feature org.bdgenomics.formats.avro.Fragment org.bdgenomics.formats.avro.Genotype - org.bdgenomics.formats.avro.NucleotideContigFragment org.bdgenomics.formats.avro.OntologyTerm org.bdgenomics.formats.avro.Read org.bdgenomics.formats.avro.ReadGroup diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaSequenceConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaSequenceConverter.scala new file mode 100644 index 0000000000..0a5353fd24 --- /dev/null +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaSequenceConverter.scala @@ -0,0 +1,236 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.converters + +import org.apache.spark.rdd.RDD +import org.bdgenomics.formats.avro.{ Alphabet, Sequence } +import scala.collection.mutable + +/** + * Object for converting an RDD containing FASTA sequence data into ADAM FASTA data. + */ +private[adam] object FastaSequenceConverter { + + /** + * Case class that describes a line in FASTA that begins with a ">". + * + * In FASTA, a sequence starts with a line that begins with a ">" and that + * gives the sequence name, and optionally, miscellaneous information about + * the sequence. If the file contains a single line, this description line + * can be omitted. + * + * @param fileIndex The line number where this line was seen in the file. + * @param seqId The index of this sequence in the file. + * @param descriptionLine An optional string that describes the FASTA line. + */ + case class FastaDescriptionLine(fileIndex: Long = -1L, + seqId: Int = 0, + descriptionLine: Option[String] = None) { + /** + * The contig name and description that was parsed out of this description line. + */ + val (contigName, contigDescription) = parseDescriptionLine(descriptionLine, fileIndex) + + /** + * Parses the text of a given line. + * + * Assumes that the line contains the contig name followed by an optional + * description of the contig, with the two separated by a space. + * + * @throws IllegalArgumentException if there is no name in the line and the + * line is not the only record in a file (i.e., the file contains multiple + * contigs). + * @param descriptionLine The optional string describing the contig. If this + * is not set and this isn't the only line in the file, we throw. + * @param id The index of this contig in the file. + * @return Returns a tuple containing (the optional contig name, and the + * optional contig description). + */ + private def parseDescriptionLine(descriptionLine: Option[String], + id: Long): (Option[String], Option[String]) = { + descriptionLine.fold { + require(id == -1L, "Cannot have a headerless line in a file with more than one sequence.") + (None: Option[String], None: Option[String]) + } { (dL) => + // fasta description line splits on whitespace + val splitIndex = dL.indexWhere(c => c.isWhitespace) + if (splitIndex >= 0) { + val split = dL.splitAt(splitIndex) + + // is this description metadata or not? if it is metadata, it will contain "|" + if (split._1.contains('|')) { + (None, Some(dL.stripPrefix(">").trim)) + } else { + val contigName: String = split._1.stripPrefix(">").trim + val contigDescription: String = split._2.trim + + (Some(contigName), Some(contigDescription)) + } + } else { + (Some(dL.stripPrefix(">").trim), None) + } + } + } + } + + /** + * Converts an RDD containing ints and strings into an RDD containing ADAM sequences. + * + * @note Input dataset is assumed to have come in from a Hadoop TextInputFormat reader. This sets + * a specific format for the RDD's Key-Value pairs. + * @throws AssertionError Thrown if there appear to be multiple sequences in a single file + * that do not have descriptions. + * @throws IllegalArgumentException Thrown if a sequence does not have sequence data. + * @param alphabet Alphabet in which to interpret the sequences to convert. + * @param rdd RDD containing Long,String tuples, where the Long corresponds to the number + * of the file line, and the String is the line of the file. + * @return An RDD of ADAM FASTA data. + */ + def apply(alphabet: Alphabet, rdd: RDD[(Long, String)]): RDD[Sequence] = { + val filtered = rdd.map(kv => (kv._1, kv._2.trim())) + .filter((kv: (Long, String)) => !kv._2.startsWith(";")) + + val descriptionLines: Map[Long, FastaDescriptionLine] = getDescriptionLines(filtered) + val indexToContigDescription = rdd.context.broadcast(descriptionLines) + + val sequenceLines = filtered.filter(kv => !isDescriptionLine(kv._2)) + + val keyedSequences = + if (indexToContigDescription.value.isEmpty) { + sequenceLines.keyBy(kv => -1L) + } else { + sequenceLines.keyBy(row => findContigIndex(row._1, indexToContigDescription.value.keys.toList)) + } + + val groupedContigs = keyedSequences.groupByKey() + + val converter = new FastaSequenceConverter(alphabet) + + groupedContigs.flatMap { + case (id, lines) => + + val descriptionLine = indexToContigDescription.value.getOrElse(id, FastaDescriptionLine()) + assert(lines.nonEmpty, s"Sequence ${descriptionLine.seqId} has no sequence data.") + + val sequence: Seq[String] = lines.toSeq.sortBy(_._1).map(kv => cleanSequence(kv._2)) + converter.convert( + descriptionLine.contigName, + descriptionLine.seqId, + sequence, + descriptionLine.contigDescription + ) + } + } + + /** + * Cleans up a sequence by stripping asterisks at the end of the sequence. + * + * To be consistent with a legacy database, some FASTA sequences end in a "*" + * suffix. This method strips that suffix from the end of the sequence. + * + * @param sequence The sequence to clean. + * @return Sequence minus "*" suffix. + */ + private def cleanSequence(sequence: String): String = { + sequence.stripSuffix("*") + } + + /** + * A FASTA line starting with ">" is a description line. + * + * @param line The line to check. + * @return True if the line starts with ">" and is thus a description line. + */ + private def isDescriptionLine(line: String): Boolean = { + line.startsWith(">") + } + + /** + * Gets the description lines in a FASTA file. + * + * Filters an input RDD that contains (line number, line) pairs and returns + * all lines that are descriptions of a sequence. + * + * @param rdd RDD of (line number, line string) pairs to filter. + * @return Returns a map that maps sequence IDs to description lines. + */ + private[converters] def getDescriptionLines(rdd: RDD[(Long, String)]): Map[Long, FastaDescriptionLine] = { + + rdd.filter(kv => isDescriptionLine(kv._2)) + .collect() + .zipWithIndex + .map(kv => (kv._1._1, FastaDescriptionLine(kv._1._1, kv._2, Some(kv._1._2)))) + .toMap + } + + /** + * Finds the index of a contig. + * + * The index of a contig is the highest index below the index of our row. + * Here, we define the index as the row number of the description line that + * describes this contig. + * + * @param rowIdx The row number of the contig row to check. + * @param indices A list containing the row numbers of all description lines. + * @return Returns the row index of the description line that describes this + * sequence line. + */ + private[converters] def findContigIndex(rowIdx: Long, indices: List[Long]): Long = { + val idx = indices.filter(_ <= rowIdx) + idx.max + } +} + +/** + * Conversion methods for single FASTA sequences into ADAM FASTA data. + * + * @param alphabet Alphabet in which to interpret the sequences to convert. + */ +private[converters] class FastaSequenceConverter(alphabet: Alphabet) extends Serializable { + + /** + * Converts a single FASTA sequence into an ADAM sequence. + * + * @throws IllegalArgumentException Thrown if sequence contains an illegal character. + * @param name String option for the sequence name. + * @param id Numerical identifier for the sequence. + * @param sequenceLines Nucleotide sequence lines. + * @param description Optional description of the sequence. + * @return The converted ADAM FASTA sequence. + */ + def convert( + name: Option[String], + id: Int, + sequenceLines: Seq[String], + description: Option[String]): Seq[Sequence] = { + + val sequence = sequenceLines.mkString + val length = sequence.length() + + val builder = Sequence.newBuilder() + .setAlphabet(alphabet) + .setSequence(sequence) + .setLength(length) + + // map over optional fields + name.foreach(builder.setName(_)) + description.foreach(builder.setDescription(_)) + + Seq(builder.build()) + } +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaSliceConverter.scala similarity index 64% rename from adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaSliceConverter.scala index 73b9e380b8..0b464f1bd1 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FastaSliceConverter.scala @@ -18,16 +18,13 @@ package org.bdgenomics.adam.converters import org.apache.spark.rdd.RDD -import org.bdgenomics.formats.avro.{ NucleotideContigFragment, Reference } +import org.bdgenomics.formats.avro.{ Alphabet, Slice } import scala.collection.mutable /** * Object for converting an RDD containing FASTA sequence data into ADAM FASTA data. */ -private[adam] object FastaConverter { - - // nucleotide + amino acid + ambiguity codes covers a-z - private val fastaRegex = "^[a-zA-Z]+$".r +private[adam] object FastaSliceConverter { /** * Case class that describes a line in FASTA that begins with a ">". @@ -45,30 +42,29 @@ private[adam] object FastaConverter { seqId: Int = 0, descriptionLine: Option[String] = None) { /** - * The reference name and description that was parsed out of this description line. + * The contig name and description that was parsed out of this description line. */ - val (referenceName, referenceDescription) = parseDescriptionLine(descriptionLine, fileIndex) + val (contigName, contigDescription) = parseDescriptionLine(descriptionLine, fileIndex) /** * Parses the text of a given line. * - * Assumes that the line contains the reference name followed by an optional - * description of the reference, with the two separated by a space. + * Assumes that the line contains the contig name followed by an optional + * description of the contig, with the two separated by a space. * * @throws IllegalArgumentException if there is no name in the line and the * line is not the only record in a file (i.e., the file contains multiple - * reference sequences). - * - * @param descriptionLine The optional string describing the reference. If this + * contigs). + * @param descriptionLine The optional string describing the contig. If this * is not set and this isn't the only line in the file, we throw. - * @param id The index of this reference in the file. - * @return Returns a tuple containing (the optional reference name, and the - * optional reference description). + * @param id The index of this contig in the file. + * @return Returns a tuple containing (the optional contig name, and the + * optional contig description). */ private def parseDescriptionLine(descriptionLine: Option[String], id: Long): (Option[String], Option[String]) = { descriptionLine.fold { - require(id == -1L, "Cannot have a headerless line in a file with more than one fragment.") + require(id == -1L, "Cannot have a headerless line in a file with more than one sequence.") (None: Option[String], None: Option[String]) } { (dL) => // fasta description line splits on whitespace @@ -80,10 +76,10 @@ private[adam] object FastaConverter { if (split._1.contains('|')) { (None, Some(dL.stripPrefix(">").trim)) } else { - val referenceName: String = split._1.stripPrefix(">").trim - val referenceDescription: String = split._2.trim + val contigName: String = split._1.stripPrefix(">").trim + val contigDescription: String = split._2.trim - (Some(referenceName), Some(referenceDescription)) + (Some(contigName), Some(contigDescription)) } } else { (Some(dL.stripPrefix(">").trim), None) @@ -93,61 +89,53 @@ private[adam] object FastaConverter { } /** - * Converts an RDD containing ints and strings into an RDD containing ADAM nucleotide - * contig fragments. + * Converts an RDD containing ints and strings into an RDD containing ADAM slices. * * @note Input dataset is assumed to have come in from a Hadoop TextInputFormat reader. This sets * a specific format for the RDD's Key-Value pairs. - * * @throws AssertionError Thrown if there appear to be multiple sequences in a single file * that do not have descriptions. * @throws IllegalArgumentException Thrown if a sequence does not have sequence data. - * * @param rdd RDD containing Long,String tuples, where the Long corresponds to the number * of the file line, and the String is the line of the file. - * @param maximumLength Maximum fragment length. Defaults to 10000L. Values greater + * @param maximumLength Maximum slice length. Defaults to 10000L. Values greater * than 1e9 should be avoided. * @return An RDD of ADAM FASTA data. */ def apply( rdd: RDD[(Long, String)], - maximumLength: Long = 10000L): RDD[NucleotideContigFragment] = { - - def isFasta(line: String): Boolean = { - line.startsWith(">") || fastaRegex.pattern.matcher(line).matches() - } - + maximumLength: Long = 10000L): RDD[Slice] = { val filtered = rdd.map(kv => (kv._1, kv._2.trim())) - .filter((kv: (Long, String)) => isFasta(kv._2)) + .filter((kv: (Long, String)) => !kv._2.startsWith(";")) val descriptionLines: Map[Long, FastaDescriptionLine] = getDescriptionLines(filtered) - val indexToReferenceDescription = rdd.context.broadcast(descriptionLines) + val indexToContigDescription = rdd.context.broadcast(descriptionLines) val sequenceLines = filtered.filter(kv => !isDescriptionLine(kv._2)) val keyedSequences = - if (indexToReferenceDescription.value.isEmpty) { + if (indexToContigDescription.value.isEmpty) { sequenceLines.keyBy(kv => -1L) } else { - sequenceLines.keyBy(row => findReferenceIndex(row._1, indexToReferenceDescription.value.keys.toList)) + sequenceLines.keyBy(row => findContigIndex(row._1, indexToContigDescription.value.keys.toList)) } - val groupedReferences = keyedSequences.groupByKey() + val groupedContigs = keyedSequences.groupByKey() - val converter = new FastaConverter(maximumLength) + val converter = new FastaSliceConverter(maximumLength) - groupedReferences.flatMap { + groupedContigs.flatMap { case (id, lines) => - val descriptionLine = indexToReferenceDescription.value.getOrElse(id, FastaDescriptionLine()) + val descriptionLine = indexToContigDescription.value.getOrElse(id, FastaDescriptionLine()) assert(lines.nonEmpty, s"Sequence ${descriptionLine.seqId} has no sequence data.") val sequence: Seq[String] = lines.toSeq.sortBy(_._1).map(kv => cleanSequence(kv._2)) converter.convert( - descriptionLine.referenceName, + descriptionLine.contigName, descriptionLine.seqId, sequence, - descriptionLine.referenceDescription + descriptionLine.contigDescription ) } } @@ -194,18 +182,18 @@ private[adam] object FastaConverter { } /** - * Finds the index of a reference sequence. + * Finds the index of a contig. * - * The index of a reference is the highest index below the index of our row. + * The index of a contig is the highest index below the index of our row. * Here, we define the index as the row number of the description line that - * describes this reference sequence. + * describes this contig. * - * @param rowIdx The row number of the reference sequence row to check. + * @param rowIdx The row number of the contig row to check. * @param indices A list containing the row numbers of all description lines. * @return Returns the row index of the description line that describes this * sequence line. */ - private[converters] def findReferenceIndex(rowIdx: Long, indices: List[Long]): Long = { + private[converters] def findContigIndex(rowIdx: Long, indices: List[Long]): Long = { val idx = indices.filter(_ <= rowIdx) idx.max } @@ -214,95 +202,95 @@ private[adam] object FastaConverter { /** * Conversion methods for single FASTA sequences into ADAM FASTA data. */ -private[converters] class FastaConverter(fragmentLength: Long) extends Serializable { +private[converters] class FastaSliceConverter(sliceLength: Long) extends Serializable { /** - * Remaps the fragments that we get coming in into our expected fragment size. + * Remaps the sequences that we get coming in into our expected slice size. * - * @param sequences Fragments coming in. - * @return A sequence of strings "recut" to the proper fragment size. + * @param sequences Sequences coming in. + * @return A sequence of strings "recut" to the proper slice size. */ - def mapFragments(sequences: Seq[String]): Seq[String] = { + def mapSlices(sequences: Seq[String]): Seq[String] = { // internal "fsm" variables var sequence: StringBuilder = new StringBuilder var sequenceSeq: mutable.MutableList[String] = mutable.MutableList() /** - * Adds a string fragment to our accumulator. If this string fragment causes the accumulator - * to grow longer than our max fragment size, we split the accumulator and add it to the end - * of our list of fragments. + * Adds a string slice to our accumulator. If this string slice causes the accumulator + * to grow longer than our max slice size, we split the accumulator and add it to the end + * of our list of slice. * - * @param seq Fragment string to add. + * @param seq Slice string to add. */ - def addFragment(seq: String) { + def addSlice(seq: String) { sequence.append(seq) - while (sequence.length > fragmentLength) { - sequenceSeq += sequence.take(fragmentLength.toInt).toString() - sequence = sequence.drop(fragmentLength.toInt) + while (sequence.length > sliceLength) { + sequenceSeq += sequence.take(sliceLength.toInt).toString() + sequence = sequence.drop(sliceLength.toInt) } } - // run addFragment on all fragments - sequences.foreach(addFragment) + // run addFragment on all slices + sequences.foreach(addSlice) - // if we still have a remaining sequence that is not part of a fragment, add it + // if we still have a remaining sequence that is not part of a slice, add it if (sequence.nonEmpty) { sequenceSeq += sequence.toString() } - // return our fragments + // return our slices sequenceSeq.toSeq } /** - * Converts a single FASTA sequence into an ADAM FASTA contig. + * Converts a single FASTA sequence into an ADAM slice. * * @throws IllegalArgumentException Thrown if sequence contains an illegal character. - * * @param name String option for the sequence name. * @param id Numerical identifier for the sequence. * @param sequence Nucleotide sequence. * @param description Optional description of the sequence. - * @return The converted ADAM FASTA contig. + * @return The converted ADAM FASTA slice. */ def convert( name: Option[String], id: Int, sequence: Seq[String], - description: Option[String]): Seq[NucleotideContigFragment] = { + description: Option[String]): Seq[Slice] = { // get sequence length val sequenceLength = sequence.map(_.length).sum - // map sequences into fragments - val sequencesAsFragments = mapFragments(sequence) + // map sequences into slices + val sequencesAsFragments = mapSlices(sequence) - // get number of fragments - val fragmentCount = sequencesAsFragments.length + // get number of slices + val sliceCount = sequencesAsFragments.length // make new builder and set up non-optional fields - val fragments = sequencesAsFragments.zipWithIndex + val slices = sequencesAsFragments.zipWithIndex .map(si => { val (bases, index) = si - val builder = NucleotideContigFragment.newBuilder() + val builder = Slice.newBuilder() + .setAlphabet(Alphabet.DNA) .setSequence(bases) .setIndex(index) - .setStart(index * fragmentLength) - .setEnd(index * fragmentLength + bases.length) - .setFragments(fragmentCount) - .setLength(bases.length) - .setContigLength(sequenceLength) + .setStart(index * sliceLength) + .setEnd(index * sliceLength + bases.length) + .setSlices(sliceCount) + .setLength(bases.length.toLong) + .setTotalLength(sequenceLength.toLong) // map over optional fields - name.foreach(builder.setContigName(_)) + name.foreach(builder.setName(_)) description.foreach(builder.setDescription(_)) // build and return builder.build() }) - fragments + slices } } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala index 1be4e1516e..a499fa17eb 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/FragmentConverter.scala @@ -30,14 +30,14 @@ private object FragmentCollector extends Serializable { /** * Apply method to create a fragment collector that is keyed by the contig. * - * @param fragment Fragment of a reference/assembled sequence to wrap. + * @param slice Slice of a reference/assembled sequence to wrap. * @return Returns key value pair where the key is the contig metadata and the * value is a Fragment Collector object. */ - def apply(fragment: NucleotideContigFragment): Option[(String, FragmentCollector)] = { - ReferenceRegion(fragment).map(rr => { - (fragment.getContigName, - FragmentCollector(Seq((rr, fragment.getSequence)))) + def apply(slice: Slice): Option[(String, FragmentCollector)] = { + ReferenceRegion(slice).map(rr => { + (slice.getName, + FragmentCollector(Seq((rr, slice.getSequence)))) }) } } @@ -142,16 +142,16 @@ private[adam] object FragmentConverter extends Serializable { } /** - * Converts an RDD of NucleotideContigFragments into AlignmentRecords. + * Converts an RDD of Slices into AlignmentRecords. * * Produces one alignment record per contiguous sequence contained in the - * input RDD. Fragments are merged down to the longest contiguous chunks + * input RDD. Slices are merged down to the longest contiguous chunks * possible. * * @param rdd RDD of assembled sequences. * @return Returns an RDD of reads that represent aligned contigs. */ - def convertRdd(rdd: RDD[NucleotideContigFragment]): RDD[AlignmentRecord] = { + def convertRdd(rdd: RDD[Slice]): RDD[AlignmentRecord] = { rdd.flatMap(FragmentCollector(_)) .reduceByKey(mergeFragments) .flatMap(convertFragment) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala b/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala index 3453516695..add9566f48 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/instrumentation/Timers.scala @@ -33,12 +33,16 @@ object Timers extends Metrics { val LoadGenotypes = timer("Load Genotypes") val LoadReferenceFile = timer("Load ReferenceFile") val LoadSequenceDictionary = timer("Load SequenceDictionary") + val LoadSequences = timer("Load Sequences") + val LoadSlices = timer("Load Slices") val LoadVariants = timer("Load Variants") // Format specific load methods val LoadBam = timer("Load BAM/CRAM/SAM format") val LoadBed = timer("Load BED6/12 format") val LoadFasta = timer("Load FASTA format") + val LoadFastaSequences = timer("Load FASTA format as Sequences") + val LoadFastaSlices = timer("Load FASTA format as Slices") val LoadFastq = timer("Load FASTQ format") val LoadGff3 = timer("Load GFF3 format") val LoadGtf = timer("Load GTF/GFF2 format") @@ -139,4 +143,6 @@ object Timers extends Metrics { val FullOuterShuffleJoin = timer("Full outer shuffle region join") val ShuffleJoinAndGroupByLeft = timer("Shuffle join followed by group-by on left") val RightOuterShuffleJoinAndGroupByLeft = timer("Right outer shuffle join followed by group-by on left") + + val CreateSequenceDictionary = timer("Create sequence dictionary") } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index a71f2599da..d665f3300d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -295,19 +295,37 @@ object ReferenceRegion { } /** - * Generates a reference region from assembly data. Returns None if the assembly does not - * have an ID or a start position. - * - * @param fragment Assembly fragment from which to generate data. - * @return Region corresponding to inclusive region of contig fragment. - */ - def apply(fragment: NucleotideContigFragment): Option[ReferenceRegion] = { - if (fragment.getContigName != null && - fragment.getStart != null && - fragment.getEnd != null) { - Some(ReferenceRegion(fragment.getContigName, - fragment.getStart, - fragment.getEnd)) + * Generates a reference region from a sequence. Returns None if the sequence does not + * have a name or a length. + * + * @param sequence Sequence from which to generate data. + * @return Region corresponding to inclusive region of the specified sequence. + */ + def apply(sequence: Sequence): Option[ReferenceRegion] = { + if (sequence.getName != null && + sequence.getLength != null) { + Some(ReferenceRegion(sequence.getName, + 0L, + sequence.getLength)) + } else { + None + } + } + + /** + * Generates a reference region from a slice. Returns None if the slice does not + * have a name, a start position, or an end position. + * + * @param slice Slice from which to generate data. + * @return Region corresponding to inclusive region of the specified slice. + */ + def apply(slice: Slice): Option[ReferenceRegion] = { + if (slice.getName != null && + slice.getStart != null && + slice.getEnd != null) { + Some(ReferenceRegion(slice.getName, + slice.getStart, + slice.getEnd)) } else { None } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala index b0fa221595..effdd7fd8c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/SequenceDictionary.scala @@ -19,7 +19,7 @@ package org.bdgenomics.adam.models import htsjdk.samtools.{ SAMFileHeader, SAMSequenceDictionary, SAMSequenceRecord } import htsjdk.variant.vcf.VCFHeader -import org.bdgenomics.formats.avro.{ NucleotideContigFragment, Reference } +import org.bdgenomics.formats.avro.{ Reference, Sequence, Slice } import scala.collection.JavaConversions.{ asScalaIterator, seqAsJavaList } import scala.collection.JavaConverters._ import scala.collection._ @@ -496,14 +496,28 @@ object SequenceRecord { } /** - * Extracts the contig metadata from a nucleotide fragment. + * Builds a sequence record from a sequence. * - * @param fragment The assembly fragment to extract a SequenceRecord from. - * @return The sequence record metadata from a single assembly fragment. + * @param sequence Sequence to build from. + * @return The specified sequence as a SequenceRecord. */ - def fromADAMContigFragment(fragment: NucleotideContigFragment): SequenceRecord = { - SequenceRecord(fragment.getContigName, - fragment.getContigLength) + def fromSequence(sequence: Sequence): SequenceRecord = { + SequenceRecord( + sequence.getName, + Option(sequence.getLength).map(l => l: Long).getOrElse(Long.MaxValue) + ) } -} + /** + * Builds a sequence record from a slice. + * + * @param slice Slice to build from. + * @return The specified slice as a SequenceRecord. + */ + def fromSlice(slice: Slice): SequenceRecord = { + SequenceRecord( + slice.getName, + Option(slice.getTotalLength).map(l => l: Long).getOrElse(Long.MaxValue) + ) + } +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index cda8e32a0c..1f1b793f89 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -58,26 +58,10 @@ import org.bdgenomics.adam.projections.{ FeatureField, Projection } -import org.bdgenomics.adam.rdd.contig.{ - DatasetBoundNucleotideContigFragmentDataset, - NucleotideContigFragmentDataset, - ParquetUnboundNucleotideContigFragmentDataset, - RDDBoundNucleotideContigFragmentDataset -} import org.bdgenomics.adam.rdd.feature._ -import org.bdgenomics.adam.rdd.fragment.{ - DatasetBoundFragmentDataset, - FragmentDataset, - ParquetUnboundFragmentDataset, - RDDBoundFragmentDataset -} -import org.bdgenomics.adam.rdd.read.{ - AlignmentRecordDataset, - DatasetBoundAlignmentRecordDataset, - RepairPartitions, - ParquetUnboundAlignmentRecordDataset, - RDDBoundAlignmentRecordDataset -} +import org.bdgenomics.adam.rdd.fragment._ +import org.bdgenomics.adam.rdd.read._ +import org.bdgenomics.adam.rdd.sequence._ import org.bdgenomics.adam.rdd.variant._ import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.adam.sql.{ @@ -85,7 +69,9 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Read => ReadProduct, + Sequence => SequenceProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -99,14 +85,17 @@ import org.bdgenomics.adam.util.{ } import org.bdgenomics.formats.avro.{ AlignmentRecord, + Alphabet, Feature, Fragment, Genotype, - NucleotideContigFragment, ProcessingStep, + Read, ReadGroup => ReadGroupMetadata, Reference, Sample, + Sequence, + Slice, Variant } import org.bdgenomics.utils.instrumentation.Metrics @@ -150,131 +139,7 @@ private case class LocatableReferenceRegion(rr: ReferenceRegion) extends Locatab */ object ADAMContext { - // conversion functions for pipes - implicit def contigsToContigsConversionFn(gDataset: NucleotideContigFragmentDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - // hijack the transform function to discard the old RDD - gDataset.transform(oldRdd => rdd) - } - - implicit def contigsToCoverageConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[Coverage]): CoverageDataset = { - new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) - } - - implicit def contigsToCoverageDatasetConversionFn( - gDataset: NucleotideContigFragmentDataset, - ds: Dataset[Coverage]): CoverageDataset = { - new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) - } - - implicit def contigsToFeaturesConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[Feature]): FeatureDataset = { - new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) - } - - implicit def contigsToFeaturesDatasetConversionFn( - gDataset: NucleotideContigFragmentDataset, - ds: Dataset[FeatureProduct]): FeatureDataset = { - new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty[Sample]) - } - - implicit def contigsToFragmentsConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[Fragment]): FragmentDataset = { - new RDDBoundFragmentDataset(rdd, - gDataset.sequences, - ReadGroupDictionary.empty, - Seq.empty, - None) - } - - implicit def contigsToFragmentsDatasetConversionFn( - gDataset: NucleotideContigFragmentDataset, - ds: Dataset[FragmentProduct]): FragmentDataset = { - new DatasetBoundFragmentDataset(ds, - gDataset.sequences, - ReadGroupDictionary.empty, - Seq.empty) - } - - implicit def contigsToAlignmentRecordsConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { - new RDDBoundAlignmentRecordDataset(rdd, - gDataset.sequences, - ReadGroupDictionary.empty, - Seq.empty, - None) - } - - implicit def contigsToAlignmentRecordsDatasetConversionFn( - gDataset: NucleotideContigFragmentDataset, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { - new DatasetBoundAlignmentRecordDataset(ds, - gDataset.sequences, - ReadGroupDictionary.empty, - Seq.empty) - } - - implicit def contigsToGenotypesConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[Genotype]): GenotypeDataset = { - new RDDBoundGenotypeDataset(rdd, - gDataset.sequences, - Seq.empty, - DefaultHeaderLines.allHeaderLines, - None) - } - - implicit def contigsToGenotypesDatasetConversionFn( - gDataset: NucleotideContigFragmentDataset, - ds: Dataset[GenotypeProduct]): GenotypeDataset = { - new DatasetBoundGenotypeDataset(ds, - gDataset.sequences, - Seq.empty, - DefaultHeaderLines.allHeaderLines) - } - - implicit def contigsToVariantsConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[Variant]): VariantDataset = { - new RDDBoundVariantDataset(rdd, - gDataset.sequences, - DefaultHeaderLines.allHeaderLines, - None) - } - - implicit def contigsToVariantsDatasetConversionFn( - gDataset: NucleotideContigFragmentDataset, - ds: Dataset[VariantProduct]): VariantDataset = { - new DatasetBoundVariantDataset(ds, - gDataset.sequences, - DefaultHeaderLines.allHeaderLines) - } - - implicit def contigsToVariantContextConversionFn( - gDataset: NucleotideContigFragmentDataset, - rdd: RDD[VariantContext]): VariantContextDataset = { - VariantContextDataset(rdd, - gDataset.sequences, - Seq.empty, - DefaultHeaderLines.allHeaderLines) - } - - implicit def coverageToContigsConversionFn( - gDataset: CoverageDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } - - implicit def coverageToContigsDatasetConversionFn( - gDataset: CoverageDataset, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) - } + // coverage conversion functions implicit def coverageToCoverageConversionFn(gDataset: CoverageDataset, rdd: RDD[Coverage]): CoverageDataset = { @@ -351,6 +216,42 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } + implicit def coverageToReadsConversionFn( + gDataset: CoverageDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def coverageToReadsDatasetConversionFn( + gDataset: CoverageDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def coverageToSequencesConversionFn( + gDataset: CoverageDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def coverageToSequencesDatasetConversionFn( + gDataset: CoverageDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def coverageToSlicesConversionFn( + gDataset: CoverageDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def coverageToSlicesDatasetConversionFn( + gDataset: CoverageDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + implicit def coverageToVariantsConversionFn( gDataset: CoverageDataset, rdd: RDD[Variant]): VariantDataset = { @@ -377,17 +278,7 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } - implicit def featuresToContigsConversionFn( - gDataset: FeatureDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } - - implicit def featuresToContigsDatasetConversionFn( - gDataset: FeatureDataset, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) - } + // features conversion functions implicit def featuresToCoverageConversionFn( gDataset: FeatureDataset, @@ -464,6 +355,42 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } + implicit def featuresToReadsConversionFn( + gDataset: FeatureDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def featuresToReadsDatasetConversionFn( + gDataset: FeatureDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def featuresToSequencesConversionFn( + gDataset: FeatureDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def featuresToSequencesDatasetConversionFn( + gDataset: FeatureDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def featuresToSlicesConversionFn( + gDataset: FeatureDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def featuresToSlicesDatasetConversionFn( + gDataset: FeatureDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + implicit def featuresToVariantsConversionFn( gDataset: FeatureDataset, rdd: RDD[Variant]): VariantDataset = { @@ -490,17 +417,7 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } - implicit def fragmentsToContigsConversionFn( - gDataset: FragmentDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } - - implicit def fragmentsToContigsDatasetConversionFn( - gDataset: FragmentDataset, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) - } + // fragments conversion functions implicit def fragmentsToCoverageConversionFn( gDataset: FragmentDataset, @@ -570,6 +487,42 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } + implicit def fragmentsToReadsConversionFn( + gDataset: FragmentDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def fragmentsToReadsDatasetConversionFn( + gDataset: FragmentDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def fragmentsToSequencesConversionFn( + gDataset: FragmentDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def fragmentsToSequencesDatasetConversionFn( + gDataset: FragmentDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def fragmentsToSlicesConversionFn( + gDataset: FragmentDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def fragmentsToSlicesDatasetConversionFn( + gDataset: FragmentDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + implicit def fragmentsToVariantsConversionFn( gDataset: FragmentDataset, rdd: RDD[Variant]): VariantDataset = { @@ -596,11 +549,7 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } - implicit def genericToContigsConversionFn[Y <: GenericGenomicDataset[_, _]]( - gDataset: Y, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } + // generic conversion functions implicit def genericToCoverageConversionFn[Y <: GenericGenomicDataset[_, _]]( gDataset: Y, @@ -644,6 +593,24 @@ object ADAMContext { None) } + implicit def genericToReadsConversionFn[Y <: GenericGenomicDataset[_, _]]( + gDataset: Y, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def genericToSequencesConversionFn[Y <: GenericGenomicDataset[_, _]]( + gDataset: Y, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def genericToSlicesConversionFn[Y <: GenericGenomicDataset[_, _]]( + gDataset: Y, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + implicit def genericToVariantsConversionFn[Y <: GenericGenomicDataset[_, _]]( gDataset: Y, rdd: RDD[Variant]): VariantDataset = { @@ -663,17 +630,7 @@ object ADAMContext { None) } - implicit def alignmentRecordsToContigsConversionFn( - gDataset: AlignmentRecordDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } - - implicit def alignmentRecordsToContigsDatasetConversionFn( - gDataset: AlignmentRecordDataset, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) - } + // alignment records conversion functions implicit def alignmentRecordsToCoverageConversionFn( gDataset: AlignmentRecordDataset, @@ -743,6 +700,42 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } + implicit def alignmentRecordsToReadsConversionFn( + gDataset: AlignmentRecordDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def alignmentRecordsToReadsDatasetConversionFn( + gDataset: AlignmentRecordDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def alignmentRecordsToSequencesConversionFn( + gDataset: AlignmentRecordDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def alignmentRecordsToSequencesDatasetConversionFn( + gDataset: AlignmentRecordDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def alignmentRecordsToSlicesConversionFn( + gDataset: AlignmentRecordDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def alignmentRecordsToSlicesDatasetConversionFn( + gDataset: AlignmentRecordDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + implicit def alignmentRecordsToVariantsConversionFn( gDataset: AlignmentRecordDataset, rdd: RDD[Variant]): VariantDataset = { @@ -769,16 +762,23 @@ object ADAMContext { DefaultHeaderLines.allHeaderLines) } - implicit def genotypesToContigsConversionFn( + implicit def genotypesToAlignmentRecordsConversionFn( gDataset: GenotypeDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) } - implicit def genotypesToContigsDatasetConversionFn( + implicit def genotypesToAlignmentRecordsDatasetConversionFn( gDataset: GenotypeDataset, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) } implicit def genotypesToCoverageConversionFn( @@ -824,46 +824,59 @@ object ADAMContext { Seq.empty) } - implicit def genotypesToAlignmentRecordsConversionFn( + implicit def genotypesToGenotypesConversionFn( gDataset: GenotypeDataset, - rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { - new RDDBoundAlignmentRecordDataset(rdd, - gDataset.sequences, - ReadGroupDictionary.empty, - Seq.empty, - None) + rdd: RDD[Genotype]): GenotypeDataset = { + // hijack the transform function to discard the old RDD + gDataset.transform(oldRdd => rdd) } - implicit def genotypesToAlignmentRecordsDatasetConversionFn( + implicit def genotypesToReadsConversionFn( gDataset: GenotypeDataset, - ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { - new DatasetBoundAlignmentRecordDataset(ds, - gDataset.sequences, - ReadGroupDictionary.empty, - Seq.empty) + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) } - implicit def genotypesToGenotypesConversionFn(gDataset: GenotypeDataset, - rdd: RDD[Genotype]): GenotypeDataset = { - // hijack the transform function to discard the old RDD - gDataset.transform(oldRdd => rdd) + implicit def genotypesToReadsDatasetConversionFn( + gDataset: GenotypeDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def genotypesToSequencesConversionFn( + gDataset: GenotypeDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def genotypesToSequencesDatasetConversionFn( + gDataset: GenotypeDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def genotypesToSlicesConversionFn( + gDataset: GenotypeDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def genotypesToSlicesDatasetConversionFn( + gDataset: GenotypeDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) } implicit def genotypesToVariantsConversionFn( gDataset: GenotypeDataset, rdd: RDD[Variant]): VariantDataset = { - new RDDBoundVariantDataset(rdd, - gDataset.sequences, - gDataset.headerLines, - None) + new RDDBoundVariantDataset(rdd, gDataset.sequences, gDataset.headerLines, None) } implicit def genotypesToVariantsDatasetConversionFn( gDataset: GenotypeDataset, ds: Dataset[VariantProduct]): VariantDataset = { - new DatasetBoundVariantDataset(ds, - gDataset.sequences, - gDataset.headerLines) + new DatasetBoundVariantDataset(ds, gDataset.sequences, gDataset.headerLines) } implicit def genotypesToVariantContextConversionFn( @@ -875,34 +888,441 @@ object ADAMContext { gDataset.headerLines) } - implicit def variantsToContigsConversionFn( - gDataset: VariantDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } - - implicit def variantsToContigsDatasetConversionFn( - gDataset: VariantDataset, - ds: Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - new DatasetBoundNucleotideContigFragmentDataset(ds, gDataset.sequences) - } + // reads conversion functions - implicit def variantsToCoverageConversionFn( - gDataset: VariantDataset, + implicit def readsToCoverageConversionFn( + gDataset: ReadDataset, rdd: RDD[Coverage]): CoverageDataset = { - new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty, None) } - implicit def variantsToCoverageDatasetConversionFn( - gDataset: VariantDataset, + implicit def readsToCoverageDatasetConversionFn( + gDataset: ReadDataset, ds: Dataset[Coverage]): CoverageDataset = { - new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty) } - implicit def variantsToFeaturesConversionFn( - gDataset: VariantDataset, + implicit def readsToFeaturesConversionFn( + gDataset: ReadDataset, rdd: RDD[Feature]): FeatureDataset = { - new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty, None) + } + + implicit def readsToFeaturesDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty) + } + + implicit def readsToFragmentsConversionFn( + gDataset: ReadDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) + } + + implicit def readsToFragmentsDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) + } + + implicit def readsToAlignmentRecordsConversionFn( + gDataset: ReadDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) + } + + implicit def readsToAlignmentRecordsDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) + } + + implicit def readsToGenotypesConversionFn( + gDataset: ReadDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines, + None) + } + + implicit def readsToGenotypesDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines) + } + + implicit def readsToReadsConversionFn(gDataset: ReadDataset, + rdd: RDD[Read]): ReadDataset = { + // hijack the transform function to discard the old RDD + gDataset.transform(oldRdd => rdd) + } + + implicit def readsToSequencesConversionFn( + gDataset: ReadDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def readsToSequencesDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def readsToSlicesConversionFn( + gDataset: ReadDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def readsToSlicesDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + + implicit def readsToVariantsConversionFn( + gDataset: ReadDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, + DefaultHeaderLines.allHeaderLines, + None) + } + + implicit def readsToVariantsDatasetConversionFn( + gDataset: ReadDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, + DefaultHeaderLines.allHeaderLines) + } + + implicit def readsToVariantContextsConversionFn( + gDataset: ReadDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines) + } + + // sequences conversion functions + + implicit def sequencesToCoverageConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty, None) + } + + implicit def sequencesToCoverageDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty) + } + + implicit def sequencesToFeaturesConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty, None) + } + + implicit def sequencesToFeaturesDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty) + } + + implicit def sequencesToFragmentsConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) + } + + implicit def sequencesToFragmentsDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) + } + + implicit def sequencesToAlignmentRecordsConversionFn( + gDataset: SequenceDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) + } + + implicit def sequencesToAlignmentRecordsDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) + } + + implicit def sequencesToGenotypesConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines, + None) + } + + implicit def sequencesToGenotypesDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines) + } + + implicit def sequencesToReadsConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def sequencesToReadsDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def sequencesToSequencesConversionFn(gDataset: SequenceDataset, + rdd: RDD[Sequence]): SequenceDataset = { + // hijack the transform function to discard the old RDD + gDataset.transform(oldRdd => rdd) + } + + implicit def sequencesToSlicesConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def sequencesToSlicesDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + + implicit def sequencesToVariantsConversionFn( + gDataset: SequenceDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, + DefaultHeaderLines.allHeaderLines, + None) + } + + implicit def sequencesToVariantsDatasetConversionFn( + gDataset: SequenceDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, + DefaultHeaderLines.allHeaderLines) + } + + implicit def sequencesToVariantContextsConversionFn( + gDataset: SequenceDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines) + } + + // slices conversion functions + + implicit def slicesToCoverageConversionFn( + gDataset: SliceDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty, None) + } + + implicit def slicesToCoverageDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty) + } + + implicit def slicesToFeaturesConversionFn( + gDataset: SliceDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty, None) + } + + implicit def slicesToFeaturesDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[FeatureProduct]): FeatureDataset = { + new DatasetBoundFeatureDataset(ds, gDataset.sequences, Seq.empty) + } + + implicit def slicesToFragmentsConversionFn( + gDataset: SliceDataset, + rdd: RDD[Fragment]): FragmentDataset = { + new RDDBoundFragmentDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) + } + + implicit def slicesToFragmentsDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[FragmentProduct]): FragmentDataset = { + new DatasetBoundFragmentDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) + } + + implicit def slicesToAlignmentRecordsConversionFn( + gDataset: SliceDataset, + rdd: RDD[AlignmentRecord]): AlignmentRecordDataset = { + new RDDBoundAlignmentRecordDataset(rdd, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty, + None) + } + + implicit def slicesToAlignmentRecordsDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[AlignmentRecordProduct]): AlignmentRecordDataset = { + new DatasetBoundAlignmentRecordDataset(ds, + gDataset.sequences, + ReadGroupDictionary.empty, + Seq.empty) + } + + implicit def slicesToGenotypesConversionFn( + gDataset: SliceDataset, + rdd: RDD[Genotype]): GenotypeDataset = { + new RDDBoundGenotypeDataset(rdd, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines, + None) + } + + implicit def slicesToGenotypesDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[GenotypeProduct]): GenotypeDataset = { + new DatasetBoundGenotypeDataset(ds, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines) + } + + implicit def slicesToReadsConversionFn( + gDataset: SliceDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def slicesToReadsDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def slicesToSequencesConversionFn( + gDataset: SliceDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def slicesToSequencesDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def slicesToSlicesConversionFn(gDataset: SliceDataset, + rdd: RDD[Slice]): SliceDataset = { + // hijack the transform function to discard the old RDD + gDataset.transform(oldRdd => rdd) + } + + implicit def slicesToVariantsConversionFn( + gDataset: SliceDataset, + rdd: RDD[Variant]): VariantDataset = { + new RDDBoundVariantDataset(rdd, + gDataset.sequences, + DefaultHeaderLines.allHeaderLines, + None) + } + + implicit def slicesToVariantsDatasetConversionFn( + gDataset: SliceDataset, + ds: Dataset[VariantProduct]): VariantDataset = { + new DatasetBoundVariantDataset(ds, + gDataset.sequences, + DefaultHeaderLines.allHeaderLines) + } + + implicit def slicesToVariantContextsConversionFn( + gDataset: SliceDataset, + rdd: RDD[VariantContext]): VariantContextDataset = { + VariantContextDataset(rdd, + gDataset.sequences, + Seq.empty, + DefaultHeaderLines.allHeaderLines) + } + + // variants conversion functions + + implicit def variantsToCoverageConversionFn( + gDataset: VariantDataset, + rdd: RDD[Coverage]): CoverageDataset = { + new RDDBoundCoverageDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) + } + + implicit def variantsToCoverageDatasetConversionFn( + gDataset: VariantDataset, + ds: Dataset[Coverage]): CoverageDataset = { + new DatasetBoundCoverageDataset(ds, gDataset.sequences, Seq.empty[Sample]) + } + + implicit def variantsToFeaturesConversionFn( + gDataset: VariantDataset, + rdd: RDD[Feature]): FeatureDataset = { + new RDDBoundFeatureDataset(rdd, gDataset.sequences, Seq.empty[Sample], None) } implicit def variantsToFeaturesDatasetConversionFn( @@ -968,6 +1388,42 @@ object ADAMContext { gDataset.headerLines) } + implicit def variantsToReadsConversionFn( + gDataset: VariantDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def variantsToReadsDatasetConversionFn( + gDataset: VariantDataset, + ds: Dataset[ReadProduct]): ReadDataset = { + new DatasetBoundReadDataset(ds, gDataset.sequences) + } + + implicit def variantsToSequencesConversionFn( + gDataset: VariantDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def variantsToSequencesDatasetConversionFn( + gDataset: VariantDataset, + ds: Dataset[SequenceProduct]): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, gDataset.sequences) + } + + implicit def variantsToSlicesConversionFn( + gDataset: VariantDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + + implicit def variantsToSlicesDatasetConversionFn( + gDataset: VariantDataset, + ds: Dataset[SliceProduct]): SliceDataset = { + new DatasetBoundSliceDataset(ds, gDataset.sequences) + } + implicit def variantsToVariantsConversionFn(gDataset: VariantDataset, rdd: RDD[Variant]): VariantDataset = { // hijack the transform function to discard the old RDD @@ -983,11 +1439,7 @@ object ADAMContext { gDataset.headerLines) } - implicit def variantContextsToContigsConversionFn( - gDataset: VariantContextDataset, - rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(rdd, gDataset.sequences, None) - } + // variant contexts conversion functions implicit def variantContextsToCoverageConversionFn( gDataset: VariantContextDataset, @@ -1031,6 +1483,24 @@ object ADAMContext { None) } + implicit def variantContextsToReadsConversionFn( + gDataset: VariantContextDataset, + rdd: RDD[Read]): ReadDataset = { + new RDDBoundReadDataset(rdd, gDataset.sequences, None) + } + + implicit def variantContextsToSequencesConversionFn( + gDataset: VariantContextDataset, + rdd: RDD[Sequence]): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, gDataset.sequences, None) + } + + implicit def variantContextsToSlicesConversionFn( + gDataset: VariantContextDataset, + rdd: RDD[Slice]): SliceDataset = { + new RDDBoundSliceDataset(rdd, gDataset.sequences, None) + } + implicit def variantContextsToVariantsConversionFn( gDataset: VariantContextDataset, rdd: RDD[Variant]): VariantDataset = { @@ -2502,36 +2972,6 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (regions.nonEmpty) variantsDatasetBound.filterByOverlappingRegions(regions) else variantsDatasetBound } - /** - * Load nucleotide contig fragments from FASTA into a NucleotideContigFragmentDataset. - * - * @param pathName The path name to load nucleotide contig fragments from. - * Globs/directories are supported. - * @param maximumLength Maximum fragment length. Defaults to 10000L. Values greater - * than 1e9 should be avoided. - * @return Returns a NucleotideContigFragmentDataset. - */ - def loadFasta( - pathName: String, - maximumLength: Long = 10000L): NucleotideContigFragmentDataset = LoadFasta.time { - - val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile( - pathName, - classOf[TextInputFormat], - classOf[LongWritable], - classOf[Text] - ) - if (Metrics.isRecording) fastaData.instrument() else fastaData - - val remapData = fastaData.map(kv => (kv._1.get, kv._2.toString)) - - // convert rdd and cache - val fragmentRdd = FastaConverter(remapData, maximumLength) - .cache() - - NucleotideContigFragmentDataset(fragmentRdd) - } - /** * Load paired unaligned alignment records grouped by sequencing fragment * from interleaved FASTQ into an FragmentDataset. @@ -2879,65 +3319,6 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (regions.nonEmpty) featureDatasetBound.filterByOverlappingRegions(regions) else featureDatasetBound } - /** - * Load a path name in Parquet + Avro format into a NucleotideContigFragmentDataset. - * - * @param pathName The path name to load nucleotide contig fragments from. - * Globs/directories are supported. - * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. - * Defaults to None. - * @param optProjection An option projection schema to use when reading Parquet + Avro. - * Defaults to None. - * @return Returns a NucleotideContigFragmentDataset. - */ - def loadParquetContigFragments( - pathName: String, - optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): NucleotideContigFragmentDataset = { - - val sd = loadAvroSequenceDictionary(pathName) - - (optPredicate, optProjection) match { - case (None, None) => { - ParquetUnboundNucleotideContigFragmentDataset( - sc, pathName, sd) - } - case (_, _) => { - val rdd = loadParquet[NucleotideContigFragment](pathName, optPredicate, optProjection) - new RDDBoundNucleotideContigFragmentDataset(rdd, - sd, - optPartitionMap = extractPartitionMap(pathName)) - } - } - } - - /** - * Load a path name with range binned partitioned Parquet format into a NucleotideContigFragmentDataset. - * - * @param pathName The path name to load alignment records from. - * Globs/directories are supported. - * @param regions Optional list of genomic regions to load. - * @param optLookbackPartitions Number of partitions to lookback to find beginning of an overlapping - * region when using the filterByOverlappingRegions function on the returned dataset. - * Defaults to one partition. - * @return Returns a NucleotideContigFragmentDataset. - */ - def loadPartitionedParquetContigFragments(pathName: String, - regions: Iterable[ReferenceRegion] = Iterable.empty, - optLookbackPartitions: Option[Int] = Some(1)): NucleotideContigFragmentDataset = { - - val partitionedBinSize = getPartitionBinSize(pathName) - val contigs = loadParquetContigFragments(pathName) - val contigsDatasetBound = DatasetBoundNucleotideContigFragmentDataset(contigs.dataset, - contigs.sequences, - isPartitioned = true, - Some(partitionedBinSize), - optLookbackPartitions - ) - - if (regions.nonEmpty) contigsDatasetBound.filterByOverlappingRegions(regions) else contigsDatasetBound - } - /** * Load a path name in Parquet + Avro format into a FragmentDataset. * @@ -3067,10 +3448,10 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log /** * Load reference sequences into a broadcastable ReferenceFile. * - * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments + * If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices * to load the reference as an RDD, which is then collected to the driver. * - * @see loadContigFragments + * @see loadSlices * * @param pathName The path name to load reference sequences from. * Globs/directories for 2bit format are not supported. @@ -3085,7 +3466,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log if (is2BitExt(pathName)) { new TwoBitFile(new LocalFileByteAccess(new File(pathName))) } else { - ReferenceContigMap(loadContigFragments(pathName, maximumLength = maximumLength).rdd) + ReferenceContigMap(loadSlices(pathName, maximumLength = maximumLength).rdd) } } @@ -3103,63 +3484,21 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * @param pathName The path name to load a sequence dictionary from. * @return Returns a sequence dictionary. * @throws IllegalArgumentException if pathName file extension not one of .dict, - * .genome, or .txt - */ - def loadSequenceDictionary(pathName: String): SequenceDictionary = LoadSequenceDictionary.time { - val trimmedPathName = trimExtensionIfCompressed(pathName) - if (isDictExt(trimmedPathName)) { - info(s"Loading $pathName as HTSJDK sequence dictionary.") - SequenceDictionaryReader(pathName, sc) - } else if (isGenomeExt(trimmedPathName)) { - info(s"Loading $pathName as Bedtools genome file sequence dictionary.") - GenomeFileReader(pathName, sc) - } else if (isTextExt(trimmedPathName)) { - info(s"Loading $pathName as UCSC Genome Browser chromInfo file sequence dictionary.") - GenomeFileReader(pathName, sc) - } else { - throw new IllegalArgumentException("Path name file extension must be one of .dict, .genome, or .txt") - } - } - - /** - * Load nucleotide contig fragments into a NucleotideContigFragmentDataset. - * - * If the path name has a .fa/.fasta extension, load as FASTA format. - * Else, fall back to Parquet + Avro. - * - * For FASTA format, compressed files are supported through compression codecs configured - * in Hadoop, which by default include .gz and .bz2, but can include more. - * - * @see loadFasta - * @see loadParquetContigFragments - * - * @param pathName The path name to load nucleotide contig fragments from. - * Globs/directories are supported, although file extension must be present - * for FASTA format. - * @param maximumLength Maximum fragment length. Defaults to 10000L. Values greater - * than 1e9 should be avoided. - * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. - * Defaults to None. - * @param optProjection An option projection schema to use when reading Parquet + Avro. - * Defaults to None. - * @return Returns a NucleotideContigFragmentDataset. + * .genome, or .txt */ - def loadContigFragments( - pathName: String, - maximumLength: Long = 10000L, - optPredicate: Option[FilterPredicate] = None, - optProjection: Option[Schema] = None): NucleotideContigFragmentDataset = LoadContigFragments.time { - + def loadSequenceDictionary(pathName: String): SequenceDictionary = LoadSequenceDictionary.time { val trimmedPathName = trimExtensionIfCompressed(pathName) - if (isFastaExt(trimmedPathName)) { - info(s"Loading $pathName as FASTA and converting to NucleotideContigFragment.") - loadFasta( - pathName, - maximumLength - ) + if (isDictExt(trimmedPathName)) { + info(s"Loading $pathName as HTSJDK sequence dictionary.") + SequenceDictionaryReader(pathName, sc) + } else if (isGenomeExt(trimmedPathName)) { + info(s"Loading $pathName as Bedtools genome file sequence dictionary.") + GenomeFileReader(pathName, sc) + } else if (isTextExt(trimmedPathName)) { + info(s"Loading $pathName as UCSC Genome Browser chromInfo file sequence dictionary.") + GenomeFileReader(pathName, sc) } else { - info(s"Loading $pathName as Parquet containing NucleotideContigFragments.") - loadParquetContigFragments(pathName, optPredicate = optPredicate, optProjection = optProjection) + throw new IllegalArgumentException("Path name file extension must be one of .dict, .genome, or .txt") } } @@ -3249,7 +3588,7 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log * * @see loadBam * @see loadFastq - * @see loadFasta + * @see loadFastaDna(String, Long) * @see loadInterleavedFastq * @see loadParquetAlignments * @@ -3294,8 +3633,8 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log info(s"Loading $pathName as unpaired FASTQ and converting to AlignmentRecords.") loadFastq(pathName, optPathName2, optReadGroup, stringency) } else if (isFastaExt(trimmedPathName)) { - info(s"Loading $pathName as FASTA and converting to AlignmentRecords.") - AlignmentRecordDataset.unaligned(loadFasta(pathName, maximumLength = 10000L).toReads) + info(s"Loading $pathName as FASTA DNA and converting to AlignmentRecords.") + AlignmentRecordDataset.unaligned(FragmentConverter.convertRdd(loadFastaDna(pathName, maximumLength = 10000L).rdd)) } else { info(s"Loading $pathName as Parquet of AlignmentRecords.") loadParquetAlignments(pathName, optPredicate = optPredicate, optProjection = optProjection) @@ -3400,6 +3739,354 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log } } + /** + * Load a path name in Parquet + Avro format into a ReadDataset. + * + * @param pathName The path name to load reads from. + * Globs/directories are supported. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a ReadDataset. + */ + def loadParquetReads( + pathName: String, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): ReadDataset = { + + val sd = loadAvroSequenceDictionary(pathName) + + (optPredicate, optProjection) match { + case (None, None) => { + ParquetUnboundReadDataset( + sc, pathName, sd) + } + case (_, _) => { + val rdd = loadParquet[Read](pathName, optPredicate, optProjection) + new RDDBoundReadDataset(rdd, + sd, + optPartitionMap = extractPartitionMap(pathName)) + } + } + } + + /** + * Load a path name in Parquet + Avro format into a SequenceDataset. + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SequenceDataset. + */ + def loadParquetSequences( + pathName: String, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SequenceDataset = { + + val sd = loadAvroSequenceDictionary(pathName) + + (optPredicate, optProjection) match { + case (None, None) => { + ParquetUnboundSequenceDataset( + sc, pathName, sd) + } + case (_, _) => { + val rdd = loadParquet[Sequence](pathName, optPredicate, optProjection) + new RDDBoundSequenceDataset(rdd, + sd, + optPartitionMap = extractPartitionMap(pathName)) + } + } + } + + /** + * Load a path name in Parquet + Avro format into a SliceDataset. + * + * @param pathName The path name to load slices from. + * Globs/directories are supported. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SliceDataset. + */ + def loadParquetSlices( + pathName: String, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SliceDataset = { + + val sd = loadAvroSequenceDictionary(pathName) + + (optPredicate, optProjection) match { + case (None, None) => { + ParquetUnboundSliceDataset( + sc, pathName, sd) + } + case (_, _) => { + val rdd = loadParquet[Slice](pathName, optPredicate, optProjection) + new RDDBoundSliceDataset(rdd, + sd, + optPartitionMap = extractPartitionMap(pathName)) + } + } + } + + /** + * Load sequences from a specified alphabet from FASTA into a SequenceDataset. + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported. + * @param alphabet Alphabet in which to interpret the loaded sequences. + * @return Returns a SequenceDataset. + */ + private def loadFastaSequences( + pathName: String, + alphabet: Alphabet): SequenceDataset = LoadFastaSequences.time { + + val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile( + pathName, + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text] + ) + if (Metrics.isRecording) fastaData.instrument() else fastaData + + val remapData = fastaData.map(kv => (kv._1.get, kv._2.toString)) + SequenceDataset(FastaSequenceConverter(alphabet, remapData)) + } + + /** + * Load DNA sequences from FASTA into a SequenceDataset. + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported. + * @return Returns a SequenceDataset containing DNA sequences. + */ + def loadFastaDna(pathName: String): SequenceDataset = { + loadFastaSequences(pathName, Alphabet.DNA) + } + + /** + * Load protein sequences from FASTA into a SequenceDataset. + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported. + * @return Returns a SequenceDataset containing protein sequences. + */ + def loadFastaProtein(pathName: String): SequenceDataset = { + loadFastaSequences(pathName, Alphabet.PROTEIN) + } + + /** + * Load RNA sequences from FASTA into a SequenceDataset. + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported. + * @return Returns a SequenceDataset containing RNA sequences. + */ + def loadFastaRna(pathName: String): SequenceDataset = { + loadFastaSequences(pathName, Alphabet.RNA) + } + + /** + * Load sequences from a specified alphabet into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see loadFastaDna + * @see loadFastaProtein + * @see loadFastaRna + * @see loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @param alphabet Alphabet in which to interpret the loaded sequences. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SequenceDataset. + */ + private def loadSequences( + pathName: String, + alphabet: Alphabet, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SequenceDataset = LoadSequences.time { + + val trimmedPathName = trimExtensionIfCompressed(pathName) + if (isFastaExt(trimmedPathName)) { + info(s"Loading $pathName as FASTA $alphabet and converting to Sequences.") + loadFastaSequences(pathName, alphabet) + } else { + info(s"Loading $pathName as Parquet containing $alphabet Sequences.") + loadParquetSequences(pathName, optPredicate = optPredicate, optProjection = optProjection) + } + } + + /** + * Load DNA sequences into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see loadFastaDna + * @see loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SequenceDataset containing DNA sequences. + */ + def loadDnaSequences( + pathName: String, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SequenceDataset = { + + loadSequences(pathName, Alphabet.DNA, optPredicate, optProjection) + } + + /** + * Load protein sequences into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see loadFastaProtein + * @see loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SequenceRDD containing protein sequences. + */ + def loadProteinSequences( + pathName: String, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SequenceDataset = { + + loadSequences(pathName, Alphabet.PROTEIN, optPredicate, optProjection) + } + + /** + * Load RNA sequences into a SequenceDataset. + * + * If the path name has a .fa/.fasta extension, load as FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see loadFastaRna + * @see loadParquetSequences + * + * @param pathName The path name to load sequences from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SequenceDataset containing RNA sequences. + */ + def loadRnaSequences( + pathName: String, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SequenceDataset = { + + loadSequences(pathName, Alphabet.RNA, optPredicate, optProjection) + } + + /** + * Load DNA slices from FASTA into a SliceDataset. + * + * @param pathName The path name to load slices from. + * Globs/directories are supported. + * @param maximumLength Maximum fragment length. Defaults to 10000L. Values greater + * than 1e9 should be avoided. + * @return Returns a SliceDataset containing DNA slices. + */ + def loadFastaDna( + pathName: String, + maximumLength: Long = 10000L): SliceDataset = LoadFastaSlices.time { + + val fastaData: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile( + pathName, + classOf[TextInputFormat], + classOf[LongWritable], + classOf[Text] + ) + if (Metrics.isRecording) fastaData.instrument() else fastaData + + val remapData = fastaData.map(kv => (kv._1.get, kv._2.toString)) + + SliceDataset(FastaSliceConverter(remapData, maximumLength)) + } + + /** + * Load slices into a SliceDataset. + * + * If the path name has a .fa/.fasta extension, load as DNA in FASTA format. + * Else, fall back to Parquet + Avro. + * + * For FASTA format, compressed files are supported through compression codecs configured + * in Hadoop, which by default include .gz and .bz2, but can include more. + * + * @see loadFastaDna(String, Long) + * @see loadParquetSlices + * + * @param pathName The path name to load DNA slices from. + * Globs/directories are supported, although file extension must be present + * for FASTA format. + * @param maximumLength Maximum slice length. Defaults to 10000L. Values greater + * than 1e9 should be avoided. + * @param optPredicate An optional pushdown predicate to use when reading Parquet + Avro. + * Defaults to None. + * @param optProjection An optional projection schema to use when reading Parquet + Avro. + * Defaults to None. + * @return Returns a SliceDataset. + */ + def loadSlices( + pathName: String, + maximumLength: Long = 10000L, + optPredicate: Option[FilterPredicate] = None, + optProjection: Option[Schema] = None): SliceDataset = LoadSlices.time { + + val trimmedPathName = trimExtensionIfCompressed(pathName) + if (isFastaExt(trimmedPathName)) { + info(s"Loading $pathName as FASTA and converting to Slices.") + loadFastaDna( + pathName, + maximumLength + ) + } else { + info(s"Loading $pathName as Parquet containing Slices.") + loadParquetSlices(pathName, optPredicate = optPredicate, optProjection = optProjection) + } + } + // alignments /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDataset.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDataset.scala deleted file mode 100644 index dfb9768c71..0000000000 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDataset.scala +++ /dev/null @@ -1,526 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.contig - -import com.google.common.base.Splitter -import org.apache.parquet.hadoop.metadata.CompressionCodecName -import org.apache.spark.SparkContext -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.api.java.function.{ Function => JFunction } -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{ Dataset, SQLContext } -import org.bdgenomics.adam.converters.FragmentConverter -import org.bdgenomics.adam.models.{ - ReferenceRegion, - ReferenceRegionSerializer, - SequenceRecord, - SequenceDictionary -} -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.{ - DatasetBoundGenomicDataset, - AvroGenomicDataset, - JavaSaveArgs -} -import org.bdgenomics.adam.serialization.AvroSerializer -import org.bdgenomics.adam.sql.{ NucleotideContigFragment => NucleotideContigFragmentProduct } -import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment } -import org.bdgenomics.utils.interval.array.{ - IntervalArray, - IntervalArraySerializer -} -import scala.collection.JavaConverters._ -import scala.math.max -import scala.reflect.ClassTag -import scala.reflect.runtime.universe._ - -private[adam] case class NucleotideContigFragmentArray( - array: Array[(ReferenceRegion, NucleotideContigFragment)], - maxIntervalWidth: Long) extends IntervalArray[ReferenceRegion, NucleotideContigFragment] { - - def duplicate(): IntervalArray[ReferenceRegion, NucleotideContigFragment] = { - copy() - } - - protected def replace(arr: Array[(ReferenceRegion, NucleotideContigFragment)], - maxWidth: Long): IntervalArray[ReferenceRegion, NucleotideContigFragment] = { - NucleotideContigFragmentArray(arr, maxWidth) - } -} - -private[adam] class NucleotideContigFragmentArraySerializer extends IntervalArraySerializer[ReferenceRegion, NucleotideContigFragment, NucleotideContigFragmentArray] { - - protected val kSerializer = new ReferenceRegionSerializer - protected val tSerializer = new AvroSerializer[NucleotideContigFragment] - - protected def builder(arr: Array[(ReferenceRegion, NucleotideContigFragment)], - maxIntervalWidth: Long): NucleotideContigFragmentArray = { - NucleotideContigFragmentArray(arr, maxIntervalWidth) - } -} - -object NucleotideContigFragmentDataset extends Serializable { - - /** - * Builds a NucleotideContigFragmentDataset when no sequence dictionary is given. - * - * @param rdd Underlying RDD. We recompute the sequence dictionary from - * this RDD. - * @return Returns a new NucleotideContigFragmentDataset. - */ - private[rdd] def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentDataset = { - - // get sequence dictionary - val sd = new SequenceDictionary(rdd.flatMap(ncf => { - if (ncf.getContigName != null) { - Some(SequenceRecord.fromADAMContigFragment(ncf)) - } else { - None - } - }).distinct - .collect - .toVector) - - NucleotideContigFragmentDataset(rdd, sd) - } - - /** - * Builds a NucleotideContigFragmentDataset without a partition map. - * - * @param rdd The underlying NucleotideContigFragment RDD. - * @param sequences The sequence dictionary for the RDD. - * @return A new NucleotideContigFragmentDataset. - */ - def apply(rdd: RDD[NucleotideContigFragment], - sequences: SequenceDictionary): NucleotideContigFragmentDataset = { - - RDDBoundNucleotideContigFragmentDataset(rdd, sequences, None) - } -} - -case class ParquetUnboundNucleotideContigFragmentDataset private[rdd] ( - @transient private val sc: SparkContext, - private val parquetFilename: String, - sequences: SequenceDictionary) extends NucleotideContigFragmentDataset { - - protected lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) - - lazy val rdd: RDD[NucleotideContigFragment] = { - sc.loadParquet(parquetFilename) - } - - lazy val dataset = { - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - sqlContext.read.parquet(parquetFilename).withColumnRenamed("referenceName", "contigName").as[NucleotideContigFragmentProduct] - } - - def replaceSequences( - newSequences: SequenceDictionary): NucleotideContigFragmentDataset = { - copy(sequences = newSequences) - } -} - -case class DatasetBoundNucleotideContigFragmentDataset private[rdd] ( - dataset: Dataset[NucleotideContigFragmentProduct], - sequences: SequenceDictionary, - override val isPartitioned: Boolean = true, - override val optPartitionBinSize: Option[Int] = Some(1000000), - override val optLookbackPartitions: Option[Int] = Some(1)) extends NucleotideContigFragmentDataset - with DatasetBoundGenomicDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - - lazy val rdd: RDD[NucleotideContigFragment] = dataset.rdd.map(_.toAvro) - - protected lazy val optPartitionMap = None - - override def saveAsParquet(filePath: String, - blockSize: Int = 128 * 1024 * 1024, - pageSize: Int = 1 * 1024 * 1024, - compressCodec: CompressionCodecName = CompressionCodecName.GZIP, - disableDictionaryEncoding: Boolean = false) { - info("Saving directly as Parquet from SQL. Options other than compression codec are ignored.") - dataset.toDF() - .write - .format("parquet") - .option("spark.sql.parquet.compression.codec", compressCodec.toString.toLowerCase()) - .save(filePath) - saveMetadata(filePath) - } - - def replaceSequences( - newSequences: SequenceDictionary): NucleotideContigFragmentDataset = { - copy(sequences = newSequences) - } -} - -/** - * A wrapper class for RDD[NucleotideContigFragment]. - * - * @param rdd Underlying RDD - * @param sequences Sequence dictionary computed from rdd - */ -case class RDDBoundNucleotideContigFragmentDataset private[rdd] ( - rdd: RDD[NucleotideContigFragment], - sequences: SequenceDictionary, - optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends NucleotideContigFragmentDataset { - - /** - * A SQL Dataset of contig fragments. - */ - lazy val dataset: Dataset[NucleotideContigFragmentProduct] = { - val sqlContext = SQLContext.getOrCreate(rdd.context) - import sqlContext.implicits._ - sqlContext.createDataset(rdd.map(NucleotideContigFragmentProduct.fromAvro)) - } - - def replaceSequences( - newSequences: SequenceDictionary): NucleotideContigFragmentDataset = { - copy(sequences = newSequences) - } -} - -sealed abstract class NucleotideContigFragmentDataset extends AvroGenomicDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset] { - - protected val productFn = NucleotideContigFragmentProduct.fromAvro(_) - protected val unproductFn = (c: NucleotideContigFragmentProduct) => c.toAvro - - @transient val uTag: TypeTag[NucleotideContigFragmentProduct] = typeTag[NucleotideContigFragmentProduct] - - protected def buildTree(rdd: RDD[(ReferenceRegion, NucleotideContigFragment)])( - implicit tTag: ClassTag[NucleotideContigFragment]): IntervalArray[ReferenceRegion, NucleotideContigFragment] = { - IntervalArray(rdd, NucleotideContigFragmentArray.apply(_, _)) - } - - /** - * Converts an RDD of nucleotide contig fragments into reads. Adjacent contig fragments are - * combined. - * - * @return Returns an RDD of reads. - */ - def toReads: RDD[AlignmentRecord] = { - FragmentConverter.convertRdd(rdd) - } - - def union(datasets: NucleotideContigFragmentDataset*): NucleotideContigFragmentDataset = { - val iterableDatasets = datasets.toSeq - NucleotideContigFragmentDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), - iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _)) - } - - /** - * Replaces the underlying RDD with a new RDD. - * - * @param newRdd The RDD to use for the new NucleotideContigFragmentDataset. - * @return Returns a new NucleotideContigFragmentDataset where the underlying RDD - * has been replaced. - */ - protected def replaceRdd(newRdd: RDD[NucleotideContigFragment], - newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): NucleotideContigFragmentDataset = { - new RDDBoundNucleotideContigFragmentDataset(newRdd, sequences, newPartitionMap) - } - - /** - * @param elem Fragment to extract a region from. - * @return If a fragment is aligned to a reference location, returns a single - * reference region. If the fragment start position and name is not defined, - * returns no regions. - */ - protected def getReferenceRegions(elem: NucleotideContigFragment): Seq[ReferenceRegion] = { - ReferenceRegion(elem).toSeq - } - - override def transformDataset( - tFn: Dataset[NucleotideContigFragmentProduct] => Dataset[NucleotideContigFragmentProduct]): NucleotideContigFragmentDataset = { - DatasetBoundNucleotideContigFragmentDataset(tFn(dataset), sequences) - } - - override def transformDataset( - tFn: JFunction[Dataset[NucleotideContigFragmentProduct], Dataset[NucleotideContigFragmentProduct]]): NucleotideContigFragmentDataset = { - DatasetBoundNucleotideContigFragmentDataset(tFn.call(dataset), sequences) - } - - override def saveAsPartitionedParquet(pathName: String, - compressCodec: CompressionCodecName = CompressionCodecName.GZIP, - partitionSize: Int = 1000000) { - info("Saving directly as Hive-partitioned Parquet from SQL. " + - "Options other than compression codec are ignored.") - val df = toDF() - .withColumnRenamed("contigName", "referenceName") - df.withColumn("positionBin", floor(df("start") / partitionSize)) - .write - .partitionBy("referenceName", "positionBin") - .format("parquet") - .option("spark.sql.parquet.compression.codec", compressCodec.toString.toLowerCase()) - .save(pathName) - writePartitionedParquetFlag(pathName, partitionSize) - saveMetadata(pathName) - } - - /** - * Save nucleotide contig fragments as Parquet or FASTA. - * - * If filename ends in .fa or .fasta, saves as Fasta. If not, saves fragments - * to Parquet. Defaults to 60 character line length, if saving to FASTA. - * - * @param fileName file name - * @param asSingleFile If false, writes file to disk as shards with - * one shard per partition. If true, we save the file to disk as a single - * file by merging the shards. - */ - def save(fileName: java.lang.String, - asSingleFile: java.lang.Boolean) { - if (fileName.endsWith(".fa") || fileName.endsWith(".fasta")) { - saveAsFasta(fileName, asSingleFile = asSingleFile) - } else { - saveAsParquet(new JavaSaveArgs(fileName)) - } - } - - /** - * Save nucleotide contig fragments in FASTA format. - * - * @param fileName file name - * @param lineWidth hard wrap FASTA formatted sequence at line width, default 60 - * @param asSingleFile By default (false), writes file to disk as shards with - * one shard per partition. If true, we save the file to disk as a single - * file by merging the shards. - * @param disableFastConcat If asSingleFile is true, disables the use of the - * parallel file merging engine. - */ - def saveAsFasta(fileName: String, - lineWidth: Int = 60, - asSingleFile: Boolean = false, - disableFastConcat: Boolean = false) { - - def isFragment(record: NucleotideContigFragment): Boolean = { - Option(record.getIndex).isDefined && Option(record.getFragments).fold(false)(_ > 1) - } - - def toFasta(record: NucleotideContigFragment): String = { - val sb = new StringBuilder() - sb.append(">") - sb.append(record.getContigName) - Option(record.getDescription).foreach(n => sb.append(" ").append(n)) - if (isFragment(record)) { - sb.append(s" fragment ${record.getIndex + 1} of ${record.getFragments}") - } - for (line <- Splitter.fixedLength(lineWidth).split(record.getSequence).asScala) { - sb.append("\n") - sb.append(line) - } - sb.toString - } - - val asFasta = rdd.map(toFasta) - - writeTextRdd(asFasta, - fileName, - asSingleFile, - disableFastConcat) - } - - /** - * Merge fragments by contig name. - * - * @return Returns a NucleotideContigFragmentDataset containing a single fragment - * per contig. - */ - def mergeFragments(): NucleotideContigFragmentDataset = { - - def merge(first: NucleotideContigFragment, second: NucleotideContigFragment): NucleotideContigFragment = { - val merged = NucleotideContigFragment.newBuilder(first) - .setIndex(null) - .setStart(null) - .setFragments(null) - .setSequence(first.getSequence + second.getSequence) - .build - - merged - } - - replaceRdd(rdd.sortBy(fragment => (fragment.getContigName, - Option(fragment.getIndex).map(_.toInt) - .getOrElse(-1))) - .map(fragment => (fragment.getContigName, fragment)) - .reduceByKey(merge) - .values) - } - - /** - * From a set of contigs, returns the base sequence that corresponds to a region of the reference. - * - * @throws UnsupportedOperationException Throws exception if query region is not found. - * @param region Reference region over which to get sequence. - * @return String of bases corresponding to reference sequence. - */ - def extract(region: ReferenceRegion): String = { - def getString(fragment: (ReferenceRegion, NucleotideContigFragment)): (ReferenceRegion, String) = { - val trimStart = max(0, region.start - fragment._1.start).toInt - val trimEnd = max(0, fragment._1.end - region.end).toInt - - val fragmentSequence: String = fragment._2.getSequence - - val str = fragmentSequence.drop(trimStart) - .dropRight(trimEnd) - val reg = new ReferenceRegion( - fragment._1.referenceName, - fragment._1.start + trimStart, - fragment._1.end - trimEnd - ) - (reg, str) - } - - def reducePairs( - kv1: (ReferenceRegion, String), - kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { - assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " + - kv1 + ", " + kv2) - - (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { - kv1._2 + kv2._2 - } else { - kv2._2 + kv1._2 - }) - } - - try { - val refPairRDD: RDD[(ReferenceRegion, String)] = rdd.keyBy(ReferenceRegion(_)) - .filter(kv => kv._1.isDefined) - .map(kv => (kv._1.get, kv._2)) - .filter(kv => kv._1.overlaps(region)) - .sortByKey() - .map(kv => getString(kv)) - - val pair: (ReferenceRegion, String) = refPairRDD.collect.reduceLeft(reducePairs) - assert( - pair._1.compareTo(region) == 0, - "Merging fragments returned a different region than requested." - ) - - pair._2 - } catch { - case (uoe: UnsupportedOperationException) => - throw new UnsupportedOperationException("Could not find " + region + "in reference RDD.") - } - } - - /** - * (Java-specific) From a set of contigs, returns a list of sequences based on reference regions provided. - * - * @param regions List of Reference regions over which to get sequences. - * @return JavaRDD[(ReferenceRegion, String)] of region -> sequence pairs. - */ - def extractRegions(regions: java.util.List[ReferenceRegion]): JavaRDD[(ReferenceRegion, String)] = { - extractRegions(regions.asScala).toJavaRDD() - } - - /** - * (Scala-specific) From a set of contigs, returns a list of sequences based on reference regions provided. - * - * @param regions Reference regions over which to get sequences. - * @return RDD[(ReferenceRegion, String)] of region -> sequence pairs. - */ - def extractRegions(regions: Iterable[ReferenceRegion]): RDD[(ReferenceRegion, String)] = { - - def extractSequence(fragmentRegion: ReferenceRegion, fragment: NucleotideContigFragment, region: ReferenceRegion): (ReferenceRegion, String) = { - val merged = fragmentRegion.intersection(region) - val start = (merged.start - fragmentRegion.start).toInt - val end = (merged.end - fragmentRegion.start).toInt - val fragmentSequence: String = fragment.getSequence - (merged, fragmentSequence.substring(start, end)) - } - - def reduceRegionSequences( - kv1: (ReferenceRegion, String), - kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { - (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { - kv1._2 + kv2._2 - } else { - kv2._2 + kv1._2 - }) - } - - val places = flattenRddByRegions() - .flatMap { - case (fragmentRegion, fragment) => - regions.collect { - case region if fragmentRegion.overlaps(region) => - (region, extractSequence(fragmentRegion, fragment, region)) - } - }.sortByKey() - - places.reduceByKey(reduceRegionSequences).values - } - - /** - * (Java-specific) For all adjacent records in the genomic dataset, we extend the records so that the adjacent - * records now overlap by _n_ bases, where _n_ is the flank length. - * - * @param flankLength The length to extend adjacent records by. - * @return Returns the genomic dataset, with all adjacent fragments extended with flanking sequence. - */ - def flankAdjacentFragments( - flankLength: java.lang.Integer): NucleotideContigFragmentDataset = { - val flank: Int = flankLength - flankAdjacentFragments(flank) - } - - /** - * (Scala-specific) For all adjacent records in the genomic dataset, we extend the records so that the adjacent - * records now overlap by _n_ bases, where _n_ is the flank length. - * - * @param flankLength The length to extend adjacent records by. - * @return Returns the genomic dataset, with all adjacent fragments extended with flanking sequence. - */ - def flankAdjacentFragments( - flankLength: Int): NucleotideContigFragmentDataset = { - replaceRdd(FlankReferenceFragments(rdd, - sequences, - flankLength)) - } - - /** - * (Scala-specific) Counts the k-mers contained in a FASTA contig. - * - * @param kmerLength The length of k-mers to count. - * @return Returns an RDD containing k-mer/count pairs. - */ - def countKmers(kmerLength: Int): RDD[(String, Long)] = { - flankAdjacentFragments(kmerLength).rdd.flatMap(r => { - // cut each read into k-mers, and attach a count of 1L - r.getSequence - .sliding(kmerLength) - .map(k => (k, 1L)) - }).reduceByKey((k1: Long, k2: Long) => k1 + k2) - } - - /** - * (Java-specific) Counts the k-mers contained in a FASTA contig. - * - * @param kmerLength The length of k-mers to count. - * @return Returns an RDD containing k-mer/count pairs. - */ - def countKmers( - kmerLength: java.lang.Integer): JavaRDD[(String, java.lang.Long)] = { - val k: Int = kmerLength - countKmers(k).map(p => { - (p._1, p._2: java.lang.Long) - }).toJavaRDD() - } -} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ReadDataset.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ReadDataset.scala new file mode 100644 index 0000000000..219613a080 --- /dev/null +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ReadDataset.scala @@ -0,0 +1,323 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.read + +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import org.apache.spark.SparkContext +import org.apache.spark.api.java.function.{ Function => JFunction } +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ Dataset, SQLContext } +import org.bdgenomics.adam.models._ +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset } +import org.bdgenomics.adam.rdd.{ + DatasetBoundGenomicDataset, + AvroGenomicDataset, + JavaSaveArgs +} +import org.bdgenomics.adam.serialization.AvroSerializer +import org.bdgenomics.adam.sql.{ Read => ReadProduct } +import org.bdgenomics.formats.avro.{ + Read, + Sequence, + Slice, + Strand +} +import org.bdgenomics.utils.interval.array.{ IntervalArray, IntervalArraySerializer } +import scala.reflect.ClassTag +import scala.reflect.runtime.universe._ + +private[adam] case class ReadArray( + array: Array[(ReferenceRegion, Read)], + maxIntervalWidth: Long) extends IntervalArray[ReferenceRegion, Read] { + + def duplicate(): IntervalArray[ReferenceRegion, Read] = { + copy() + } + + protected def replace(arr: Array[(ReferenceRegion, Read)], + maxWidth: Long): IntervalArray[ReferenceRegion, Read] = { + ReadArray(arr, maxWidth) + } +} + +private[adam] class ReadArraySerializer extends IntervalArraySerializer[ReferenceRegion, Read, ReadArray] { + + protected val kSerializer = new ReferenceRegionSerializer + protected val tSerializer = new AvroSerializer[Read] + + protected def builder(arr: Array[(ReferenceRegion, Read)], + maxIntervalWidth: Long): ReadArray = { + ReadArray(arr, maxIntervalWidth) + } +} + +object ReadDataset { + + /** + * A genomic dataset that wraps a dataset of Read data. + * + * @param ds A Dataset of genomic Reads. + * @param sequences The reference genome these data are aligned to. + */ + def apply(ds: Dataset[ReadProduct], + sequences: SequenceDictionary): ReadDataset = { + new DatasetBoundReadDataset(ds, sequences) + } + + /** + * Builds a ReadDataset with an empty sequence dictionary. + * + * @param rdd The underlying Read RDD to build from. + * @return Returns a new ReadDataset. + */ + def apply(rdd: RDD[Read]): ReadDataset = { + ReadDataset(rdd, SequenceDictionary.empty) + } + + /** + * Builds a ReadDataset given a sequence dictionary. + * + * @param rdd The underlying Read RDD to build from. + * @param sd The sequence dictionary for this ReadDataset. + * @return Returns a new ReadDataset. + */ + def apply(rdd: RDD[Read], sd: SequenceDictionary): ReadDataset = { + new RDDBoundReadDataset(rdd, sd, None) + } +} + +case class ParquetUnboundReadDataset private[rdd] ( + @transient private val sc: SparkContext, + private val parquetFilename: String, + sequences: SequenceDictionary) extends ReadDataset { + + lazy val rdd: RDD[Read] = { + sc.loadParquet(parquetFilename) + } + + protected lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) + + lazy val dataset = { + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + sqlContext.read.parquet(parquetFilename).as[ReadProduct] + } + + def replaceSequences(newSequences: SequenceDictionary): ReadDataset = { + copy(sequences = newSequences) + } +} + +case class DatasetBoundReadDataset private[rdd] ( + dataset: Dataset[ReadProduct], + sequences: SequenceDictionary, + override val isPartitioned: Boolean = true, + override val optPartitionBinSize: Option[Int] = Some(1000000), + override val optLookbackPartitions: Option[Int] = Some(1)) extends ReadDataset + with DatasetBoundGenomicDataset[Read, ReadProduct, ReadDataset] { + + lazy val rdd = dataset.rdd.map(_.toAvro) + protected lazy val optPartitionMap = None + + override def saveAsParquet(filePath: String, + blockSize: Int = 128 * 1024 * 1024, + pageSize: Int = 1 * 1024 * 1024, + compressCodec: CompressionCodecName = CompressionCodecName.GZIP, + disableDictionaryEncoding: Boolean = false) { + warn("Saving directly as Parquet from SQL. Options other than compression codec are ignored.") + dataset.toDF() + .write + .format("parquet") + .option("spark.sql.parquet.compression.codec", compressCodec.toString.toLowerCase()) + .save(filePath) + saveMetadata(filePath) + } + + override def transformDataset( + tFn: Dataset[ReadProduct] => Dataset[ReadProduct]): ReadDataset = { + copy(dataset = tFn(dataset)) + } + + override def transformDataset( + tFn: JFunction[Dataset[ReadProduct], Dataset[ReadProduct]]): ReadDataset = { + copy(dataset = tFn.call(dataset)) + } + + def replaceSequences(newSequences: SequenceDictionary): ReadDataset = { + copy(sequences = newSequences) + } +} + +case class RDDBoundReadDataset private[rdd] ( + rdd: RDD[Read], + sequences: SequenceDictionary, + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends ReadDataset { + + /** + * A SQL Dataset of reads. + */ + lazy val dataset: Dataset[ReadProduct] = { + val sqlContext = SQLContext.getOrCreate(rdd.context) + import sqlContext.implicits._ + sqlContext.createDataset(rdd.map(ReadProduct.fromAvro)) + } + + def replaceSequences(newSequences: SequenceDictionary): ReadDataset = { + copy(sequences = newSequences) + } +} + +sealed abstract class ReadDataset extends AvroGenomicDataset[Read, ReadProduct, ReadDataset] { + + protected val productFn = ReadProduct.fromAvro(_) + protected val unproductFn = (r: ReadProduct) => r.toAvro + + @transient val uTag: TypeTag[ReadProduct] = typeTag[ReadProduct] + + protected def buildTree(rdd: RDD[(ReferenceRegion, Read)])( + implicit tTag: ClassTag[Read]): IntervalArray[ReferenceRegion, Read] = { + IntervalArray(rdd, ReadArray.apply(_, _)) + } + + def union(datasets: ReadDataset*): ReadDataset = { + val iterableDatasets = datasets.toSeq + ReadDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _)) + } + + override def transformDataset( + tFn: Dataset[ReadProduct] => Dataset[ReadProduct]): ReadDataset = { + DatasetBoundReadDataset(tFn(dataset), sequences) + } + + override def transformDataset( + tFn: JFunction[Dataset[ReadProduct], Dataset[ReadProduct]]): ReadDataset = { + DatasetBoundReadDataset(tFn.call(dataset), sequences) + } + + /** + * Convert this genomic dataset of reads into sequences. + * + * @return Returns a new SequenceDataset converted from this genomic dataset of reads. + */ + def toSequences: SequenceDataset = { + def toSequence(read: Read): Sequence = { + Sequence.newBuilder() + .setName(read.getName) + .setDescription(read.getDescription) + .setAlphabet(read.getAlphabet) + .setSequence(read.getSequence) + .setLength(read.getLength) + .setAttributes(read.getAttributes) + .build() + } + SequenceDataset(rdd.map(toSequence), sequences) + } + + /** + * Convert this genomic dataset of reads into slices. + * + * @return Returns a new SliceDataset converted from this genomic dataset of reads. + */ + def toSlices: SliceDataset = { + def toSlice(read: Read): Slice = { + Slice.newBuilder() + .setName(read.getName) + .setDescription(read.getDescription) + .setAlphabet(read.getAlphabet) + .setSequence(read.getSequence) + .setLength(read.getLength) + .setTotalLength(read.getLength) + .setStart(0L) + .setEnd(read.getLength) + .setStrand(Strand.INDEPENDENT) + .setAttributes(read.getAttributes) + .build() + } + SliceDataset(rdd.map(toSlice), sequences) + } + + /** + * Save reads as Parquet or FASTQ. + * + * If filename ends in .fq or .fastq, saves as FASTQ. If not, saves reads + * to Parquet. + * + * @param filePath Path to save files to. + * @param asSingleFile If true, saves output as a single file. + */ + def save(filePath: java.lang.String, asSingleFile: java.lang.Boolean) { + if (filePath.endsWith(".fq") || filePath.endsWith(".fastq")) { + saveAsFastq(filePath, asSingleFile = asSingleFile) + } else { + if (asSingleFile) { + warn("asSingleFile = true ignored when saving as Parquet.") + } + saveAsParquet(new JavaSaveArgs(filePath)) + } + } + + /** + * Save reads in FASTQ format. + * + * @param filePath Path to save files to. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + * @param asSingleFile If true, saves output as a single file. + */ + def saveAsFastq(filePath: String, + asSingleFile: Boolean = false, + disableFastConcat: Boolean = false) { + + def toFastq(read: Read): String = { + val sb = new StringBuilder() + sb.append("@") + sb.append(read.getName) + Option(read.getDescription).foreach(n => sb.append(" ").append(n)) + sb.append("\n") + sb.append(read.getSequence) + sb.append("\n+\n") + sb.append(read.getQualityScores) + sb.append("\n") + sb.toString + } + + writeTextRdd(rdd.map(toFastq), + filePath, + asSingleFile = asSingleFile, + disableFastConcat = disableFastConcat) + } + + /** + * @param newRdd The RDD to replace the underlying RDD with. + * @return Returns a new ReadDataset with the underlying RDD replaced. + */ + protected def replaceRdd(newRdd: RDD[Read], + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): ReadDataset = { + new RDDBoundReadDataset(newRdd, sequences, newPartitionMap) + } + + /** + * @param read Read to extract a region from. + * @return Returns a reference region that covers the entirety of the read. + */ + protected def getReferenceRegions(read: Read): Seq[ReferenceRegion] = { + Seq(ReferenceRegion(read.getName, 0L, read.getLength)) + } +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FlankSlices.scala similarity index 60% rename from adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala rename to adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FlankSlices.scala index c0749e2905..bdf6ea011c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FlankSlices.scala @@ -15,59 +15,59 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.rdd.contig +package org.bdgenomics.adam.rdd.sequence import org.apache.spark.rdd.RDD import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary } import org.bdgenomics.adam.rdd.ReferencePartitioner -import org.bdgenomics.formats.avro.NucleotideContigFragment +import org.bdgenomics.formats.avro.Slice /** - * Object that extends all of the fragments in an RDD of contig fragments - * with the sequence flanking said fragment. + * Object that extends all of the slices in an RDD of slices with + * sequence flanking said slice. */ -private[contig] object FlankReferenceFragments extends Serializable { +private[sequence] object FlankSlices extends Serializable { /** - * Adds flanks to sequence fragments in an RDD. + * Adds flanking sequence to slices in an RDD. * - * Assumes that after sorting, all fragments are contiguous. + * Assumes that after sorting, all slices are contiguous. * * @param rdd The RDD to flank. * @param sd The sequence dictionary describing all contigs in this sequence * dictionary. - * @param flankSize The size of flanking sequence to add to each fragment. - * @return Returns a new RDD where each fragment has been extended with + * @param flankSize The size of flanking sequence to add to each slice. + * @return Returns a new RDD where each slice has been extended with * flanking sequence. */ def apply( - rdd: RDD[NucleotideContigFragment], + rdd: RDD[Slice], sd: SequenceDictionary, - flankSize: Int): RDD[NucleotideContigFragment] = { - rdd.keyBy(ctg => ReferenceRegion(ctg).get) + flankSize: Int): RDD[Slice] = { + rdd.keyBy(slice => ReferenceRegion(slice).get) .repartitionAndSortWithinPartitions(ReferencePartitioner(sd)) .mapPartitions(flank(_, flankSize)) } def flank( - iter: Iterator[(ReferenceRegion, NucleotideContigFragment)], - flankSize: Int): Iterator[NucleotideContigFragment] = { + iter: Iterator[(ReferenceRegion, Slice)], + flankSize: Int): Iterator[Slice] = { // we need to have at least one element in the iterator if (iter.hasNext) { // now, we apply a window and flank adjacent segments - var lastFragment = iter.next + var lastSlice = iter.next iter.map(f => { // grab temp copy; we will overwrite later - val copyLastFragment = lastFragment + val copyLastSlice = lastSlice - // are the two fragments adjacent? if so, we must add the flanking sequences - if (copyLastFragment._1.isAdjacent(f._1)) { - val lastSequence = copyLastFragment._2.getSequence + // are the two slices adjacent? if so, we must add the flanking sequences + if (copyLastSlice._1.isAdjacent(f._1)) { + val lastSequence = copyLastSlice._2.getSequence val currSequence = f._2.getSequence // update fragments with flanking sequences - copyLastFragment._2.setSequence(lastSequence + currSequence.take(flankSize)) - copyLastFragment._2.setDescription(Option(copyLastFragment._2.getDescription) + copyLastSlice._2.setSequence(lastSequence + currSequence.take(flankSize)) + copyLastSlice._2.setDescription(Option(copyLastSlice._2.getDescription) .fold("rr")(_ + "rr")) f._2.setSequence(lastSequence.takeRight(flankSize) + currSequence) f._2.setDescription("f") @@ -75,16 +75,16 @@ private[contig] object FlankReferenceFragments extends Serializable { // we must change the start position of the fragment we are prepending to f._2.setStart(f._2.getStart - flankSize.toLong) // and the end position of the fragment we are appending to - copyLastFragment._2.setEnd( - copyLastFragment._2.getStart + copyLastFragment._2.getSequence.length - 1L) + copyLastSlice._2.setEnd( + copyLastSlice._2.getStart + copyLastSlice._2.getSequence.length - 1L) } // overwrite last fragment - lastFragment = f + lastSlice = f // emit updated last fragment - copyLastFragment._2 - }) ++ Iterator(lastFragment._2) + copyLastSlice._2 + }) ++ Iterator(lastSlice._2) } else { Iterator() } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala new file mode 100644 index 0000000000..5444d5e28e --- /dev/null +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala @@ -0,0 +1,456 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.sequence + +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import org.apache.spark.SparkContext +import org.apache.spark.api.java.function.{ Function => JFunction } +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ Dataset, SQLContext } +import org.bdgenomics.adam.instrumentation.Timers._ +import org.bdgenomics.adam.models._ +import org.bdgenomics.adam.rdd.read.ReadDataset +import org.bdgenomics.adam.rdd.{ + DatasetBoundGenomicDataset, + AvroGenomicDataset, + JavaSaveArgs +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.serialization.AvroSerializer +import org.bdgenomics.adam.sql.{ Sequence => SequenceProduct } +import org.bdgenomics.formats.avro.{ + Read, + Sequence, + Slice, + Strand +} +import org.bdgenomics.utils.interval.array.{ + IntervalArray, + IntervalArraySerializer +} +import scala.collection.mutable.MutableList +import scala.reflect.ClassTag +import scala.reflect.runtime.universe._ + +private[adam] case class SequenceArray( + array: Array[(ReferenceRegion, Sequence)], + maxIntervalWidth: Long) extends IntervalArray[ReferenceRegion, Sequence] { + + def duplicate(): IntervalArray[ReferenceRegion, Sequence] = { + copy() + } + + protected def replace(arr: Array[(ReferenceRegion, Sequence)], + maxWidth: Long): IntervalArray[ReferenceRegion, Sequence] = { + SequenceArray(arr, maxWidth) + } +} + +private[adam] class SequenceArraySerializer extends IntervalArraySerializer[ReferenceRegion, Sequence, SequenceArray] { + + protected val kSerializer = new ReferenceRegionSerializer + protected val tSerializer = new AvroSerializer[Sequence] + + protected def builder(arr: Array[(ReferenceRegion, Sequence)], + maxIntervalWidth: Long): SequenceArray = { + SequenceArray(arr, maxIntervalWidth) + } +} + +object SequenceDataset { + + /** + * A genomic dataset that wraps a dataset of Sequence data. + * + * @param ds A Dataset of sequences. + * @param sequences The reference genome these data are aligned to. + */ + def apply(ds: Dataset[SequenceProduct], + sequences: SequenceDictionary): SequenceDataset = { + new DatasetBoundSequenceDataset(ds, sequences) + } + + /** + * Builds a SequenceDataset with an empty sequence dictionary. + * + * @param rdd The underlying Sequence RDD to build from. + * @return Returns a new SequenceDataset. + */ + def apply(rdd: RDD[Sequence]): SequenceDataset = { + SequenceDataset(rdd, SequenceDictionary.empty) + } + + /** + * Builds a SequenceDataset given a sequence dictionary. + * + * @param rdd The underlying Sequence RDD to build from. + * @param sd The sequence dictionary for this SequenceDataset. + * @return Returns a new SequenceDataset. + */ + def apply(rdd: RDD[Sequence], sd: SequenceDictionary): SequenceDataset = { + new RDDBoundSequenceDataset(rdd, sd, None) + } +} + +case class ParquetUnboundSequenceDataset private[rdd] ( + @transient private val sc: SparkContext, + private val parquetFilename: String, + sequences: SequenceDictionary) extends SequenceDataset { + + lazy val rdd: RDD[Sequence] = { + sc.loadParquet(parquetFilename) + } + + protected lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) + + lazy val dataset = { + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + sqlContext.read.parquet(parquetFilename).as[SequenceProduct] + } + + def replaceSequences(newSequences: SequenceDictionary): SequenceDataset = { + copy(sequences = newSequences) + } +} + +case class DatasetBoundSequenceDataset private[rdd] ( + dataset: Dataset[SequenceProduct], + sequences: SequenceDictionary, + override val isPartitioned: Boolean = true, + override val optPartitionBinSize: Option[Int] = Some(1000000), + override val optLookbackPartitions: Option[Int] = Some(1)) extends SequenceDataset + with DatasetBoundGenomicDataset[Sequence, SequenceProduct, SequenceDataset] { + + lazy val rdd = dataset.rdd.map(_.toAvro) + protected lazy val optPartitionMap = None + + override def saveAsParquet(filePath: String, + blockSize: Int = 128 * 1024 * 1024, + pageSize: Int = 1 * 1024 * 1024, + compressCodec: CompressionCodecName = CompressionCodecName.GZIP, + disableDictionaryEncoding: Boolean = false) { + warn("Saving directly as Parquet from SQL. Options other than compression codec are ignored.") + dataset.toDF() + .write + .format("parquet") + .option("spark.sql.parquet.compression.codec", compressCodec.toString.toLowerCase()) + .save(filePath) + saveMetadata(filePath) + } + + override def transformDataset( + tFn: Dataset[SequenceProduct] => Dataset[SequenceProduct]): SequenceDataset = { + copy(dataset = tFn(dataset)) + } + + override def transformDataset( + tFn: JFunction[Dataset[SequenceProduct], Dataset[SequenceProduct]]): SequenceDataset = { + copy(dataset = tFn.call(dataset)) + } + + def replaceSequences(newSequences: SequenceDictionary): SequenceDataset = { + copy(sequences = newSequences) + } +} + +case class RDDBoundSequenceDataset private[rdd] ( + rdd: RDD[Sequence], + sequences: SequenceDictionary, + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends SequenceDataset { + + /** + * A SQL Dataset of sequences. + */ + lazy val dataset: Dataset[SequenceProduct] = { + val sqlContext = SQLContext.getOrCreate(rdd.context) + import sqlContext.implicits._ + sqlContext.createDataset(rdd.map(SequenceProduct.fromAvro)) + } + + def replaceSequences(newSequences: SequenceDictionary): SequenceDataset = { + copy(sequences = newSequences) + } +} + +sealed abstract class SequenceDataset extends AvroGenomicDataset[Sequence, SequenceProduct, SequenceDataset] { + + protected val productFn = SequenceProduct.fromAvro(_) + protected val unproductFn = (s: SequenceProduct) => s.toAvro + + @transient val uTag: TypeTag[SequenceProduct] = typeTag[SequenceProduct] + + protected def buildTree(rdd: RDD[(ReferenceRegion, Sequence)])( + implicit tTag: ClassTag[Sequence]): IntervalArray[ReferenceRegion, Sequence] = { + IntervalArray(rdd, SequenceArray.apply(_, _)) + } + + def union(datasets: SequenceDataset*): SequenceDataset = { + val iterableDatasets = datasets.toSeq + SequenceDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _)) + } + + override def transformDataset( + tFn: Dataset[SequenceProduct] => Dataset[SequenceProduct]): SequenceDataset = { + DatasetBoundSequenceDataset(tFn(dataset), sequences) + } + + override def transformDataset( + tFn: JFunction[Dataset[SequenceProduct], Dataset[SequenceProduct]]): SequenceDataset = { + DatasetBoundSequenceDataset(tFn.call(dataset), sequences) + } + + /** + * Slice the sequences in this genomic dataset to the specified maximum length. + * + * @param maximumLength Maximum length. + * @return Returns a new SliceDataset from the sequences in this genomic dataset sliced + * to the specified maximum length. + */ + def slice(maximumLength: Long): SliceDataset = { + def sliceSequence(sequence: Sequence): Seq[Slice] = { + val slices: MutableList[Slice] = MutableList() + + val sb = Slice.newBuilder + .setName(sequence.getName) + .setDescription(sequence.getDescription) + .setAlphabet(sequence.getAlphabet) + .setSequence(sequence.getSequence) + .setStrand(Strand.INDEPENDENT) + .setTotalLength(sequence.getLength) + .setAttributes(sequence.getAttributes) + + var index = 0 + var count = (sequence.getLength / maximumLength).toInt + if (sequence.getLength % maximumLength != 0) count += 1 + for (start <- 0L until sequence.getLength by maximumLength) { + val end = math.min(sequence.getLength, start + maximumLength) + slices += sb + .setStart(start) + .setEnd(end) + .setLength(end - start) + .setSequence(sequence.getSequence.substring(start.toInt, end.toInt)) + .setIndex(index) + .setSlices(count) + .build() + index += 1 + } + slices + } + SliceDataset(rdd.flatMap(sliceSequence), sequences) + } + + /** + * Slice the specified sequence overlapping the specified region. + * + * @param region Region to overlap. + * @return Returns a new Slice from the sequence overlapping the specified region. + */ + private def slice(sequence: Sequence, region: ReferenceRegion): Slice = { + // region may be open-ended + val end = math.min(sequence.getLength, region.end) + Slice.newBuilder() + .setName(sequence.getName) + .setDescription(sequence.getDescription) + .setAlphabet(sequence.getAlphabet) + .setSequence(sequence.getSequence.substring(region.start.toInt, end.toInt)) + .setLength(end - region.start) + .setTotalLength(sequence.getLength) + .setStart(region.start) + .setEnd(end) + .setStrand(region.strand) // perhaps Sequence should have strand? + .setAttributes(sequence.getAttributes) + .build() + } + + /** + * Slice the sequences in this genomic dataset overlapping the specified region. + * + * @param region Region to overlap. + * @return Returns a new SliceDataset from the sequences in this genomic dataset sliced + * to overlap the specified region. + */ + def slice(region: ReferenceRegion): SliceDataset = { + SliceDataset(filterByOverlappingRegion(region).rdd.map(sequence => slice(sequence, region))) + } + + /** + * Slice the specified sequence overlapping the specified regions. + * + * @param regions Regions to overlap. + * @return Returns one or more slices from the sequence overlapping the specified regions. + */ + private def slice(sequence: Sequence, regions: Iterable[ReferenceRegion]): Iterable[Slice] = { + val sequenceRegion = ReferenceRegion(sequence).get + regions.map(region => + if (region.covers(sequenceRegion)) { + Some(slice(sequence, region)) + } else { + None + }).flatten + } + + /** + * Slice the sequences in this genomic dataset overlapping the specified regions. + * + * @param regions Regions to overlap. + * @return Returns a new SliceDataset from the sequences in this genomic dataset sliced + * to overlap the specified regions. + */ + def slice(regions: Iterable[ReferenceRegion]): SliceDataset = { + SliceDataset(filterByOverlappingRegions(regions).rdd.flatMap(sequence => slice(sequence, regions))) + } + + /** + * Convert this genomic dataset of sequences into reads. + * + * @return Returns a new ReadRDD converted from this genomic dataset of sequences. + */ + def toReads: ReadDataset = { + def toRead(sequence: Sequence): Read = { + Read.newBuilder() + .setName(sequence.getName) + .setDescription(sequence.getDescription) + .setAlphabet(sequence.getAlphabet) + .setSequence(sequence.getSequence) + .setLength(sequence.getLength) + .setQualityScores("B" * (if (sequence.getLength == null) 0 else sequence.getLength.toInt)) + .setAttributes(sequence.getAttributes) + .build() + } + ReadDataset(rdd.map(toRead), sequences) + } + + /** + * Convert this genomic dataset of sequences into slices. + * + * @return Returns a new SliceDataset converted from this genomic dataset of sequences. + */ + def toSlices: SliceDataset = { + def toSlice(sequence: Sequence): Slice = { + Slice.newBuilder() + .setName(sequence.getName) + .setDescription(sequence.getDescription) + .setAlphabet(sequence.getAlphabet) + .setSequence(sequence.getSequence) + .setLength(sequence.getLength) + .setTotalLength(sequence.getLength) + .setStart(0L) + .setEnd(sequence.getLength) + .setStrand(Strand.INDEPENDENT) + .setAttributes(sequence.getAttributes) + .build() + } + SliceDataset(rdd.map(toSlice), sequences) + } + + /** + * Replace the sequence dictionary for this SequenceDataset with one + * created from the sequences in this SequenceDataset. + * + * @return Returns a new SequenceDataset with the sequence dictionary replaced. + */ + def createSequenceDictionary(): SequenceDataset = CreateSequenceDictionary.time { + val sd = new SequenceDictionary(rdd.flatMap(sequence => { + if (sequence.getName != null) { + Some(SequenceRecord.fromSequence(sequence)) + } else { + None + } + }).distinct + .collect + .toVector) + + replaceSequences(sd) + } + + /** + * Save sequences as Parquet or FASTA. + * + * If filename ends in .fa or .fasta, saves as FASTA. If not, saves fragments + * to Parquet. Defaults to 60 character line length, if saving to FASTA. + * + * @param filePath Path to save files to. + * @param asSingleFile If true, saves output as a single file. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + */ + def save( + filePath: java.lang.String, + asSingleFile: java.lang.Boolean, + disableFastConcat: java.lang.Boolean) { + if (filePath.endsWith(".fa") || filePath.endsWith(".fasta")) { + saveAsFasta(filePath, asSingleFile = asSingleFile, disableFastConcat = disableFastConcat) + } else { + if (asSingleFile) { + warn("asSingleFile = true ignored when saving as Parquet.") + } + saveAsParquet(new JavaSaveArgs(filePath)) + } + } + + /** + * Save sequences in FASTA format. + * + * @param filePath Path to save files to. + * @param asSingleFile If true, saves output as a single file. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + * @param lineWidth Hard wrap FASTA formatted sequence at line width, default 60. + */ + def saveAsFasta(filePath: String, + asSingleFile: Boolean = false, + disableFastConcat: Boolean = false, + lineWidth: Int = 60) { + + def toFasta(sequence: Sequence): String = { + val sb = new StringBuilder() + sb.append(">") + sb.append(sequence.getName) + Option(sequence.getDescription).foreach(n => sb.append(" ").append(n)) + sequence.getSequence.grouped(lineWidth).foreach(line => { + sb.append("\n") + sb.append(line) + }) + sb.toString + } + + writeTextRdd(rdd.map(toFasta), + filePath, + asSingleFile = asSingleFile, + disableFastConcat = disableFastConcat) + } + + /** + * @param newRdd The RDD to replace the underlying RDD with. + * @return Returns a new SequenceRDD with the underlying RDD replaced. + */ + protected def replaceRdd(newRdd: RDD[Sequence], + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): SequenceDataset = { + new RDDBoundSequenceDataset(newRdd, sequences, newPartitionMap) + } + + /** + * @param sequence Sequence to extract a region from. + * @return Returns a reference region that covers the entirety of the sequence. + */ + protected def getReferenceRegions(sequence: Sequence): Seq[ReferenceRegion] = { + Seq(ReferenceRegion(sequence).get) + } +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SliceDataset.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SliceDataset.scala new file mode 100644 index 0000000000..81874728e3 --- /dev/null +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SliceDataset.scala @@ -0,0 +1,565 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.sequence + +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import org.apache.spark.SparkContext +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.java.function.{ Function => JFunction } +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{ Dataset, SQLContext } +import org.bdgenomics.adam.instrumentation.Timers._ +import org.bdgenomics.adam.models._ +import org.bdgenomics.adam.rdd.read.{ + AlignmentRecordDataset, + ReadDataset +} +import org.bdgenomics.adam.rdd.{ + AvroGenomicDataset, + DatasetBoundGenomicDataset, + JavaSaveArgs +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.serialization.AvroSerializer +import org.bdgenomics.adam.sql.{ Slice => SliceProduct } +import org.bdgenomics.formats.avro.{ + AlignmentRecord, + Read, + Sequence, + Slice +} +import org.bdgenomics.utils.interval.array.{ + IntervalArray, + IntervalArraySerializer +} +import scala.collection.JavaConversions._ +import scala.math._ +import scala.reflect.ClassTag +import scala.reflect.runtime.universe._ + +private[adam] case class SliceArray( + array: Array[(ReferenceRegion, Slice)], + maxIntervalWidth: Long) extends IntervalArray[ReferenceRegion, Slice] { + + def duplicate(): IntervalArray[ReferenceRegion, Slice] = { + copy() + } + + protected def replace(arr: Array[(ReferenceRegion, Slice)], + maxWidth: Long): IntervalArray[ReferenceRegion, Slice] = { + SliceArray(arr, maxWidth) + } +} + +private[adam] class SliceArraySerializer extends IntervalArraySerializer[ReferenceRegion, Slice, SliceArray] { + + protected val kSerializer = new ReferenceRegionSerializer + protected val tSerializer = new AvroSerializer[Slice] + + protected def builder(arr: Array[(ReferenceRegion, Slice)], + maxIntervalWidth: Long): SliceArray = { + SliceArray(arr, maxIntervalWidth) + } +} + +object SliceDataset { + + /** + * A genomic dataset that wraps a dataset of Slice data. + * + * @param ds A Dataset of slices. + * @param sequences The reference genome these data are aligned to. + */ + def apply(ds: Dataset[SliceProduct], + sequences: SequenceDictionary): SliceDataset = { + new DatasetBoundSliceDataset(ds, sequences) + } + + /** + * Builds a SliceDataset with an empty sequence dictionary. + * + * @param rdd The underlying Slice RDD to build from. + * @return Returns a new SliceDataset. + */ + def apply(rdd: RDD[Slice]): SliceDataset = { + SliceDataset(rdd, SequenceDictionary.empty) + } + + /** + * Builds a SliceDataset given a sequence dictionary. + * + * @param rdd The underlying Slice RDD to build from. + * @param sd The sequence dictionary for this SliceDataset. + * @return Returns a new SliceDataset. + */ + def apply(rdd: RDD[Slice], sd: SequenceDictionary): SliceDataset = { + new RDDBoundSliceDataset(rdd, sd, None) + } +} + +case class ParquetUnboundSliceDataset private[rdd] ( + @transient private val sc: SparkContext, + private val parquetFilename: String, + sequences: SequenceDictionary) extends SliceDataset { + + lazy val rdd: RDD[Slice] = { + sc.loadParquet(parquetFilename) + } + + protected lazy val optPartitionMap = sc.extractPartitionMap(parquetFilename) + + lazy val dataset = { + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + sqlContext.read.parquet(parquetFilename).as[SliceProduct] + } + + def replaceSequences(newSequences: SequenceDictionary): SliceDataset = { + copy(sequences = newSequences) + } +} + +case class DatasetBoundSliceDataset private[rdd] ( + dataset: Dataset[SliceProduct], + sequences: SequenceDictionary, + override val isPartitioned: Boolean = true, + override val optPartitionBinSize: Option[Int] = Some(1000000), + override val optLookbackPartitions: Option[Int] = Some(1)) extends SliceDataset + with DatasetBoundGenomicDataset[Slice, SliceProduct, SliceDataset] { + + lazy val rdd = dataset.rdd.map(_.toAvro) + protected lazy val optPartitionMap = None + + override def saveAsParquet(filePath: String, + blockSize: Int = 128 * 1024 * 1024, + pageSize: Int = 1 * 1024 * 1024, + compressCodec: CompressionCodecName = CompressionCodecName.GZIP, + disableDictionaryEncoding: Boolean = false) { + warn("Saving directly as Parquet from SQL. Options other than compression codec are ignored.") + dataset.toDF() + .write + .format("parquet") + .option("spark.sql.parquet.compression.codec", compressCodec.toString.toLowerCase()) + .save(filePath) + saveMetadata(filePath) + } + + override def transformDataset( + tFn: Dataset[SliceProduct] => Dataset[SliceProduct]): SliceDataset = { + copy(dataset = tFn(dataset)) + } + + override def transformDataset( + tFn: JFunction[Dataset[SliceProduct], Dataset[SliceProduct]]): SliceDataset = { + copy(dataset = tFn.call(dataset)) + } + + def replaceSequences(newSequences: SequenceDictionary): SliceDataset = { + copy(sequences = newSequences) + } +} + +case class RDDBoundSliceDataset private[rdd] ( + rdd: RDD[Slice], + sequences: SequenceDictionary, + optPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]]) extends SliceDataset { + + /** + * A SQL Dataset of slices. + */ + lazy val dataset: Dataset[SliceProduct] = { + val sqlContext = SQLContext.getOrCreate(rdd.context) + import sqlContext.implicits._ + sqlContext.createDataset(rdd.map(SliceProduct.fromAvro)) + } + + def replaceSequences(newSequences: SequenceDictionary): SliceDataset = { + copy(sequences = newSequences) + } +} + +sealed abstract class SliceDataset extends AvroGenomicDataset[Slice, SliceProduct, SliceDataset] { + + protected val productFn = SliceProduct.fromAvro(_) + protected val unproductFn = (s: SliceProduct) => s.toAvro + + @transient val uTag: TypeTag[SliceProduct] = typeTag[SliceProduct] + + protected def buildTree(rdd: RDD[(ReferenceRegion, Slice)])( + implicit tTag: ClassTag[Slice]): IntervalArray[ReferenceRegion, Slice] = { + IntervalArray(rdd, SliceArray.apply(_, _)) + } + + def union(datasets: SliceDataset*): SliceDataset = { + val iterableDatasets = datasets.toSeq + SliceDataset(rdd.context.union(rdd, iterableDatasets.map(_.rdd): _*), + iterableDatasets.map(_.sequences).fold(sequences)(_ ++ _)) + } + + override def transformDataset( + tFn: Dataset[SliceProduct] => Dataset[SliceProduct]): SliceDataset = { + DatasetBoundSliceDataset(tFn(dataset), sequences) + } + + override def transformDataset( + tFn: JFunction[Dataset[SliceProduct], Dataset[SliceProduct]]): SliceDataset = { + DatasetBoundSliceDataset(tFn.call(dataset), sequences) + } + + /** + * Merge slices into sequences. + * + * @return Returns a SequenceDataset containing merged slices. + */ + def merge(): SequenceDataset = { + def toSequence(slice: Slice): Sequence = { + Sequence.newBuilder() + .setName(slice.getName) + .setDescription(slice.getDescription) + .setAlphabet(slice.getAlphabet) + .setSequence(slice.getSequence) + .setLength(slice.getLength) + .setAttributes(slice.getAttributes) + .build + } + + def mergeSequences(first: Sequence, second: Sequence): Sequence = { + Sequence.newBuilder(first) + .setLength(first.getLength + second.getLength) + .setSequence(first.getSequence + second.getSequence) + .setAttributes(first.getAttributes ++ second.getAttributes) + .build + } + + val merged: RDD[Sequence] = rdd + .sortBy(slice => (slice.getName, slice.getStart)) + .map(slice => (slice.getName, toSequence(slice))) + .reduceByKey(mergeSequences) + .values + + SequenceDataset(merged) + } + + /** + * Convert this genomic dataset of slices into reads. + * + * @return Returns a new ReadDataset converted from this genomic dataset of slices. + */ + def toReads: ReadDataset = { + def toRead(slice: Slice): Read = { + Read.newBuilder() + .setName(slice.getName) + .setDescription(slice.getDescription) + .setAlphabet(slice.getAlphabet) + .setSequence(slice.getSequence) + .setLength(slice.getLength) + .setQualityScores("B" * (if (slice.getLength == null) 0 else slice.getLength.toInt)) + .setAttributes(slice.getAttributes) + .build() + } + ReadDataset(rdd.map(toRead), sequences) + } + + /** + * Convert this genomic dataset of slices into sequences. + * + * @return Returns a new SequenceDataset converted from this genomic dataset of slices. + */ + def toSequences: SequenceDataset = { + def toSequence(slice: Slice): Sequence = { + Sequence.newBuilder() + .setName(slice.getName) + .setDescription(slice.getDescription) + .setAlphabet(slice.getAlphabet) + .setSequence(slice.getSequence) + .setLength(slice.getLength) + .setAttributes(slice.getAttributes) + .build() + } + SequenceDataset(rdd.map(toSequence), sequences) + } + + /** + * Convert this genomic dataset of slices into alignments. + * + * @return Returns a new AlignmentRecordDataset converted from this genomic dataset of slices. + */ + def toAlignments: AlignmentRecordDataset = { + def toAlignments(slice: Slice): AlignmentRecord = { + AlignmentRecord.newBuilder() + .setReferenceName(slice.getName) + .setStart(slice.getStart) + .setEnd(slice.getEnd) + .build() + } + AlignmentRecordDataset(rdd.map(toAlignments), sequences, ReadGroupDictionary.empty, Seq.empty) + } + + /** + * Replace the sequence dictionary for this SliceDataset with one + * created from the slices in this SliceDataset. + * + * @return Returns a new SliceDataset with the sequence dictionary replaced. + */ + def createSequenceDictionary(): SliceDataset = CreateSequenceDictionary.time { + val sd = new SequenceDictionary(rdd.flatMap(slice => { + if (slice.getName != null) { + Some(SequenceRecord.fromSlice(slice)) + } else { + None + } + }).distinct + .collect + .toVector) + + replaceSequences(sd) + } + + /** + * Save slices as Parquet or FASTA. + * + * If filename ends in .fa or .fasta, saves as FASTA. If not, saves slices + * to Parquet. Defaults to 60 character line length, if saving to FASTA. + * + * @param filePath Path to save files to. + * @param asSingleFile If true, saves output as a single file. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + */ + def save( + filePath: java.lang.String, + asSingleFile: java.lang.Boolean, + disableFastConcat: java.lang.Boolean) { + if (filePath.endsWith(".fa") || filePath.endsWith(".fasta")) { + saveAsFasta(filePath, asSingleFile = asSingleFile, disableFastConcat = disableFastConcat) + } else { + if (asSingleFile) { + warn("asSingleFile = true ignored when saving as Parquet.") + } + saveAsParquet(new JavaSaveArgs(filePath)) + } + } + + /** + * Save slices in FASTA format. + * + * The coordinate fields for this slice are appended to the description field + * for the FASTA description line: + *
+   * >description start-slice:strand
+   * 
+ * + * @param filePath Path to save files to. + * @param asSingleFile If true, saves output as a single file. + * @param disableFastConcat If asSingleFile is true, disables the use of the + * parallel file merging engine. + * @param lineWidth Hard wrap FASTA formatted slice at line width, default 60. + */ + def saveAsFasta(filePath: String, + asSingleFile: Boolean = false, + disableFastConcat: Boolean = false, + lineWidth: Int = 60) { + + def toFasta(slice: Slice): String = { + val sb = new StringBuilder() + sb.append(">") + sb.append(slice.getName) + Option(slice.getDescription).foreach(n => sb.append(" ").append(n)) + sb.append(s" slice.getStart-slice.getEnd:slice.getStrand") + slice.getSequence.grouped(lineWidth).foreach(line => { + sb.append("\n") + sb.append(line) + }) + sb.toString + } + + writeTextRdd(rdd.map(toFasta), + filePath, + asSingleFile = asSingleFile, + disableFastConcat = disableFastConcat) + } + + /** + * Extract the specified region from this genomic dataset of slices as a string, merging + * slices if necessary. + * + * @param region Region to extract. + * @return Return the specified region from this genomic dataset of slices as a string, merging + * slices if necessary. + */ + def extract(region: ReferenceRegion): String = { + def getString(slice: (ReferenceRegion, Slice)): (ReferenceRegion, String) = { + val trimStart = max(0, region.start - slice._1.start).toInt + val trimEnd = max(0, slice._1.end - region.end).toInt + + val fragmentSequence: String = slice._2.getSequence + + val str = fragmentSequence.drop(trimStart) + .dropRight(trimEnd) + val reg = new ReferenceRegion( + slice._1.referenceName, + slice._1.start + trimStart, + slice._1.end - trimEnd + ) + (reg, str) + } + + def reducePairs( + kv1: (ReferenceRegion, String), + kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { + assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " + + kv1 + ", " + kv2) + + (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { + kv1._2 + kv2._2 + } else { + kv2._2 + kv1._2 + }) + } + + try { + val refPairRDD: RDD[(ReferenceRegion, String)] = rdd.keyBy(ReferenceRegion(_)) + .filter(kv => kv._1.isDefined) + .map(kv => (kv._1.get, kv._2)) + .filter(kv => kv._1.overlaps(region)) + .sortByKey() + .map(kv => getString(kv)) + + val pair: (ReferenceRegion, String) = refPairRDD.collect.reduceLeft(reducePairs) + assert( + pair._1.compareTo(region) == 0, + "Merging slices returned a different region than requested." + ) + + pair._2 + } catch { + case (uoe: UnsupportedOperationException) => + throw new UnsupportedOperationException("Could not find " + region + "in reference RDD.") + } + } + + /** + * Extract the specified regions from this genomic dataset of slices as an RDD of (ReferenceRegion, + * String) tuples, merging slices if necessary. + * + * @param regions Zero or more regions to extract. + * @return Return the specified regions from this genomic dataset of slices as an RDD of (ReferenceRegion, + * String) tuples, merging slices if necessary. + */ + def extractRegions(regions: Iterable[ReferenceRegion]): RDD[(ReferenceRegion, String)] = { + def extractSequence(sliceRegion: ReferenceRegion, slice: Slice, region: ReferenceRegion): (ReferenceRegion, String) = { + val merged = sliceRegion.intersection(region) + val start = (merged.start - sliceRegion.start).toInt + val end = (merged.end - sliceRegion.start).toInt + val fragmentSequence: String = slice.getSequence + (merged, fragmentSequence.substring(start, end)) + } + + def reduceRegionSequences( + kv1: (ReferenceRegion, String), + kv2: (ReferenceRegion, String)): (ReferenceRegion, String) = { + (kv1._1.merge(kv2._1), if (kv1._1.compareTo(kv2._1) <= 0) { + kv1._2 + kv2._2 + } else { + kv2._2 + kv1._2 + }) + } + + val places = flattenRddByRegions() + .flatMap { + case (sliceRegion, slice) => + regions.collect { + case region if sliceRegion.overlaps(region) => + (region, extractSequence(sliceRegion, slice, region)) + } + }.sortByKey() + + places.reduceByKey(reduceRegionSequences).values + } + + /** + * (Java-friendly) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent + * slices now overlap by _n_ bases, where _n_ is the flank length. + * + * @param flankLength The length to extend adjacent slices by. + * @return Returns this genomic dataset, with all adjacent slices extended with flanking sequence. + */ + def flankAdjacent(flankLength: java.lang.Integer): SliceDataset = { + val flank: Int = flankLength + flankAdjacent(flank) + } + + /** + * (Scala-friendly) For all adjacent slices in this genomic dataset, we extend the slices so that the adjacent + * slices now overlap by _n_ bases, where _n_ is the flank length. + * + * @param flankLength The length to extend adjacent slices by. + * @return Returns this genomic dataset, with all adjacent slices extended with flanking sequence. + */ + def flankAdjacent(flankLength: Int): SliceDataset = { + replaceRdd(FlankSlices(rdd, + sequences, + flankLength)) + } + + /** + * (Scala-friendly) Counts the k-mers contained in this genomic dataset of slices. + * + * @param kmerLength The length of k-mers to count. + * @return Returns an RDD containing k-mer/count pairs. + */ + def countKmers(kmerLength: Int): RDD[(String, Long)] = { + flankAdjacent(kmerLength).rdd.flatMap(r => { + // cut each read into k-mers, and attach a count of 1L + r.getSequence + .sliding(kmerLength) + .map(k => (k, 1L)) + }).reduceByKey((k1: Long, k2: Long) => k1 + k2) + } + + /** + * (Java-friendly) Counts the k-mers contained in this genomic dataset of slices. + * + * @param kmerLength The length of k-mers to count. + * @return Returns an RDD containing k-mer/count pairs. + */ + def countKmers( + kmerLength: java.lang.Integer): JavaRDD[(String, java.lang.Long)] = { + val k: Int = kmerLength + countKmers(k).map(p => { + (p._1, p._2: java.lang.Long) + }).toJavaRDD() + } + + /** + * @param newRdd The RDD to replace the underlying RDD with. + * @return Returns a new SliceDataset with the underlying RDD replaced. + */ + protected def replaceRdd(newRdd: RDD[Slice], + newPartitionMap: Option[Array[Option[(ReferenceRegion, ReferenceRegion)]]] = None): SliceDataset = { + new RDDBoundSliceDataset(newRdd, sequences, newPartitionMap) + } + + /** + * @param slice Slice to extract a region from. + * @return Returns a reference region that covers the entirety of the slice. + */ + protected def getReferenceRegions(slice: Slice): Seq[ReferenceRegion] = { + Seq(ReferenceRegion(slice.getName, slice.getStart, slice.getEnd, slice.getStrand)) + } +} diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala index abc3163557..872f59b69c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala @@ -167,7 +167,8 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { kryo.register(classOf[org.bdgenomics.adam.algorithms.consensus.Consensus]) // org.bdgenomics.adam.converters - kryo.register(classOf[org.bdgenomics.adam.converters.FastaConverter.FastaDescriptionLine]) + kryo.register(classOf[org.bdgenomics.adam.converters.FastaSequenceConverter.FastaDescriptionLine]) + kryo.register(classOf[org.bdgenomics.adam.converters.FastaSliceConverter.FastaDescriptionLine]) kryo.register(classOf[org.bdgenomics.adam.converters.FragmentCollector]) // org.bdgenomics.adam.models @@ -203,8 +204,12 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { new org.bdgenomics.adam.rdd.fragment.FragmentArraySerializer) kryo.register(classOf[org.bdgenomics.adam.rdd.variant.GenotypeArray], new org.bdgenomics.adam.rdd.variant.GenotypeArraySerializer) - kryo.register(classOf[org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentArray], - new org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentArraySerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.read.ReadArray], + new org.bdgenomics.adam.rdd.read.ReadArraySerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.sequence.SequenceArray], + new org.bdgenomics.adam.rdd.sequence.SequenceArraySerializer) + kryo.register(classOf[org.bdgenomics.adam.rdd.sequence.SliceArray], + new org.bdgenomics.adam.rdd.sequence.SliceArraySerializer) kryo.register(classOf[org.bdgenomics.adam.rdd.variant.VariantArray], new org.bdgenomics.adam.rdd.variant.VariantArraySerializer) kryo.register(classOf[org.bdgenomics.adam.rdd.variant.VariantContextArray], @@ -256,8 +261,6 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { new AvroSerializer[org.bdgenomics.formats.avro.Genotype]) kryo.register(classOf[org.bdgenomics.formats.avro.GenotypeAllele]) kryo.register(classOf[org.bdgenomics.formats.avro.GenotypeType]) - kryo.register(classOf[org.bdgenomics.formats.avro.NucleotideContigFragment], - new AvroSerializer[org.bdgenomics.formats.avro.NucleotideContigFragment]) kryo.register(classOf[org.bdgenomics.formats.avro.OntologyTerm], new AvroSerializer[org.bdgenomics.formats.avro.OntologyTerm]) kryo.register(classOf[org.bdgenomics.formats.avro.ProcessingStep], @@ -329,7 +332,6 @@ class ADAMKryoRegistrator extends KryoRegistrator with Logging { kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.Genotype]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.GenotypeAllele]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.OntologyTerm]]) - kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.NucleotideContigFragment]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.Read]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.ReadGroup]]) kryo.register(classOf[scala.Array[org.bdgenomics.formats.avro.Reference]]) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala index f29905f65a..b424702c20 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/ReferenceContigMap.scala @@ -26,15 +26,15 @@ import org.bdgenomics.adam.models.{ SequenceRecord } import org.bdgenomics.adam.serialization.AvroSerializer -import org.bdgenomics.formats.avro.NucleotideContigFragment +import org.bdgenomics.formats.avro.Slice /** * A broadcastable ReferenceFile backed by a map containing contig name -> - * Seq[NucleotideContigFragment] pairs. + * Seq[Slice] pairs. * - * @param contigMap a map containing a Seq of contig fragments per contig. + * @param contigMap a map containing a Seq of slices per contig. */ -case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragment]]) extends ReferenceFile { +case class ReferenceContigMap(contigMap: Map[String, Seq[Slice]]) extends ReferenceFile { private def keys(): String = { contigMap.keys.toList.sortBy(x => x).mkString(", ") @@ -64,7 +64,7 @@ case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragmen "Contig %s not found in reference map with keys: %s".format(region.referenceName, keys()) ) ) - .dropWhile(f => f.getStart + f.getSequence.length < region.start) + .dropWhile(s => s.getStart + s.getSequence.length < region.start) .takeWhile(_.getStart < region.end) .map( clipFragment(_, region.start, region.end) @@ -72,41 +72,39 @@ case class ReferenceContigMap(contigMap: Map[String, Seq[NucleotideContigFragmen .mkString("") } - private def clipFragment(fragment: NucleotideContigFragment, start: Long, end: Long): String = { + private def clipFragment(slice: Slice, start: Long, end: Long): String = { val min = math.max( 0L, - start - fragment.getStart + start - slice.getStart ).toInt val max = math.min( - fragment.getSequence.length, - end - fragment.getStart + slice.getSequence.length, + end - slice.getStart ).toInt - fragment.getSequence.substring(min, max) + slice.getSequence.substring(min, max) } } /** - * Companion object for creating a ReferenceContigMap from an RDD of contig - * fragments. + * Companion object for creating a ReferenceContigMap from an RDD of slices. */ object ReferenceContigMap { /** - * Builds a ReferenceContigMap from an RDD of fragments. + * Builds a ReferenceContigMap from an RDD of slices. * - * @param fragments RDD of nucleotide contig fragments describing a genome - * reference. - * @return Returns a serializable wrapper around these fragments that enables + * @param slices RDD of slices describing a genome reference. + * @return Returns a serializable wrapper around these slices that enables * random access into the reference genome. */ - def apply(fragments: RDD[NucleotideContigFragment]): ReferenceContigMap = { + def apply(slices: RDD[Slice]): ReferenceContigMap = { ReferenceContigMap( - fragments - .groupBy(_.getContigName) + slices + .groupBy(_.getName) .mapValues(_.toSeq.sortBy(_.getStart)) .collectAsMap .toMap @@ -115,30 +113,30 @@ object ReferenceContigMap { } class ReferenceContigMapSerializer extends Serializer[ReferenceContigMap] { - private val ncfSerializer = new AvroSerializer[NucleotideContigFragment] + private val sliceSerializer = new AvroSerializer[Slice] def write(kryo: Kryo, out: Output, record: ReferenceContigMap) = { out.writeInt(record.contigMap.size) record.contigMap.foreach(p => { out.writeString(p._1) out.writeInt(p._2.size) - p._2.foreach(ncf => { - ncfSerializer.write(kryo, out, ncf) + p._2.foreach(slice => { + sliceSerializer.write(kryo, out, slice) }) }) } def read(kryo: Kryo, in: Input, clazz: Class[ReferenceContigMap]): ReferenceContigMap = { val n = in.readInt() - val array = new Array[(String, Seq[NucleotideContigFragment])](n) + val array = new Array[(String, Seq[Slice])](n) (0 until n).foreach(idx => { val key = in.readString() - val numNcfs = in.readInt() - val ncfArray = new Array[NucleotideContigFragment](numNcfs) - (0 until numNcfs).foreach(jdx => { - ncfArray(jdx) = ncfSerializer.read(kryo, in, classOf[NucleotideContigFragment]) + val numSlices = in.readInt() + val sliceArray = new Array[Slice](numSlices) + (0 until numSlices).foreach(jdx => { + sliceArray(jdx) = sliceSerializer.read(kryo, in, classOf[Slice]) }) - array(idx) = (key, ncfArray.toSeq) + array(idx) = (key, sliceArray.toSeq) }) ReferenceContigMap(array.toMap) } diff --git a/adam-core/src/test/resources/trinity.fa b/adam-core/src/test/resources/trinity.fa new file mode 100644 index 0000000000..366819d4a4 --- /dev/null +++ b/adam-core/src/test/resources/trinity.fa @@ -0,0 +1,43 @@ +>000872-000883_All_comp1777_c1_seq1 len=375 ~FPKM=14.3 path=[0:0-239 240:240-374] +CACTGCACCACCAGGGAAGCCCCAGGTGAATTTCTTACTTCCTTAAGTGCAGGACCTTGT +TTCAGACCTCCCTGCCTTCCTATATGCTGCCTTCTGCCTGGAAACCCCTCCCTCCTTTCT +CCTCTGCACCAACTCCTATCCACCGTTTGAAACTTGCTTCATGTCTCCTTTTATAGGAGG +ACTTCTCTGATTCCCCAATTTGTTTTTTTTCCACTGATCTGTTTTGTTATTTTAATTGAA +TGAATGATTCTTTAAATTCTATAGTGCTTTACAATTTTCAAAAGTTTCACACACATGATC +TCATATAATCCCATAGCAACCCTTTTTCTTCCTCCTACCCCTATATTGCCCCTCTCCCCA +CTGGTAGCCACTAGG +>000872-000883_All_comp1777_c1_seq2 len=344 ~FPKM=5.9 path=[1135:0-208 240:209-343] +TGTCTTACAAACATATGCCGGTGCCTGGAAAAAAAGAATTTCAAAAGTAAAAAATTAAGG +TCATTCCCATCTCAAACATACCTAAAATACATAATGATAAGTAACTTAGCACAGGGAATA +AAGTTGTTACTCAATAAATATTTACTTAATTAGACATGGCAGGAATACAGATATTTGTCC +TGAAGACTTTTTGTAGTTTATTTTTTTATTGAATGATTCTTTAAATTCTATAGTGCTTTA +CAATTTTCAAAAGTTTCACACACATGATCTCATATAATCCCATAGCAACCCTTTTTCTTC +CTCCTACCCCTATATTGCCCCTCTCCCCACTGGTAGCCACTAGG +>000872-000883_All_comp1777_c1_seq3 len=265 ~FPKM=14.1 path=[240:0-134 375:135-143 2488:144-150 2495:151-158 2503:159-167 408:168-264] +TGAATGATTCTTTAAATTCTATAGTGCTTTACAATTTTCAAAAGTTTCACACACATGATC +TCATATAATCCCATAGCAACCCTTTTTCTTCCTCCTACCCCTATATTGCCCCTCTCCCCA +CTGGTAGCCACTAGGTTGTTCTCTATATCTGTGAGTCTGCTTCTTTTTGTTTTATTCACC +AGTTTGTTGTAATTTTTACATTCCACATACAAGTGATAACATATAGTATGTCTTTGACAT +TTCACTTAGTATAATGCCCTCCAAT +>000872-000883_All_comp1777_c3_seq1 len=221 ~FPKM=1.1 path=[1966:0-220] +TATACGCTGATAGGTCACAACTTGCTTTTTAAAAAAAAGTTTTTTTAATCATTGAACAAA +GCTATCTTTAAATTTTATAGTACTTTACAGTTTTCACAAGTTTCACACACGATTTCATAT +AATCCTATAATAACCCTTTTTTATCCCCTACTCCTATATTGCCCTTCCCCCTTCCCTGTC +CCCAATGATAACTACTAGTTCTATCTGTGAGTCTGCTTTTA +>000872-000883_All_comp1852_c0_seq1 len=902 ~FPKM=5.6 path=[0:0-901] +ACTGCGCCACCAGGGAAGCCCCAGCATACCTATTTTTGGTACTGATAAGCTGCTGCCATC +CCAACCAGTGATGTAGCCAATCTTCTCAAATTTTTGGCCCATAGGAGTGAATAGATTCTT +GATCCTTTGCTTCCCACAATGCATAGAAGGGACAGCCAAATTAGTCCCTCACTGCTGGAC +AACGCTGATGGAACTAAAAATGGGTTTGTAGCCCAAGAAAATGAACCCCAGGGCTGTCCA +GAGTACTTTCCTGATACCTGGTGAGAGATTAAGATGCCAGTCACGTATATGCCCCGCAGG +TTCAGTTCTGGGAAGGCCTTCAACATACCATTCTGTGATTTTCCCGAATAAAATGACAGA +CTTCGGTTATGCCTAAGGCTGATTCCTAGATTTTTTTTTTTTCCTGTGTTCACAGCATCT +GGTACATTTTATTTTGTCTAACCAAAATAAAATGTTTATTAGCTTCATTTATGAAAACAA +ACTCACTTAACATTTTAATTGGAAACAGTTCATATGGTCTGTTCAGTGGAAGCTCAGATA +GGCAAAGTGCAATAAGCAGAATGAAATATCACACATATGTCTCTGCATAAATCAACATGA +AAAAGTATTTAAGATGTTTCAAATGGAAAAAGCAAGCTACAAAATAATAAGTACAGTGAG +AGTTCTCTGTTAAACATTTTTTCCTAAACTATTTATGTAGTATTATTATTTTTGCAAAAG +TTATTGCTTTTGTAATACAAAACAGCAAAATTAACAAATACAACAAATACAATGTTTAAT +GAGAAACCTGTGGTATACAGTGTAATTGTCTCTGGACAAAAGTCTTCTTTACAAGTCAAG +GAACAAAACTATTTCTTTTAATTAAATGTAGGTTTTAGAAAAACATTCATTACACATTAC +TA diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala deleted file mode 100644 index aba853accf..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FastaConverterSuite.scala +++ /dev/null @@ -1,221 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.converters - -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.util.ADAMFunSuite -import java.io.File - -class FastaConverterSuite extends ADAMFunSuite { - - val converter = new FastaConverter(1000) - - sparkTest("find contig index") { - val headerLines = sc.parallelize(Seq( - (0L, ">1 dna:chromosome chromosome:GRCh37:1:1:249250621:1"), - (252366306L, ">2 dna:chromosome chromosome:GRCh37:2:1:243199373:1"), - (699103487L, ">4 dna:chromosome chromosome:GRCh37:4:1:191154276:1"), - (892647244L, ">5 dna:chromosome chromosome:GRCh37:5:1:180915260:1"), - (498605724L, ">3 dna:chromosome chromosome:GRCh37:3:1:198022430:1"))) - val descLines = FastaConverter.getDescriptionLines(headerLines) - val headerIndices: List[Long] = descLines.keys.toList - - assert(0 === FastaConverter.findReferenceIndex(252366300L, headerIndices)) - assert(892647244L === FastaConverter.findReferenceIndex(892647249L, headerIndices)) - assert(252366306L === FastaConverter.findReferenceIndex(498605720L, headerIndices)) - } - - test("convert a single record without naming information") { - val contig = converter.convert(None, 0, Seq("AAATTTGCGC"), None) - - assert(contig.head.getSequence.map(_.toString).reduce(_ + _) === "AAATTTGCGC") - assert(contig.head.getContigLength === 10) - assert(contig.head.getContigName === null) - assert(contig.head.getDescription === null) - } - - test("convert a single record with naming information") { - val contig = converter.convert(Some("chr2"), 1, Seq("NNNN"), Some("hg19")) - - assert(contig.head.getSequence.map(_.toString).reduce(_ + _) === "NNNN") - assert(contig.head.getContigLength === 4) - assert(contig.head.getContigName === "chr2") - assert(contig.head.getDescription === "hg19") - } - - sparkTest("convert single fasta sequence") { - val fasta = List((0L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA"), - (1L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (2L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (3L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (4L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (5L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (6L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (7L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (8L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (9L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (10L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (11L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (12L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (13L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (14L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (15L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")) - val rdd = sc.parallelize(fasta.toSeq) - - val adamFasta = FastaConverter(rdd) - assert(adamFasta.count === 1) - - val fastaElement = adamFasta.first() - val fastaSequence = fasta.map(_._2).reduce(_ + _) - val convertedSequence = fastaElement.getSequence.map(_.toString).reduce(_ + _) - - assert(convertedSequence === fastaSequence) - assert(fastaElement.getContigLength() == fastaSequence.length) - assert(fastaElement.getContigName === null) - assert(fastaElement.getDescription === null) - } - - sparkTest("convert fasta with multiple sequences") { - val fasta1 = List((0L, ">chr1"), - (1L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA"), - (2L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (3L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (4L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (5L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (6L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (7L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (8L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (9L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (10L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (11L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (12L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (13L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (14L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (15L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (16L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")) - val fasta2 = List((17L, ">chr2"), - (18L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTCCCCCCCCCCTTTTTTTTTTCCCCCC"), - (19L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (20L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (21L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (22L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (23L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (24L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (25L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (26L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (27L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (28L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (29L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (30L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (31L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (32L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (33L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")) - val fasta = fasta1 ::: fasta2 - val rdd = sc.parallelize(fasta.toSeq) - - val adamFasta = FastaConverter(rdd) - assert(adamFasta.count === 2) - - val fastaElement1 = adamFasta.filter(_.getContigName == "chr1").first() - val fastaSequence1 = fasta1.drop(1).map(_._2).reduce(_ + _) - val convertedSequence1 = fastaElement1.getSequence.map(_.toString).reduce(_ + _) - - assert(convertedSequence1 === fastaSequence1) - assert(fastaElement1.getContigLength() == fastaSequence1.length) - assert(fastaElement1.getContigName().toString === "chr1") - assert(fastaElement1.getDescription === null) - - val fastaElement2 = adamFasta.filter(_.getContigName == "chr2").first() - val fastaSequence2 = fasta2.drop(1).map(_._2).reduce(_ + _) - val convertedSequence2 = fastaElement2.getSequence.map(_.toString).reduce(_ + _) - - assert(convertedSequence2 === fastaSequence2) - assert(fastaElement2.getContigLength() == fastaSequence2.length) - assert(fastaElement2.getContigName().toString === "chr2") - assert(fastaElement2.getDescription === null) - } - - sparkTest("convert fasta with multiple sequences; short fragment") { - val fasta1 = List((0L, ">chr1"), - (1L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGGGGGGAAAAAAAAAAGGGGGGGGGGAAAAAA"), - (2L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (3L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (4L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (5L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (6L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (7L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (8L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (9L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (10L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (11L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (12L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (13L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (14L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (15L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), - (16L, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")) - val fasta2 = List((17L, ">chr2"), - (18L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTCCCCCCCCCCTTTTTTTTTTCCCCCC"), - (19L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (20L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (21L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (22L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (23L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (24L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (25L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (26L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (27L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (28L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (29L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (30L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (31L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (32L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"), - (33L, "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")) - val fasta = fasta1 ::: fasta2 - val rdd = sc.parallelize(fasta.toSeq) - - val adamFasta = FastaConverter(rdd, maximumLength = 35) - assert(adamFasta.count === 64) - - val fastaElement1 = adamFasta.filter(_.getContigName == "chr1").collect() - val fastaSequence1 = fasta1.drop(1).map(_._2).mkString - val seqs = fastaElement1.sortBy(_.getIndex) - val convertedSequence1 = fastaElement1.sortBy(_.getIndex).map(_.getSequence.toString).mkString - assert(seqs != null) - assert(convertedSequence1 === fastaSequence1) - - val fastaElement2 = adamFasta.filter(_.getContigName == "chr2").collect() - val fastaSequence2 = fasta2.drop(1).map(_._2).mkString - val convertedSequence2 = fastaElement2.sortBy(_.getIndex).map(_.getSequence.toString).mkString - - assert(convertedSequence2 === fastaSequence2) - } - - val chr1File = testFile("human_g1k_v37_chr1_59kb.fasta") - - sparkTest("convert reference fasta file") { - //Loading "human_g1k_v37_chr1_59kb.fasta" - val referenceSequences = sc.loadContigFragments(chr1File, maximumLength = 10).rdd.collect() - assert(referenceSequences.forall(_.getContigName.toString == "1")) - assert(referenceSequences.slice(0, referenceSequences.length - 2).forall(_.getSequence.length == 10)) - - val reassembledSequence = referenceSequences.sortBy(_.getIndex).map(_.getSequence).mkString - val originalSequence = scala.io.Source.fromFile(new File(chr1File)).getLines().filter(!_.startsWith(">")).mkString - - assert(reassembledSequence === originalSequence) - } -} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala index 7e85087dda..c3afc8bba2 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/converters/FragmentConverterSuite.scala @@ -24,8 +24,8 @@ import org.bdgenomics.formats.avro._ class FragmentConverterSuite extends ADAMFunSuite { test("build a fragment collector and convert to a read") { - val fcOpt = FragmentCollector(NucleotideContigFragment.newBuilder() - .setContigName("ctg") + val fcOpt = FragmentCollector(Slice.newBuilder() + .setName("ctg") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) @@ -50,18 +50,18 @@ class FragmentConverterSuite extends ADAMFunSuite { } test("if a fragment isn't associated with a contig, don't get a fragment collector") { - val fcOpt = FragmentCollector(NucleotideContigFragment.newBuilder().build()) + val fcOpt = FragmentCollector(Slice.newBuilder().build()) assert(fcOpt.isEmpty) } sparkTest("convert an rdd of discontinuous fragments, all from the same contig") { - val rdd = sc.parallelize(Seq(NucleotideContigFragment.newBuilder() - .setContigName("ctg") + val rdd = sc.parallelize(Seq(Slice.newBuilder() + .setName("ctg") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg") + .build(), Slice.newBuilder() + .setName("ctg") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) @@ -85,18 +85,18 @@ class FragmentConverterSuite extends ADAMFunSuite { } sparkTest("convert an rdd of contiguous fragments, all from the same contig") { - val rdd = sc.parallelize(Seq(NucleotideContigFragment.newBuilder() - .setContigName("ctg") + val rdd = sc.parallelize(Seq(Slice.newBuilder() + .setName("ctg") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg") + .build(), Slice.newBuilder() + .setName("ctg") .setSequence("TGTGTG") .setStart(8L) .setEnd(14L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg") + .build(), Slice.newBuilder() + .setName("ctg") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) @@ -114,33 +114,33 @@ class FragmentConverterSuite extends ADAMFunSuite { } sparkTest("convert an rdd of varied fragments from multiple contigs") { - val rdd = sc.parallelize(Seq(NucleotideContigFragment.newBuilder() - .setContigName("ctg1") + val rdd = sc.parallelize(Seq(Slice.newBuilder() + .setName("ctg1") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg1") + .build(), Slice.newBuilder() + .setName("ctg1") .setSequence("TGTGTG") .setStart(8L) .setEnd(14L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg1") + .build(), Slice.newBuilder() + .setName("ctg1") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg2") + .build(), Slice.newBuilder() + .setName("ctg2") .setSequence("ACACACAC") .setStart(0L) .setEnd(8L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg2") + .build(), Slice.newBuilder() + .setName("ctg2") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) - .build(), NucleotideContigFragment.newBuilder() - .setContigName("ctg3") + .build(), Slice.newBuilder() + .setName("ctg3") .setSequence("AATTCCGGCCTTAA") .setStart(14L) .setEnd(28L) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala index 160b153de7..c3da738862 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/ADAMContextSuite.scala @@ -403,32 +403,36 @@ class ADAMContextSuite extends ADAMFunSuite { sparkTest("read a HLA fasta from GRCh38") { val inputPath = testFile("HLA_DQB1_05_01_01_02.fa") - val gDataset = sc.loadFasta(inputPath, 10000L) - assert(gDataset.sequences.records.size === 1) - assert(gDataset.sequences.records.head.name === "HLA-DQB1*05:01:01:02") - val fragments = gDataset.rdd.collect - assert(fragments.size === 1) - assert(fragments.head.getContigName === "HLA-DQB1*05:01:01:02") + val gRdd = sc.loadFastaDna(inputPath) + + // see https://github.com/bigdatagenomics/adam/issues/1894 + val withSequenceDictionary = gRdd.createSequenceDictionary() + assert(withSequenceDictionary.sequences.records.size === 1) + assert(withSequenceDictionary.sequences.records.head.name === "HLA-DQB1*05:01:01:02") + + val sequences = gRdd.rdd.collect + assert(sequences.size === 1) + assert(sequences.head.getName === "HLA-DQB1*05:01:01:02") } sparkTest("read a gzipped fasta file") { val inputPath = testFile("chr20.250k.fa.gz") - val contigFragments = sc.loadFasta(inputPath, 10000L) - .transform((rdd: RDD[NucleotideContigFragment]) => { - rdd.sortBy(_.getIndex.toInt) - }) - assert(contigFragments.rdd.count() === 26) - val first: NucleotideContigFragment = contigFragments.rdd.first() - assert(first.getContigName === null) + val slices = sc.loadFastaDna(inputPath, 10000L) + .rdd + .sortBy(_.getIndex.toInt) + + assert(slices.count() === 26) + val first = slices.first() + assert(first.getName === null) assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") assert(first.getIndex === 0) assert(first.getSequence.length === 10000) assert(first.getStart === 0L) assert(first.getEnd === 10000L) - assert(first.getFragments === 26) + assert(first.getSlices === 26) // 250k file actually has 251930 bases - val last: NucleotideContigFragment = contigFragments.rdd.collect().last + val last = slices.collect().last assert(last.getIndex === 25) assert(last.getStart === 250000L) assert(last.getEnd === 251930L) @@ -992,4 +996,60 @@ class ADAMContextSuite extends ADAMFunSuite { assert(reloaded.headerLines.toSet == variants.headerLines.toSet) assert(reloaded.rdd.collect().deep == variants.rdd.collect().deep) } + + sparkTest("read a fasta file with short sequences as sequences") { + val inputPath = testFile("trinity.fa") + val sequences = sc.loadFastaDna(inputPath) + assert(sequences.rdd.count === 5) + } + + sparkTest("read a fasta file with long sequences as sequences") { + val inputPath = testFile("chr20.250k.fa.gz") + val sequences = sc.loadFastaDna(inputPath) + assert(sequences.rdd.count === 1) + val sequence = sequences.rdd.first() + assert(sequence.getName === null) + assert(sequence.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") + assert(sequence.getAlphabet === org.bdgenomics.formats.avro.Alphabet.DNA) + assert(sequence.getSequence.length === 251930) + assert(sequence.getLength === 251930L) + } + + sparkTest("read a fasta file with short sequences as slices") { + val inputPath = testFile("trinity.fa") + val slices = sc.loadFastaDna(inputPath, 10000L) + assert(slices.rdd.count === 5) + } + + sparkTest("read a fasta file with long sequences as slices") { + val inputPath = testFile("chr20.250k.fa.gz") + val slices = sc.loadFastaDna(inputPath, 10000L) + slices.transform(rdd => rdd.sortBy(_.getIndex.toInt)) + assert(slices.rdd.count() === 26) + + val first = slices.rdd.first() + assert(first.getName === null) + assert(first.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") + assert(first.getAlphabet === org.bdgenomics.formats.avro.Alphabet.DNA) + assert(first.getSequence.length === 10000) + assert(first.getLength === 10000L) + assert(first.getStart === 0L) + assert(first.getEnd === 10000L) + assert(first.getIndex === 0) + assert(first.getSlices === 26) + assert(first.getTotalLength === 251930L) + + // 250k file actually has 251930 bases + val last = slices.rdd.collect().last + assert(last.getName === null) + assert(last.getDescription === "gi|224384749|gb|CM000682.1| Homo sapiens chromosome 20, GRCh37 primary reference assembly") + assert(last.getAlphabet === org.bdgenomics.formats.avro.Alphabet.DNA) + assert(last.getSequence.length === 1930) + assert(last.getLength === 1930L) + assert(last.getStart === 250000L) + assert(last.getEnd === 251930L) + assert(last.getIndex === 25) + assert(last.getSlices === 26) + assert(last.getTotalLength === 251930L) + } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala deleted file mode 100644 index 3e7047055a..0000000000 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentDatasetSuite.scala +++ /dev/null @@ -1,922 +0,0 @@ -/** - * Licensed to Big Data Genomics (BDG) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The BDG licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.bdgenomics.adam.rdd.contig - -import java.io.File -import java.lang.{ Long => JLong } - -import com.google.common.io.Files -import org.apache.parquet.filter2.predicate.Operators.{ BinaryColumn, LongColumn } -import org.apache.parquet.filter2.predicate.{ FilterApi, FilterPredicate } -import org.apache.parquet.io.api.Binary -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{ Dataset, SQLContext } -import org.bdgenomics.adam.models._ -import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } -import org.bdgenomics.adam.rdd.fragment.FragmentDataset -import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset -import org.bdgenomics.adam.rdd.variant.{ - GenotypeDataset, - VariantDataset, - VariantContextDataset -} -import org.bdgenomics.adam.sql.{ - AlignmentRecord => AlignmentRecordProduct, - Feature => FeatureProduct, - Fragment => FragmentProduct, - Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, - Variant => VariantProduct, - VariantContext => VariantContextProduct -} -import org.bdgenomics.adam.util.ADAMFunSuite -import org.bdgenomics.formats.avro._ -import scala.collection.mutable.ListBuffer - -object NucleotideContigFragmentDatasetSuite extends Serializable { - - def covFn(ncf: NucleotideContigFragment): Coverage = { - Coverage(ncf.getContigName, - ncf.getStart, - ncf.getEnd, - 1) - } - - def featFn(ncf: NucleotideContigFragment): Feature = { - Feature.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def fragFn(ncf: NucleotideContigFragment): Fragment = { - Fragment.newBuilder - .setName(ncf.getContigName) - .build - } - - def genFn(ncf: NucleotideContigFragment): Genotype = { - Genotype.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def readFn(ncf: NucleotideContigFragment): AlignmentRecord = { - AlignmentRecord.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def varFn(ncf: NucleotideContigFragment): Variant = { - Variant.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build - } - - def vcFn(ncf: NucleotideContigFragment): VariantContext = { - VariantContext(Variant.newBuilder - .setReferenceName(ncf.getContigName) - .setStart(ncf.getStart) - .setEnd(ncf.getEnd) - .build) - } -} - -class NucleotideContigFragmentDatasetSuite extends ADAMFunSuite { - - sparkTest("union two ncf genomic datasets together") { - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 10000L) - val fragments2 = sc.loadFasta(testFile("artificial.fa")) - val union = fragments1.union(fragments2) - assert(union.rdd.count === (fragments1.rdd.count + fragments2.rdd.count)) - assert(union.sequences.size === 2) - } - - sparkTest("round trip a ncf to parquet") { - def testMetadata(fRdd: NucleotideContigFragmentDataset) { - val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) - assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - } - - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - assert(fragments1.rdd.count === 8L) - assert(fragments1.dataset.count === 8L) - testMetadata(fragments1) - - // save using dataset path - val output1 = tmpFile("ctg.adam") - val dsBound = fragments1.transformDataset(ds => ds) - testMetadata(dsBound) - dsBound.saveAsParquet(output1) - val fragments2 = sc.loadContigFragments(output1) - testMetadata(fragments2) - assert(fragments2.rdd.count === 8L) - assert(fragments2.dataset.count === 8L) - - // save using rdd path - val output2 = tmpFile("ctg.adam") - val rddBound = fragments2.transform((rdd: RDD[NucleotideContigFragment]) => rdd) - testMetadata(rddBound) - rddBound.saveAsParquet(output2) - val fragments3 = sc.loadContigFragments(output2) - assert(fragments3.rdd.count === 8L) - assert(fragments3.dataset.count === 8L) - } - - sparkTest("round trip a ncf to partitioned parquet") { - def testMetadata(fRdd: NucleotideContigFragmentDataset) { - val sequenceRdd = fRdd.addSequence(SequenceRecord("aSequence", 1000L)) - assert(sequenceRdd.sequences.containsReferenceName("aSequence")) - } - - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - assert(fragments1.rdd.count === 8L) - assert(fragments1.dataset.count === 8L) - testMetadata(fragments1) - - // save using dataset path - val output1 = tmpFile("ctg.adam") - val dsBound = fragments1.transformDataset(ds => ds) - testMetadata(dsBound) - dsBound.saveAsPartitionedParquet(output1) - val fragments2 = sc.loadPartitionedParquetContigFragments(output1) - testMetadata(fragments2) - assert(fragments2.rdd.count === 8L) - assert(fragments2.dataset.count === 8L) - - // save using rdd path - val output2 = tmpFile("ctg.adam") - val rddBound = fragments2.transform((rdd: RDD[NucleotideContigFragment]) => rdd) - testMetadata(rddBound) - rddBound.saveAsPartitionedParquet(output2) - val fragments3 = sc.loadPartitionedParquetContigFragments(output2) - assert(fragments3.rdd.count === 8L) - assert(fragments3.dataset.count === 8L) - } - - sparkTest("save fasta back as a single file") { - val origFasta = testFile("artificial.fa") - val tmpFasta = tmpFile("test.fa") - sc.loadFasta(origFasta) - .saveAsFasta(tmpFasta, asSingleFile = true, lineWidth = 70) - checkFiles(origFasta, tmpFasta) - } - - sparkTest("generate sequence dict from fasta") { - - val ctg0 = NucleotideContigFragment.newBuilder() - .setContigName("chr0") - .setContigLength(1000L) - .build() - val ctg1 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(900L) - .build() - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(ctg0, ctg1))) - - assert(rdd.sequences.containsReferenceName("chr0")) - val chr0 = rdd.sequences("chr0").get - assert(chr0.length === 1000L) - assert(rdd.sequences.containsReferenceName("chr1")) - val chr1 = rdd.sequences("chr1").get - assert(chr1.length === 900L) - } - - sparkTest("recover reference string from a single contig fragment") { - - val sequence = "ACTGTAC" - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val region = ReferenceRegion(fragment).get - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - assert(rdd.extract(region) === "ACTGTAC") - } - - sparkTest("recover trimmed reference string from a single contig fragment") { - - val sequence = "ACTGTAC" - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val region = new ReferenceRegion("chr1", 1L, 6L) - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - assert(rdd.extract(region) === "CTGTA") - } - - sparkTest("recover reference string from multiple contig fragments") { - - val sequence = "ACTGTACTC" - val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT - val sequence2 = sequence.takeRight(6).reverse // CTCATG - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence0) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence1) - .setIndex(0) - .setStart(0L) - .setEnd(5L) - .setFragments(2) - .build() - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence2) - .setIndex(1) - .setStart(5L) - .setEnd(12L) - .setFragments(2) - .build() - val region0 = ReferenceRegion(fragment0).get - val region1 = ReferenceRegion(fragment1).get.merge(ReferenceRegion(fragment2).get) - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, - fragment1, - fragment2))) - - assert(rdd.extract(region0) === "ACTGTAC") - assert(rdd.extract(region1) === "GTACTCTCATG") - } - - sparkTest("extract sequences based on the list of reference regions") { - val test = "test" - - def dnas2fragments(dnas: Seq[String]): List[NucleotideContigFragment] = { - val (_, frags) = dnas.foldLeft((0L, List.empty[NucleotideContigFragment])) { - case ((start, acc), str) => - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("test") - .setStart(start) - .setLength(str.length: Long) - .setSequence(str) - .setEnd(start + str.length) - .build() - (start + str.length, fragment :: acc) - } - frags.reverse - } - - val dnas: Seq[String] = Vector( - "ACAGCTGATCTCCAGATATGACCATGGGTT", - "CAGCTGATCTCCAGATATGACCATGGGTTT", - "CCAGAAGTTTGAGCCACAAACCCATGGTCA" - ) - - val merged = dnas.reduce(_ + _) - - val record = SequenceRecord("test", merged.length) - - val dic = new SequenceDictionary(Vector(record)) - val frags = sc.parallelize(dnas2fragments(dnas)) - val fragments = NucleotideContigFragmentDataset(frags, dic) - - val byRegion = fragments.rdd.keyBy(ReferenceRegion(_)) - - val regions = List( - new ReferenceRegion(test, 0, 5), - new ReferenceRegion(test, 25, 35), - new ReferenceRegion(test, 40, 50), - new ReferenceRegion(test, 50, 70) - ) - - val results: Set[(ReferenceRegion, String)] = fragments.extractRegions(regions).collect().toSet - val seqs = regions.zip(List("ACAGC", "GGGTTCAGCT", "CCAGATATGA", "CCATGGGTTTCCAGAAGTTT")).toSet - assert(seqs === results) - } - - sparkTest("recover trimmed reference string from multiple contig fragments") { - - val sequence = "ACTGTACTC" - val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT - val sequence2 = sequence.takeRight(6).reverse // CTCATG - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence0) - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build() - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence1) - .setIndex(0) - .setStart(0L) - .setEnd(5L) - .setFragments(2) - .build() - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence2) - .setIndex(1) - .setStart(5L) - .setEnd(11L) - .setFragments(2) - .build() - val region0 = new ReferenceRegion("chr1", 1L, 6L) - val region1 = new ReferenceRegion("chr2", 3L, 9L) - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, - fragment1, - fragment2))) - - assert(rdd.extract(region0) === "CTGTA") - assert(rdd.extract(region1) === "CTCTCA") - } - - sparkTest("testing nondeterminism from reduce when recovering referencestring") { - - var fragments: ListBuffer[NucleotideContigFragment] = new ListBuffer[NucleotideContigFragment]() - for (a <- 0L to 1000L) { - val seq = "A" - val frag = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(1000L) - .setStart(a) - .setEnd(a + 1L) - .setSequence(seq) - .build() - fragments += frag - } - var passed = true - val rdd = NucleotideContigFragmentDataset(sc.parallelize(fragments.toList)) - try { - val result = rdd.extract(new ReferenceRegion("chr1", 0L, 1000L)) - } catch { - case e: AssertionError => passed = false - } - assert(passed == true) - } - - sparkTest("save single contig fragment as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with description as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setDescription("description") - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1 description") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with null fields as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(null) - .setStart(null) - .setEnd(null) - .setFragments(null) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with null fragment number as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(null) - .setStart(null) - .setEnd(null) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - sparkTest("save single contig fragment with null number of fragments in contig as FASTA text file") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(0) - .setStart(null) - .setEnd(null) - .setFragments(null) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - - def validate(fileName: String) { - val fastaLines = scala.io.Source.fromFile(new File(fileName + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 2) - assert(fastaLines(0) === ">chr1") - assert(fastaLines(1) === "ACTGTAC") - } - - val outputFastaFile = tmpFile("test.fa") - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - validate(outputFastaFile) - - val outputFastaFile2 = tmpFile("test2.fa") - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile2) - validate(outputFastaFile2) - } - - sparkTest("save multiple contig fragments from same contig as FASTA text file") { - - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(3) - .build - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setSequence("GCATATC") - .setIndex(1) - .setFragments(3) - .build - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setSequence("CTGATCG") - .setIndex(2) - .setFragments(3) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, fragment1, fragment2))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 6) - assert(fastaLines(0) === ">chr1 fragment 1 of 3") - assert(fastaLines(1) === "ACTGTAC") - assert(fastaLines(2) === ">chr1 fragment 2 of 3") - assert(fastaLines(3) === "GCATATC") - assert(fastaLines(4) === ">chr1 fragment 3 of 3") - assert(fastaLines(5) === "CTGATCG") - } - - sparkTest("save multiple contig fragments with description from same contig as FASTA text file") { - - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setDescription("description") - .setSequence("ACTGTAC") - .setIndex(0) - .setFragments(3) - .build - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setDescription("description") - .setSequence("GCATATC") - .setIndex(1) - .setFragments(3) - .build - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(21L) - .setDescription("description") - .setSequence("CTGATCG") - .setIndex(2) - .setFragments(3) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment0, - fragment1, - fragment2))) - - val outputDir = Files.createTempDir() - val outputFastaFile = outputDir.getAbsolutePath + "/test.fa" - rdd.transform((rdd: RDD[NucleotideContigFragment]) => rdd.coalesce(1)).saveAsFasta(outputFastaFile) - val fastaLines = scala.io.Source.fromFile(new File(outputFastaFile + "/part-00000")).getLines().toSeq - - assert(fastaLines.length === 6) - assert(fastaLines(0) === ">chr1 description fragment 1 of 3") - assert(fastaLines(1) === "ACTGTAC") - assert(fastaLines(2) === ">chr1 description fragment 2 of 3") - assert(fastaLines(3) === "GCATATC") - assert(fastaLines(4) === ">chr1 description fragment 3 of 3") - assert(fastaLines(5) === "CTGATCG") - } - - sparkTest("merge single contig fragment null fragment number") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(null) - .setStart(null) - .setEnd(null) - .setFragments(null) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - val merged = rdd.mergeFragments() - - assert(merged.rdd.count == 1L) - assert(merged.rdd.first.getSequence() === "ACTGTAC") - } - - sparkTest("merge single contig fragment number zero") { - - val fragment = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence("ACTGTAC") - .setIndex(0) - .setStart(0L) - .setEnd(7L) - .setFragments(1) - .build - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment))) - val merged = rdd.mergeFragments() - - assert(merged.rdd.count == 1L) - assert(merged.rdd.first.getSequence() === "ACTGTAC") - } - - sparkTest("merge multiple contig fragments") { - - val sequence = "ACTGTACTC" - val sequence0 = sequence.take(7) // ACTGTAC - val sequence1 = sequence.drop(3).take(5) // GTACT - val sequence2 = sequence.takeRight(6).reverse // CTCATG - val fragment0 = NucleotideContigFragment.newBuilder() - .setContigName("chr1") - .setContigLength(7L) - .setSequence(sequence0) - .setIndex(0) - .setStart(0L) - .setEnd(sequence0.length - 1L) - .setFragments(1) - .build() - val fragment1 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence1) - .setIndex(0) - .setStart(0L) - .setEnd(sequence1.length - 1L) - .setFragments(2) - .build() - val fragment2 = NucleotideContigFragment.newBuilder() - .setContigName("chr2") - .setContigLength(11L) - .setSequence(sequence2) - .setIndex(1) - .setStart(5L) - .setEnd(sequence2.length - 1L) - .setFragments(2) - .build() - - val rdd = NucleotideContigFragmentDataset(sc.parallelize(List(fragment2, - fragment1, - fragment0))) - val merged = rdd.mergeFragments() - - assert(merged.rdd.count == 2L) - - val collect = merged.rdd.collect - assert(collect(0).getSequence() === "ACTGTAC") - assert(collect(1).getSequence() === "GTACTCTCATG") - } - - sparkTest("save as parquet and apply predicate pushdown") { - val fragments1 = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - assert(fragments1.rdd.count === 8) - val output = tmpFile("contigs.adam") - fragments1.saveAsParquet(output) - val fragments2 = sc.loadContigFragments(output) - assert(fragments2.rdd.count === 8) - val fragments3 = sc.loadContigFragments(output, - optPredicate = Some( - // ReferenceRegion.toPredicate uses referenceName instead of contigName - FilterApi.and( - FilterApi.and( - FilterApi.eq[Binary, BinaryColumn]( - FilterApi.binaryColumn("contigName"), - Binary.fromString("HLA-DQB1*05:01:01:02")), - FilterApi.gt[JLong, LongColumn](FilterApi.longColumn("end"), 500L)), - FilterApi.ltEq[JLong, LongColumn](FilterApi.longColumn("start"), 1500L)) - ) - ) - assert(fragments3.rdd.count === 2) - } - - sparkTest("load fasta sequences from GFF3 file") { - val sequences = sc.loadFasta(testFile("ctg123.fasta.gff3")) - assert(sequences.rdd.count() === 4) - } - - sparkTest("transform contigs to coverage genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(coverage: CoverageDataset) { - val tempPath = tmpLocation(".bed") - coverage.save(tempPath, false, false) - - assert(sc.loadCoverage(tempPath).rdd.count === 8) - } - - val coverage = contigs.transmute[Coverage, Coverage, CoverageDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.covFn) - }) - - checkSave(coverage) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val coverageDs: CoverageDataset = contigs.transmuteDataset[Coverage, Coverage, CoverageDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => NucleotideContigFragmentDatasetSuite.covFn(r.toAvro)) - }) - - checkSave(coverageDs) - } - - sparkTest("transform contigs to feature genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(features: FeatureDataset) { - val tempPath = tmpLocation(".bed") - features.saveAsBed(tempPath) - - assert(sc.loadFeatures(tempPath).rdd.count === 8) - } - - val features: FeatureDataset = contigs.transmute[Feature, FeatureProduct, FeatureDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.featFn) - }) - - checkSave(features) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val featuresDs: FeatureDataset = contigs.transmuteDataset[Feature, FeatureProduct, FeatureDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - FeatureProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.featFn(r.toAvro)) - }) - }) - - checkSave(featuresDs) - } - - sparkTest("transform contigs to fragment genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(fragments: FragmentDataset) { - val tempPath = tmpLocation(".adam") - fragments.saveAsParquet(tempPath) - - assert(sc.loadFragments(tempPath).rdd.count === 8) - } - - val fragments: FragmentDataset = contigs.transmute[Fragment, FragmentProduct, FragmentDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.fragFn) - }) - - checkSave(fragments) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val fragmentsDs: FragmentDataset = contigs.transmuteDataset[Fragment, FragmentProduct, FragmentDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - FragmentProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.fragFn(r.toAvro)) - }) - }) - - checkSave(fragmentsDs) - } - - sparkTest("transform contigs to read genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(reads: AlignmentRecordDataset) { - val tempPath = tmpLocation(".adam") - reads.saveAsParquet(tempPath) - - assert(sc.loadAlignments(tempPath).rdd.count === 8) - } - - val reads: AlignmentRecordDataset = contigs.transmute[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.readFn) - }) - - checkSave(reads) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val readsDs: AlignmentRecordDataset = contigs.transmuteDataset[AlignmentRecord, AlignmentRecordProduct, AlignmentRecordDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - AlignmentRecordProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.readFn(r.toAvro)) - }) - }) - - checkSave(readsDs) - } - - sparkTest("transform contigs to genotype genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(genotypes: GenotypeDataset) { - val tempPath = tmpLocation(".adam") - genotypes.saveAsParquet(tempPath) - - assert(sc.loadGenotypes(tempPath).rdd.count === 8) - } - - val genotypes: GenotypeDataset = contigs.transmute[Genotype, GenotypeProduct, GenotypeDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.genFn) - }) - - checkSave(genotypes) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val genotypesDs: GenotypeDataset = contigs.transmuteDataset[Genotype, GenotypeProduct, GenotypeDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - GenotypeProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.genFn(r.toAvro)) - }) - }) - - checkSave(genotypesDs) - } - - sparkTest("transform contigs to variant genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(variants: VariantDataset) { - val tempPath = tmpLocation(".adam") - variants.saveAsParquet(tempPath) - - assert(sc.loadVariants(tempPath).rdd.count === 8) - } - - val variants: VariantDataset = contigs.transmute[Variant, VariantProduct, VariantDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.varFn) - }) - - checkSave(variants) - - val sqlContext = SQLContext.getOrCreate(sc) - import sqlContext.implicits._ - - val variantsDs: VariantDataset = contigs.transmuteDataset[Variant, VariantProduct, VariantDataset]( - (ds: Dataset[NucleotideContigFragmentProduct]) => { - ds.map(r => { - VariantProduct.fromAvro( - NucleotideContigFragmentDatasetSuite.varFn(r.toAvro)) - }) - }) - - checkSave(variantsDs) - } - - sparkTest("transform contigs to variant context genomic dataset") { - val contigs = sc.loadFasta(testFile("HLA_DQB1_05_01_01_02.fa"), 1000L) - - def checkSave(variantContexts: VariantContextDataset) { - assert(variantContexts.rdd.count === 8) - } - - val variantContexts: VariantContextDataset = contigs.transmute[VariantContext, VariantContextProduct, VariantContextDataset]( - (rdd: RDD[NucleotideContigFragment]) => { - rdd.map(NucleotideContigFragmentDatasetSuite.vcFn) - }) - - checkSave(variantContexts) - } -} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala index fcc42a2077..6dd1fc651b 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/CoverageDatasetSuite.scala @@ -28,9 +28,9 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -41,7 +41,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -50,11 +50,9 @@ import org.bdgenomics.formats.avro._ object CoverageDatasetSuite extends Serializable { - def ncfFn(cov: Coverage): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(cov.referenceName) - .setStart(cov.start) - .setEnd(cov.end) + def sliceFn(cov: Coverage): Slice = { + Slice.newBuilder + .setName(cov.referenceName) .build } @@ -289,38 +287,38 @@ class CoverageDatasetSuite extends ADAMFunSuite { assert(collapsed.rdd.count == 8) } - sparkTest("transform coverage to contig rdd") { + sparkTest("transform coverage to slice genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 3) + assert(sc.loadSlices(tempPath).rdd.count === 3) } - val contigs: NucleotideContigFragmentDataset = coverage.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = coverage.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Coverage]) => { - rdd.map(CoverageDatasetSuite.ncfFn) + rdd.map(CoverageDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = coverage.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = coverage.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[Coverage]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - CoverageDatasetSuite.ncfFn(r)) + SliceProduct.fromAvro( + CoverageDatasetSuite.sliceFn(r)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } - sparkTest("transform coverage to feature rdd") { + sparkTest("transform coverage to feature genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(features: FeatureDataset) { @@ -351,7 +349,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(featuresDs) } - sparkTest("transform coverage to fragment rdd") { + sparkTest("transform coverage to fragment genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(fragments: FragmentDataset) { @@ -382,7 +380,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(fragmentsDs) } - sparkTest("transform coverage to read rdd") { + sparkTest("transform coverage to read genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(reads: AlignmentRecordDataset) { @@ -413,7 +411,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(readsDs) } - sparkTest("transform coverage to genotype rdd") { + sparkTest("transform coverage to genotype genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(genotypes: GenotypeDataset) { @@ -444,7 +442,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(genotypesDs) } - sparkTest("transform coverage to variant rdd") { + sparkTest("transform coverage to variant genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(variants: VariantDataset) { @@ -475,7 +473,7 @@ class CoverageDatasetSuite extends ADAMFunSuite { checkSave(variantsDs) } - sparkTest("transform coverage to variant context rdd") { + sparkTest("transform coverage to variant context genomic dataset") { val coverage = sc.loadCoverage(testFile("sample_coverage.bed")) def checkSave(variantContexts: VariantContextDataset) { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala index 14678051bb..cc8561f198 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/feature/FeatureDatasetSuite.scala @@ -28,9 +28,9 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -41,7 +41,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -72,9 +72,9 @@ object FeatureDatasetSuite extends Serializable { .build } - def ncfFn(f: Feature): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(f.getReferenceName) + def sliceFn(f: Feature): Slice = { + Slice.newBuilder + .setName(f.getReferenceName) .build } @@ -1002,38 +1002,38 @@ class FeatureDatasetSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 4) } - sparkTest("transform features to contig rdd") { + sparkTest("transform features to slice genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 3) + assert(sc.loadSlices(tempPath).rdd.count === 3) } - val contigs: NucleotideContigFragmentDataset = features.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = features.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Feature]) => { - rdd.map(FeatureDatasetSuite.ncfFn) + rdd.map(FeatureDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = features.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = features.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[FeatureProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - FeatureDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + FeatureDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } - sparkTest("transform features to coverage rdd") { + sparkTest("transform features to coverage genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(coverage: CoverageDataset) { @@ -1061,7 +1061,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(coverageDs) } - sparkTest("transform features to fragment rdd") { + sparkTest("transform features to fragment genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(fragments: FragmentDataset) { @@ -1092,7 +1092,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(fragmentsDs) } - sparkTest("transform features to read rdd") { + sparkTest("transform features to read genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(reads: AlignmentRecordDataset) { @@ -1123,7 +1123,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(readsDs) } - sparkTest("transform features to genotype rdd") { + sparkTest("transform features to genotype genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(genotypes: GenotypeDataset) { @@ -1154,7 +1154,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(genotypesDs) } - sparkTest("transform features to variant rdd") { + sparkTest("transform features to variant genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(variants: VariantDataset) { @@ -1185,7 +1185,7 @@ class FeatureDatasetSuite extends ADAMFunSuite { checkSave(variantsDs) } - sparkTest("transform features to variant context rdd") { + sparkTest("transform features to variant context genomic dataset") { val features = sc.loadFeatures(testFile("sample_coverage.bed")) def checkSave(variantContexts: VariantContextDataset) { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala index 75e05fc8c8..5dfa0c3b7f 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentDatasetSuite.scala @@ -27,7 +27,6 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, @@ -35,6 +34,7 @@ import org.bdgenomics.adam.rdd.read.{ AnySAMOutFormatter, QualityScoreBin } +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -45,7 +45,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -380,35 +380,35 @@ class FragmentDatasetSuite extends ADAMFunSuite { assert(rdd4.dataset.count === 20) } - sparkTest("transform fragments to contig genomic dataset") { + sparkTest("transform fragments to slice genomic dataset") { val fragments = sc.loadFragments(testFile("small.sam")) - def checkSave(ncRdd: NucleotideContigFragmentDataset) { + def checkSave(sliceRdd: SliceDataset) { val tempPath = tmpLocation(".fa") - ncRdd.saveAsFasta(tempPath) + sliceRdd.saveAsFasta(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count.toInt === 20) + assert(sc.loadSlices(tempPath).rdd.count.toInt === 20) } - val features: NucleotideContigFragmentDataset = fragments.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = fragments.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Fragment]) => { - rdd.map(AlignmentRecordDatasetSuite.ncfFn) + rdd.map(AlignmentRecordDatasetSuite.sliceFn) }) - checkSave(features) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: NucleotideContigFragmentDataset = fragments.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = fragments.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[FragmentProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - AlignmentRecordDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + AlignmentRecordDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(featuresDs) + checkSave(slicesDs) } sparkTest("transform fragments to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala index 64f2892e25..6accf636dc 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordDatasetSuite.scala @@ -41,9 +41,9 @@ import org.bdgenomics.adam.rdd.{ ADAMContext, TestSaveArgs } -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.rdd.variant.{ GenotypeDataset, VariantDataset, @@ -55,7 +55,7 @@ import org.bdgenomics.adam.sql.{ Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -89,15 +89,15 @@ object AlignmentRecordDatasetSuite extends Serializable { f.getAlignments().get(0) } - def ncfFn(r: AlignmentRecord): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(r.getReferenceName) + def sliceFn(r: AlignmentRecord): Slice = { + Slice.newBuilder + .setName(r.getReferenceName) .setSequence(r.getSequence) .build } - def ncfFn(f: Fragment): NucleotideContigFragment = { - ncfFn(fragToRead(f)) + def sliceFn(f: Fragment): Slice = { + sliceFn(fragToRead(f)) } def covFn(r: AlignmentRecord): Coverage = { @@ -1440,35 +1440,35 @@ class AlignmentRecordDatasetSuite extends ADAMFunSuite { assert(kmerCounts.toDF().where($"kmer" === "CCAAGA" && $"count" === 3).count === 1) } - sparkTest("transform reads to contig genomic dataset") { + sparkTest("transform reads to slice genomic dataset") { val reads = sc.loadAlignments(testFile("small.sam")) - def checkSave(ncRdd: NucleotideContigFragmentDataset) { + def checkSave(sliceRdd: SliceDataset) { val tempPath = tmpLocation(".fa") - ncRdd.saveAsFasta(tempPath) + sliceRdd.saveAsFasta(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count.toInt === 20) + assert(sc.loadSlices(tempPath).rdd.count.toInt === 20) } - val features: NucleotideContigFragmentDataset = reads.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = reads.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[AlignmentRecord]) => { - rdd.map(AlignmentRecordDatasetSuite.ncfFn) + rdd.map(AlignmentRecordDatasetSuite.sliceFn) }) - checkSave(features) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val featuresDs: NucleotideContigFragmentDataset = reads.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = reads.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[AlignmentRecordProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - AlignmentRecordDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + AlignmentRecordDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(featuresDs) + checkSave(slicesDs) } sparkTest("transform reads to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala index 4512e1974d..bcfa046080 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala @@ -20,7 +20,7 @@ package org.bdgenomics.adam.rdd.read import htsjdk.samtools.ValidationStringency import org.apache.spark.rdd.RDD import org.bdgenomics.adam.util.{ ADAMFunSuite, ReferenceContigMap } -import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment, Reference } +import org.bdgenomics.formats.avro.{ AlignmentRecord, Reference, Slice } class MDTaggingSuite extends ADAMFunSuite { val chr1 = @@ -36,14 +36,14 @@ class MDTaggingSuite extends ADAMFunSuite { .setLength(100L) .build() - def makeFrags(frags: (Reference, Int, String)*): RDD[NucleotideContigFragment] = + def makeFrags(frags: (Reference, Int, String)*): RDD[Slice] = sc.parallelize( for { (reference, start, seq) <- frags } yield ( - NucleotideContigFragment.newBuilder - .setContigLength(reference.getLength) - .setContigName(reference.getName) + Slice.newBuilder() + .setTotalLength(reference.getLength) + .setName(reference.getName) .setStart(start.toLong) .setEnd(start.toLong + seq.length) .setSequence(seq).build() diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/ReadDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/ReadDatasetSuite.scala new file mode 100644 index 0000000000..c709ff70db --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/ReadDatasetSuite.scala @@ -0,0 +1,189 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.read + +import java.io.File + +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ + ReferenceRegion, + SequenceDictionary, + SequenceRecord +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.rdd.feature.FeatureDataset +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ + Alphabet, + Feature, + Read, + Strand +} + +class ReadDatasetSuite extends ADAMFunSuite { + + val r1 = Read.newBuilder() + .setName("name1") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setLength(4L) + .setSequence("actg") + .setQualityScores("9999") + .build + + val r2 = Read.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setLength(4L) + .setSequence("actg") + .setQualityScores("9999") + .build + + val sd = SequenceDictionary( + SequenceRecord("name1", 4), + SequenceRecord("name2", 4) + ) + + def tempLocation(suffix: String = ".adam"): String = { + val tempFile = File.createTempFile("ReadDatasetSuite", "") + val tempDir = tempFile.getParentFile + new File(tempDir, tempFile.getName + suffix).getAbsolutePath + } + + sparkTest("create a new read genomic dataset") { + val reads: RDD[Read] = sc.parallelize(Seq(r1, r2)) + assert(ReadDataset(reads).rdd.count === 2) + } + + sparkTest("create a new read genomic dataset with sequence dictionary") { + val reads: RDD[Read] = sc.parallelize(Seq(r1, r2)) + assert(ReadDataset(reads, sd).rdd.count === 2) + } + + sparkTest("save as parquet") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".adam") + reads.save(outputPath, asSingleFile = false) + } + + sparkTest("round trip as parquet") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".adam") + reads.saveAsParquet(outputPath) + + val parquetReads = sc.loadParquetReads(outputPath) + assert(parquetReads.rdd.count === 2) + } + + sparkTest("save as fastq") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".fastq") + reads.save(outputPath, asSingleFile = false) + } + + sparkTest("save as single file fastq") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val outputPath = tempLocation(".fastq") + reads.save(outputPath, asSingleFile = true) + } + + sparkTest("filter read genomic dataset by reference region") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val filtered = reads.filterByOverlappingRegion(ReferenceRegion.all("name1")) + assert(filtered.rdd.count() === 1) + } + + sparkTest("broadcast region join reads and features") { + val feature = Feature.newBuilder() + .setReferenceName("name2") + .setStart(0L) + .setEnd(3L) + .build + + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val features: FeatureDataset = FeatureDataset(sc.parallelize(Seq(feature))) + + val kv = reads.broadcastRegionJoin(features).rdd.first + assert(kv._1 === r2) + assert(kv._2 === feature) + } + + sparkTest("shuffle region join reads and features") { + val feature = Feature.newBuilder() + .setReferenceName("name1") + .setStart(0L) + .setEnd(3L) + .build + + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val features: FeatureDataset = FeatureDataset(sc.parallelize(Seq(feature))) + + val kv = reads.broadcastRegionJoin(features).rdd.first + assert(kv._1 === r1) + assert(kv._2 === feature) + } + + sparkTest("convert reads to sequences") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val sequences = reads.toSequences.rdd.collect() + assert(sequences.length === 2) + + val s1 = sequences(0) + assert(s1.getName === "name1") + assert(s1.getDescription === "description") + assert(s1.getAlphabet === Alphabet.DNA) + assert(s1.getLength === 4L) + assert(s1.getSequence === "actg") + + val s2 = sequences(1) + assert(s2.getName === "name2") + assert(s2.getDescription === "description") + assert(s2.getAlphabet === Alphabet.DNA) + assert(s2.getLength === 4L) + assert(s2.getSequence === "actg") + } + + sparkTest("convert reads to slices") { + val reads: ReadDataset = ReadDataset(sc.parallelize(Seq(r1, r2))) + val slices = reads.toSlices.rdd.collect() + assert(slices.length === 2) + + val s1 = slices(0) + assert(s1.getName === "name1") + assert(s1.getDescription === "description") + assert(s1.getAlphabet === Alphabet.DNA) + assert(s1.getLength === 4L) + assert(s1.getTotalLength === 4L) + assert(s1.getSequence === "actg") + assert(s1.getStart === 0L) + assert(s1.getEnd === 4L) + assert(s1.getStrand === Strand.INDEPENDENT) + + val s2 = slices(1) + assert(s2.getName === "name2") + assert(s2.getDescription === "description") + assert(s2.getAlphabet === Alphabet.DNA) + assert(s2.getLength === 4L) + assert(s2.getTotalLength === 4L) + assert(s2.getSequence === "actg") + assert(s2.getStart === 0L) + assert(s2.getEnd === 4L) + assert(s2.getStrand === Strand.INDEPENDENT) + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragmentsSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/FlankSlicesSuite.scala similarity index 50% rename from adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragmentsSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/FlankSlicesSuite.scala index 6bbf5bd314..6d453dfb5d 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragmentsSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/FlankSlicesSuite.scala @@ -15,73 +15,73 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.rdd.contig +package org.bdgenomics.adam.rdd.sequence import org.bdgenomics.adam.models.ReferenceRegion -import org.bdgenomics.formats.avro.NucleotideContigFragment +import org.bdgenomics.formats.avro.Slice import org.scalatest.FunSuite -class FlankReferenceFragmentsSuite extends FunSuite { +class FlankSlicesSuite extends FunSuite { - test("don't put flanks on non-adjacent fragments") { + test("don't put flanks on non-adjacent slices") { val testIter = Iterator((ReferenceRegion("chr1", 0L, 10L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("AAAAATTTTT") .setStart(0L) .setEnd(9L) .build()), (ReferenceRegion("chr1", 20L, 30L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("CCCCCGGGGG") .setStart(20L) .setEnd(29L) .build())) - val fragments = FlankReferenceFragments.flank(testIter, 5).toSeq + val slices = FlankSlices.flank(testIter, 5).toSeq - assert(fragments.size === 2) - fragments.foreach(_.getSequence.length === 10) - assert(fragments(0).getSequence === "AAAAATTTTT") - assert(fragments(0).getStart === 0L) - assert(fragments(0).getEnd === 9L) - assert(fragments(1).getSequence === "CCCCCGGGGG") - assert(fragments(1).getStart === 20L) - assert(fragments(1).getEnd === 29L) + assert(slices.size === 2) + slices.foreach(_.getSequence.length === 10) + assert(slices(0).getSequence === "AAAAATTTTT") + assert(slices(0).getStart === 0L) + assert(slices(0).getEnd === 9L) + assert(slices(1).getSequence === "CCCCCGGGGG") + assert(slices(1).getStart === 20L) + assert(slices(1).getEnd === 29L) } - test("put flanks on adjacent fragments") { + test("put flanks on adjacent slices") { val testIter = Iterator((ReferenceRegion("chr1", 0L, 10L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("AAAAATTTTT") .setStart(0L) .setEnd(9L) .build()), (ReferenceRegion("chr1", 10L, 20L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("NNNNNUUUUU") .setStart(10L) .setEnd(19L) .build()), (ReferenceRegion("chr1", 20L, 30L), - NucleotideContigFragment.newBuilder() - .setContigName("chr1") + Slice.newBuilder() + .setName("chr1") .setSequence("CCCCCGGGGG") .setStart(20L) .setEnd(29L) .build())) - val fragments = FlankReferenceFragments.flank(testIter, 5).toSeq + val slices = FlankSlices.flank(testIter, 5).toSeq - assert(fragments.size === 3) - assert(fragments(0).getSequence === "AAAAATTTTTNNNNN") - assert(fragments(0).getStart === 0L) - assert(fragments(0).getEnd === 14L) - assert(fragments(1).getSequence === "TTTTTNNNNNUUUUUCCCCC") - assert(fragments(1).getStart === 5L) - assert(fragments(1).getEnd === 24L) - assert(fragments(2).getSequence === "UUUUUCCCCCGGGGG") - assert(fragments(2).getStart === 15L) - assert(fragments(2).getEnd === 29L) + assert(slices.size === 3) + assert(slices(0).getSequence === "AAAAATTTTTNNNNN") + assert(slices(0).getStart === 0L) + assert(slices(0).getEnd === 14L) + assert(slices(1).getSequence === "TTTTTNNNNNUUUUUCCCCC") + assert(slices(1).getStart === 5L) + assert(slices(1).getEnd === 24L) + assert(slices(2).getSequence === "UUUUUCCCCCGGGGG") + assert(slices(2).getStart === 15L) + assert(slices(2).getEnd === 29L) } } diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SequenceDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SequenceDatasetSuite.scala new file mode 100644 index 0000000000..27e636fcec --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SequenceDatasetSuite.scala @@ -0,0 +1,398 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.sequence + +import com.google.common.collect.ComparisonChain +import java.io.File +import java.util.Comparator +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ + ReferenceRegion, + SequenceDictionary, + SequenceRecord +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ + Alphabet, + Sequence, + Strand +} + +class SequenceDatasetSuite extends ADAMFunSuite { + + val s1 = Sequence.newBuilder() + .setName("name1") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("actg") + .setLength(4L) + .build + + val s2 = Sequence.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("actg") + .setLength(4L) + .build + + val sd = SequenceDictionary( + SequenceRecord("name1", 4), + SequenceRecord("name2", 4) + ) + + def tempLocation(suffix: String = ".adam"): String = { + val tempFile = File.createTempFile("SequenceDatasetSuite", "") + val tempDir = tempFile.getParentFile + new File(tempDir, tempFile.getName + suffix).getAbsolutePath + } + + sparkTest("create a new sequence genomic dataset") { + val sequences: RDD[Sequence] = sc.parallelize(Seq(s1, s2)) + assert(SequenceDataset(sequences).rdd.count === 2) + } + + sparkTest("create a new sequence genomic dataset with sequence dictionary") { + val sequences: RDD[Sequence] = sc.parallelize(Seq(s1, s2)) + assert(SequenceDataset(sequences, sd).rdd.count === 2) + } + + sparkTest("save as parquet") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".adam") + sequences.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("round trip as parquet") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".adam") + sequences.saveAsParquet(outputPath) + + val parquetSequences = sc.loadParquetSequences(outputPath) + assert(parquetSequences.rdd.count === 2) + } + + sparkTest("save as fasta") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".fasta") + sequences.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("save as single file fasta") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val outputPath = tempLocation(".fasta") + sequences.save(outputPath, asSingleFile = true, disableFastConcat = false) + } + + sparkTest("convert sequences to reads") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val reads = sequences.toReads.rdd.collect() + assert(reads.length === 2) + + val r1 = reads(0) + assert(r1.getName === "name1") + assert(r1.getDescription === "description") + assert(r1.getAlphabet === Alphabet.DNA) + assert(r1.getLength === 4L) + assert(r1.getSequence === "actg") + assert(r1.getQualityScores === "BBBB") + + val r2 = reads(1) + assert(r2.getName === "name2") + assert(r2.getDescription === "description") + assert(r2.getAlphabet === Alphabet.DNA) + assert(r2.getLength === 4L) + assert(r2.getSequence === "actg") + assert(r2.getQualityScores === "BBBB") + } + + sparkTest("convert sequences to slices") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.toSlices.rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 4L) + assert(slice2.getTotalLength === 4) + assert(slice2.getSequence === "actg") + assert(slice2.getStart === 0L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences to a maximum length") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(3L).rdd.collect() + assert(slices.length === 4) + + slices.sortWith((v1, v2) => ComparisonChain.start() + .compare(v1.getName, v2.getName) + .compare(v1.getStart, v2.getStart) + .result() < 0 + ) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 3L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "act") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 3L) + assert(slice1.getStrand === Strand.INDEPENDENT) + assert(slice1.getIndex === 0) + assert(slice1.getSlices === 2) + + val slice2 = slices(1) + assert(slice2.getName === "name1") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 1L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "g") + assert(slice2.getStart === 3L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + assert(slice2.getIndex === 1) + assert(slice2.getSlices === 2) + + val slice3 = slices(2) + assert(slice3.getName === "name2") + assert(slice3.getDescription === "description") + assert(slice3.getAlphabet === Alphabet.DNA) + assert(slice3.getLength === 3L) + assert(slice3.getTotalLength === 4L) + assert(slice3.getSequence === "act") + assert(slice3.getStart === 0L) + assert(slice3.getEnd === 3L) + assert(slice3.getStrand === Strand.INDEPENDENT) + assert(slice3.getIndex === 0) + assert(slice3.getSlices === 2) + + val slice4 = slices(3) + assert(slice4.getName === "name2") + assert(slice4.getDescription === "description") + assert(slice4.getAlphabet === Alphabet.DNA) + assert(slice4.getLength === 1L) + assert(slice4.getTotalLength === 4L) + assert(slice4.getSequence === "g") + assert(slice4.getStart === 3L) + assert(slice4.getEnd === 4L) + assert(slice4.getStrand === Strand.INDEPENDENT) + assert(slice4.getIndex === 1) + assert(slice4.getSlices === 2) + } + + sparkTest("slice sequences shorter than maximum length") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(10L).rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + assert(slice1.getIndex === 0) + assert(slice1.getSlices === 1) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 4L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "actg") + assert(slice2.getStart === 0L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + assert(slice2.getIndex === 0) + assert(slice2.getSlices === 1) + } + + sparkTest("filter sequences by overlapping region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val filtered = sequences.filterByOverlappingRegion(ReferenceRegion("name1", 1L, 3L)).rdd.collect() + assert(filtered.length == 1) + + val sequence1 = filtered(0) + assert(sequence1.getName === "name1") + assert(sequence1.getDescription === "description") + assert(sequence1.getAlphabet === Alphabet.DNA) + assert(sequence1.getLength === 4L) + assert(sequence1.getSequence === "actg") + } + + sparkTest("filter sequences failing to overlap region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + assert(sequences.filterByOverlappingRegion(ReferenceRegion("name1", 99L, 101L)).rdd.isEmpty) + } + + sparkTest("filter sequences by overlapping regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 1L, 3L), ReferenceRegion("name2", 1L, 3L)) + val filtered = sequences.filterByOverlappingRegions(regions).rdd.collect() + assert(filtered.length == 2) + + val sequence1 = filtered(0) + assert(sequence1.getName === "name1") + assert(sequence1.getDescription === "description") + assert(sequence1.getAlphabet === Alphabet.DNA) + assert(sequence1.getLength === 4L) + assert(sequence1.getSequence === "actg") + + val sequence2 = filtered(1) + assert(sequence2.getName === "name2") + assert(sequence2.getDescription === "description") + assert(sequence2.getAlphabet === Alphabet.DNA) + assert(sequence2.getLength === 4L) + assert(sequence2.getSequence === "actg") + } + + sparkTest("filter sequences failing to overlap regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 99L, 101L), ReferenceRegion("name2", 99L, 101L)) + assert(sequences.filterByOverlappingRegions(regions).rdd.isEmpty) + } + + sparkTest("slice sequences overlapping a smaller region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(ReferenceRegion("name1", 1L, 3L)).rdd.collect() + assert(slices.length === 1) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 2L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "ct") + assert(slice1.getStart === 1L) + assert(slice1.getEnd === 3L) + assert(slice1.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences overlapping a larger region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(ReferenceRegion("name1", 0L, 99L)).rdd.collect() + assert(slices.length === 1) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences failing to overlap a region") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val slices = sequences.slice(ReferenceRegion("name1", 99L, 101L)).rdd.collect() + assert(slices.length === 0) + } + + sparkTest("slice sequences overlapping smaller regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 1L, 3L), ReferenceRegion("name2", 1L, 3L)) + val slices = sequences.slice(regions).rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 2L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "ct") + assert(slice1.getStart === 1L) + assert(slice1.getEnd === 3L) + assert(slice1.getStrand === Strand.INDEPENDENT) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 2L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "ct") + assert(slice2.getStart === 1L) + assert(slice2.getEnd === 3L) + assert(slice2.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences overlapping larger regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 0L, 99L), ReferenceRegion("name2", 0L, 99L)) + val slices = sequences.slice(regions).rdd.collect() + assert(slices.length === 2) + + val slice1 = slices(0) + assert(slice1.getName === "name1") + assert(slice1.getDescription === "description") + assert(slice1.getAlphabet === Alphabet.DNA) + assert(slice1.getLength === 4L) + assert(slice1.getTotalLength === 4L) + assert(slice1.getSequence === "actg") + assert(slice1.getStart === 0L) + assert(slice1.getEnd === 4L) + assert(slice1.getStrand === Strand.INDEPENDENT) + + val slice2 = slices(1) + assert(slice2.getName === "name2") + assert(slice2.getDescription === "description") + assert(slice2.getAlphabet === Alphabet.DNA) + assert(slice2.getLength === 4L) + assert(slice2.getTotalLength === 4L) + assert(slice2.getSequence === "actg") + assert(slice2.getStart === 0L) + assert(slice2.getEnd === 4L) + assert(slice2.getStrand === Strand.INDEPENDENT) + } + + sparkTest("slice sequences failing to overlap regions") { + val sequences: SequenceDataset = SequenceDataset(sc.parallelize(Seq(s1, s2))) + val regions = List(ReferenceRegion("name1", 99L, 101L), ReferenceRegion("name2", 99L, 101L)) + val slices = sequences.slice(regions).rdd.collect() + assert(slices.length === 0) + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SliceDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SliceDatasetSuite.scala new file mode 100644 index 0000000000..8f910f9920 --- /dev/null +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/sequence/SliceDatasetSuite.scala @@ -0,0 +1,173 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.sequence + +import java.io.File + +import org.apache.spark.rdd.RDD +import org.bdgenomics.adam.models.{ + SequenceDictionary, + SequenceRecord +} +import org.bdgenomics.adam.rdd.ADAMContext._ +import org.bdgenomics.adam.util.ADAMFunSuite +import org.bdgenomics.formats.avro.{ + Alphabet, + Slice, + Strand +} + +class SliceDatasetSuite extends ADAMFunSuite { + + val s1 = Slice.newBuilder() + .setName("name1") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("actg") + .setStart(0L) + .setEnd(3L) + .setStrand(Strand.INDEPENDENT) + .setLength(4L) + .build + + val s2 = Slice.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("aatt") + .setStart(0L) + .setEnd(3L) + .setStrand(Strand.INDEPENDENT) + .setLength(4L) + .build + + val s3 = Slice.newBuilder() + .setName("name2") + .setDescription("description") + .setAlphabet(Alphabet.DNA) + .setSequence("ccgg") + .setStart(4L) + .setEnd(7L) + .setStrand(Strand.INDEPENDENT) + .setLength(4L) + .build + + val sd = SequenceDictionary( + SequenceRecord("name1", 4), + SequenceRecord("name2", 4) + ) + + sparkTest("create a new slice genomic dataset") { + val slices: RDD[Slice] = sc.parallelize(Seq(s1, s2, s3)) + assert(SliceDataset(slices).rdd.count === 3) + } + + sparkTest("create a new slice genomic dataset with sequence dictionary") { + val slices: RDD[Slice] = sc.parallelize(Seq(s1, s2, s3)) + assert(SliceDataset(slices, sd).rdd.count === 3) + } + + sparkTest("merge slices into a sequence genomic dataset") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val sequences = slices.merge() + assert(sequences.rdd.count === 2) + + val seqs = sequences.rdd.collect + val seq1 = seqs(0) + val seq2 = seqs(1) + + assert(seq1.getLength === 4L) + assert(seq2.getLength === 8L) + assert(seq2.getSequence === "aattccgg") + } + + def tempLocation(suffix: String = ".adam"): String = { + val tempFile = File.createTempFile("SliceDatasetSuite", "") + val tempDir = tempFile.getParentFile + new File(tempDir, tempFile.getName + suffix).getAbsolutePath + } + + sparkTest("save as parquet") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".adam") + slices.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("round trip as parquet") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".adam") + slices.saveAsParquet(outputPath) + + val parquetSlices = sc.loadParquetSlices(outputPath) + assert(parquetSlices.rdd.count === 3) + } + + sparkTest("save as fasta") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".fasta") + slices.save(outputPath, asSingleFile = false, disableFastConcat = false) + } + + sparkTest("save as single file fasta") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2, s3))) + val outputPath = tempLocation(".fasta") + slices.save(outputPath, asSingleFile = true, disableFastConcat = false) + } + + sparkTest("convert slices to reads") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2))) + val reads = slices.toReads.rdd.collect() + assert(reads.length === 2) + + val r1 = reads(0) + assert(r1.getName === "name1") + assert(r1.getDescription === "description") + assert(r1.getAlphabet === Alphabet.DNA) + assert(r1.getLength === 4L) + assert(r1.getSequence === "actg") + assert(r1.getQualityScores === "BBBB") + + val r2 = reads(1) + assert(r2.getName === "name2") + assert(r2.getDescription === "description") + assert(r2.getAlphabet === Alphabet.DNA) + assert(r2.getLength === 4L) + assert(r2.getSequence === "aatt") + assert(r2.getQualityScores === "BBBB") + } + + sparkTest("convert slices to sequences") { + val slices: SliceDataset = SliceDataset(sc.parallelize(Seq(s1, s2))) + val sequences = slices.toSequences.rdd.collect() + assert(sequences.length === 2) + + val sequence1 = sequences(0) + assert(sequence1.getName === "name1") + assert(sequence1.getDescription === "description") + assert(sequence1.getAlphabet === Alphabet.DNA) + assert(sequence1.getLength === 4L) + assert(sequence1.getSequence === "actg") + + val sequence2 = sequences(1) + assert(sequence2.getName === "name2") + assert(sequence2.getDescription === "description") + assert(sequence2.getAlphabet === Alphabet.DNA) + assert(sequence2.getLength === 4L) + assert(sequence2.getSequence === "aatt") + } +} diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala index 446c754ae5..87c243dfe1 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/GenotypeDatasetSuite.scala @@ -32,16 +32,16 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -71,9 +71,9 @@ object GenotypeDatasetSuite extends Serializable { .build } - def ncfFn(g: Genotype): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(g.getReferenceName) + def sliceFn(g: Genotype): Slice = { + Slice.newBuilder + .setName(g.getReferenceName) .build } @@ -386,35 +386,35 @@ class GenotypeDatasetSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 18) } - sparkTest("transform genotypes to contig genomic dataset") { + sparkTest("transform genotypes to slice genomic dataset") { val genotypes = sc.loadGenotypes(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 18) + assert(sc.loadSlices(tempPath).rdd.count === 18) } - val contigs: NucleotideContigFragmentDataset = genotypes.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = genotypes.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Genotype]) => { - rdd.map(GenotypeDatasetSuite.ncfFn) + rdd.map(GenotypeDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = genotypes.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = genotypes.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[GenotypeProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - GenotypeDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + GenotypeDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } sparkTest("transform genotypes to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala index 16f4ce328f..90aff20e27 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantContextDatasetSuite.scala @@ -40,16 +40,16 @@ import org.bdgenomics.adam.models.{ } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rdd.TestSaveArgs -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -395,22 +395,22 @@ class VariantContextDatasetSuite extends ADAMFunSuite { } } - sparkTest("transform variant contexts to contig genomic dataset") { + sparkTest("transform variant contexts to slice genomic dataset") { val variantContexts = sc.loadVcf(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 6) + assert(sc.loadSlices(tempPath).rdd.count === 6) } - val contigs: NucleotideContigFragmentDataset = variantContexts.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = variantContexts.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[VariantContext]) => { - rdd.map(VariantDatasetSuite.ncfFn) + rdd.map(VariantDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) } sparkTest("transform variant contexts to coverage genomic dataset") { diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala index 4a461d517f..23a1af2416 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/variant/VariantDatasetSuite.scala @@ -28,16 +28,16 @@ import org.bdgenomics.adam.models.{ VariantContext } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset } import org.bdgenomics.adam.rdd.fragment.FragmentDataset import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset +import org.bdgenomics.adam.rdd.sequence.SliceDataset import org.bdgenomics.adam.sql.{ AlignmentRecord => AlignmentRecordProduct, Feature => FeatureProduct, Fragment => FragmentProduct, Genotype => GenotypeProduct, - NucleotideContigFragment => NucleotideContigFragmentProduct, + Slice => SliceProduct, Variant => VariantProduct, VariantContext => VariantContextProduct } @@ -79,14 +79,14 @@ object VariantDatasetSuite extends Serializable { fragFn(vc.variant.variant) } - def ncfFn(v: Variant): NucleotideContigFragment = { - NucleotideContigFragment.newBuilder - .setContigName(v.getReferenceName) + def sliceFn(v: Variant): Slice = { + Slice.newBuilder + .setName(v.getReferenceName) .build } - def ncfFn(vc: VariantContext): NucleotideContigFragment = { - ncfFn(vc.variant.variant) + def sliceFn(vc: VariantContext): Slice = { + sliceFn(vc.variant.variant) } def readFn(v: Variant): AlignmentRecord = { @@ -402,35 +402,35 @@ class VariantDatasetSuite extends ADAMFunSuite { assert(rdd3.dataset.count === 6) } - sparkTest("transform variants to contig genomic dataset") { + sparkTest("transform variants to slice genomic dataset") { val variants = sc.loadVariants(testFile("small.vcf")) - def checkSave(contigs: NucleotideContigFragmentDataset) { + def checkSave(slices: SliceDataset) { val tempPath = tmpLocation(".adam") - contigs.saveAsParquet(tempPath) + slices.saveAsParquet(tempPath) - assert(sc.loadContigFragments(tempPath).rdd.count === 6) + assert(sc.loadSlices(tempPath).rdd.count === 6) } - val contigs: NucleotideContigFragmentDataset = variants.transmute[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slices: SliceDataset = variants.transmute[Slice, SliceProduct, SliceDataset]( (rdd: RDD[Variant]) => { - rdd.map(VariantDatasetSuite.ncfFn) + rdd.map(VariantDatasetSuite.sliceFn) }) - checkSave(contigs) + checkSave(slices) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - val contigsDs: NucleotideContigFragmentDataset = variants.transmuteDataset[NucleotideContigFragment, NucleotideContigFragmentProduct, NucleotideContigFragmentDataset]( + val slicesDs: SliceDataset = variants.transmuteDataset[Slice, SliceProduct, SliceDataset]( (ds: Dataset[VariantProduct]) => { ds.map(r => { - NucleotideContigFragmentProduct.fromAvro( - VariantDatasetSuite.ncfFn(r.toAvro)) + SliceProduct.fromAvro( + VariantDatasetSuite.sliceFn(r.toAvro)) }) }) - checkSave(contigsDs) + checkSave(slicesDs) } sparkTest("transform variants to coverage genomic dataset") { diff --git a/adam-python/bdgenomics/adam/adamContext.py b/adam-python/bdgenomics/adam/adamContext.py index fb8e3f142b..5f9ddefbc1 100644 --- a/adam-python/bdgenomics/adam/adamContext.py +++ b/adam-python/bdgenomics/adam/adamContext.py @@ -31,8 +31,10 @@ FeatureDataset, \ FragmentDataset, \ GenotypeDataset, \ - NucleotideContigFragmentDataset, \ + SequenceDataset, \ + SliceDataset, \ VariantDataset + from bdgenomics.adam.stringency import STRICT, _toJava @@ -147,26 +149,6 @@ def loadCoverage(self, filePath, return CoverageDataset(adamRdd, self._sc) - def loadContigFragments(self, filePath): - """ - Load nucleotide contig fragments into a NucleotideContigFragmentDataset. - - If the path name has a .fa/.fasta extension, load as FASTA format. - Else, fall back to Parquet + Avro. - - For FASTA format, compressed files are supported through compression codecs configured - in Hadoop, which by default include .gz and .bz2, but can include more. - - :param str filePath: The path to load the file from. - :return: Returns a genomic dataset containing sequence fragments. - :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentDataset - """ - - adamRdd = self.__jac.loadContigFragments(filePath) - - return NucleotideContigFragmentDataset(adamRdd, self._sc) - - def loadFragments(self, filePath, stringency=STRICT): """ Load fragments into a FragmentDataset. @@ -255,3 +237,73 @@ def loadVariants(self, filePath, stringency=STRICT): _toJava(stringency, self._jvm)) return VariantDataset(adamRdd, self._sc) + + + def loadDnaSequences(self, filePath): + """ + Load DNA sequences into a SequenceDataset. + + If the path name has a .fa/.fasta extension, load as FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :return: Returns a genomic dataset containing DNA sequences. + :rtype: bdgenomics.adam.rdd.SequenceDataset + """ + + adamRdd = self.__jac.loadDnaSequences(filePath) + + return SequenceDataset(adamRdd, self._sc) + + + def loadProteinSequences(self, filePath): + """ + Load protein sequences into a SequenceDataset. + + If the path name has a .fa/.fasta extension, load as FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :return: Returns a genomic dataset containing protein sequences. + :rtype: bdgenomics.adam.rdd.SequenceDataset + """ + + adamRdd = self.__jac.loadProteinSequences(filePath) + + return SequenceDataset(adamRdd, self._sc) + + + def loadRnaSequences(self, filePath): + """ + Load RNA sequences into a SequenceDataset. + + If the path name has a .fa/.fasta extension, load as FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :return: Returns a genomic dataset containing RNA sequences. + :rtype: bdgenomics.adam.rdd.SequenceDataset + """ + + adamRdd = self.__jac.loadRnaSequences(filePath) + + return SequenceDataset(adamRdd, self._sc) + + + def loadSlices(self, filePath, maximumLength): + """ + Load slices into a SliceDataset. + + If the path name has a .fa/.fasta extension, load as DNA in FASTA format. + Else, fall back to Parquet + Avro. + + :param str filePath: The path to load the file from. + :param long maximumLength: Maximum slice length. + :return: Returns a genomic dataset containing sequence slices. + :rtype: bdgenomics.adam.rdd.SliceDataset + """ + + adamRdd = self.__jac.loadSlices(filePath, maximumLength) + + return SliceDataset(adamRdd, self._sc) + diff --git a/adam-python/bdgenomics/adam/rdd.py b/adam-python/bdgenomics/adam/rdd.py index ae6709bede..20dd8dc728 100644 --- a/adam-python/bdgenomics/adam/rdd.py +++ b/adam-python/bdgenomics/adam/rdd.py @@ -30,7 +30,8 @@ FeatureDataset FragmentDataset GenotypeDataset - NucleotideContigFragmentDataset + SequenceDataset + SliceDataset VariantDataset VariantContextDataset """ @@ -228,9 +229,7 @@ def _inferConversionFn(self, destClass): def _destClassSuffix(self, destClass): - if destClass is NucleotideContigFragmentDataset: - return "ContigsDatasetConverter" - elif destClass is CoverageDataset: + if destClass is CoverageDataset: return "CoverageDatasetConverter" elif destClass is FeatureDataset: return "FeaturesDatasetConverter" @@ -242,6 +241,12 @@ def _destClassSuffix(self, destClass): return "GenotypeDatasetConverter" elif destClass is VariantDataset: return "VariantDatasetConverter" + elif destClass is ReadRDD: + return "ReadDatasetConverter" + elif destClass is SequenceRDD: + return "SequenceDatasetConverter" + elif destClass is SliceRDD: + return "SliceDatasetConverter" else: raise ValueError("No conversion method known for %s." % destClass) @@ -1484,23 +1489,21 @@ def _inferConversionFn(self, destClass): return "org.bdgenomics.adam.api.java.GenotypesTo%s" % self._destClassSuffix(destClass) -class NucleotideContigFragmentDataset(GenomicDataset): - """ - Wraps an GenomicDataset with Nucleotide Contig Fragment metadata and functions. - """ +class SliceDataset(GenomicDataset): + def _replaceRdd(self, newRdd): - return NucleotideContigFragmentDataset(newRdd, self.sc) + return SliceDataset(newRdd, self.sc) def __init__(self, jvmRdd, sc): """ - Constructs a Python NucleotideContigFragmentDataset from a JVM - NucleotideContigFragmentDataset. Should not be called from user code; + Constructs a Python SliceDataset from a JVM + SliceDataset. Should not be called from user code; instead, go through bdgenomics.adamContext.ADAMContext. - :param jvmRdd: Py4j handle to the underlying JVM NucleotideContigFragmentDataset. + :param jvmRdd: Py4j handle to the underlying JVM SliceDataset. :param pyspark.context.SparkContext sc: Active Spark Context. """ @@ -1509,9 +1512,9 @@ def __init__(self, jvmRdd, sc): def save(self, fileName): """ - Save nucleotide contig fragments as Parquet or FASTA. + Save slices as Parquet or FASTA. - If filename ends in .fa or .fasta, saves as Fasta. If not, saves + If filename ends in .fa or .fasta, saves as FASTA. If not, saves fragments to Parquet. Defaults to 60 character line length, if saving to FASTA. @@ -1528,17 +1531,18 @@ def flankAdjacentFragments(self, flankLength): length. :param int flankLength: The length to extend adjacent records by. - :return: Returns the genomic dataset, with all adjacent fragments extended with + :return: Returns the genomic dataset, with all adjacent slices extended with flanking sequence. - :rtype: bdgenomics.adam.rdd.NucleotideContigFragmentDataset + :rtype: bdgenomics.adam.rdd.SliceDataset """ - return NucleotideContigFragmentDataset(self._jvmRdd.flankAdjacentFragments(flankLength), self.sc) + return SliceDataset(self._jvmRdd.flankAdjacentFragments(flankLength), + self.sc) def countKmers(self, kmerLength): """ - Counts the k-mers contained in a FASTA contig. + Counts the k-mers contained in a slice. :param int kmerLength: The value of _k_ to use for cutting _k_-mers. :return: Returns an RDD containing k-mer/count pairs. @@ -1550,7 +1554,7 @@ def countKmers(self, kmerLength): def _inferConversionFn(self, destClass): - return "org.bdgenomics.adam.api.java.ContigsTo%s" % self._destClassSuffix(destClass) + return "org.bdgenomics.adam.api.java.SlicesTo%s" % self._destClassSuffix(destClass) class VariantDataset(VCFSupportingGenomicDataset): @@ -1648,3 +1652,85 @@ def saveAsVcf(self, deferMerging, disableFastConcat, _toJava(stringency, self.sc._jvm)) + + +class ReadRDD(GenomicDataset): + + + def _replaceRdd(self, newRdd): + + return ReadRDD(newRdd, self.sc) + + + def __init__(self, jvmRdd, sc): + """ + Constructs a Python ReadRDD from a JVM + ReadRDD. Should not be called from user code; + instead, go through bdgenomics.adamContext.ADAMContext. + + :param jvmRdd: Py4j handle to the underlying JVM ReadRDD. + :param pyspark.context.SparkContext sc: Active Spark Context. + """ + + GenomicDataset.__init__(self, jvmRdd, sc) + + + def save(self, fileName): + """ + Save reads as Parquet or FASTQ. + + If filename ends in .fq or .fastq, saves as FASTQ. If not, saves + reads to Parquet. + + :param str fileName: Path to save to. + """ + + self._jvmRdd.save(fileName) + + + def _inferConversionFn(self, destClass): + + return "org.bdgenomics.adam.api.java.ReadsTo%s" % self._destClassSuffix(destClass) + + +class SequenceDataset(GenomicDataset): + + + def _replaceRdd(self, newRdd): + + return SequenceDataset(newRdd, self.sc) + + + def __init__(self, jvmRdd, sc): + """ + Constructs a Python SequenceDataset from a JVM + SequenceDataset. Should not be called from user code; + instead, go through bdgenomics.adamContext.ADAMContext. + + :param jvmRdd: Py4j handle to the underlying JVM SequenceDataset. + :param pyspark.context.SparkContext sc: Active Spark Context. + """ + + GenomicDataset.__init__(self, jvmRdd, sc) + +# slice(maximumLength) +# slice(region) +# slice(regions) + + def save(self, fileName): + """ + Save slices as Parquet or FASTA. + + If filename ends in .fa or .fasta, saves as Fasta. If not, saves + sequences to Parquet. Defaults to 60 character line length, if saving to + FASTA. + + :param str fileName: Path to save to. + """ + + self._jvmRdd.save(fileName) + + + def _inferConversionFn(self, destClass): + + return "org.bdgenomics.adam.api.java.SequencesTo%s" % self._destClassSuffix(destClass) diff --git a/adam-python/bdgenomics/adam/test/adamContext_test.py b/adam-python/bdgenomics/adam/test/adamContext_test.py index d8612cc658..cef2bf46c1 100644 --- a/adam-python/bdgenomics/adam/test/adamContext_test.py +++ b/adam-python/bdgenomics/adam/test/adamContext_test.py @@ -128,13 +128,25 @@ def test_load_variants(self): self.assertEqual(reads._jvmRdd.jrdd().count(), 6) - def test_load_contig_fragments(self): + def test_load_slices(self): testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") ac = ADAMContext(self.ss) - reads = ac.loadContigFragments(testFile) + slices = ac.loadSlices(testFile, 10000) - self.assertEqual(reads.toDF().count(), 1) - self.assertEqual(reads._jvmRdd.jrdd().count(), 1) + self.assertEqual(slices.toDF().count(), 1) + self.assertEqual(slices._jvmRdd.jrdd().count(), 1) + + + def test_load_dna_sequences(self): + + + testFile = self.resourceFile("HLA_DQB1_05_01_01_02.fa") + ac = ADAMContext(self.ss) + + sequences = ac.loadDnaSequences(testFile) + + self.assertEqual(sequences.toDF().count(), 1) + self.assertEqual(sequences._jvmRdd.jrdd().count(), 1) diff --git a/adam-r/bdgenomics.adam/R/adam-context.R b/adam-r/bdgenomics.adam/R/adam-context.R index b9640be486..f47220a63d 100644 --- a/adam-r/bdgenomics.adam/R/adam-context.R +++ b/adam-r/bdgenomics.adam/R/adam-context.R @@ -98,7 +98,7 @@ setMethod("loadAlignments", AlignmentRecordDataset(jrdd) }) -#' Load nucleotide contig fragments into a NucleotideContigFragmentDataset. +#' Load DNA sequences into a SequenceDataset. #' #' If the path name has a .fa/.fasta extension, load as FASTA format. #' Else, fall back to Parquet + Avro. @@ -108,16 +108,83 @@ setMethod("loadAlignments", #' #' @param ac The ADAMContext. #' @param filePath The path to load the file from. -#' @return Returns a genomic dataset containing nucleotide contig fragments. +#' @return Returns a genomic dataset containing DNA sequences. #' #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("loadContigFragments", +setMethod("loadDnaSequences", signature(ac = "ADAMContext", filePath = "character"), function(ac, filePath) { - jrdd <- sparkR.callJMethod(ac@jac, "loadContigFragments", filePath) - NucleotideContigFragmentDataset(jrdd) + jrdd <- sparkR.callJMethod(ac@jac, "loadDnaSequences", filePath) + SequenceDataset(jrdd) + }) + +#' Load protein sequences into a SequenceDataset. +#' +#' If the path name has a .fa/.fasta extension, load as FASTA format. +#' Else, fall back to Parquet + Avro. +#' +#' For FASTA format, compressed files are supported through compression codecs configured +#' in Hadoop, which by default include .gz and .bz2, but can include more. +#' +#' @param ac The ADAMContext. +#' @param filePath The path to load the file from. +#' @return Returns a genomic dataset containing protein sequences. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("loadProteinSequences", + signature(ac = "ADAMContext", filePath = "character"), + function(ac, filePath) { + jrdd <- sparkR.callJMethod(ac@jac, "loadProteinSequences", filePath) + SequenceDataset(jrdd) + }) + +#' Load RNA sequences into a SequenceDataset. +#' +#' If the path name has a .fa/.fasta extension, load as FASTA format. +#' Else, fall back to Parquet + Avro. +#' +#' For FASTA format, compressed files are supported through compression codecs configured +#' in Hadoop, which by default include .gz and .bz2, but can include more. +#' +#' @param ac The ADAMContext. +#' @param filePath The path to load the file from. +#' @return Returns a genomic dataset containing RNA sequences. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("loadRnaSequences", + signature(ac = "ADAMContext", filePath = "character"), + function(ac, filePath) { + jrdd <- sparkR.callJMethod(ac@jac, "loadRnaSequences", filePath) + SequenceDataset(jrdd) + }) + +#' Load slices into a SliceDataset. +#' +#' If the path name has a .fa/.fasta extension, load as DNA in FASTA format. +#' Else, fall back to Parquet + Avro. +#' +#' For FASTA format, compressed files are supported through compression codecs configured +#' in Hadoop, which by default include .gz and .bz2, but can include more. +#' +#' @param ac The ADAMContext. +#' @param filePath The path to load the file from. +#' @param maximumLength Maximum slice length. +#' @return Returns a genomic dataset containing slices. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("loadSlices", + signature(ac = "ADAMContext", filePath = "character", maximumLength = "integer"), + function(ac, filePath, maximumLength) { + jrdd <- sparkR.callJMethod(ac@jac, "loadSlices", filePath, maximumLength) + SliceDataset(jrdd) }) #' Load fragments into a FragmentDataset. diff --git a/adam-r/bdgenomics.adam/R/generics.R b/adam-r/bdgenomics.adam/R/generics.R index 2b61383820..d78cf06bff 100644 --- a/adam-r/bdgenomics.adam/R/generics.R +++ b/adam-r/bdgenomics.adam/R/generics.R @@ -33,8 +33,23 @@ setGeneric("loadAlignments", #' @rdname ADAMContext #' @export -setGeneric("loadContigFragments", - function(ac, filePath) { standardGeneric("loadContigFragments") }) +setGeneric("loadDnaSequences", + function(ac, filePath) { standardGeneric("loadDnaSequences") }) + +#' @rdname ADAMContext +#' @export +setGeneric("loadProteinSequences", + function(ac, filePath) { standardGeneric("loadProteinSequences") }) + +#' @rdname ADAMContext +#' @export +setGeneric("loadRnaSequences", + function(ac, filePath) { standardGeneric("loadRnaSequences") }) + +#' @rdname ADAMContext +#' @export +setGeneric("loadSlices", + function(ac, filePath) { standardGeneric("loadSlices") }) #' @rdname ADAMContext #' @export @@ -380,15 +395,15 @@ setGeneric("toVariantContexts", setGeneric("toVariants", function(ardd, ...) { standardGeneric("toVariants") }) -#### NucleotideContigFragment operations #### +#### Slice operations #### -#' The NucleotideContigFragmentDataset class is used to manipulate contigs. +#' The SliceDataset class is used to manipulate slices. #' -#' @name NucleotideContigFragmentDataset +#' @name SliceDataset NULL -#' @rdname NucleotideContigFragmentDataset -#' @param ardd The genomic dataset to apply this to. +#' @rdname SliceDataset +#' @param ardd The RDD to apply this to. #' @param flankLength The length to extend adjacent records by. #' @export setGeneric("flankAdjacentFragments", diff --git a/adam-r/bdgenomics.adam/R/rdd.R b/adam-r/bdgenomics.adam/R/rdd.R index be2e7ec98f..3fe7044454 100644 --- a/adam-r/bdgenomics.adam/R/rdd.R +++ b/adam-r/bdgenomics.adam/R/rdd.R @@ -107,19 +107,34 @@ GenotypeDataset <- function(jrdd) { new("GenotypeDataset", jrdd = jrdd) } -#' A class that wraps an RDD of contigs with helpful metadata. +#' A class that wraps an RDD of sequences with helpful metadata. #' -#' @rdname NucleotideContigFragmentDataset -#' @slot jrdd The Java RDD of contigs that this class wraps. +#' @rdname SequenceDataset +#' @slot jrdd The Java RDD of sequences that this class wraps. #' #' @export -setClass("NucleotideContigFragmentDataset", +setClass("SequenceDataset", slots = list(jrdd = "jobj"), contains = "GenomicDataset") #' @importFrom methods new -NucleotideContigFragmentDataset <- function(jrdd) { - new("NucleotideContigFragmentDataset", jrdd = jrdd) +SequenceDataset <- function(jrdd) { + new("SequenceDataset", jrdd = jrdd) +} + +#' A class that wraps an RDD of slices with helpful metadata. +#' +#' @rdname SliceDataset +#' @slot jrdd The Java RDD of slices that this class wraps. +#' +#' @export +setClass("SliceDataset", + slots = list(jrdd = "jobj"), + contains = "GenomicDataset") + +#' @importFrom methods new +SliceDataset <- function(jrdd) { + new("SliceDataset", jrdd = jrdd) } #' A class that wraps an RDD of variants with helpful metadata. @@ -373,9 +388,7 @@ setMethod("inferConversionFn", setMethod("destClassSuffix", signature(destClass = "character"), function(destClass) { - if (destClass == "NucleotideContigFragmentDataset") { - "ContigsDatasetConverter" - } else if (destClass == "CoverageDataset") { + if (destClass == "CoverageDataset") { "CoverageDatasetConverter" } else if (destClass == "FeatureDataset") { "FeaturesDatasetConverter" @@ -387,6 +400,12 @@ setMethod("destClassSuffix", "GenotypeDatasetConverter" } else if (destClass == "VariantDataset") { "VariantDatasetConverter" + } else if (destClass == "ReadDataset") { + "ReadDatasetConverter" + } else if (destClass == "SequenceDataset") { + "SequenceDatasetConverter" + } else if (destClass == "SliceDataset") { + "SliceDatasetConverter" } else { stop(paste("No conversion method known for", destClass)) @@ -1272,23 +1291,39 @@ setMethod("toVariantContexts", signature(ardd = "GenotypeDataset"), }) setMethod("inferConversionFn", - signature(ardd = "NucleotideContigFragmentDataset", + signature(ardd = "SliceDataset", destClass = "character"), function(ardd, destClass) { - paste0("org.bdgenomics.adam.api.java.ContigsTo", + paste0("org.bdgenomics.adam.api.java.SlicesTo", destClassSuffix(destClass)) }) setMethod("replaceRdd", - signature(ardd = "NucleotideContigFragmentDataset", + signature(ardd = "SliceDataset", rdd = "jobj"), function(ardd, rdd) { - NucleotideContigFragmentDataset(rdd) + SliceDataset(rdd) + }) + +#' Save sequences as Parquet or FASTA. +#' +#' If filename ends in .fa or .fasta, saves as FASTA. If not, saves slices to +#' Parquet. Defaults to 60 character line length, if saving as FASTA. +#' +#' @param ardd The RDD to apply this to. +#' @param filePath Path to save to. +#' +#' @importFrom SparkR sparkR.callJMethod +#' +#' @export +setMethod("save", signature(ardd = "SequenceDataset", filePath = "character"), + function(ardd, filePath) { + invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath)) }) -#' Save nucleotide contig fragments as Parquet or FASTA. +#' Save slices as Parquet or FASTA. #' -#' If filename ends in .fa or .fasta, saves as Fasta. If not, saves fragments to +#' If filename ends in .fa or .fasta, saves as FASTA. If not, saves slices to #' Parquet. Defaults to 60 character line length, if saving as FASTA. #' #' @param ardd The genomic dataset to apply this to. @@ -1297,7 +1332,7 @@ setMethod("replaceRdd", #' @importFrom SparkR sparkR.callJMethod #' #' @export -setMethod("save", signature(ardd = "NucleotideContigFragmentDataset", filePath = "character"), +setMethod("save", signature(ardd = "SliceDataset", filePath = "character"), function(ardd, filePath) { invisible(sparkR.callJMethod(ardd@jrdd, "save", filePath)) }) @@ -1314,11 +1349,11 @@ setMethod("save", signature(ardd = "NucleotideContigFragmentDataset", filePath = #' #' @export setMethod("flankAdjacentFragments", - signature(ardd = "NucleotideContigFragmentDataset", flankLength = "numeric"), + signature(ardd = "SliceDataset", flankLength = "numeric"), function(ardd, flankLength) { - NucleotideContigFragmentDataset(sparkR.callJMethod(ardd@jrdd, - "flankAdjacentFragments", - flankLength)) + SliceRDD(sparkR.callJMethod(ardd@jrdd, + "flankAdjacentFragments", + flankLength)) }) setMethod("inferConversionFn", diff --git a/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R b/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R index 8582117c02..0e75fb92f4 100644 --- a/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R +++ b/adam-r/bdgenomics.adam/tests/testthat/test_adamContext.R @@ -71,8 +71,14 @@ test_that("load variants from vcf", { expect_equal(count(variantDf), 6) }) -test_that("load fasta", { - ncfs <- loadContigFragments(ac, resourceFile("HLA_DQB1_05_01_01_02.fa")) - ncfDf <- toDF(ncfs) - expect_equal(count(ncfDf), 1) +test_that("load fasta sequences", { + sequences <- loadDnaSequences(ac, resourceFile("HLA_DQB1_05_01_01_02.fa")) + sequencesDf <- toDF(sequences) + expect_equal(count(sequencesDf), 1) +}) + +test_that("load fasta slices", { + slices <- loadSlices(ac, resourceFile("HLA_DQB1_05_01_01_02.fa"), 10000) + slicesDf <- toDF(slices) + expect_equal(count(slicesDf), 1) }) diff --git a/docs/api/adamContext.rst b/docs/api/adamContext.rst index a13fbde68a..84ac22915d 100644 --- a/docs/api/adamContext.rst +++ b/docs/api/adamContext.rst @@ -100,12 +100,17 @@ With an ``ADAMContext``, you can load: - From partitioned Parquet using ``loadPartitionedParquetFeatures`` (Scala only) - Autodetected from any of the above using ``loadFeatures`` (Scala, Java, Python, and R) -- Fragmented contig sequence as a ``NucleotideContigFragmentDataset``: +- Sequences as a ``SequenceDataset``: - - From FASTA with ``loadFasta`` (Scala only) - - From Parquet with ``loadParquetContigFragments`` (Scala only) - - From partitioned Parquet with ``loadPartitionedParquetContigFragments`` (Scala only) - - Autodetected from either of the above using ``loadSequences`` (Scala, Java, Python, and R) + - From FASTA with ``loadFastaDna``, ``loadFastaProtein``, ``loadFastaRna`` (Scala only) + - From Parquet with ``loadParquetSequences`` (Scala only) + - Autodetected from either of the above using ``loadDnaSequences``, ``loadProteinSequences``, ``loadRnaSequences`` (Scala, Java, Python, and R) + +- Sequence slices as a ``SliceDataset``: + + - From FASTA with ``loadFastaDna`` (Scala only) + - From Parquet with ``loadParquetSlices`` (Scala only) + - Autodetected from either of the above using ``loadSlices`` (Scala, Java, Python, and R) - Coverage data as a ``CoverageDataset``: diff --git a/docs/architecture/schemas.rst b/docs/architecture/schemas.rst index beae36f339..c95e5437d2 100644 --- a/docs/architecture/schemas.rst +++ b/docs/architecture/schemas.rst @@ -19,8 +19,8 @@ schemas: from a single sequenced fragment. - The *Genotype* schema represents a genotype call, along with annotations about the quality/read support of the called genotype. -- The *NucleotideContigFragment* schema represents a section of a - contig's sequence. +- The *Sequence* and *Slice* schema represents sequences and slices of + sequences, respectfully. - The *Variant* schema represents a sequence variant, along with statistics about that variant's support across a group of samples, and annotations about the effect of the variant. diff --git a/docs/cli/actions.rst b/docs/cli/actions.rst index 5cb867e5d9..10e70e6e88 100644 --- a/docs/cli/actions.rst +++ b/docs/cli/actions.rst @@ -162,7 +162,7 @@ fall into several general categories: - ``mismatchingPositions`` tagging options: We can recompute the ``mismatchingPositions`` field of an AlignmentRecord (SAM "MD" tag) with the ``-add_md_tags`` flag. This flag takes a path to a reference - file in either FASTA or Parquet ``NucleotideContigFragment`` format. + file in either FASTA or Parquet ``Sequence`` format. Additionally, this engine takes the following options: - ``-md_tag_fragment_size``: If loading from FASTA, sets the size of diff --git a/docs/cli/conversions.rst b/docs/cli/conversions.rst index 70ab009921..37003cece4 100644 --- a/docs/cli/conversions.rst +++ b/docs/cli/conversions.rst @@ -4,53 +4,6 @@ Conversion tools These tools convert data between a legacy genomic file format and using ADAM's schemas to store data in Parquet. -fasta2adam and adam2fasta -~~~~~~~~~~~~~~~~~~~~~~~~~ - -These commands convert between FASTA and Parquet files storing -assemblies using the NucleotideContigFragment schema. - -``fasta2adam`` takes two required arguments: - -1. ``FASTA``: The input FASTA file to convert. -2. ``ADAM``: The path to save the Parquet formatted - NucleotideContigFragments to. - -``fasta2adam`` supports the full set of `default -options <#default-args>`__, as well as the following options: - -- ``-fragment_length``: The fragment length to shard a given contig - into. Defaults to 10,000bp. -- ``-reads``: Path to a set of reads that includes sequence info. This - read path is used to obtain the sequence indices for ordering the - contigs from the FASTA file. -- ``-repartition``: The number of partitions to save the data to. If - provided, forces a shuffle. -- ``-verbose``: If given, enables additional logging where the sequence - dictionary is printed. - -``adam2fasta`` takes two required arguments: - -1. ``ADAM``: The path to a Parquet file containing - NucleotideContigFragments. -2. ``FASTA``: The path to save the FASTA file to. - -``adam2fasta`` only supports the ``-print_metrics`` option from the -`default options <#default-args>`__. Additionally, ``adam2fasta`` takes -the following options: - -- ``-line_width``: The line width in characters to use for breaking - FASTA lines. Defaults to 60 characters. -- ``-coalesce``: Sets the number of partitions to coalesce the output - to. If ``-force_shuffle_coalesce`` is not provided, the Spark engine - may ignore the coalesce directive. -- ``-force_shuffle_coalesce``: Forces a shuffle that leads to the - output being saved with the number of partitions requested by - ``-coalesce``. This is necessary if the ``-coalesce`` would increase - the number of partitions, or if it would reduce the number of - partitions to fewer than the number of Spark executors. This may have - a substantial performance cost, and will invalidate any sort order. - adam2fastq ~~~~~~~~~~ @@ -125,4 +78,3 @@ Additionally, ``transformFragments`` takes the following options: - ``-sort_lexicographically``: Sorts reads by alignment position. Unmapped reads are placed at the end of all reads. Contigs are ordered lexicographically. -