Skip to content

Commit

Permalink
Adding ReadRDD, SequenceRDD, and SliceRDD.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Jun 12, 2019
1 parent a512113 commit 38973ae
Show file tree
Hide file tree
Showing 61 changed files with 6,350 additions and 3,878 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ import htsjdk.samtools.ValidationStringency
import org.apache.spark.api.java.JavaSparkContext
import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.rdd.ADAMContext
import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset
import org.bdgenomics.adam.rdd.feature.{ CoverageDataset, FeatureDataset }
import org.bdgenomics.adam.rdd.fragment.FragmentDataset
import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
import org.bdgenomics.adam.rdd.sequence.{ SequenceDataset, SliceDataset }
import org.bdgenomics.adam.rdd.variant.{
GenotypeDataset,
VariantDataset
Expand Down Expand Up @@ -131,26 +131,6 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
ac.loadIndexedBam(pathName, viewRegions.toIterable, stringency = stringency)
}

/**
* (Java-specific) Load nucleotide contig fragments into a NucleotideContigFragmentDataset.
*
* If the path name has a .fa/.fasta extension, load as FASTA format.
* Else, fall back to Parquet + Avro.
*
* For FASTA format, compressed files are supported through compression codecs configured
* in Hadoop, which by default include .gz and .bz2, but can include more.
*
* @see ADAMContext#loadContigFragments
*
* @param pathName The path name to load nucleotide contig fragments from.
* Globs/directories are supported, although file extension must be present
* for FASTA format.
* @return Returns a NucleotideContigFragmentDataset.
*/
def loadContigFragments(pathName: java.lang.String): NucleotideContigFragmentDataset = {
ac.loadContigFragments(pathName)
}

/**
* (Java-specific) Load fragments into a FragmentDataset.
*
Expand Down Expand Up @@ -390,10 +370,10 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
/**
* (Java-specific) Load reference sequences into a broadcastable ReferenceFile.
*
* If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments
* If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices
* to load the reference as an RDD, which is then collected to the driver.
*
* @see loadContigFragments
* @see ADAMContext#loadSlices
*
* @param pathName The path name to load reference sequences from.
* Globs/directories for 2bit format are not supported.
Expand All @@ -409,11 +389,11 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
/**
* (Java-specific) Load reference sequences into a broadcastable ReferenceFile.
*
* If the path name has a .2bit extension, loads a 2bit file. Else, uses loadContigFragments
* If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices
* to load the reference as an RDD, which is then collected to the driver. Uses a
* maximum fragment length of 10kbp.
*
* @see loadContigFragments
* @see ADAMContext#loadSlices
*
* @param pathName The path name to load reference sequences from.
* Globs/directories for 2bit format are not supported.
Expand All @@ -422,4 +402,113 @@ class JavaADAMContext(val ac: ADAMContext) extends Serializable {
def loadReferenceFile(pathName: java.lang.String): ReferenceFile = {
loadReferenceFile(pathName, 10000L)
}

/**
* (Java-specific) Load DNA sequences into a SequenceDataset.
*
* If the path name has a .fa/.fasta extension, load as FASTA format.
* Else, fall back to Parquet + Avro.
*
* For FASTA format, compressed files are supported through compression codecs configured
* in Hadoop, which by default include .gz and .bz2, but can include more.
*
* @see ADAMContext#loadFastaDna
* @see ADAMContext#loadParquetSequences
*
* @param pathName The path name to load sequences from.
* Globs/directories are supported, although file extension must be present
* for FASTA format.
* @return Returns a SequenceDataset containing DNA sequences.
*/
def loadDnaSequences(pathName: java.lang.String): SequenceDataset = {
ac.loadDnaSequences(pathName)
}

/**
* (Java-specific) Load protein sequences into a SequenceDataset.
*
* If the path name has a .fa/.fasta extension, load as FASTA format.
* Else, fall back to Parquet + Avro.
*
* For FASTA format, compressed files are supported through compression codecs configured
* in Hadoop, which by default include .gz and .bz2, but can include more.
*
* @see ADAMContext#loadFastaProtein
* @see ADAMContext#loadParquetSequences
*
* @param pathName The path name to load sequences from.
* Globs/directories are supported, although file extension must be present
* for FASTA format.
* @return Returns a SequenceDataset containing protein sequences.
*/
def loadProteinSequences(pathName: java.lang.String): SequenceDataset = {
ac.loadProteinSequences(pathName)
}

/**
* (Java-specific) Load RNA sequences into a SequenceDataset.
*
* If the path name has a .fa/.fasta extension, load as FASTA format.
* Else, fall back to Parquet + Avro.
*
* For FASTA format, compressed files are supported through compression codecs configured
* in Hadoop, which by default include .gz and .bz2, but can include more.
*
* @see ADAMContext#loadFastaRna
* @see ADAMContext#loadParquetSequences
*
* @param pathName The path name to load sequences from.
* Globs/directories are supported, although file extension must be present
* for FASTA format.
* @return Returns a SequenceDataset containing RNA sequences.
*/
def loadRnaSequences(pathName: java.lang.String): SequenceDataset = {
ac.loadRnaSequences(pathName)
}

/**
* (Java/Python-specific) Load slices into a SliceDataset.
*
* If the path name has a .fa/.fasta extension, load as DNA in FASTA format.
* Else, fall back to Parquet + Avro.
*
* For FASTA format, compressed files are supported through compression codecs configured
* in Hadoop, which by default include .gz and .bz2, but can include more.
*
* @param pathName The path name to load DNA slices from.
* Globs/directories are supported, although file extension must be present
* for FASTA format.
* @param maximumLength Maximum slice length, reduced to Integer data type to support
* dispatch from Python.
* @return Returns a SliceDataset.
*/
def loadSlices(
pathName: java.lang.String,
maximumLength: java.lang.Integer): SliceDataset = {

ac.loadSlices(pathName, maximumLength.toLong)
}

/**
* (R-specific) Load slices into a SliceDataset.
*
* If the path name has a .fa/.fasta extension, load as DNA in FASTA format.
* Else, fall back to Parquet + Avro.
*
* For FASTA format, compressed files are supported through compression codecs configured
* in Hadoop, which by default include .gz and .bz2, but can include more.
*
* @param pathName The path name to load DNA slices from.
* Globs/directories are supported, although file extension must be present
* for FASTA format.
* @param maximumLength Maximum fragment length, in Double data type to support
* dispatch from SparkR.
* @return Returns a SliceDataset.
*/
def loadSlices(
pathName: java.lang.String,
maximumLength: java.lang.Double): SliceDataset = {

ac.loadSlices(pathName, maximumLength.toLong)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.api.java;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

import org.bdgenomics.adam.rdd.ADAMContext;
import org.bdgenomics.adam.rdd.sequence.SequenceDataset;

/**
* A simple test class for the JavaADAMRDD/Context. Writes an RDD of sequences
* to disk and reads it back.
*/
final class JavaADAMSequenceConduit {
public static SequenceDataset conduit(final SequenceDataset sequenceDataset,
final ADAMContext ac) throws IOException {

// make temp directory and save file
Path tempDir = Files.createTempDirectory("javaAC");
String fileName = tempDir.toString() + "/testRdd.sequences.adam";
sequenceDataset.save(fileName, true, true);

// create a new adam context and load the file
JavaADAMContext jac = new JavaADAMContext(ac);
return jac.loadDnaSequences(fileName);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,25 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

import org.bdgenomics.adam.rdd.ADAMContext;
import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentDataset;
import org.bdgenomics.adam.rdd.sequence.SliceDataset;

/**
* A simple test class for the JavaADAMRDD/Context. Writes an RDD of nucleotide
* contig fragments to disk and reads it back.
* A simple test class for the JavaADAMRDD/Context. Writes an RDD of slices
* to disk and reads it back.
*/
final class JavaADAMContigConduit {
public static NucleotideContigFragmentDataset conduit(final NucleotideContigFragmentDataset recordRdd,
final ADAMContext ac) throws IOException {
final class JavaADAMSliceConduit {
public static SliceDataset conduit(final SliceDataset sliceDataset,
final ADAMContext ac) throws IOException {

// make temp directory and save file
Path tempDir = Files.createTempDirectory("javaAC");
String fileName = tempDir.toString() + "/testRdd.contig.adam";
recordRdd.save(fileName, true);
String fileName = tempDir.toString() + "/testRdd.slices.adam";
sliceDataset.save(fileName, true, true);

// create a new adam context and load the file
JavaADAMContext jac = new JavaADAMContext(ac);
return jac.loadContigFragments(fileName);
return jac.loadSlices(fileName, 10000);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,6 @@ class JavaADAMContextSuite extends ADAMFunSuite {
assert(reads.rdd.count == 2)
}

sparkTest("can read and write a small FASTA file") {
val path = copyResource("chr20.250k.fa.gz")
val aRdd = jac.loadContigFragments(path)
assert(aRdd.jrdd.count() === 26)

val newRdd = JavaADAMContigConduit.conduit(aRdd, sc)

assert(newRdd.jrdd.count() === 26)
}

sparkTest("can read and write a small .SAM file as fragments") {
val path = copyResource("small.sam")
val aRdd = jac.loadFragments(path)
Expand Down Expand Up @@ -114,4 +104,24 @@ class JavaADAMContextSuite extends ADAMFunSuite {
val refFile = jac.loadReferenceFile(path)
assert(refFile.extract(ReferenceRegion("hg19_chrM", 16561, 16571)) === "CATCACGATG")
}

sparkTest("can read and write .fa as sequences") {
val path = copyResource("trinity.fa")
val sequences = jac.loadDnaSequences(path)
assert(sequences.jrdd.count() === 5)

val newRdd = JavaADAMSequenceConduit.conduit(sequences, sc)

assert(newRdd.jrdd.count() === 5)
}

sparkTest("can read and write .fa as slices") {
val path = copyResource("trinity.fa")
val slices = jac.loadSlices(path, 10000L)
assert(slices.jrdd.count() === 5)

val newRdd = JavaADAMSliceConduit.conduit(slices, sc)

assert(newRdd.jrdd.count() === 5)
}
}
82 changes: 0 additions & 82 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fasta.scala

This file was deleted.

0 comments on commit 38973ae

Please sign in to comment.