Navigation Menu

Skip to content

Commit

Permalink
Merge 11a89a2 into 6f30ec9
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Jan 30, 2017
2 parents 6f30ec9 + 11a89a2 commit 6b78a58
Show file tree
Hide file tree
Showing 11 changed files with 571 additions and 15 deletions.
@@ -0,0 +1,65 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.feature

import java.io.{
BufferedWriter,
OutputStream,
OutputStreamWriter
}
import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
import org.bdgenomics.formats.avro.Feature
import org.bdgenomics.utils.misc.Logging

/**
* InFormatter companion that builds a BEDInFormatter to write features in BED format to a pipe.
*/
object BEDInFormatter extends InFormatterCompanion[Feature, FeatureRDD, BEDInFormatter] {

/**
* Apply method for building the BEDInFormatter from a FeatureRDD.
*
* @param fRdd FeatureRDD to build from.
*/
def apply(fRdd: FeatureRDD): BEDInFormatter = {
BEDInFormatter()
}
}

case class BEDInFormatter private () extends InFormatter[Feature, FeatureRDD, BEDInFormatter] {
protected val companion = BEDInFormatter

/**
* Writes features to an output stream in BED format.
*
* @param os An OutputStream connected to a process we are piping to.
* @param iter An iterator of features to write.
*/
def write(os: OutputStream, iter: Iterator[Feature]) {
val writer = new BufferedWriter(new OutputStreamWriter(os))

// write the features
iter.foreach(f => {
writer.write(FeatureRDD.toBed(f))
writer.newLine()
})

// close the writer, else stream may be defective
writer.close() // os is flushed and closed in InFormatterRunner
}
}
@@ -0,0 +1,63 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.feature

import htsjdk.samtools.ValidationStringency
import htsjdk.tribble.readers.{
AsciiLineReader,
AsciiLineReaderIterator
}
import java.io.InputStream
import org.bdgenomics.adam.rdd.OutFormatter
import org.bdgenomics.formats.avro.Feature
import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer

/**
* OutFormatter that reads streaming BED format.
*/
case class BEDOutFormatter() extends OutFormatter[Feature] {
val bedParser = new BEDParser
val stringency = ValidationStringency.STRICT

/**
* Reads features from an input stream in BED format.
*
* @param is An InputStream connected to the process we are piping from.
* @return Returns an iterator of Features read from the stream.
*/
def read(is: InputStream): Iterator[Feature] = {

// make line reader iterator
val lri = new AsciiLineReaderIterator(new AsciiLineReader(is))

@tailrec def convertIterator(iter: AsciiLineReaderIterator,
features: ListBuffer[Feature] = ListBuffer.empty): Iterator[Feature] = {
if (!iter.hasNext) {
iter.close()
features.toIterator
} else {
val nextFeatures = bedParser.parse(iter.next, stringency).fold(features)(features += _)
convertIterator(iter, nextFeatures)
}
}

// convert the iterator
convertIterator(lri)
}
}
Expand Up @@ -101,14 +101,6 @@ private object FeatureOrdering extends FeatureOrdering[Feature] {}

object FeatureRDD {

/**
* @param elem The feature to extract a sequence record from.
* @return Gets the SequenceRecord for this feature.
*/
private def getSequenceRecord(elem: Feature): SequenceRecord = {
SequenceRecord(elem.getContigName, 1L)
}

/**
* Builds a FeatureRDD without SequenceDictionary information by running an
* aggregate to rebuild the SequenceDictionary.
Expand All @@ -121,11 +113,13 @@ object FeatureRDD {
// cache the rdd, since we're making multiple passes
rdd.cache()

// aggregate to create the sequence dictionary
val sd = new SequenceDictionary(rdd.map(getSequenceRecord)
.distinct
.collect
.toVector)
// create sequence records based on largest end coordinate
val featuresByContigName = rdd.keyBy(_.getContigName)
val featureEndsByContigName: RDD[(String, Long)] = featuresByContigName.map(kv => (kv._1, kv._2.getEnd))
val maxFeatureEndsByContigName = featureEndsByContigName.reduceByKey(math.max(_, _))
val sequenceRecords = maxFeatureEndsByContigName.map(kv => SequenceRecord(kv._1, kv._2))

val sd = new SequenceDictionary(sequenceRecords.collect.toVector)

FeatureRDD(rdd, sd)
}
Expand Down
@@ -0,0 +1,65 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.feature

import java.io.{
BufferedWriter,
OutputStream,
OutputStreamWriter
}
import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
import org.bdgenomics.formats.avro.Feature
import org.bdgenomics.utils.misc.Logging

/**
* InFormatter companion that builds a GFF3InFormatter to write features in GFF3 format to a pipe.
*/
object GFF3InFormatter extends InFormatterCompanion[Feature, FeatureRDD, GFF3InFormatter] {

/**
* Apply method for building the GFF3InFormatter from a FeatureRDD.
*
* @param fRdd FeatureRDD to build from.
*/
def apply(fRdd: FeatureRDD): GFF3InFormatter = {
GFF3InFormatter()
}
}

case class GFF3InFormatter private () extends InFormatter[Feature, FeatureRDD, GFF3InFormatter] {
protected val companion = GFF3InFormatter

/**
* Writes features to an output stream in GFF3 format.
*
* @param os An OutputStream connected to a process we are piping to.
* @param iter An iterator of features to write.
*/
def write(os: OutputStream, iter: Iterator[Feature]) {
val writer = new BufferedWriter(new OutputStreamWriter(os))

// write the features
iter.foreach(f => {
writer.write(FeatureRDD.toGff3(f))
writer.newLine()
})

// close the writer, else stream may be defective
writer.close() // os is flushed and closed in InFormatterRunner
}
}
@@ -0,0 +1,63 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.feature

import htsjdk.samtools.ValidationStringency
import htsjdk.tribble.readers.{
AsciiLineReader,
AsciiLineReaderIterator
}
import java.io.InputStream
import org.bdgenomics.adam.rdd.OutFormatter
import org.bdgenomics.formats.avro.Feature
import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer

/**
* OutFormatter that reads streaming GFF3 format.
*/
case class GFF3OutFormatter() extends OutFormatter[Feature] {
val gff3Parser = new GFF3Parser
val stringency = ValidationStringency.STRICT

/**
* Reads features from an input stream in GFF3 format.
*
* @param is An InputStream connected to the process we are piping from.
* @return Returns an iterator of Features read from the stream.
*/
def read(is: InputStream): Iterator[Feature] = {

// make line reader iterator
val lri = new AsciiLineReaderIterator(new AsciiLineReader(is))

@tailrec def convertIterator(iter: AsciiLineReaderIterator,
features: ListBuffer[Feature] = ListBuffer.empty): Iterator[Feature] = {
if (!iter.hasNext) {
iter.close()
features.toIterator
} else {
val nextFeatures = gff3Parser.parse(iter.next, stringency).fold(features)(features += _)
convertIterator(iter, nextFeatures)
}
}

// convert the iterator
convertIterator(lri)
}
}
@@ -0,0 +1,65 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.feature

import java.io.{
BufferedWriter,
OutputStream,
OutputStreamWriter
}
import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
import org.bdgenomics.formats.avro.Feature
import org.bdgenomics.utils.misc.Logging

/**
* InFormatter companion that builds a GTFInFormatter to write features in GTF format to a pipe.
*/
object GTFInFormatter extends InFormatterCompanion[Feature, FeatureRDD, GTFInFormatter] {

/**
* Apply method for building the GTFInFormatter from a FeatureRDD.
*
* @param fRdd FeatureRDD to build from.
*/
def apply(fRdd: FeatureRDD): GTFInFormatter = {
GTFInFormatter()
}
}

case class GTFInFormatter private () extends InFormatter[Feature, FeatureRDD, GTFInFormatter] {
protected val companion = GTFInFormatter

/**
* Writes features to an output stream in GTF format.
*
* @param os An OutputStream connected to a process we are piping to.
* @param iter An iterator of features to write.
*/
def write(os: OutputStream, iter: Iterator[Feature]) {
val writer = new BufferedWriter(new OutputStreamWriter(os))

// write the features
iter.foreach(f => {
writer.write(FeatureRDD.toGtf(f))
writer.newLine()
})

// close the writer, else stream may be defective
writer.close() // os is flushed and closed in InFormatterRunner
}
}

0 comments on commit 6b78a58

Please sign in to comment.