This repository has been archived by the owner on Nov 28, 2020. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* depth app initial version * bringing back sbt plugins * adding step to build docker images * sample -> sampleId * using spark 2.4.1 image * doc updates * Testing BLOCKS_NEEDED_FOR_GUESS param * Fixing too many open files * Trying to close streams * Trying to close streams * Trying to close streams * Trying to close streams * Trying to close Wrapstream seekable * Long reads coverage unit test for multiple partitions and logging * Updated unit tests * First draft of generalized algorithm * loaaaads of debugging * more debugging * cleaning up unit tests * Changing logging level from warn to debug in coverage * Documentaition updates * Fixing sbt stats * Fixing building docker image * script polishing * fixing sortOrder and printlns
- Loading branch information
1 parent
20771a9
commit 9880159
Showing
8 changed files
with
136 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/bash | ||
|
||
appParams=`echo $@ | sed 's/-- /|/g' | cut -f2 -d'|'| sed 's/-F SAF//g'` ###temp remove format option | ||
sparkParams=`echo $@ | sed 's/-- /|/g' | cut -f1 -d'|'` | ||
#iappParams=`echo $@ | sed 's/ -- /|/g' | cut -f2 -d'|'` | ||
readsFile=`echo ${@:$#}` | ||
|
||
substr="master" | ||
if case ${sparkParams} in *"${substr}"*) true;; *) false;; esac; then | ||
echo "Master specified" | ||
else | ||
echo "Master not specified, adding --master=local[*]" | ||
master=" --master local[*] " | ||
sparkParams=$sparkParams$master | ||
fi | ||
|
||
|
||
outfile=`echo $appParams | sed -n "s/^.*-o \([^ ]*\) .*$/\1/p"` | ||
|
||
echo "Checking output directory " $outfile | ||
|
||
if [ -e "$outfile" ] | ||
then | ||
echo "Output directory already exists, please remove" | ||
exit 1; | ||
fi | ||
|
||
|
||
echo ' | ||
_____ ____ _ __ ____ ______ | ||
/ ___/___ / __ \__ __(_) / ____ _ / __ \____ / ____/ | ||
\__ \/ _ \/ / / / / / / / / / __ `/_____/ / / / __ \/ / | ||
___/ / __/ /_/ / /_/ / / /___/ /_/ /_____/ /_/ / /_/ / /___ | ||
/____/\___/\___\_\__,_/_/_____/\__,_/ /_____/\____/\____/ | ||
' | ||
echo $BDG_VERSION | ||
echo -e "\n" | ||
echo "Running with the following arguments: $appParams" | ||
echo "Arguments passed to Apache Spark: $sparkParams" | ||
echo -e "\n" | ||
spark-submit ${sparkParams} --class org.biodatageeks.apps.DepthOfCoverage /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
//addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") | ||
// | ||
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") | ||
|
||
//addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") | ||
// | ||
|
||
|
||
addSbtPlugin("com.orrsella" % "sbt-stats" % "1.0.7") |
75 changes: 75 additions & 0 deletions
75
src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package org.biodatageeks.apps | ||
|
||
import htsjdk.samtools.ValidationStringency | ||
import org.apache.hadoop.io.LongWritable | ||
import org.apache.spark.sql.SparkSession | ||
import org.biodatageeks.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim | ||
import org.rogach.scallop.ScallopConf | ||
import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} | ||
import org.seqdoop.hadoop_bam.util.SAMHeaderReader | ||
|
||
import org.apache.spark.sql.SequilaSession | ||
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams} | ||
|
||
|
||
|
||
|
||
|
||
object DepthOfCoverage { | ||
|
||
case class Region(contigName:String, start:Int, end:Int) | ||
|
||
class RunConf(args:Array[String]) extends ScallopConf(args){ | ||
|
||
val output = opt[String](required = true) | ||
val reads = opt[String](required = true) | ||
val format = opt[String](required = true) | ||
verify() | ||
} | ||
|
||
|
||
def main(args: Array[String]): Unit = { | ||
val runConf = new RunConf(args) | ||
val spark = SparkSession | ||
.builder() | ||
.appName("SeQuiLa-DoC") | ||
.getOrCreate() | ||
|
||
|
||
spark | ||
.sparkContext | ||
.setLogLevel("WARN") | ||
|
||
spark | ||
.sparkContext | ||
.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) | ||
|
||
|
||
val ss = SequilaSession(spark) | ||
SequilaRegister.register(ss) | ||
|
||
|
||
ss.sql(s"""CREATE TABLE IF NOT EXISTS reads USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '${runConf.reads()}')""") | ||
|
||
val sample = ss.sql(s"SELECT DISTINCT (sampleId) from reads").first().get(0) | ||
println(s"Input file: ${runConf.reads()}") | ||
println(s"Format: ${runConf.format()}") | ||
println(s"Sample: $sample") | ||
|
||
|
||
val query = "SELECT * FROM bdg_coverage('reads', '%s', '%s')".format(sample, runConf.format()) | ||
|
||
|
||
ss.sql(query) | ||
.orderBy("contigName", "start") | ||
.coalesce(1) | ||
.write | ||
.mode("overwrite") | ||
.option("header", "true") | ||
.option("delimiter", "\t") | ||
.csv(runConf.output()) | ||
|
||
println(s"Coverage for $sample stored in ${runConf.output()}") | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters