From 9880159c39de2c5e305fae36d9360159b4952189 Mon Sep 17 00:00:00 2001 From: agaszmurlo Date: Sat, 11 May 2019 21:51:10 +0200 Subject: [PATCH] depth app initial version (#144) * depth app initial version * bringing back sbt plugins * adding step to build docker images * sample -> sampleId * using spark 2.4.1 image * doc updates * Testing BLOCKS_NEEDED_FOR_GUESS param * Fixing too many open files * Trying to close streams * Trying to close streams * Trying to close streams * Trying to close streams * Trying to close Wrapstream seekable * Long reads coverage unit test for multiple partitions and logging * Updated unit tests * First draft of generalized algorithm * loaaaads of debugging * more debugging * cleaning up unit tests * Changing logging level from warn to debug in coverage * Documentaition updates * Fixing sbt stats * Fixing building docker image * script polishing * fixing sortOrder and printlns --- Docker/bdg-sequila/Dockerfile | 7 +- Docker/bdg-sequila/bin/depthOfCoverage.sh | 43 +++++++++++ Docker/bdg-sequila/bin/featureCounts.sh | 2 +- build.sbt | 2 + build.sh | 8 +- project/plugins.sbt | 7 +- .../biodatageeks/apps/DepthOfCoverage.scala | 75 +++++++++++++++++++ .../biodatageeks/utils/BGDTableFuncs.scala | 2 +- 8 files changed, 136 insertions(+), 10 deletions(-) create mode 100644 Docker/bdg-sequila/bin/depthOfCoverage.sh create mode 100644 src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala diff --git a/Docker/bdg-sequila/Dockerfile b/Docker/bdg-sequila/Dockerfile index aa47ca6b..196e1331 100644 --- a/Docker/bdg-sequila/Dockerfile +++ b/Docker/bdg-sequila/Dockerfile @@ -1,4 +1,4 @@ -FROM biodatageeks/bdg-spark +FROM biodatageeks/bdg-spark:2.4.1 MAINTAINER biodatageeks RUN apt-get update && apt-get install --yes git sudo curl @@ -19,7 +19,6 @@ ARG BDG_VERSION ENV BDG_VERSION=$BDG_VERSION - RUN apt-get update && apt-get install --yes git sudo curl libssl-dev libxml2-dev @@ -43,11 +42,15 @@ COPY bin/bdg-sequilaR.sh /tmp/bdg-toolset/bdg-sequilaR #featureCounts scripts COPY bin/featureCounts.sh /tmp/bdg-toolset/featureCounts +RUN echo $BDG_VERSION RUN bash -c " if [[ $BDG_VERSION =~ SNAPSHOT ]]; then \ wget https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/org/biodatageeks/bdg-sequila_2.11/${BDG_VERSION}/bdg-sequila_2.11-${BDG_VERSION}-assembly.jar -O /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar ; \ else wget https://zsibio.ii.pw.edu.pl/nexus/repository/maven-releases/org/biodatageeks/bdg-sequila_2.11/${BDG_VERSION}/bdg-sequila_2.11-${BDG_VERSION}-assembly.jar -O /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar ; \ fi" + +COPY bin/depthOfCoverage.sh /tmp/bdg-toolset/depthOfCoverage + #copy test data COPY NA12878.slice.bam /tmp/NA12878.slice.bam COPY unittest.scala /tmp/unittest.scala diff --git a/Docker/bdg-sequila/bin/depthOfCoverage.sh b/Docker/bdg-sequila/bin/depthOfCoverage.sh new file mode 100644 index 00000000..c1048d20 --- /dev/null +++ b/Docker/bdg-sequila/bin/depthOfCoverage.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +appParams=`echo $@ | sed 's/-- /|/g' | cut -f2 -d'|'| sed 's/-F SAF//g'` ###temp remove format option +sparkParams=`echo $@ | sed 's/-- /|/g' | cut -f1 -d'|'` +#iappParams=`echo $@ | sed 's/ -- /|/g' | cut -f2 -d'|'` +readsFile=`echo ${@:$#}` + +substr="master" +if case ${sparkParams} in *"${substr}"*) true;; *) false;; esac; then + echo "Master specified" + else + echo "Master not specified, adding --master=local[*]" + master=" --master local[*] " + sparkParams=$sparkParams$master + fi + + +outfile=`echo $appParams | sed -n "s/^.*-o \([^ ]*\) .*$/\1/p"` + +echo "Checking output directory " $outfile + +if [ -e "$outfile" ] +then + echo "Output directory already exists, please remove" + exit 1; +fi + + +echo ' + + _____ ____ _ __ ____ ______ + / ___/___ / __ \__ __(_) / ____ _ / __ \____ / ____/ + \__ \/ _ \/ / / / / / / / / / __ `/_____/ / / / __ \/ / + ___/ / __/ /_/ / /_/ / / /___/ /_/ /_____/ /_/ / /_/ / /___ +/____/\___/\___\_\__,_/_/_____/\__,_/ /_____/\____/\____/ + + ' +echo $BDG_VERSION +echo -e "\n" +echo "Running with the following arguments: $appParams" +echo "Arguments passed to Apache Spark: $sparkParams" +echo -e "\n" +spark-submit ${sparkParams} --class org.biodatageeks.apps.DepthOfCoverage /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams diff --git a/Docker/bdg-sequila/bin/featureCounts.sh b/Docker/bdg-sequila/bin/featureCounts.sh index 607dbeb4..8a23e76f 100644 --- a/Docker/bdg-sequila/bin/featureCounts.sh +++ b/Docker/bdg-sequila/bin/featureCounts.sh @@ -40,4 +40,4 @@ echo -e "\n" echo "Running with the following arguments: $appParams" echo "Arguments passed to Apache Spark: $sparkParams" echo -e "\n" -spark-submit ${sparkParams} --class org.biodatageeks.apps.FeatureCounts /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams +spark-submit -v ${sparkParams} --class org.biodatageeks.apps.FeatureCounts /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams diff --git a/build.sbt b/build.sbt index ce4f2323..35c6753d 100644 --- a/build.sbt +++ b/build.sbt @@ -144,11 +144,13 @@ artifact in (Compile, assembly) := { val art = (artifact in (Compile, assembly)).value art.withClassifier(Some("assembly")) } + addArtifact(artifact in (Compile, assembly), assembly) publishConfiguration := publishConfiguration.value.withOverwrite(true) credentials += Credentials(Path.userHome / ".ivy2" / ".credentials") + publishTo := { val nexus = "http://zsibio.ii.pw.edu.pl/nexus/repository/" if (isSnapshot.value) diff --git a/build.sh b/build.sh index 46c58d92..6ce57198 100755 --- a/build.sh +++ b/build.sh @@ -22,6 +22,7 @@ bump_version () { find Docker -name "Dockerfile" | sed 's/\/Dockerfile//' |grep "$IMAGE_TO_BUILD"| while read dir; do + echo $version image=`echo $dir| sed 's/^Docker/biodatageeks/'` #version=`if [ ! -e $dir/version ]; then bump_version $image; else tail -1 $dir/version; fi` #if [ -e $dir/version ]; then @@ -35,11 +36,12 @@ do echo "Building image ${image}..." #diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` #if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then + if [ $image == "biodatageeks/bdg-sequila" ]; then cd $dir if [[ ${BUILD_MODE} != "local" ]]; then - docker build --no-cache --build-arg BDG_VERSION=${version} -t $image:$version . + docker build --no-cache --build-arg BDG_VERSION=$version -t $image:$version . else - docker build --no-cache --build-arg BDG_VERSION=${version} -t $image:$version . + docker build --no-cache --build-arg BDG_VERSION=$version -t $image:$version . fi docker build -t $image:latest . if [[ ${BUILD_MODE} != "local" ]]; then @@ -56,6 +58,6 @@ do docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | grep -v ""| xargs -i docker rmi {} cd ../.. - #fi + fi done diff --git a/project/plugins.sbt b/project/plugins.sbt index ddf270e1..534e11fa 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,6 @@ -//addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") -// +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") + //addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") -// + + addSbtPlugin("com.orrsella" % "sbt-stats" % "1.0.7") \ No newline at end of file diff --git a/src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala b/src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala new file mode 100644 index 00000000..93b3838e --- /dev/null +++ b/src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala @@ -0,0 +1,75 @@ +package org.biodatageeks.apps + +import htsjdk.samtools.ValidationStringency +import org.apache.hadoop.io.LongWritable +import org.apache.spark.sql.SparkSession +import org.biodatageeks.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim +import org.rogach.scallop.ScallopConf +import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} +import org.seqdoop.hadoop_bam.util.SAMHeaderReader + +import org.apache.spark.sql.SequilaSession +import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams} + + + + + +object DepthOfCoverage { + + case class Region(contigName:String, start:Int, end:Int) + + class RunConf(args:Array[String]) extends ScallopConf(args){ + + val output = opt[String](required = true) + val reads = opt[String](required = true) + val format = opt[String](required = true) + verify() + } + + + def main(args: Array[String]): Unit = { + val runConf = new RunConf(args) + val spark = SparkSession + .builder() + .appName("SeQuiLa-DoC") + .getOrCreate() + + + spark + .sparkContext + .setLogLevel("WARN") + + spark + .sparkContext + .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) + + + val ss = SequilaSession(spark) + SequilaRegister.register(ss) + + + ss.sql(s"""CREATE TABLE IF NOT EXISTS reads USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '${runConf.reads()}')""") + + val sample = ss.sql(s"SELECT DISTINCT (sampleId) from reads").first().get(0) + println(s"Input file: ${runConf.reads()}") + println(s"Format: ${runConf.format()}") + println(s"Sample: $sample") + + + val query = "SELECT * FROM bdg_coverage('reads', '%s', '%s')".format(sample, runConf.format()) + + + ss.sql(query) + .orderBy("contigName", "start") + .coalesce(1) + .write + .mode("overwrite") + .option("header", "true") + .option("delimiter", "\t") + .csv(runConf.output()) + + println(s"Coverage for $sample stored in ${runConf.output()}") + } + +} diff --git a/src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala b/src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala index 41de6b83..e0bcb40a 100644 --- a/src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala +++ b/src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala @@ -34,7 +34,7 @@ object BDGTableFuncs{ def getAllSamples(spark: SparkSession, path:String) = { val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path)) - println(statuses.length) + //println(statuses.length) statuses .map(_.getPath.toString.split('/').takeRight(1).head.split('.').take(1).head) }