depth app initial version (#144)

* depth app initial version * bringing back sbt plugins * adding step to build docker images * sample -> sampleId * using spark 2.4.1 image * doc updates * Testing BLOCKS_NEEDED_FOR_GUESS param * Fixing too many open files * Trying to close streams * Trying to close streams * Trying to close streams * Trying to close streams * Trying to close Wrapstream seekable * Long reads coverage unit test for multiple partitions and logging * Updated unit tests * First draft of generalized algorithm * loaaaads of debugging * more debugging * cleaning up unit tests * Changing logging level from warn to debug in coverage * Documentaition updates * Fixing sbt stats * Fixing building docker image * script polishing * fixing sortOrder and printlns
ZSI-Bio · May 11, 2019 · 9880159 · 9880159
1 parent 20771a9
commit 9880159
Show file tree

Hide file tree

Showing 8 changed files with 136 additions and 10 deletions.
diff --git a/Docker/bdg-sequila/Dockerfile b/Docker/bdg-sequila/Dockerfile
@@ -1,4 +1,4 @@
-FROM biodatageeks/bdg-spark
+FROM biodatageeks/bdg-spark:2.4.1
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
 RUN apt-get update && apt-get install --yes git sudo curl
@@ -19,7 +19,6 @@ ARG BDG_VERSION
 ENV BDG_VERSION=$BDG_VERSION
 
 
-
 RUN apt-get update && apt-get install --yes git sudo curl libssl-dev libxml2-dev
 
 
@@ -43,11 +42,15 @@ COPY bin/bdg-sequilaR.sh /tmp/bdg-toolset/bdg-sequilaR
 
 #featureCounts scripts
 COPY bin/featureCounts.sh /tmp/bdg-toolset/featureCounts
+RUN echo $BDG_VERSION
 RUN bash -c " if [[ $BDG_VERSION =~ SNAPSHOT ]]; then \
     wget https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/org/biodatageeks/bdg-sequila_2.11/${BDG_VERSION}/bdg-sequila_2.11-${BDG_VERSION}-assembly.jar -O /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar ; \
     else wget https://zsibio.ii.pw.edu.pl/nexus/repository/maven-releases/org/biodatageeks/bdg-sequila_2.11/${BDG_VERSION}/bdg-sequila_2.11-${BDG_VERSION}-assembly.jar -O /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar ; \
     fi"
 
+
+COPY bin/depthOfCoverage.sh /tmp/bdg-toolset/depthOfCoverage
+
 #copy test data
 COPY NA12878.slice.bam /tmp/NA12878.slice.bam
 COPY unittest.scala /tmp/unittest.scala

diff --git a/Docker/bdg-sequila/bin/depthOfCoverage.sh b/Docker/bdg-sequila/bin/depthOfCoverage.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+appParams=`echo $@ | sed 's/-- /|/g' | cut -f2 -d'|'| sed 's/-F SAF//g'` ###temp remove format option
+sparkParams=`echo $@ | sed 's/-- /|/g' | cut -f1 -d'|'`
+#iappParams=`echo $@ | sed 's/ -- /|/g' | cut -f2 -d'|'`
+readsFile=`echo ${@:$#}`
+
+substr="master"
+if case ${sparkParams} in *"${substr}"*) true;; *) false;; esac; then
+    echo "Master specified"
+ else
+    echo "Master not specified, adding --master=local[*]"
+    master=" --master local[*] "
+    sparkParams=$sparkParams$master
+ fi
+
+
+outfile=`echo $appParams | sed -n "s/^.*-o \([^ ]*\) .*$/\1/p"`
+
+echo "Checking output directory " $outfile
+
+if [ -e "$outfile" ]
+then
+    echo "Output directory already exists, please remove"
+    exit 1;
+fi
+
+
+echo '
+
+   _____      ____        _ __                ____        ______
+  / ___/___  / __ \__  __(_) /   ____ _      / __ \____  / ____/
+  \__ \/ _ \/ / / / / / / / /   / __ `/_____/ / / / __ \/ /
+ ___/ /  __/ /_/ / /_/ / / /___/ /_/ /_____/ /_/ / /_/ / /___
+/____/\___/\___\_\__,_/_/_____/\__,_/     /_____/\____/\____/
+
+                                                          '
+echo $BDG_VERSION
+echo -e "\n"
+echo "Running with the following arguments: $appParams"
+echo "Arguments passed to Apache Spark: $sparkParams"
+echo -e "\n"
+spark-submit ${sparkParams} --class org.biodatageeks.apps.DepthOfCoverage /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar  $appParams
diff --git a/Docker/bdg-sequila/bin/featureCounts.sh b/Docker/bdg-sequila/bin/featureCounts.sh
@@ -40,4 +40,4 @@ echo -e "\n"
 echo "Running with the following arguments: $appParams"
 echo "Arguments passed to Apache Spark: $sparkParams"
 echo -e "\n"
-spark-submit ${sparkParams} --class org.biodatageeks.apps.FeatureCounts /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar  $appParams
+spark-submit -v ${sparkParams} --class org.biodatageeks.apps.FeatureCounts /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar  $appParams
diff --git a/build.sbt b/build.sbt
@@ -144,11 +144,13 @@ artifact in (Compile, assembly) := {
   val art = (artifact in (Compile, assembly)).value
   art.withClassifier(Some("assembly"))
 }
+
 addArtifact(artifact in (Compile, assembly), assembly)
 
 publishConfiguration := publishConfiguration.value.withOverwrite(true)
 
 credentials += Credentials(Path.userHome / ".ivy2" / ".credentials")
+
 publishTo := {
   val nexus = "http://zsibio.ii.pw.edu.pl/nexus/repository/"
   if (isSnapshot.value)

diff --git a/build.sh b/build.sh
@@ -22,6 +22,7 @@ bump_version () {
 find Docker  -name "Dockerfile"  | sed 's/\/Dockerfile//' |grep "$IMAGE_TO_BUILD"| while read dir;
 do
 
+    echo $version
   image=`echo $dir| sed 's/^Docker/biodatageeks/'`
   #version=`if [ ! -e $dir/version ]; then bump_version $image; else tail -1 $dir/version; fi`
   #if [ -e $dir/version ]; then
@@ -35,11 +36,12 @@ do
   echo "Building image ${image}..."
   #diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   #if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
+  if [ $image == "biodatageeks/bdg-sequila" ]; then
     cd $dir
      if [[ ${BUILD_MODE} != "local" ]]; then
-         docker build --no-cache --build-arg BDG_VERSION=${version} -t $image:$version .
+         docker build --no-cache --build-arg BDG_VERSION=$version -t $image:$version .
      else
-         docker build --no-cache --build-arg BDG_VERSION=${version} -t $image:$version .
+         docker build --no-cache --build-arg BDG_VERSION=$version -t $image:$version .
      fi
     docker build  -t $image:latest .
     if [[ ${BUILD_MODE} != "local" ]]; then
@@ -56,6 +58,6 @@ do
     docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | grep -v "<none>"| xargs -i docker rmi {}
 
     cd ../..
-  #fi
+  fi
 
 done
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1,5 +1,6 @@
-//addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
-//
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
+
 //addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
-//
+
+
 addSbtPlugin("com.orrsella" % "sbt-stats" % "1.0.7")
diff --git a/src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala b/src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala
@@ -0,0 +1,75 @@
+package org.biodatageeks.apps
+
+import htsjdk.samtools.ValidationStringency
+import org.apache.hadoop.io.LongWritable
+import org.apache.spark.sql.SparkSession
+import org.biodatageeks.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
+import org.rogach.scallop.ScallopConf
+import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable}
+import org.seqdoop.hadoop_bam.util.SAMHeaderReader
+
+import org.apache.spark.sql.SequilaSession
+import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}
+
+
+
+
+
+object DepthOfCoverage {
+
+  case class Region(contigName:String, start:Int, end:Int)
+
+  class RunConf(args:Array[String]) extends ScallopConf(args){
+
+    val output = opt[String](required = true)
+    val reads = opt[String](required = true)
+    val format = opt[String](required = true)
+    verify()
+  }
+
+
+  def main(args: Array[String]): Unit = {
+    val runConf = new RunConf(args)
+    val spark = SparkSession
+      .builder()
+      .appName("SeQuiLa-DoC")
+      .getOrCreate()
+
+
+    spark
+      .sparkContext
+      .setLogLevel("WARN")
+
+    spark
+      .sparkContext
+      .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString)
+
+
+    val ss = SequilaSession(spark)
+    SequilaRegister.register(ss)
+
+
+    ss.sql(s"""CREATE TABLE IF NOT EXISTS reads  USING org.biodatageeks.datasources.BAM.BAMDataSource  OPTIONS(path '${runConf.reads()}')""")
+
+    val sample = ss.sql(s"SELECT DISTINCT (sampleId) from reads").first().get(0)
+    println(s"Input file: ${runConf.reads()}")
+    println(s"Format: ${runConf.format()}")
+    println(s"Sample: $sample")
+
+
+    val query = "SELECT * FROM bdg_coverage('reads', '%s', '%s')".format(sample, runConf.format())
+
+
+    ss.sql(query)
+      .orderBy("contigName", "start")
+      .coalesce(1)
+      .write
+        .mode("overwrite")
+      .option("header", "true")
+      .option("delimiter", "\t")
+      .csv(runConf.output())
+
+    println(s"Coverage for $sample stored in ${runConf.output()}")
+  }
+
+}
diff --git a/src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala b/src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala
@@ -34,7 +34,7 @@ object BDGTableFuncs{
   def getAllSamples(spark: SparkSession, path:String) = {
     val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
     val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path))
-    println(statuses.length)
+    //println(statuses.length)
     statuses
       .map(_.getPath.toString.split('/').takeRight(1).head.split('.').take(1).head)
   }