Skip to content
This repository has been archived by the owner on Nov 28, 2020. It is now read-only.

Commit

Permalink
depth app initial version (#144)
Browse files Browse the repository at this point in the history
* depth app initial  version

* bringing back sbt plugins

* adding step to build docker images

* sample -> sampleId

* using spark 2.4.1 image

* doc updates

* Testing BLOCKS_NEEDED_FOR_GUESS param

* Fixing too many open files

* Trying to close streams

* Trying to close streams

* Trying to close streams

* Trying to close streams

* Trying to close Wrapstream seekable

* Long reads coverage unit test for multiple partitions and logging

* Updated unit tests

* First draft of generalized algorithm

* loaaaads of debugging

* more debugging

* cleaning up unit tests

* Changing logging level from warn to debug in coverage

* Documentaition updates

* Fixing sbt stats

* Fixing building docker image

* script polishing

* fixing sortOrder and printlns
  • Loading branch information
agaszmurlo authored and mwiewior committed May 11, 2019
1 parent 20771a9 commit 9880159
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 10 deletions.
7 changes: 5 additions & 2 deletions Docker/bdg-sequila/Dockerfile
@@ -1,4 +1,4 @@
FROM biodatageeks/bdg-spark
FROM biodatageeks/bdg-spark:2.4.1
MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>

RUN apt-get update && apt-get install --yes git sudo curl
Expand All @@ -19,7 +19,6 @@ ARG BDG_VERSION
ENV BDG_VERSION=$BDG_VERSION



RUN apt-get update && apt-get install --yes git sudo curl libssl-dev libxml2-dev


Expand All @@ -43,11 +42,15 @@ COPY bin/bdg-sequilaR.sh /tmp/bdg-toolset/bdg-sequilaR

#featureCounts scripts
COPY bin/featureCounts.sh /tmp/bdg-toolset/featureCounts
RUN echo $BDG_VERSION
RUN bash -c " if [[ $BDG_VERSION =~ SNAPSHOT ]]; then \
wget https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/org/biodatageeks/bdg-sequila_2.11/${BDG_VERSION}/bdg-sequila_2.11-${BDG_VERSION}-assembly.jar -O /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar ; \
else wget https://zsibio.ii.pw.edu.pl/nexus/repository/maven-releases/org/biodatageeks/bdg-sequila_2.11/${BDG_VERSION}/bdg-sequila_2.11-${BDG_VERSION}-assembly.jar -O /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar ; \
fi"


COPY bin/depthOfCoverage.sh /tmp/bdg-toolset/depthOfCoverage

#copy test data
COPY NA12878.slice.bam /tmp/NA12878.slice.bam
COPY unittest.scala /tmp/unittest.scala
Expand Down
43 changes: 43 additions & 0 deletions Docker/bdg-sequila/bin/depthOfCoverage.sh
@@ -0,0 +1,43 @@
#!/bin/bash

appParams=`echo $@ | sed 's/-- /|/g' | cut -f2 -d'|'| sed 's/-F SAF//g'` ###temp remove format option
sparkParams=`echo $@ | sed 's/-- /|/g' | cut -f1 -d'|'`
#iappParams=`echo $@ | sed 's/ -- /|/g' | cut -f2 -d'|'`
readsFile=`echo ${@:$#}`

substr="master"
if case ${sparkParams} in *"${substr}"*) true;; *) false;; esac; then
echo "Master specified"
else
echo "Master not specified, adding --master=local[*]"
master=" --master local[*] "
sparkParams=$sparkParams$master
fi


outfile=`echo $appParams | sed -n "s/^.*-o \([^ ]*\) .*$/\1/p"`

echo "Checking output directory " $outfile

if [ -e "$outfile" ]
then
echo "Output directory already exists, please remove"
exit 1;
fi


echo '
_____ ____ _ __ ____ ______
/ ___/___ / __ \__ __(_) / ____ _ / __ \____ / ____/
\__ \/ _ \/ / / / / / / / / / __ `/_____/ / / / __ \/ /
___/ / __/ /_/ / /_/ / / /___/ /_/ /_____/ /_/ / /_/ / /___
/____/\___/\___\_\__,_/_/_____/\__,_/ /_____/\____/\____/
'
echo $BDG_VERSION
echo -e "\n"
echo "Running with the following arguments: $appParams"
echo "Arguments passed to Apache Spark: $sparkParams"
echo -e "\n"
spark-submit ${sparkParams} --class org.biodatageeks.apps.DepthOfCoverage /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams
2 changes: 1 addition & 1 deletion Docker/bdg-sequila/bin/featureCounts.sh
Expand Up @@ -40,4 +40,4 @@ echo -e "\n"
echo "Running with the following arguments: $appParams"
echo "Arguments passed to Apache Spark: $sparkParams"
echo -e "\n"
spark-submit ${sparkParams} --class org.biodatageeks.apps.FeatureCounts /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams
spark-submit -v ${sparkParams} --class org.biodatageeks.apps.FeatureCounts /tmp/bdg-toolset/bdg-sequila-assembly-${BDG_VERSION}.jar $appParams
2 changes: 2 additions & 0 deletions build.sbt
Expand Up @@ -144,11 +144,13 @@ artifact in (Compile, assembly) := {
val art = (artifact in (Compile, assembly)).value
art.withClassifier(Some("assembly"))
}

addArtifact(artifact in (Compile, assembly), assembly)

publishConfiguration := publishConfiguration.value.withOverwrite(true)

credentials += Credentials(Path.userHome / ".ivy2" / ".credentials")

publishTo := {
val nexus = "http://zsibio.ii.pw.edu.pl/nexus/repository/"
if (isSnapshot.value)
Expand Down
8 changes: 5 additions & 3 deletions build.sh
Expand Up @@ -22,6 +22,7 @@ bump_version () {
find Docker -name "Dockerfile" | sed 's/\/Dockerfile//' |grep "$IMAGE_TO_BUILD"| while read dir;
do

echo $version
image=`echo $dir| sed 's/^Docker/biodatageeks/'`
#version=`if [ ! -e $dir/version ]; then bump_version $image; else tail -1 $dir/version; fi`
#if [ -e $dir/version ]; then
Expand All @@ -35,11 +36,12 @@ do
echo "Building image ${image}..."
#diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
#if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
if [ $image == "biodatageeks/bdg-sequila" ]; then
cd $dir
if [[ ${BUILD_MODE} != "local" ]]; then
docker build --no-cache --build-arg BDG_VERSION=${version} -t $image:$version .
docker build --no-cache --build-arg BDG_VERSION=$version -t $image:$version .
else
docker build --no-cache --build-arg BDG_VERSION=${version} -t $image:$version .
docker build --no-cache --build-arg BDG_VERSION=$version -t $image:$version .
fi
docker build -t $image:latest .
if [[ ${BUILD_MODE} != "local" ]]; then
Expand All @@ -56,6 +58,6 @@ do
docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | grep -v "<none>"| xargs -i docker rmi {}

cd ../..
#fi
fi

done
7 changes: 4 additions & 3 deletions project/plugins.sbt
@@ -1,5 +1,6 @@
//addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
//
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")

//addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2")
//


addSbtPlugin("com.orrsella" % "sbt-stats" % "1.0.7")
75 changes: 75 additions & 0 deletions src/main/scala/org/biodatageeks/apps/DepthOfCoverage.scala
@@ -0,0 +1,75 @@
package org.biodatageeks.apps

import htsjdk.samtools.ValidationStringency
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.SparkSession
import org.biodatageeks.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
import org.rogach.scallop.ScallopConf
import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable}
import org.seqdoop.hadoop_bam.util.SAMHeaderReader

import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister,BDGInternalParams}





object DepthOfCoverage {

case class Region(contigName:String, start:Int, end:Int)

class RunConf(args:Array[String]) extends ScallopConf(args){

val output = opt[String](required = true)
val reads = opt[String](required = true)
val format = opt[String](required = true)
verify()
}


def main(args: Array[String]): Unit = {
val runConf = new RunConf(args)
val spark = SparkSession
.builder()
.appName("SeQuiLa-DoC")
.getOrCreate()


spark
.sparkContext
.setLogLevel("WARN")

spark
.sparkContext
.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString)


val ss = SequilaSession(spark)
SequilaRegister.register(ss)


ss.sql(s"""CREATE TABLE IF NOT EXISTS reads USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '${runConf.reads()}')""")

val sample = ss.sql(s"SELECT DISTINCT (sampleId) from reads").first().get(0)
println(s"Input file: ${runConf.reads()}")
println(s"Format: ${runConf.format()}")
println(s"Sample: $sample")


val query = "SELECT * FROM bdg_coverage('reads', '%s', '%s')".format(sample, runConf.format())


ss.sql(query)
.orderBy("contigName", "start")
.coalesce(1)
.write
.mode("overwrite")
.option("header", "true")
.option("delimiter", "\t")
.csv(runConf.output())

println(s"Coverage for $sample stored in ${runConf.output()}")
}

}
2 changes: 1 addition & 1 deletion src/main/scala/org/biodatageeks/utils/BGDTableFuncs.scala
Expand Up @@ -34,7 +34,7 @@ object BDGTableFuncs{
def getAllSamples(spark: SparkSession, path:String) = {
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val statuses = fs.globStatus(new org.apache.hadoop.fs.Path(path))
println(statuses.length)
//println(statuses.length)
statuses
.map(_.getPath.toString.split('/').takeRight(1).head.split('.').take(1).head)
}
Expand Down

0 comments on commit 9880159

Please sign in to comment.