🚧 WIP: Batch run

ababaian · May 25, 2020 · a448262 · a448262
1 parent ab448ed
commit a448262
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 0 deletions.
diff --git a/containers/serratus-batch/Dockerfile b/containers/serratus-batch/Dockerfile
@@ -0,0 +1,64 @@
+FROM serratus-base:latest AS build_base
+
+# Versions
+ENV SRATOOLKITVERSION='2.10.4'
+ENV BOWTIEVERSION='2.4.1'
+
+# Bowtie2 - download and install
+RUN wget -O bowtie2.zip --quiet "https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIEVERSION}/bowtie2-${BOWTIEVERSION}-linux-x86_64.zip" \
+ && unzip bowtie2.zip \
+ && rm bowtie2.zip \
+ && mkdir /opt/bowtie2-align \
+ && mv bowtie2*/{bowtie2,bowtie2-align-s} /opt/bowtie2-align \
+ && rm -rf bowtie2*
+
+## SRAToolkit
+RUN wget --quiet "https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${SRATOOLKITVERSION}/sratoolkit.${SRATOOLKITVERSION}-centos_linux64.tar.gz" \
+ && tar xzf "sratoolkit.${SRATOOLKITVERSION}-centos_linux64.tar.gz" \
+ && rm -f "sratoolkit.${SRATOOLKITVERSION}-centos_linux64.tar.gz" \
+ && mkdir -p /opt/sratools \
+    # Keep sratools grouped together, so its easy to copy them all out into the runtime
+ && mv ./sratoolkit.${SRATOOLKITVERSION}-centos_linux64/bin/{vdb-config*,prefetch*,fastq-dump*,fasterq-dump*,sratools*} /opt/sratools \
+    # Install into /usr/local/bin for the rest of the build
+ && cp -r /opt/sratools/* /usr/local/bin \
+ && mkdir /etc/ncbi
+
+# Copy in config for sra tools
+COPY serratus-dl/VDB_user-settings.mkfg /root/.ncbi/user-settings.mkfg
+RUN vdb-config --report-cloud-identity yes
+
+FROM amazonlinux:2 AS runtime
+
+LABEL container.description="serratus: end-to-end batch container"
+LABEL software.license="GPLv3"
+LABEL tags="aws-cli, samtools, bowtie2, sratoolkit"
+
+# aws cli, plus dependencies
+# -merge has its own python dependency, so do a full python/pip install
+RUN yum -y install python3 perl \
+ && alias python=python3 \
+ && curl -O https://bootstrap.pypa.io/get-pip.py \
+ && python3 get-pip.py \
+ && rm get-pip.py \
+ && pip install boto3 awscli \
+ && yum clean all \
+#   aws configuration
+ && aws configure set default.s3.multipart_threshold 4GB \
+ && aws configure set default.s3.multipart_chunksize 4GB
+
+# bowtie2
+COPY --from=build_base /opt/bowtie2-align/* /usr/local/bin/
+
+# samtools
+COPY --from=build_base /usr/local/bin/samtools /usr/local/bin/
+
+# sratools
+COPY --from=build_base /root/.ncbi /root/.ncbi
+COPY --from=build_base /opt/sratools/ /usr/local/bin/
+
+# run script
+WORKDIR /home/serratus
+COPY ./serratus-batch/run /usr/local/bin/
+COPY ./serratus-merge/serratus_summarizer.py ./summarizer.py
+
+CMD ["run"]
diff --git a/containers/serratus-batch/run b/containers/serratus-batch/run
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <SRA> <Genome>"
+    echo 'Run the container with docker run <img> run <SRA> <Genome>'
+    exit 1
+fi
+
+SRA=$1
+GENOME=$2
+
+# May be overridden via environment
+FQMAX=${FQMAX:-100000000}
+
+aws s3 cp aws s3 cp --recursive s3://serratus-public/seq/$GENOME/ .
+
+# Prefetch the data before processing
+# This should be VERY fast, and will cause fastq-dump to have
+# smoother CPU usage in the end.
+prefetch $SRA
+
+# Create some named pipes for fastq-dump to put its data into.
+fastq-dump -X $FQMAX -Z | bowtie2 --very-sensitive-local -U /dev/stdi 
+fastq-dump -X -100000000 -Z $SRA | bowtie2 -x $GENOME --very-sensitive-local --no-unal -U /dev/stdin | python3 summarizer.py /dev/stdin cov3a.sumzer.tsv $SRA.summary /dev/stdout | samtools view -b > out.bam
+
+# Stream both bowtie flavors into s3
+S3_OUT="serratus-batch-$(date +%s).bam"
+aws s3 cp s3://public-data/testing-batch-outs/${S3_OUT} ./out.bam