From a4482624ff1acfbde1e3ed3e657ce78496a728b3 Mon Sep 17 00:00:00 2001 From: Kyl Wellman Date: Sun, 24 May 2020 17:06:17 -0600 Subject: [PATCH] :construction: WIP: Batch run --- containers/serratus-batch/Dockerfile | 64 ++++++++++++++++++++++++++++ containers/serratus-batch/run | 29 +++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 containers/serratus-batch/Dockerfile create mode 100755 containers/serratus-batch/run diff --git a/containers/serratus-batch/Dockerfile b/containers/serratus-batch/Dockerfile new file mode 100644 index 0000000..b99a323 --- /dev/null +++ b/containers/serratus-batch/Dockerfile @@ -0,0 +1,64 @@ +FROM serratus-base:latest AS build_base + +# Versions +ENV SRATOOLKITVERSION='2.10.4' +ENV BOWTIEVERSION='2.4.1' + +# Bowtie2 - download and install +RUN wget -O bowtie2.zip --quiet "https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIEVERSION}/bowtie2-${BOWTIEVERSION}-linux-x86_64.zip" \ + && unzip bowtie2.zip \ + && rm bowtie2.zip \ + && mkdir /opt/bowtie2-align \ + && mv bowtie2*/{bowtie2,bowtie2-align-s} /opt/bowtie2-align \ + && rm -rf bowtie2* + +## SRAToolkit +RUN wget --quiet "https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${SRATOOLKITVERSION}/sratoolkit.${SRATOOLKITVERSION}-centos_linux64.tar.gz" \ + && tar xzf "sratoolkit.${SRATOOLKITVERSION}-centos_linux64.tar.gz" \ + && rm -f "sratoolkit.${SRATOOLKITVERSION}-centos_linux64.tar.gz" \ + && mkdir -p /opt/sratools \ + # Keep sratools grouped together, so its easy to copy them all out into the runtime + && mv ./sratoolkit.${SRATOOLKITVERSION}-centos_linux64/bin/{vdb-config*,prefetch*,fastq-dump*,fasterq-dump*,sratools*} /opt/sratools \ + # Install into /usr/local/bin for the rest of the build + && cp -r /opt/sratools/* /usr/local/bin \ + && mkdir /etc/ncbi + +# Copy in config for sra tools +COPY serratus-dl/VDB_user-settings.mkfg /root/.ncbi/user-settings.mkfg +RUN vdb-config --report-cloud-identity yes + +FROM amazonlinux:2 AS runtime + +LABEL container.description="serratus: end-to-end batch container" +LABEL software.license="GPLv3" +LABEL tags="aws-cli, samtools, bowtie2, sratoolkit" + +# aws cli, plus dependencies +# -merge has its own python dependency, so do a full python/pip install +RUN yum -y install python3 perl \ + && alias python=python3 \ + && curl -O https://bootstrap.pypa.io/get-pip.py \ + && python3 get-pip.py \ + && rm get-pip.py \ + && pip install boto3 awscli \ + && yum clean all \ +# aws configuration + && aws configure set default.s3.multipart_threshold 4GB \ + && aws configure set default.s3.multipart_chunksize 4GB + +# bowtie2 +COPY --from=build_base /opt/bowtie2-align/* /usr/local/bin/ + +# samtools +COPY --from=build_base /usr/local/bin/samtools /usr/local/bin/ + +# sratools +COPY --from=build_base /root/.ncbi /root/.ncbi +COPY --from=build_base /opt/sratools/ /usr/local/bin/ + +# run script +WORKDIR /home/serratus +COPY ./serratus-batch/run /usr/local/bin/ +COPY ./serratus-merge/serratus_summarizer.py ./summarizer.py + +CMD ["run"] diff --git a/containers/serratus-batch/run b/containers/serratus-batch/run new file mode 100755 index 0000000..a7314d9 --- /dev/null +++ b/containers/serratus-batch/run @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + echo 'Run the container with docker run run ' + exit 1 +fi + +SRA=$1 +GENOME=$2 + +# May be overridden via environment +FQMAX=${FQMAX:-100000000} + +aws s3 cp aws s3 cp --recursive s3://serratus-public/seq/$GENOME/ . + +# Prefetch the data before processing +# This should be VERY fast, and will cause fastq-dump to have +# smoother CPU usage in the end. +prefetch $SRA + +# Create some named pipes for fastq-dump to put its data into. +fastq-dump -X $FQMAX -Z | bowtie2 --very-sensitive-local -U /dev/stdi +fastq-dump -X -100000000 -Z $SRA | bowtie2 -x $GENOME --very-sensitive-local --no-unal -U /dev/stdin | python3 summarizer.py /dev/stdin cov3a.sumzer.tsv $SRA.summary /dev/stdout | samtools view -b > out.bam + +# Stream both bowtie flavors into s3 +S3_OUT="serratus-batch-$(date +%s).bam" +aws s3 cp s3://public-data/testing-batch-outs/${S3_OUT} ./out.bam