Permalink
Browse files

Committed main bismark-based pipeline script for padlock alignment

  • Loading branch information...
1 parent 293069b commit 9b0eea7c40a868fbbf4738e3a128805cc93196bb @buske committed Nov 8, 2014
Showing with 291 additions and 0 deletions.
  1. +291 −0 src/bis_seq/run_bismark.sh
View
@@ -0,0 +1,291 @@
+#!/usr/bin/env bash
+
+set -eu
+set -o pipefail
+
+
+##### Global parameters #####
+suffix=.fastq.gz
+thisdir="$(cd "$(dirname $0)"; pwd)"
+srcdir="$(pwd)/src"
+refdir="$(pwd)/sequences"
+trimmomatic_jar="$(pwd)/lib/Trimmomatic-0.32/trimmomatic-0.32.jar"
+trimmomatic="java -Xmx2048m -jar ${trimmomatic_jar}"
+bismarkdir="$(pwd)/lib/bismark_v0.12.3"
+bowtiedir="$(pwd)/lib/bowtie-1.1.1"
+adapters="${refdir}/adapters.fa"
+
+# DISPATCH: method for running SGE-compatible subscripts
+DISPATCH="${DISPATCH:-qsub}"
+
+##### Parse command-line arguments and ensure dependencies exist #####
+function usage {
+ cat <<EOF
+Usage: $0 ASSEMBLY DIR
+
+Script must be run from the root of the padlock-pipeline repo.
+
+ASSEMBLY: the assembly to which the sample will be aligned. This must be a
+ subdirectory of ${refdir}/bismark, already prepared with
+ "bismark_genome_preparation".
+
+DIR: a sample directory, containing read files with names of the format
+ "<SAMPLE>_L001_R1_001${suffix}".
+
+- ${refdir}/bismark/phiX must already exist and be prepared with bismark.
+- All reads from the same strand will be merged across lanes and file parts.
+- The script creates SGE-compatible bash scripts within the sample directory.
+- Depending on the value of the DISPATCH environment variable, scripts will
+ be run over SGE, run on the local machine, or not run:
+ 'qsub': scripts will be automatically dispatched
+ 'bash': scripts will be in serial on the local machine
+ 'echo': scripts will not be run (their paths will be echoed
+EOF
+ exit 1
+}
+
+function require_files {
+ while [[ $# -gt 0 ]]; do
+ file="$1"
+ shift
+ if [[ ! -e "${file}" ]]; then
+ echo "Unable to find required file: ${file}" >&2
+ exit 1
+ fi
+ done
+}
+
+function require_dirs {
+ while [[ $# -gt 0 ]]; do
+ dir="$1"
+ shift
+ if [[ ! -d "${dir}" ]]; then
+ echo "Unable to find required directory: ${dir}" >&2
+ exit 1
+ fi
+ done
+}
+
+require_files "${trimmomatic_jar}" "${adapters}"
+require_dirs "${srcdir}" "${refdir}" "${bismarkdir}" "${bowtiedir}"
+
+if [[ DISPATCH = "qsub" ]]; then
+ which qsub > /dev/null
+elif [[ "${DISPATCH}" = "echo" ]]; then
+ require_dir "${TMPDIR}"
+fi
+
+if [[ $# -eq 0 ]]; then
+ usage
+elif [[ $# -ne 2 ]]; then
+ echo -e "Error: unexpected number of arguments\n" >&2
+ usage
+fi
+
+
+export PATH="${bismarkdir}:${bowtiedir}:${PATH}"
+
+
+##### Define functions #####
+
+function trim_sample {
+ local lanes=( $(cd "${dir}" && ls *"${suffix}" | perl -pe 's/.*(L\d+).*/\1/' | sort -u) )
+
+ if [[ "${#lanes[@]}" -lt 1 ]]; then
+ echo "Unable to find lanes for ${name}" >&2
+ return 1
+ fi
+
+ if [[ ! -s "${dir}/${name}.R1P.fq.gz" ]]; then
+ local script="${dir}/pipeline-trim.sh"
+ cat > "${script}" <<EOF
+#!/usr/bin/env bash
+
+#$ -pe parallel 2
+#$ -l h_vmem=6G
+#$ -V
+#$ -S /bin/bash
+#$ -N "${name}.trim"
+#$ -e "${logdir}"
+#$ -o "${logdir}"
+#$ -l hostname="supa*"
+
+set -eu
+
+cd "\${TMPDIR}"
+
+
+# Paired-end trimming
+for lane in ${lanes[@]}; do
+ r1="\$(ls "${dir}"/*_\${lane}_R1_*${suffix})"
+ r2="\$(ls "${dir}"/*_\${lane}_R2_*${suffix})"
+ test -s "\${r1}"
+ test -s "\${r2}"
+
+ ${trimmomatic} PE -threads 2 -phred33 "\${r1}" "\${r2}" \
+ "${name}.\${lane}.R1P.fq.gz" "${name}.\${lane}.R1U.fq.gz" "${name}.\${lane}.R2P.fq.gz" "${name}.\${lane}.R2U.fq.gz" \
+ ILLUMINACLIP:"${adapters}":2:25:7:1 LEADING:10 TRAILING:10 SLIDINGWINDOW:3:15 MINLEN:75
+ cat "${name}.\${lane}.R1P.fq.gz" >> "${name}.R1P.fq.gz"
+ cat "${name}.\${lane}.R2P.fq.gz" >> "${name}.R2P.fq.gz"
+ cat "${name}.\${lane}.R1U.fq.gz" >> "${name}.R1U.fq.gz"
+ cat "${name}.\${lane}.R2U.fq.gz" >> "${name}.R2U.fq.gz"
+done
+
+rm -f "${name}".*.*.fq.gz
+
+mv *.fq.gz "${dir}/"
+EOF
+
+ ${DISPATCH} "${script}"
+ fi
+}
+
+function unspike {
+ local assembly=phiX
+ local ref="${refdir}/bismark/$assembly"
+
+ if [[ ! -e "${dir}/${name}.${assembly}.paired_pe.sam" || ! -e "${dir}/${name}.${assembly}.unpaired_r1.sam" || ! -e "${dir}/${name}.${assembly}.unpaired_r2.sam" ]]; then
+ local script="${dir}/pipeline-align-${assembly}.sh"
+ cat > "${script}" <<EOF
+#!/usr/bin/env bash
+
+#$ -l h_vmem=16G
+#$ -V
+#$ -S /bin/bash
+#$ -hold_jid "${name}.trim"
+#$ -N "${name}.unspike"
+#$ -e "${logdir}"
+#$ -o "${logdir}"
+#$ -l hostname="supa*"
+
+set -eu
+
+cd "\${TMPDIR}"
+
+if [[ ! -e "${dir}/${name}.${assembly}.paired_pe.sam" ]]; then
+ "${bismarkdir}/bismark" --path_to_bowtie "${bowtiedir}/" --output_dir . --temp_dir . --basename "${name}.${assembly}.paired" --unmapped -n 1 --non_directional -X 1000 "${ref}" -1 "${dir}/${name}.R1P.fq.gz" -2 "${dir}/${name}.R2P.fq.gz"
+fi
+
+for r in 1 2; do
+ if [[ ! -e "${dir}/${name}.${assembly}.unpaired_r\${r}.sam" ]]; then
+ "${bismarkdir}/bismark" --path_to_bowtie "${bowtiedir}/" --output_dir . --temp_dir . --basename "${name}.${assembly}.unpaired_r\${r}" --unmapped -n 1 --non_directional "${ref}" "${dir}/${name}.R\${r}U.fq.gz"
+ fi
+done
+
+mv "${name}.${assembly}".* "${dir}/"
+EOF
+ ${DISPATCH} "${script}"
+ fi
+}
+
+
+function align {
+ if [[ ! -e "${dir}/${name}.${assembly}.paired_pe.sam" || ! -e "${dir}/${name}.${assembly}.unpaired_r1.sam" || ! -e "${dir}/${name}.${assembly}.unpaired_r2.sam" ]]; then
+ local script="${dir}/pipeline-align-${assembly}.sh"
+ cat > "${script}" <<EOF
+#!/usr/bin/env bash
+
+#$ -l h_vmem=16G
+#$ -V
+#$ -S /bin/bash
+#$ -hold_jid "${name}.unspike"
+#$ -N "${name}.align.${assembly}"
+#$ -e "${logdir}"
+#$ -o "${logdir}"
+#$ -l hostname="supa*"
+
+set -eu
+
+cd "\${TMPDIR}"
+
+if [[ -e "${dir}/${name}.phiX.paired_unmapped_reads_1.fq" && -e "${dir}/${name}.phiX.paired_unmapped_reads_2.fq" && ! -e "${dir}/${name}.${assembly}.paired_pe.sam" ]]; then
+ "${bismarkdir}/bismark" --path_to_bowtie "${bowtiedir}/" --output_dir . --temp_dir . --basename "${name}.${assembly}.paired" -n 1 --non_directional -X 1000 "${ref}" -1 "${dir}/${name}.phiX.paired_unmapped_reads_1.fq" -2 "${dir}/${name}.phiX.paired_unmapped_reads_2.fq"
+fi
+
+for r in r1 r2; do
+ if [[ -e "${dir}/${name}.phiX.unpaired_\${r}_unmapped_reads.fq" && ! -e "${dir}/${name}.${assembly}.unpaired_\${r}.sam" ]]; then
+ "${bismarkdir}/bismark" --path_to_bowtie "${bowtiedir}/" --output_dir . --temp_dir . --basename "${name}.${assembly}.unpaired_\${r}" -n 1 --non_directional "${ref}" "${dir}/${name}.phiX.unpaired_\${r}_unmapped_reads.fq"
+ fi
+done
+
+mv "${name}.${assembly}".* "${dir}/"
+EOF
+ ${DISPATCH} "${script}"
+ fi
+}
+
+function coverage {
+ if [[ ! -s "${dir}/${name}.${assembly}.merged.cov" ]]; then
+ local script="${dir}/pipeline-coverage-${assembly}.sh"
+ cat > "${script}" <<EOF
+#!/usr/bin/env bash
+
+#$ -l h_vmem=8G
+#$ -V
+#$ -S /bin/bash
+#$ -hold_jid "${name}.align.${assembly}"
+#$ -N "${name}.cov.${assembly}"
+#$ -e "${logdir}"
+#$ -o "${logdir}"
+#$ -l hostname="supa*"
+
+set -eu
+set -o pipefail
+
+cd "\${TMPDIR}"
+
+# Generate any missing coverage files
+for pairing in paired_pe unpaired_r1 unpaired_r2; do
+ if [[ ! -s "${dir}/${name}.${assembly}.\${pairing}.bismark.cov" ]]; then
+ if [[ "\${pairing}" = "paired_pe" ]]; then
+ extra_args="-p --no_overlap"
+ else
+ extra_args=""
+ fi
+
+ "${bismarkdir}/bismark_methylation_extractor" \${extra_args} --comprehensive --merge_non_CpG --report "${dir}/${name}.${assembly}.\${pairing}.sam"
+ "${srcdir}/mybismark2bedGraph" --CX_context --buffer_size 4G -o "${name}.${assembly}.\${pairing}.bedGraph" {,Non_}"CpG_context_${name}.${assembly}.\${pairing}.txt"
+ mv -v "${name}.${assembly}.\${pairing}.bedGraph" "${dir}/"
+ mv -v "${name}.${assembly}.\${pairing}.bismark.cov" "${dir}/"
+ fi
+done
+
+# Sum coverages for paired and unpaired reads
+"${srcdir}/merge_bismark_coverage.py" --aggregate "${dir}/${name}.${assembly}".{paired_pe,unpaired_r1,unpaired_r2}.bismark.cov > "${name}.${assembly}.merged.cov"
+
+mv -v "${name}.${assembly}.merged.cov" "${dir}/"
+
+EOF
+ ${DISPATCH} "${script}"
+ fi
+}
+
+function process_dir {
+ # Set some globals that most functions will need to use
+ assembly="$1"
+ dir="$2"
+
+ name="$(basename "${dir}")"
+ name="${name#Sample_}"
+
+ logdir="${dir}/log"
+ mkdir -pv "${logdir}"
+
+ ref="${refdir}/bismark/${assembly}"
+
+ # Step 1a: trim
+ trim_sample
+
+ # Step 1b: remove phiX
+ unspike
+
+ # Step 2: align with bismark
+ align
+
+ # Step 3: measure methylation coverage
+ coverage
+}
+
+
+##### Run program #####
+process_dir "$1" "$2"

0 comments on commit 9b0eea7

Please sign in to comment.