Skip to content
Permalink
Browse files

Update workflows

  • Loading branch information...
mjsteinbaugh committed Jul 26, 2019
1 parent 49b4c7d commit dc833e0f8215d6f465cc0b085b53c89935bc86c7
@@ -1,11 +1,11 @@
#!/bin/sh

# Prepare bcbio samples.
# Harvard O2 cluster.
# Modified 2019-06-21.
## Prepare bcbio samples.
## Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

#SBATCH --job-name=bcbio # Job name.
#SBATCH --partition=medium # Partition (queue).
@@ -18,8 +18,8 @@
#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID.
#SBATCH --mail-type=ALL # Type of email notification (BEGIN, END, FAIL, ALL).

# This script requires sratoolkit (fastq-dump).
# Match the number of cores to the number of samples.
## This script requires sratoolkit (fastq-dump).
## Match the number of cores to the number of samples.

bcbio_prepare_samples.py \
--csv "samples.csv" \
@@ -1,14 +1,14 @@
#!/bin/sh

# Install Ensembl genome for bcbio.
# Harvard O2 cluster.
# Modified 2019-06-21.
## Install Ensembl genome for bcbio.
## Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

# Use `highmem` partition instead if there are memory issues with HISAT2.
# `medium` partition has a memory limit of 250 GB.
## Use `highmem` partition instead if there are memory issues with HISAT2.
## `medium` partition has a memory limit of 250 GB.

#SBATCH --job-name=bcbio_genome # Job name.
#SBATCH --partition=medium # Partition name.
@@ -1,11 +1,11 @@
#!/bin/sh

# Run bcbio fast RNA-seq pipeline.
# Harvard O2 cluster.
# Modified 2019-06-21.
## Run bcbio fast RNA-seq pipeline.
## Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

#SBATCH --job-name=fastrnaseq # Job name.
#SBATCH --partition=medium # Partition name.
@@ -1,11 +1,11 @@
#!/bin/sh

# Run bcbio RNA-seq pipeline.
# Harvard O2 cluster.
# Modified 2019-06-21.
## Run bcbio RNA-seq pipeline.
## Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

#SBATCH --job-name=rnaseq # Job name.
#SBATCH --partition=medium # Partition name.
@@ -1,11 +1,11 @@
#!/bin/sh

# Run bcbio single-cell RNA-seq pipeline.
# Harvard O2 cluster.
# Modified 2019-06-21.
## Run bcbio single-cell RNA-seq pipeline.
## Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

#SBATCH --job-name=scrnaseq # Job name.
#SBATCH --partition=medium # Partition name.
@@ -1,27 +1,27 @@
#!/bin/sh

# Start bcbio run on HMS O2 cluster using IPython and slurm.
# Modified 2019-07-09.
## Start bcbio run on HMS O2 cluster using IPython and slurm.
## Updated 2019-07-09.

# Configure the bcbio run.
# A directory named "bcbio" will be created.
# SC2035: Use ./*glob* or -- *glob* so names with dashes won't become options.
## Configure the bcbio run.
## A directory named "bcbio" will be created.
## SC2035: Use ./*glob* or -- *glob* so names with dashes won't become options.
bcbio_nextgen.py -w template bcbio.yaml bcbio.csv ./*.fastq.gz

# Traverse into the work directory.
## Traverse into the work directory.
(
cd bcbio/work || exit 1
# Symlink our sbatch script.
## Symlink our sbatch script.
ln -s ../../sbatch-bcbio.sh .
# Now ready to start the run using slurm.
## Now ready to start the run using slurm.
sbatch sbatch_bcbio.sh
)

# This will check the run status.
## This will check the run status.
squeue -u "$USER"
sshare -U

# Check the job status.
# > sprio -j JOBID
# > less *.err
# > tree
## Check the job status.
## > sprio -j JOBID
## > less *.err
## > tree
@@ -1,5 +1,5 @@
#!/bin/sh

# Assuming a local run on multi-core VM.
# Use n - 2 cores.
## Assuming a local run on multi-core VM.
## Use n - 2 cores.
bcbio_nextgen_py ../config/bcbio.yaml -t local -n $((CPU_COUNT-2))
@@ -1,6 +1,6 @@
# Template for fast RNA-seq using Illumina prepared samples.
# This will only run salmon and not perform any alignment or quality control.
# Modified 2019-06-21.
## Template for fast RNA-seq using Illumina prepared samples.
## This will only run salmon and not perform any alignment or quality control.
## Updated 2019-07-26.
---
details:
- analysis: fastrna-seq
@@ -1,75 +1,76 @@
# Template for RNA-seq using Illumina prepared samples.
# Modified 2019-06-21.
## Template for RNA-seq using Illumina prepared samples.
## Updated 2019-07-26.
---
details:
- analysis: RNA-seq
genome_build: hg38
algorithm:
# Recommending HISAT2 instead of STAR for GRCh38/hg38.
# STAR still has issues with ALT contigs.
# https://github.com/alexdobin/STAR/issues/39
#
# Recommending STAR instead of HISAT2 for GRCh37/hg19.
# Note that bcbio only supports a single aligner per run.
## Recommending HISAT2 instead of STAR for GRCh38/hg38.
## STAR still has issues with ALT contigs.
## https://github.com/alexdobin/STAR/issues/39
## Recommending STAR instead of HISAT2 for GRCh37/hg19.
## Note that bcbio only supports a single aligner per run.
aligner: hisat2

# Recommend using salmon/kallisto over HISAT2/STAR for counts.
# Refer to the bcbioRNASeq R package documentation for more information.
# Salmon is currently run by default.
# Note that this setting will run both salmon and kallisto.
## Recommend using salmon/kallisto over HISAT2/STAR for counts.
## Refer to the bcbioRNASeq R package documentation for more information.
## Salmon is currently run by default.
## Note that this setting will run both salmon and kallisto.
expression_caller: [salmon, kallisto]

# Stranded library configuration.
# This setting depends on your library preparation method.
# Supported: "unstranded", "firststrand", "secondstrand".
# Unstranded is the current default.
#
# TruSeq Stranded mRNA is commonly used.
# For that kit, use "firstrand" (fr-firststrand).
# Note that for HISAT2, fr-firststrand corresponds to `--rna-strandedness RF`.
#
# Unsure about the library preparation?
# Run the pipeline on a single sample and check the BAM.
#
# See also:
# - https://ccb.jhu.edu/software/hisat2/manual.shtml
# - https://salmon.readthedocs.io/en/latest/library_type.html
# - https://github.com/bcbio/bcbio-nextgen/blob/master/bcbio/ngsalign/hisat2.py
# - https://github.com/bcbio/bcbio-nextgen/blob/master/bcbio/rnaseq/salmon.py
# - https://www.biostars.org/p/262027/
# - https://www.biostars.org/p/297399/
# - https://support.illumina.com/sequencing/sequencing_kits/truseq-stranded-mrna-workflow.html
# - http://seqanswers.com/forums/showthread.php?t=29542
# - https://www.illumina.com/documents/products/technotes/RNASeqAnalysisTopHat.pdf
## Stranded library configuration.
## This setting depends on your library preparation method.
##
## Supported: "unstranded", "firststrand", "secondstrand".
## Unstranded is the current default.
##
## TruSeq Stranded mRNA is commonly used.
## For that kit, use "firstrand" (fr-firststrand).
##
## HISAT2: `fr-firststrand` corresponds to `--rna-strandedness RF`.
##
## Unsure about the library preparation?
## Run the pipeline on a single sample and check the BAM.
##
## See also:
## - https://ccb.jhu.edu/software/hisat2/manual.shtml
## - https://salmon.readthedocs.io/en/latest/library_type.html
## - https://github.com/bcbio/bcbio-nextgen/blob/master/bcbio/ngsalign/hisat2.py
## - https://github.com/bcbio/bcbio-nextgen/blob/master/bcbio/rnaseq/salmon.py
## - https://www.biostars.org/p/262027/
## - https://www.biostars.org/p/297399/
## - https://support.illumina.com/sequencing/sequencing_kits/truseq-stranded-mrna-workflow.html
## - http://seqanswers.com/forums/showthread.php?t=29542
## - https://www.illumina.com/documents/products/technotes/RNASeqAnalysisTopHat.pdf
strandedness: unstranded

# `quality_format` tells bcbio what quality format is used in the FASTQ files.
# This is set to "standard" by default.
# If your samples were sequenced any time past 2009 or so, use standard.
## Quality format used in the FASTQ files. This is set to "standard" by
## default. If you are unsure and your samples were sequenced any time
## past 2009 or so, use "standard".
quality_format: standard

# Note that adapter trimming is no longer recommended, and is disabled by default.
## Adapter trimming is no longer recommended, and is disabled by default.
trim_reads: False

# Uncomment these if adapter trimming needs to be performed.
# trim_reads: read_through
# adapters: [truseq, polya]
## Uncomment these if adapter trimming needs to be performed.
## > trim_reads: read_through
## > adapters: [truseq, polya]

# Uncomment if you want to provide a custom transcriptome FASTA file.
# transcriptome_fasta: transcriptome.fa
## Uncomment if you want to provide a custom transcriptome FASTA file.
## > transcriptome_fasta: transcriptome.fa

# Uncomment if you want to provide a custom transcriptome GTF file.
# transcriptome_gtf: transcriptome.gtf
## Uncomment if you want to provide a custom transcriptome GTF file.
## > transcriptome_gtf: transcriptome.gtf

# Uncomment if you want to want to provide a FASTA file of spike-ins to quantify.
# spikein_fasta: spikeins.fa
## Uncomment if you want to want to provide a FASTA file of spike-ins.
## > spikein_fasta: spikeins.fa

# Automatically save a bcbioRNASeq S4 object for R.
# tools_on: [bcbiornaseq]
# bcbiornaseq:
# organism: homo sapiens
# interesting_groups: [treatment, genotype]
## Automatically save a bcbioRNASeq S4 object for R.
## > tools_on: [bcbiornaseq]
## > bcbiornaseq:
## > organism: homo sapiens
## > interesting_groups: [treatment, genotype]

# Where to upload (save) the final output.
## Where to upload (save) the final output.
upload:
dir: ../final
@@ -1,39 +1,36 @@
# Template for single-cell RNA-seq using Illumina prepared samples.
#
# Sample barcodes:
# Use the reverse complement (revcomp) with bcbio.
# https://github.com/steinbaugh/koopa/blob/develop/workflows/bcbio/data/umis
## Template for single-cell RNA-seq using Illumina prepared samples.
## Updated 2019-07-26.
---
details:
- analysis: scRNA-seq
genome_build: hg38
algorithm:
# Note that bcbio uses "indrop" instead of "indrops".
## Note that bcbio uses "indrop" instead of "indrops".
umi_type: harvard-indrop-v3

# Specify sample barcodes for multiplexed FASTQs.
# Note that you need to define the reverse complement sequences here.
# inDrops v3: https://github.com/steinbaugh/koopa/blob/master/workflows/indrops/harvard_v3_sample_barcodes.csv
# For bcbioSingleCell R package, use the forward sequences instead.
# This is clearer in reports and what NIH prefers for GEO submissions.
## Specify sample barcodes for multiplexed FASTQs.
## Note that you need to define the reverse complement sequences here.
## inDrops v3: https://github.com/steinbaugh/koopa/blob/master/workflows/indrops/harvard_v3_sample_barcodes.csv
## For bcbioSingleCell R package, use the forward sequences instead.
## This is clearer in reports and what NIH prefers for GEO submissions.
sample_barcodes: sample_barcodes_reverse_complement.txt

# RapMap is the default quantifier.
# Note that specifying this can cause some older bcbio releases to error.
## RapMap is the default quantifier.
## Note that specifying this can cause some older bcbio releases to error.
singlecell_quantifier: rapmap

# Barcode correction is enabled by default, and is recommended.
## Barcode correction is enabled by default, and is recommended.
cellular_barcode_correction: 1

# Note that the bcbio documentation suggests using a cutoff of 10,000.
# This is often too high for inDrops data sets.
# I recommend setting a lower threshold and filtering instead in R.
## Note that the bcbio documentation suggests using a cutoff of 10,000.
## This is often too high for inDrops data sets.
## I recommend setting a lower threshold and filtering instead in R.
minimum_barcode_depth: 1000

# Uncomment if you want to use a custom FASTA/GTF combo.
# transcriptome_fasta: Homo_sapiens.GRCh38.cdna.all.fa
# transcriptome_gtf: Homo_sapiens.GRCh38.95.gtf
## Uncomment if you want to use a custom FASTA/GTF combo.
## > transcriptome_fasta: Homo_sapiens.GRCh38.cdna.all.fa
## > transcriptome_gtf: Homo_sapiens.GRCh38.95.gtf

# Where to upload (save) the final output.
## Where to upload (save) the final output.
upload:
dir: ../final
@@ -1,12 +1,12 @@
#!/bin/sh

# Run Cell Ranger.
# Harvard O2 cluster.
# Modified 2019-06-21.
## Run Cell Ranger.
## Harvard O2 cluster.
## Updated 2019-06-21.

# See also:
# - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq
# - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count
## See also:
## - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq
## - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count

module load bcl2fastq/2.20.0.422
module load cellranger/2.1.1
@@ -1,10 +1,10 @@
#!/bin/bash

# Run Cell Ranger on Harvard O2 cluster.
# Modified 2019-06-21.
## Run Cell Ranger on Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

#SBATCH --job-name=cellranger # Job name.
#SBATCH --partition=medium # Partition name.
@@ -1,10 +1,10 @@
#!/bin/bash

# Run R script on Harvard O2 cluster.
# Modified 2019-06-21.
## Run R script on Harvard O2 cluster.
## Updated 2019-06-21.

# SLURM
# https://slurm.schedmd.com/sbatch.html
## SLURM
## https://slurm.schedmd.com/sbatch.html

#SBATCH --job-name=rscript # Job name.
#SBATCH --partition=priority # Partition name.

0 comments on commit dc833e0

Please sign in to comment.
You can’t perform that action at this time.