/
Optimus.wdl
260 lines (229 loc) · 10.2 KB
/
Optimus.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
version 1.0
import "../../../tasks/skylab/FastqProcessing.wdl" as FastqProcessing
import "../../../tasks/skylab/StarAlign.wdl" as StarAlign
import "../../../tasks/skylab/Metrics.wdl" as Metrics
import "../../../tasks/skylab/RunEmptyDrops.wdl" as RunEmptyDrops
import "../../../tasks/skylab/CheckInputs.wdl" as OptimusInputChecks
import "../../../tasks/skylab/MergeSortBam.wdl" as Merge
import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils
workflow Optimus {
meta {
description: "The optimus 3' pipeline processes 10x genomics sequencing data based on the v2 chemistry. It corrects cell barcodes and UMIs, aligns reads, marks duplicates, and returns data as alignments in BAM format and as counts in sparse matrix exchange format."
}
input {
# Mode for counting either "sc_rna" or "sn_rna"
String counting_mode = "sc_rna"
# Sequencing data inputs
Array[File] r1_fastq
Array[File] r2_fastq
Array[File]? i1_fastq
String input_id
String output_bam_basename = input_id
String? input_name
String? input_id_metadata_field
String? input_name_metadata_field
# organism reference parameters
File tar_star_reference
File annotations_gtf
File? mt_genes
String? soloMultiMappers = "Uniform"
# Chemistry options include: 2 or 3
Int tenx_chemistry_version
# Whitelist is selected based on the input tenx_chemistry_version
File whitelist = checkOptimusInput.whitelist_out
# read_structure is based on v2 or v3 chemistry
String read_struct = checkOptimusInput.read_struct_out
# Emptydrops lower cutoff
Int emptydrops_lower = 100
# Set to true to override input checks and allow pipeline to proceed with invalid input
Boolean force_no_check = false
# Check that tenx_chemistry_version matches the length of the read 1 fastq;
# Set to true if you expect that r1_read_length does not match length of UMIs/barcodes for 10x chemistry v2 (26 bp) or v3 (28 bp).
Boolean ignore_r1_read_length = false
# Set to Forward, Reverse, or Unstranded to account for stranded library preparations (per STARsolo documentation)
String star_strand_mode = "Forward"
# Set to true to count reads aligned to exonic regions in sn_rna mode
Boolean count_exons = false
# this pipeline does not set any preemptible varibles and only relies on the task-level preemptible settings
# you could override the tasklevel preemptible settings by passing it as one of the workflows inputs
# for example: `"Optimus.StarAlign.preemptible": 3` will let the StarAlign task, which by default disables the
# usage of preemptible machines, attempt to request for preemptible instance up to 3 times.
}
# version of this pipeline
String pipeline_version = "6.6.2"
# this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
Array[Int] indices = range(length(r1_fastq))
# 10x parameters
File whitelist_v2 = "gs://gcp-public-data--broad-references/RNA/resources/737k-august-2016.txt"
File whitelist_v3 = "gs://gcp-public-data--broad-references/RNA/resources/3M-febrary-2018.txt"
# Takes the first read1 FASTQ from the inputs to check for chemistry match
File r1_single_fastq = r1_fastq[0]
parameter_meta {
r1_fastq: "forward read, contains cell barcodes and molecule barcodes"
r2_fastq: "reverse read, contains cDNA fragment generated from captured mRNA"
i1_fastq: "(optional) index read, for demultiplexing of multiple samples on one flow cell."
input_id: "name of sample matching this file, inserted into read group header"
input_id_metadata_field: "String that describes the metadata field containing the input_id"
input_name: "User provided sample name or cell_names"
input_name_metadata_field: "String that describes the metadata field containing the input_name"
tar_star_reference: "star genome reference"
annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)"
whitelist: "10x genomics cell barcode whitelist"
tenx_chemistry_version: "10X Genomics v2 (10 bp UMI) or v3 chemistry (12bp UMI)"
force_no_check: "Set to true to override input checks and allow pipeline to proceed with invalid input"
star_strand_mode: "STAR mode for handling stranded reads. Options are 'Forward', 'Reverse, or 'Unstranded.' Default is Forward."
}
call OptimusInputChecks.checkOptimusInput {
input:
force_no_check = force_no_check,
counting_mode = counting_mode,
count_exons = count_exons,
whitelist_v2 = whitelist_v2,
whitelist_v3 = whitelist_v3,
tenx_chemistry_version = tenx_chemistry_version,
r1_fastq = r1_single_fastq,
ignore_r1_read_length = ignore_r1_read_length
}
call StarAlign.STARGenomeRefVersion as ReferenceCheck {
input:
tar_star_reference = tar_star_reference
}
call FastqProcessing.FastqProcessing as SplitFastq {
input:
i1_fastq = i1_fastq,
r1_fastq = r1_fastq,
r2_fastq = r2_fastq,
whitelist = whitelist,
chemistry = tenx_chemistry_version,
sample_id = input_id,
read_struct = read_struct
}
scatter(idx in range(length(SplitFastq.fastq_R1_output_array))) {
call StarAlign.STARsoloFastq as STARsoloFastq {
input:
r1_fastq = [SplitFastq.fastq_R1_output_array[idx]],
r2_fastq = [SplitFastq.fastq_R2_output_array[idx]],
star_strand_mode = star_strand_mode,
white_list = whitelist,
tar_star_reference = tar_star_reference,
chemistry = tenx_chemistry_version,
counting_mode = counting_mode,
count_exons = count_exons,
output_bam_basename = output_bam_basename + "_" + idx,
soloMultiMappers = soloMultiMappers
}
}
call Merge.MergeSortBamFiles as MergeBam {
input:
bam_inputs = STARsoloFastq.bam_output,
output_bam_filename = output_bam_basename + ".bam",
sort_order = "coordinate"
}
call Metrics.CalculateGeneMetrics as GeneMetrics {
input:
bam_input = MergeBam.output_bam,
mt_genes = mt_genes,
original_gtf = annotations_gtf,
input_id = input_id
}
call Metrics.CalculateCellMetrics as CellMetrics {
input:
bam_input = MergeBam.output_bam,
mt_genes = mt_genes,
original_gtf = annotations_gtf,
input_id = input_id
}
call StarAlign.MergeStarOutput as MergeStarOutputs {
input:
barcodes = STARsoloFastq.barcodes,
features = STARsoloFastq.features,
matrix = STARsoloFastq.matrix,
cell_reads = STARsoloFastq.cell_reads,
summary = STARsoloFastq.summary,
align_features = STARsoloFastq.align_features,
umipercell = STARsoloFastq.umipercell,
input_id = input_id,
counting_mode = counting_mode
}
if (counting_mode == "sc_rna"){
call RunEmptyDrops.RunEmptyDrops {
input:
sparse_count_matrix = MergeStarOutputs.sparse_counts,
row_index = MergeStarOutputs.row_index,
col_index = MergeStarOutputs.col_index,
emptydrops_lower = emptydrops_lower
}
}
if (!count_exons) {
call H5adUtils.OptimusH5adGeneration{
input:
input_id = input_id,
input_name = input_name,
input_id_metadata_field = input_id_metadata_field,
input_name_metadata_field = input_name_metadata_field,
annotation_file = annotations_gtf,
cell_metrics = CellMetrics.cell_metrics,
gene_metrics = GeneMetrics.gene_metrics,
sparse_count_matrix = MergeStarOutputs.sparse_counts,
cell_id = MergeStarOutputs.row_index,
gene_id = MergeStarOutputs.col_index,
empty_drops_result = RunEmptyDrops.empty_drops_result,
counting_mode = counting_mode,
pipeline_version = "Optimus_v~{pipeline_version}"
}
}
if (count_exons && counting_mode=="sn_rna") {
call StarAlign.MergeStarOutput as MergeStarOutputsExons {
input:
barcodes = STARsoloFastq.barcodes_sn_rna,
features = STARsoloFastq.features_sn_rna,
matrix = STARsoloFastq.matrix_sn_rna,
cell_reads = STARsoloFastq.cell_reads_sn_rna,
input_id = input_id,
counting_mode = "sc_rna",
summary = STARsoloFastq.summary_sn_rna,
align_features = STARsoloFastq.align_features_sn_rna,
umipercell = STARsoloFastq.umipercell_sn_rna,
input_id = input_id
}
call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{
input:
input_id = input_id,
input_name = input_name,
input_id_metadata_field = input_id_metadata_field,
input_name_metadata_field = input_name_metadata_field,
annotation_file = annotations_gtf,
cell_metrics = CellMetrics.cell_metrics,
gene_metrics = GeneMetrics.gene_metrics,
sparse_count_matrix = MergeStarOutputs.sparse_counts,
cell_id = MergeStarOutputs.row_index,
gene_id = MergeStarOutputs.col_index,
sparse_count_matrix_exon = MergeStarOutputsExons.sparse_counts,
cell_id_exon = MergeStarOutputsExons.row_index,
gene_id_exon = MergeStarOutputsExons.col_index,
pipeline_version = "Optimus_v~{pipeline_version}"
}
}
File final_h5ad_output = select_first([OptimusH5adGenerationWithExons.h5ad_output, OptimusH5adGeneration.h5ad_output])
output {
# version of this pipeline
String pipeline_version_out = pipeline_version
File genomic_reference_version = ReferenceCheck.genomic_ref_version
File bam = MergeBam.output_bam
File matrix = MergeStarOutputs.sparse_counts
File matrix_row_index = MergeStarOutputs.row_index
File matrix_col_index = MergeStarOutputs.col_index
File cell_metrics = CellMetrics.cell_metrics
File gene_metrics = GeneMetrics.gene_metrics
File? cell_calls = RunEmptyDrops.empty_drops_result
File? aligner_metrics = MergeStarOutputs.cell_reads_out
File? library_metrics = MergeStarOutputs.library_metrics
File? mtx_files = MergeStarOutputs.mtx_files
Array[File?] multimappers_EM_matrix = STARsoloFastq.multimappers_EM_matrix
Array[File?] multimappers_Uniform_matrix = STARsoloFastq.multimappers_Uniform_matrix
Array[File?] multimappers_Rescue_matrix = STARsoloFastq.multimappers_Rescue_matrix
Array[File?] multimappers_PropUnique_matrix = STARsoloFastq.multimappers_PropUnique_matrix
# h5ad
File h5ad_output_file = final_h5ad_output
}
}