/
GermlineCNVCaller.java
447 lines (405 loc) · 25.3 KB
/
GermlineCNVCaller.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
package org.broadinstitute.hellbender.tools.copynumber;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection;
import org.broadinstitute.hellbender.cmdline.argumentcollections.OptionalIntervalArgumentCollection;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.*;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
import java.io.File;
import java.util.*;
import java.util.stream.Collectors;
/**
* Calls copy-number variants in germline samples given their counts and the corresponding output of
* {@link DetermineGermlineContigPloidy}. The former should be either HDF5 or TSV count files generated by
* {@link CollectReadCounts}.
*
* <h3>Introduction</h3>
*
* <p>Reliable detection of copy-number variation (CNV) from read-depth ("coverage" or "counts") data such as whole
* exome sequencing (WES), whole genome sequencing (WGS), and custom gene sequencing panels requires a comprehensive
* model to account for technical variation in library preparation and sequencing. The Bayesian model and the associated
* inference scheme implemented in {@link GermlineCNVCaller} includes provisions for inferring and explaining away much
* of the technical variation. Furthermore, CNV calling confidence is automatically adjusted for each sample and
* genomic interval.
*
* <p>The parameters of the probabilistic model for read-depth bias and variance estimation (hereafter, "the coverage
* model") can be automatically inferred by {@link GermlineCNVCaller} by providing a cohort of germline samples
* sequenced using the same sequencing platform and library preparation protocol (in case of WES, the same capture
* kit). We refer to this run mode of the tool as the <b>COHORT</b> mode. The number of samples required for the
* COHORT mode depends on several factors such as the sequencing depth, tissue type/quality and similarity in the cohort,
* and the stringency of following the library preparation and sequencing protocols. For WES and WGS samples, we
* recommend including at least 30 samples.</p>
*
* <p>The parametrized coverage model can be used for CNV calling on future case samples provided that they are
* strictly compatible with the cohort used to generate the model parameters (in terms of tissue type(s), library
* preparation and sequencing protocols). We refer to this mode as the <b>CASE</b> run mode. There is no lower
* limit on the number of samples for running {@link GermlineCNVCaller} in the case mode.</p>
*
* <p>In both tool run modes, {@link GermlineCNVCaller} requires karyotyping and global read-depth information for
* all samples. Such information can be automatically generated by running {@link DetermineGermlineContigPloidy}
* on all samples, and passed on to {@link GermlineCNVCaller} by providing the ploidy output calls using the argument
* {@code contig-ploidy-calls}. The ploidy state of a contig is used as the <em>baseline</em>
* ("default") copy-number state of all intervals contained in that contig for the corresponding sample. All other
* copy-number states are treated as alternative states and get equal shares from the total alternative state
* probability (set using the {@code p-alt} argument).</p>
*
* <h3>Python environment setup</h3>
*
* <p>The computation done by this tool, aside from input data parsing and validation, is performed outside of the Java
* Virtual Machine and using the <em>gCNV computational python module</em>, namely {@code gcnvkernel}. It is crucial that
* the user has properly set up a python conda environment with {@code gcnvkernel} and its dependencies
* installed. If the user intends to run {@link GermlineCNVCaller} using one of the official GATK Docker images,
* the python environment is already set up. Otherwise, the environment must be created and activated as described in the
* main GATK README.md file.</p>
*
* <h3>Tool run modes</h3>
* <dl>
* <dt>COHORT mode:</dt>
* <dd><p>The tool will be run in the COHORT mode using the argument {@code run-mode COHORT}.
* In this mode, coverage model parameters are inferred simultaneously with the CNV states. Depending on
* available memory, it may be necessary to run the tool over a subset of all intervals, which can be specified
* by -L; this can be used to pass a filtered interval list produced by {@link FilterIntervals} to mask
* intervals from modeling. The specified intervals must be present in all of the input count files. The output
* will contain two subdirectories, one ending with "-model" and the other with "-calls".</p>
*
* <p>The model subdirectory contains a snapshot of the inferred parameters of the coverage model, which may be
* used later for CNV calling in one or more similarly-sequenced samples as mentioned earlier. Optionally, the path
* to a previously obtained coverage model parametrization can be provided via the {@code model} argument
* in the COHORT mode, in which case, the provided parameters will be only used for model initialization and
* a new parametrization will be generated based on the input count files. Furthermore, the genomic intervals are
* set to those used in creating the previous parameterization and interval-related arguments will be ignored.
* Note that the newly obtained parametrization ultimately reflects the input count files from the last run,
* regardless of whether or not an initialization parameter set is given. If the users wishes to model coverage
* data from two or more cohorts simultaneously, all of the input counts files must be given to the tool at once.
*
* <p>The calls subdirectory contains sequentially-ordered subdirectories for each sample, each listing various
* sample-specific estimated quantities such as the probability of various copy-number states for each interval,
* a parametrization of the GC curve, sample-specific unexplained variance, read-depth, and loadings of
* coverage bias factors.</p></dd>
*
* <dt>CASE mode:</dt>
* <dd><p>The tool will be run in the CASE mode using the argument {@code run-mode CASE}. The path to a previously
* obtained model directory must be provided via the {@code model} argument in this mode. The modeled intervals are
* then specified by a file contained in the model directory, all interval-related arguments are ignored in this
* mode, and all model intervals must be present in all of the input count files. The tool output in the CASE mode
* is only the "-calls" subdirectory and is organized similarly to the COHORT mode.</p>
*
* <p>Note that at the moment, this tool does not automatically verify the compatibility of the provided parametrization
* with the provided count files. Model compatibility may be assessed a posteriori by inspecting the magnitude of
* sample-specific unexplained variance of each sample, and asserting that they lie within the same range as
* those obtained from the cohort used to generate the parametrization. This manual step is expected to be made
* automatic in the future.</p>
* </dl>
*
* <h3>Important Remarks</h3>
* <dl>
* <dt>Choice of hyperparameters:</dt>
* <dd><p>The quality of coverage model parametrization and the sensitivity/precision of germline CNV calling are
* sensitive to the choice of model hyperparameters, including the prior probability of alternative copy-number states
* (set using the {@code p-alt} argument), prevalence of active (i.e. CNV-rich) intervals (set via the
* {@code p-active} argument), the coherence length of CNV events and active/silent domains
* across intervals (set using the {@code cnv-coherence-length} and {@code class-coherence-length} arguments,
* respectively), and the typical scale of interval- and sample-specific unexplained variance
* (set using the {@code interval-psi-scale} and {@code sample-psi-scale} arguments, respectively). It is crucial
* to note that these hyperparameters are <em>not</em> universal and must be tuned for each sequencing protocol
* and properly set at runtime.</p></dd>
*
* <dt>Running {@link GermlineCNVCaller} on a subset of intervals:</dt>
* <dd><p>As mentioned earlier, it may be necessary to run the tool over a subset of all intervals depending on
* available memory. The number of intervals must be large enough to include a genomically diverse set of regions
* for reliable inference of the GC bias curve, as well as other bias factors. For WES and WGS, we recommend no less
* than 10000 consecutive intervals spanning at least 10 - 50 mb.</p></dd>
*
* <dt>Memory requirements for the python subprocess ("gcnvkernel"):</dt>
* <dd><p>The computation done by this tool, for the most part, is performed outside of JVM and via a spawned
* python subprocess. The Java heap memory is only used for loading sample counts and preparing raw data for the
* python subprocess. The user must ensure that the machine has enough free physical memory for spawning and executing
* the python subprocess. Generally speaking, the resource requirements of this tool scale linearly with each of the
* number of samples, the number of modeled intervals, the highest copy number state, the number of bias factors, and
* the number of knobs on the GC curve. For example, the python subprocess requires approximately 16GB of physical memory
* for modeling 10000 intervals for 100 samples, with 16 maximum bias factors, maximum copy-number state of 10, and
* explicit GC bias modeling.</p></dd>
* </dl>
*
* <h3>Usage examples</h3>
*
* <p>COHORT mode:</p>
* <pre>
* gatk GermlineCNVCaller \
* --run-mode COHORT \
* -L intervals.interval_list \
* --interval-merging-rule OVERLAPPING_ONLY \
* --contig-ploidy-calls path_to_contig_ploidy_calls \
* --input normal_1.counts.hdf5 \
* --input normal_2.counts.hdf5 \
* ... \
* --output output_dir \
* --output-prefix normal_cohort_run
* </pre>
*
* <p>CASE mode:</p>
* <pre>
* gatk GermlineCNVCaller \
* --run-mode CASE \
* --contig-ploidy-calls path_to_contig_ploidy_calls \
* --model previous_model_path \
* --input normal_1.counts.hdf5 \
* ... \
* --output output_dir \
* --output-prefix normal_case_run
* </pre>
*
* @author Mehrtash Babadi <mehrtash@broadinstitute.org>
* @author Samuel Lee <slee@broadinstitute.org>
*/
@CommandLineProgramProperties(
summary = "Calls copy-number variants in germline samples given their counts and " +
"the output of DetermineGermlineContigPloidy",
oneLineSummary = "Calls copy-number variants in germline samples given their counts and " +
"the output of DetermineGermlineContigPloidy",
programGroup = CopyNumberProgramGroup.class
)
@DocumentedFeature
@BetaFeature
public final class GermlineCNVCaller extends CommandLineProgram {
public enum RunMode {
COHORT, CASE
}
public static final String COHORT_DENOISING_CALLING_PYTHON_SCRIPT = "cohort_denoising_calling.py";
public static final String CASE_SAMPLE_CALLING_PYTHON_SCRIPT = "case_denoising_calling.py";
//name of the interval file output by the python code in the model directory
public static final String INPUT_MODEL_INTERVAL_FILE = "interval_list.tsv";
public static final String MODEL_PATH_SUFFIX = "-model";
public static final String CALLS_PATH_SUFFIX = "-calls";
public static final String TRACKING_PATH_SUFFIX = "-tracking";
public static final String CONTIG_PLOIDY_CALLS_DIRECTORY_LONG_NAME = "contig-ploidy-calls";
public static final String RUN_MODE_LONG_NAME = "run-mode";
@Argument(
doc = "Input read-count files containing integer read counts in genomic intervals for all samples. " +
"All intervals specified via -L must be contained; " +
"if none are specified, then intervals must be identical and in the same order for all samples.",
fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
minElements = 1
)
private List<File> inputReadCountFiles = new ArrayList<>();
@Argument(
doc = "Tool run-mode.",
fullName = RUN_MODE_LONG_NAME
)
private RunMode runMode;
@Argument(
doc = "Input contig-ploidy calls directory (output of DetermineGermlineContigPloidy).",
fullName = CONTIG_PLOIDY_CALLS_DIRECTORY_LONG_NAME
)
private String inputContigPloidyCallsDir;
@Argument(
doc = "Input denoising-model directory. In the COHORT mode, this argument is optional and if provided," +
"a new model will be built using this input model to initialize. In the CASE mode, the denoising " +
"model parameters set to this input model and therefore, this argument is required.",
fullName = CopyNumberStandardArgument.MODEL_LONG_NAME,
optional = true
)
private String inputModelDir = null;
@Argument(
doc = "Input annotated-interval file containing annotations for GC content in genomic intervals " +
"(output of AnnotateIntervals). All intervals specified via -L must be contained. " +
"This input should not be provided if an input denoising-model directory is given (the latter " +
"already contains the annotated-interval file).",
fullName = CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME,
optional = true
)
private File inputAnnotatedIntervalsFile = null;
@Argument(
doc = "Prefix for output filenames.",
fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME
)
private String outputPrefix;
@Argument(
doc = "Output directory.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
)
private String outputDir;
@ArgumentCollection
protected IntervalArgumentCollection intervalArgumentCollection
= new OptionalIntervalArgumentCollection();
@Advanced
@ArgumentCollection
private GermlineDenoisingModelArgumentCollection germlineDenoisingModelArgumentCollection =
new GermlineDenoisingModelArgumentCollection();
@Advanced
@ArgumentCollection
private GermlineCallingArgumentCollection germlineCallingArgumentCollection
= new GermlineCallingArgumentCollection();
@Advanced
@ArgumentCollection
private GermlineCNVHybridADVIArgumentCollection germlineCNVHybridADVIArgumentCollection
= new GermlineCNVHybridADVIArgumentCollection();
private SimpleIntervalCollection specifiedIntervals;
private File specifiedIntervalsFile;
@Override
protected void onStartup() {
/* check for successful import of gcnvkernel */
PythonScriptExecutor.checkPythonEnvironmentForPackage("gcnvkernel");
}
@Override
protected Object doWork() {
validateArguments();
//read in count files, validate they contain specified subset of intervals, and output
//count files for these intervals to temporary files
final List<File> intervalSubsetReadCountFiles = writeIntervalSubsetReadCountFiles();
//call python inference code
final boolean pythonReturnCode = executeGermlineCNVCallerPythonScript(intervalSubsetReadCountFiles);
if (!pythonReturnCode) {
throw new UserException("Python return code was non-zero.");
}
logger.info("Germline denoising and CNV calling complete.");
return "SUCCESS";
}
private void validateArguments() {
inputReadCountFiles.forEach(IOUtils::canReadFile);
Utils.validateArg(inputReadCountFiles.size() == new HashSet<>(inputReadCountFiles).size(),
"List of input read-count files cannot contain duplicates.");
if (inputModelDir != null) {
Utils.validateArg(new File(inputModelDir).exists(),
String.format("Input denoising-model directory %s does not exist.", inputModelDir));
}
if (inputModelDir != null) {
//intervals are retrieved from the input model directory
specifiedIntervalsFile = new File(inputModelDir, INPUT_MODEL_INTERVAL_FILE);
IOUtils.canReadFile(specifiedIntervalsFile);
specifiedIntervals = new SimpleIntervalCollection(specifiedIntervalsFile);
} else {
//get sequence dictionary and intervals from the first read-count file to use to validate remaining files
//(this first file is read again below, which is slightly inefficient but is probably not worth the extra code)
final File firstReadCountFile = inputReadCountFiles.get(0);
specifiedIntervals = CopyNumberArgumentValidationUtils.resolveIntervals(
firstReadCountFile, intervalArgumentCollection, logger);
//in cohort mode, intervals are specified via -L; we write them to a temporary file
specifiedIntervalsFile = IOUtils.createTempFile("intervals", ".tsv");
//get GC content (null if not provided)
final AnnotatedIntervalCollection subsetAnnotatedIntervals =
CopyNumberArgumentValidationUtils.validateAnnotatedIntervalsSubset(
inputAnnotatedIntervalsFile, specifiedIntervals, logger);
if (subsetAnnotatedIntervals != null) {
logger.info("GC-content annotations for intervals found; explicit GC-bias correction will be performed...");
subsetAnnotatedIntervals.write(specifiedIntervalsFile);
} else {
logger.info("No GC-content annotations for intervals found; explicit GC-bias correction will not be performed...");
specifiedIntervals.write(specifiedIntervalsFile);
}
}
if (runMode.equals(RunMode.COHORT)) {
logger.info("Running the tool in the COHORT mode...");
Utils.validateArg(inputReadCountFiles.size() > 1, "At least two samples must be provided in the " +
"COHORT mode");
if (inputModelDir != null) {
logger.info("(advanced feature) A denoising-model directory is provided in the COHORT mode; " +
"using the model for initialization and ignoring specified and/or annotated intervals.");
}
} else { // case run-mode
logger.info("Running the tool in the CASE mode...");
Utils.validateArg(inputModelDir != null, "An input denoising-model directory must be provided in the " +
"CASE mode.");
if (intervalArgumentCollection.intervalsSpecified()) {
throw new UserException.BadInput("Invalid combination of inputs: Running in CASE mode, " +
"but intervals were provided.");
}
if (inputAnnotatedIntervalsFile != null) {
throw new UserException.BadInput("Invalid combination of inputs: Running in CASE mode," +
"but annotated intervals were provided.");
}
}
Utils.nonNull(outputPrefix);
Utils.validateArg(new File(inputContigPloidyCallsDir).exists(),
String.format("Input contig-ploidy calls directory %s does not exist.", inputContigPloidyCallsDir));
Utils.validateArg(new File(outputDir).exists(),
String.format("Output directory %s does not exist.", outputDir));
germlineCallingArgumentCollection.validate();
germlineDenoisingModelArgumentCollection.validate();
germlineCNVHybridADVIArgumentCollection.validate();
}
private List<File> writeIntervalSubsetReadCountFiles() {
logger.info("Validating and aggregating data from input read-count files...");
final int numSamples = inputReadCountFiles.size();
final ListIterator<File> inputReadCountFilesIterator = inputReadCountFiles.listIterator();
final List<File> intervalSubsetReadCountFiles = new ArrayList<>(numSamples);
final Set<SimpleInterval> intervalSubset = new HashSet<>(specifiedIntervals.getRecords());
while (inputReadCountFilesIterator.hasNext()) {
final int sampleIndex = inputReadCountFilesIterator.nextIndex();
final File inputReadCountFile = inputReadCountFilesIterator.next();
logger.info(String.format("Aggregating read-count file %s (%d / %d)",
inputReadCountFile, sampleIndex + 1, numSamples));
final SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile);
if (!CopyNumberArgumentValidationUtils.isSameDictionary(
readCounts.getMetadata().getSequenceDictionary(), specifiedIntervals.getMetadata().getSequenceDictionary())) {
logger.warn("Sequence dictionary for read-count file %s does not match that " +
"in other read-count files.", inputReadCountFile);
}
final List<SimpleCount> subsetReadCounts = readCounts.getRecords().stream()
.filter(c -> intervalSubset.contains(c.getInterval()))
.collect(Collectors.toList());
Utils.validateArg(subsetReadCounts.size() == intervalSubset.size(),
String.format("Intervals for read-count file %s do not contain all specified intervals.",
inputReadCountFile));
final File intervalSubsetReadCountFile = IOUtils.createTempFile("sample-" + sampleIndex, ".tsv");
new SimpleCountCollection(readCounts.getMetadata(), subsetReadCounts)
.write(intervalSubsetReadCountFile);
intervalSubsetReadCountFiles.add(intervalSubsetReadCountFile);
}
return intervalSubsetReadCountFiles;
}
private boolean executeGermlineCNVCallerPythonScript(final List<File> intervalSubsetReadCountFiles) {
final PythonScriptExecutor executor = new PythonScriptExecutor(true);
final String outputDirArg = Utils.nonEmpty(outputDir).endsWith(File.separator)
? outputDir
: outputDir + File.separator; //add trailing slash if necessary
//add required arguments
final List<String> arguments = new ArrayList<>(Arrays.asList(
"--ploidy_calls_path=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyCallsDir),
"--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX),
"--output_tracking_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX)));
//if a model path is given, add it to the argument (both COHORT and CASE modes)
if (inputModelDir != null) {
arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir));
}
final String script;
if (runMode == RunMode.COHORT) {
script = COHORT_DENOISING_CALLING_PYTHON_SCRIPT;
//these are the annotated intervals, if provided
arguments.add("--modeling_interval_list=" + CopyNumberUtils.getCanonicalPath(specifiedIntervalsFile));
arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX));
if (inputAnnotatedIntervalsFile != null) {
arguments.add("--enable_explicit_gc_bias_modeling=True");
} else {
arguments.add("--enable_explicit_gc_bias_modeling=False");
}
} else {
script = CASE_SAMPLE_CALLING_PYTHON_SCRIPT;
// in the case mode, explicit gc bias modeling is set by the model
}
arguments.add("--read_count_tsv_files");
arguments.addAll(intervalSubsetReadCountFiles.stream().map(CopyNumberUtils::getCanonicalPath).collect(Collectors.toList()));
arguments.addAll(germlineDenoisingModelArgumentCollection.generatePythonArguments(runMode));
arguments.addAll(germlineCallingArgumentCollection.generatePythonArguments(runMode));
arguments.addAll(germlineCNVHybridADVIArgumentCollection.generatePythonArguments());
return executor.executeScript(
new Resource(script, GermlineCNVCaller.class),
null,
arguments);
}
}