/
CollectGcBiasMetrics.java
219 lines (194 loc) · 12 KB
/
CollectGcBiasMetrics.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import picard.metrics.GcBiasMetrics;
import picard.util.RExecutor;
import java.io.File;
import java.text.NumberFormat;
import java.util.List;
import java.util.Set;
/**
* Tool to collect information about GC bias in the reads in a given BAM file. Computes
* the number of windows (of size specified by SCAN_WINDOW_SIZE) in the genome at each GC%
* and counts the number of read starts in each GC bin. What is output and plotted is
* the "normalized coverage" in each bin - i.e. the number of reads per window normalized
* to the average number of reads per window across the whole genome.
*
* @author Tim Fennell
* edited by Kylee Bergin
*/
@CommandLineProgramProperties(
summary = CollectGcBiasMetrics.USAGE_SUMMARY + CollectGcBiasMetrics.USAGE_DETAILS,
oneLineSummary = CollectGcBiasMetrics.USAGE_SUMMARY,
programGroup = DiagnosticsAndQCProgramGroup.class
)
@DocumentedFeature
public class CollectGcBiasMetrics extends SinglePassSamProgram {
static final String USAGE_SUMMARY = "Collect metrics regarding GC bias. ";
static final String USAGE_DETAILS = "This tool collects information about the relative proportions of guanine (G) and cytosine (C)" +
" nucleotides in a sample. Regions of high and low G + C content have been shown to interfere with mapping/aligning," +
" ultimately leading to fragmented genome assemblies and poor coverage in a phenomenon known as 'GC bias'. " +
"Detailed information on the effects of GC bias on the collection and analysis of sequencing data can be found at " +
"DOI: 10.1371/journal.pone.0062856/.<br /><br />" +
"" +
"<p>The GC bias statistics are always output in a detailed long-form version, but a summary can also be produced. Both the " +
"detailed metrics and the summary metrics are output as tables '.txt' files) and an accompanying chart that plots the " +
"data ('.pdf' file). </p> " +
"" +
"<h4>Detailed metrics</h4>" +
"The table of detailed metrics includes GC percentages for each bin (GC), the percentage of WINDOWS corresponding to each " +
"GC bin of the reference sequence, the numbers of reads that start within a particular %GC content bin (READ_STARTS), and " +
"the mean base quality of the reads that correspond to a specific GC content distribution window (MEAN_BASE_QUALITY). " +
"NORMALIZED_COVERAGE is a relative measure of sequence coverage by the reads at a particular GC content." +
"" +
"For each run, the corresponding reference sequence is divided into bins or windows based on the percentage of G + C" +
" content ranging from 0 - 100%. The percentages of G + C are determined from a defined length of sequence; the default " +
"value is set at 100 bases. The mean of the distribution will vary among organisms; human DNA has a mean GC content " +
"of 40%, suggesting a slight preponderance of AT-rich regions. <br /><br />" +
"" +
"<h4>Summary metrics</h4>" +
"The table of summary metrics captures run-specific bias information including WINDOW_SIZE, ALIGNED_READS, TOTAL_CLUSTERS, " +
"AT_DROPOUT, and GC_DROPOUT. While WINDOW_SIZE refers to the numbers of bases used for the distribution (see above), the " +
"ALIGNED_READS and TOTAL_CLUSTERS are the total number of aligned reads and the total number of reads (after filtering) " +
"produced in a run. In addition, the tool produces both AT_DROPOUT and GC_DROPOUT metrics, which indicate the percentage of " +
"misaligned reads that correlate with low (%-GC is < 50%) or high (%-GC is > 50%) GC content respectively. <br /><br />" +
"" +
"The percentage of 'coverage' or depth in a GC bin is calculated by dividing the number of reads of a particular GC content " +
"by the mean number of reads of all GC bins. A number of 1 represents mean coverage, a number less than 1 represents lower " +
"than mean coverage (e.g. 0.5 means half as much coverage as average) while a number greater than 1 represents higher than " +
"mean coverage (e.g. 3.1 means this GC bin has 3.1 times more reads per window than average). " +
"" +
"This tool also tracks mean base-quality scores of the reads within each GC content bin, enabling the user to determine " +
"how base quality scores vary with GC content. <br /> <br />"+
"" +
"The chart output associated with this data table plots the NORMALIZED_COVERAGE, the distribution of WINDOWs corresponding " +
"to GC percentages, and base qualities corresponding to each %GC bin."+
"<p>Note: Metrics labeled as percentages are actually expressed as fractions!</p>" +
"<h4>Usage Example:</h4>"+
"<pre>" +
"java -jar picard.jar CollectGcBiasMetrics \\<br />"+
" I=input.bam \\<br />"+
" O=gc_bias_metrics.txt \\<br />"+
" CHART=gc_bias_metrics.pdf \\<br />"+
" S=summary_metrics.txt \\<br />"+
" R=reference_sequence.fasta"+
"</pre>"+
"Please see <a href='https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics'>" +
"the GcBiasMetrics documentation</a> for further explanations of each metric." +
"<hr />";
/** The location of the R script to do the plotting. */
private static final String R_SCRIPT = "picard/analysis/gcBias.R";
// Usage and parameters
@Argument(shortName = "CHART", doc = "The PDF file to render the chart to.")
public File CHART_OUTPUT;
@Argument(shortName = "S", doc = "The text file to write summary metrics to.")
public File SUMMARY_OUTPUT;
@Argument(shortName = "WINDOW_SIZE", doc = "The size of the scanning windows on the reference genome that are used to bin reads.")
public int SCAN_WINDOW_SIZE = 100;
@Argument(shortName = "MGF", doc = "For summary metrics, exclude GC windows that include less than this fraction of the genome.")
public double MINIMUM_GENOME_FRACTION = 0.00001;
@Argument(shortName = "BS", doc = "Whether the SAM or BAM file consists of bisulfite sequenced reads.")
public boolean IS_BISULFITE_SEQUENCED = false;
@Argument(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics.")
public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
@Argument(shortName = "ALSO_IGNORE_DUPLICATES", doc = "Use to get additional results without duplicates. This option " +
"allows to gain two plots per level at the same time: one is the usual one and the other excludes duplicates.")
public boolean ALSO_IGNORE_DUPLICATES = false;
// Calculates GcBiasMetrics for all METRIC_ACCUMULATION_LEVELs provided
private GcBiasMetricsCollector multiCollector;
// Bins for the histograms to track the number of windows at each GC, and the number of read starts
// at bins of each GC %. Need 101 to get from 0-100.
private static final int BINS = 101;
@Override
protected String[] customCommandLineValidation() {
if (!checkRInstallation(CHART_OUTPUT != null)) {
return new String[]{"R is not installed on this machine. It is required for creating the chart."};
}
return super.customCommandLineValidation();
}
/////////////////////////////////////////////////////////////////////////////
// Setup calculates windowsByGc for the entire reference. Must be done at
// startup to avoid missing reference contigs in the case of small files
// that may not have reads aligning to every reference contig.
/////////////////////////////////////////////////////////////////////////////
@Override
protected void setup(final SAMFileHeader header, final File samFile) {
IOUtil.assertFileIsWritable(CHART_OUTPUT);
IOUtil.assertFileIsWritable(SUMMARY_OUTPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
//Calculate windowsByGc for the reference sequence
final int[] windowsByGc = GcBiasUtils.calculateRefWindowsByGc(BINS, REFERENCE_SEQUENCE, SCAN_WINDOW_SIZE);
//Delegate actual collection to GcBiasMetricCollector
multiCollector = new GcBiasMetricsCollector(METRIC_ACCUMULATION_LEVEL, windowsByGc, header.getReadGroups(), SCAN_WINDOW_SIZE, IS_BISULFITE_SEQUENCED, ALSO_IGNORE_DUPLICATES);
}
////////////////////////////////////////////////////////////////////////////
// MultiCollector acceptRead
////////////////////////////////////////////////////////////////////////////
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
multiCollector.acceptRecord(rec, ref);
}
/////////////////////////////////////////////////////////////////////////////
// Write out all levels of normalized coverage metrics to a file
/////////////////////////////////////////////////////////////////////////////
@Override
protected void finish() {
multiCollector.finish();
writeResultsToFiles();
}
private void writeResultsToFiles() {
final MetricsFile<GcBiasMetrics, Integer> file = getMetricsFile();
final MetricsFile<GcBiasDetailMetrics, ?> detailMetricsFile = getMetricsFile();
final MetricsFile<GcBiasSummaryMetrics, ?> summaryMetricsFile = getMetricsFile();
multiCollector.addAllLevelsToFile(file);
final List<GcBiasMetrics> gcBiasMetricsList = file.getMetrics();
for (final GcBiasMetrics gcbm : gcBiasMetricsList) {
final List<GcBiasDetailMetrics> gcDetailList = gcbm.DETAILS.getMetrics();
for (final GcBiasDetailMetrics d : gcDetailList) {
detailMetricsFile.addMetric(d);
}
summaryMetricsFile.addMetric(gcbm.SUMMARY);
}
detailMetricsFile.write(OUTPUT);
summaryMetricsFile.write(SUMMARY_OUTPUT);
final NumberFormat fmt = NumberFormat.getIntegerInstance();
fmt.setGroupingUsed(true);
RExecutor.executeFromClasspath(R_SCRIPT,
OUTPUT.getAbsolutePath(),
SUMMARY_OUTPUT.getAbsolutePath(),
CHART_OUTPUT.getAbsolutePath().replaceAll("%", "%%"),
String.valueOf(SCAN_WINDOW_SIZE));
}
}