diff --git a/src/java/picard/analysis/CollectWgsMetrics.java b/src/java/picard/analysis/CollectWgsMetrics.java index f5c142f22..0b76735a6 100644 --- a/src/java/picard/analysis/CollectWgsMetrics.java +++ b/src/java/picard/analysis/CollectWgsMetrics.java @@ -39,6 +39,7 @@ import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.QualityUtil; import htsjdk.samtools.util.SamLocusIterator; +import htsjdk.samtools.util.SequenceUtil; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; @@ -98,7 +99,8 @@ @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true) public int MINIMUM_MAPPING_QUALITY = 20; - @Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true) + @Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage. N bases will be treated as having a base quality " + + "of negative infinity and will therefore be excluded from coverage regardless of the value of this parameter.", overridable = true) public int MINIMUM_BASE_QUALITY = 20; @Option(shortName = "CAP", doc = "Treat positions with coverage exceeding this value as if they had coverage at this value (but calculate the difference for PCT_EXC_CAPPED).", overridable = true) @@ -253,7 +255,7 @@ protected int doWork() { // Check that the reference is not N final byte base = ref.getBases()[info.getPosition() - 1]; - if (base == 'N') continue; + if (SequenceUtil.isNoCall(base)) continue; // add to the collector collector.addInfo(info, ref); @@ -297,9 +299,8 @@ protected WgsMetricsCollector getCollector(final int coverageCap) { protected class WgsMetricsCollector { - protected final long[] histogramArray; - private final long[] baseQHistogramArray; - private final long[] baseQHetSensHistogram; + protected final long[] depthHistogramArray; + private final long[] baseQHistogramArray; private long basesExcludedByBaseq = 0; private long basesExcludedByOverlap = 0; @@ -307,9 +308,8 @@ protected WgsMetricsCollector getCollector(final int coverageCap) { protected final int coverageCap; public WgsMetricsCollector(final int coverageCap) { - histogramArray = new long[coverageCap + 1]; + depthHistogramArray = new long[coverageCap + 1]; baseQHistogramArray = new long[Byte.MAX_VALUE]; - baseQHetSensHistogram = new long[Byte.MAX_VALUE]; this.coverageCap = coverageCap; } @@ -318,14 +318,10 @@ public void addInfo(final SamLocusIterator.LocusInfo info, final ReferenceSequen // Figure out the coverage while not counting overlapping reads twice, and excluding various things final HashSet readNames = new HashSet<>(info.getRecordAndPositions().size()); int pileupSize = 0; - int pileupSizeForBaseQHetSens = 0; for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) { - pileupSizeForBaseQHetSens++; - if(pileupSizeForBaseQHetSens <= coverageCap) { - baseQHetSensHistogram[recs.getRecord().getBaseQualities()[recs.getOffset()]]++; - } - if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) { ++basesExcludedByBaseq; continue; } + if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY || + SequenceUtil.isNoCall(recs.getReadBase())) { ++basesExcludedByBaseq; continue; } if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; } pileupSize++; @@ -334,9 +330,9 @@ public void addInfo(final SamLocusIterator.LocusInfo info, final ReferenceSequen } } - final int depth = Math.min(readNames.size(), coverageCap); - if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - coverageCap; - histogramArray[depth]++; + final int depth = Math.min(pileupSize, coverageCap); + if (depth < pileupSize) basesExcludedByCapping += pileupSize - coverageCap; + depthHistogramArray[depth]++; } public void addToMetricsFile(final MetricsFile file, @@ -352,16 +348,7 @@ public void addToMetricsFile(final MetricsFile file, } protected void addBaseQHistogram(final MetricsFile file) { - // Construct and write the outputs - final Histogram baseQHistogram = new Histogram<>("value", "baseq_count"); - - for (int i = 0; i < baseQHistogramArray.length; ++i) { - baseQHistogram.increment(i, baseQHistogramArray[i]); - } - - if (INCLUDE_BQ_HISTOGRAM) { - file.addHistogram(baseQHistogram); - } + file.addHistogram(getBaseQHistogram()); } protected void addMetricsToFile(final MetricsFile file, @@ -378,11 +365,19 @@ protected void addMetricsToFile(final MetricsFile file, } protected Histogram getDepthHistogram() { - final Histogram depthHistogram = new Histogram<>("coverage", "count"); - for (int i=0; i getBaseQHistogram() { + return getHistogram(baseQHistogramArray, "value", "baseq_count"); + } + + private Histogram getHistogram(final long[] array, final String binLabel, final String valueLabel) { + final Histogram histogram = new Histogram<>(binLabel, valueLabel); + for (int i = 0; i < array.length; ++i) { + histogram.increment(i, array[i]); } - return depthHistogram; + return histogram; } protected WgsMetrics getMetrics(final Histogram depthHistogram, @@ -391,17 +386,6 @@ protected WgsMetrics getMetrics(final Histogram depthHistogram, final CountingPairedFilter pairFilter) { // the base q het histogram - final Histogram baseQHetHistogram = new Histogram<>("value", "baseq_count"); - final int BASEQ_MAX = 50; - final Integer[] x = new Integer[BASEQ_MAX]; - IntStream.range(0, BASEQ_MAX).forEach(i -> x[i] = i); - baseQHetHistogram.prefillBins(x); - - // Haplotype caller uses 17 as a baseQ cut off, so we are too. Everything below 17 is squashed into the '0' bin. - final int BASEQ_MIN_CUTOFF = 17; - for (int i=0; i depthHistogram, metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes; metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes; - - metrics.PCT_1X = MathUtil.sum(histogramArray, 1, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_5X = MathUtil.sum(histogramArray, 5, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_10X = MathUtil.sum(histogramArray, 10, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_15X = MathUtil.sum(histogramArray, 15, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_20X = MathUtil.sum(histogramArray, 20, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_25X = MathUtil.sum(histogramArray, 25, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_30X = MathUtil.sum(histogramArray, 30, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_40X = MathUtil.sum(histogramArray, 40, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_50X = MathUtil.sum(histogramArray, 50, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_60X = MathUtil.sum(histogramArray, 60, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_70X = MathUtil.sum(histogramArray, 70, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_80X = MathUtil.sum(histogramArray, 80, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_90X = MathUtil.sum(histogramArray, 90, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_100X = MathUtil.sum(histogramArray, 100, histogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_1X = MathUtil.sum(depthHistogramArray, 1, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_5X = MathUtil.sum(depthHistogramArray, 5, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_10X = MathUtil.sum(depthHistogramArray, 10, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_15X = MathUtil.sum(depthHistogramArray, 15, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_20X = MathUtil.sum(depthHistogramArray, 20, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_25X = MathUtil.sum(depthHistogramArray, 25, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_30X = MathUtil.sum(depthHistogramArray, 30, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_40X = MathUtil.sum(depthHistogramArray, 40, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_50X = MathUtil.sum(depthHistogramArray, 50, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_60X = MathUtil.sum(depthHistogramArray, 60, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_70X = MathUtil.sum(depthHistogramArray, 70, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_80X = MathUtil.sum(depthHistogramArray, 80, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_90X = MathUtil.sum(depthHistogramArray, 90, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_100X = MathUtil.sum(depthHistogramArray, 100, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; // Get Theoretical Het SNP Sensitivity - final double [] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); - final double [] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHetHistogram); + final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(getBaseQHistogram()); metrics.HET_SNP_SENSITIVITY = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, SAMPLE_SIZE, LOG_ODDS_THRESHOLD); - metrics.HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1-metrics.HET_SNP_SENSITIVITY)); + metrics.HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - metrics.HET_SNP_SENSITIVITY)); return metrics; } } - } diff --git a/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java index 8f396fbdc..a829cc5f2 100644 --- a/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java +++ b/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java @@ -158,8 +158,8 @@ public void addToMetricsFile(final MetricsFile file, private Histogram depthHistogramNonZero() { final Histogram depthHistogram = new Histogram<>("coverage", "count"); // do not include the zero-coverage bin - for (int i = 1; i> collectWgsMetricsOutput = new MetricsFile>(); + final MetricsFile> collectWgsMetricsOutput = new MetricsFile<>(); collectWgsMetricsOutput.read(new FileReader(collectWgsOutfile)); for (final CollectWgsMetrics.WgsMetrics metrics : output.getMetrics()) { diff --git a/src/tests/java/picard/analysis/CollectWgsMetricsTest.java b/src/tests/java/picard/analysis/CollectWgsMetricsTest.java index 5a070cae4..87d5f9333 100644 --- a/src/tests/java/picard/analysis/CollectWgsMetricsTest.java +++ b/src/tests/java/picard/analysis/CollectWgsMetricsTest.java @@ -8,6 +8,7 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordSetBuilder; import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.Histogram; import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; import org.testng.Assert; import org.testng.annotations.BeforeTest; @@ -189,7 +190,7 @@ public void testLargeIntervals() throws IOException { }; Assert.assertEquals(runPicardCommandLine(args), 0); - final MetricsFile> output = new MetricsFile>(); + final MetricsFile> output = new MetricsFile<>(); output.read(new FileReader(outfile)); for (final CollectWgsMetrics.WgsMetrics metrics : output.getMetrics()) { @@ -197,7 +198,93 @@ public void testLargeIntervals() throws IOException { Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.271403); Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.182149); Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.091075); + } + } + + @Test + public void testExclusions() throws IOException { + final File reference = new File("testdata/picard/sam/merger.fasta"); + final File tempSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); + tempSamFile.deleteOnExit(); + + final SAMFileHeader header = new SAMFileHeader(); + //Check that dictionary file is readable and then set header dictionary + try { + header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(reference)); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + } catch (final SAMException e) { + e.printStackTrace(); } + + //Set readGroupRecord + final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(READ_GROUP_ID); + readGroupRecord.setSample(SAMPLE); + readGroupRecord.setPlatform(PLATFORM); + readGroupRecord.setLibrary(LIBRARY); + readGroupRecord.setPlatformUnit(READ_GROUP_ID); + header.addReadGroup(readGroupRecord); + + //Add to setBuilder + final SAMRecordSetBuilder setBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate, true, 100); + setBuilder.setReadGroup(readGroupRecord); + setBuilder.setUseNmFlag(true); + setBuilder.setHeader(header); + + setBuilder.setReadLength(10); + + int expectedSingltonCoverage = 0; + + expectedSingltonCoverage += 13; + setBuilder.addPair("overlappingReads", 0, 2, 5, false, false, "10M", "10M", true, false, 30); + + expectedSingltonCoverage += 2 * 5; // 5 bases for each mate are good (see AAA!!!AA!! below). + setBuilder.addPair("poorQualityReads", 1, 2, 20, false, false, "10M", "10M", true, false, -1); + + for(int i = 1; i < 5; i++) { + setBuilder.addPair("deepStack-" + i, 2, 2, 20, false, false, "10M", "10M", true, false, 30); + } + + // modify quality of reads + setBuilder.getRecords().stream() + .filter(samRecord -> samRecord.getReadName().equals("poorQualityReads")) + .forEach(record -> record.setBaseQualityString("AAA!!!AA!!")); + + setBuilder.getSamReader(); + + // Write SAM file + final SAMFileWriter writer = new SAMFileWriterFactory() + .setCreateIndex(true).makeBAMWriter(header, false, tempSamFile); + + for (final SAMRecord record : setBuilder) { + writer.addAlignment(record); + } + writer.close(); + + // create output files for tests + final File outfile = File.createTempFile("testWgsMetrics", ".txt"); + outfile.deleteOnExit(); + + final String[] args = new String[] { + "INPUT=" + tempSamFile.getAbsolutePath(), + "OUTPUT=" + outfile.getAbsolutePath(), + "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), + "INCLUDE_BQ_HISTOGRAM=true", + "COVERAGE_CAP=3" + }; + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(outfile)); + final CollectWgsMetrics.WgsMetrics metrics = output.getMetrics().get(0); + + final Histogram depthHistogram = output.getAllHistograms().get(0); + final Histogram baseQHistogram = output.getAllHistograms().get(1); + + Assert.assertEquals((long) depthHistogram.getSumOfValues(), metrics.GENOME_TERRITORY); + Assert.assertEquals(baseQHistogram.getSumOfValues(), depthHistogram.getSum()); + Assert.assertEquals((long) depthHistogram.get(1).getValue(), expectedSingltonCoverage); + Assert.assertEquals((long) depthHistogram.get(3).getValue(), 2*10); + } } \ No newline at end of file