From e4b234df5d0cb0ffb35dae246442d238fe2d37ac Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 27 Jun 2016 22:29:07 -0400 Subject: [PATCH] =?UTF-8?q?-=20remove=20separate=20BQ=20cutoff=20for=20het?= =?UTF-8?q?=20sensitivity=20in=20WGS=20metrics.=20This=20=E2=80=A6=20(#577?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * - remove separate BQ cutoff for het sensitivity in WGS metrics. This causes incorrect depth and therefore sensitivity results whenever it's relevant. Also, the 17 cutoff in GATK onl refers to the graph building, for the pair HMM there is no minimum quality. - added a test to verify that the base-based exclusions affect the coverage and base-quality distributions correctly * fixed broken test. problem was that by using a separate histogram for the het sensitivity, exclusions were not taken into account in the base-distribution, but they were in the depth. --- src/java/picard/analysis/CollectWgsMetrics.java | 102 +++++++++------------ .../CollectWgsMetricsWithNonZeroCoverage.java | 4 +- .../CollectWgsMetricsFromSampledSitesTest.java | 13 +-- .../picard/analysis/CollectWgsMetricsTest.java | 89 +++++++++++++++++- 4 files changed, 139 insertions(+), 69 deletions(-) diff --git a/src/java/picard/analysis/CollectWgsMetrics.java b/src/java/picard/analysis/CollectWgsMetrics.java index f5c142f22..0b76735a6 100644 --- a/src/java/picard/analysis/CollectWgsMetrics.java +++ b/src/java/picard/analysis/CollectWgsMetrics.java @@ -39,6 +39,7 @@ import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.QualityUtil; import htsjdk.samtools.util.SamLocusIterator; +import htsjdk.samtools.util.SequenceUtil; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; @@ -98,7 +99,8 @@ @Option(shortName = "MQ", doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true) public int MINIMUM_MAPPING_QUALITY = 20; - @Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true) + @Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage. N bases will be treated as having a base quality " + + "of negative infinity and will therefore be excluded from coverage regardless of the value of this parameter.", overridable = true) public int MINIMUM_BASE_QUALITY = 20; @Option(shortName = "CAP", doc = "Treat positions with coverage exceeding this value as if they had coverage at this value (but calculate the difference for PCT_EXC_CAPPED).", overridable = true) @@ -253,7 +255,7 @@ protected int doWork() { // Check that the reference is not N final byte base = ref.getBases()[info.getPosition() - 1]; - if (base == 'N') continue; + if (SequenceUtil.isNoCall(base)) continue; // add to the collector collector.addInfo(info, ref); @@ -297,9 +299,8 @@ protected WgsMetricsCollector getCollector(final int coverageCap) { protected class WgsMetricsCollector { - protected final long[] histogramArray; - private final long[] baseQHistogramArray; - private final long[] baseQHetSensHistogram; + protected final long[] depthHistogramArray; + private final long[] baseQHistogramArray; private long basesExcludedByBaseq = 0; private long basesExcludedByOverlap = 0; @@ -307,9 +308,8 @@ protected WgsMetricsCollector getCollector(final int coverageCap) { protected final int coverageCap; public WgsMetricsCollector(final int coverageCap) { - histogramArray = new long[coverageCap + 1]; + depthHistogramArray = new long[coverageCap + 1]; baseQHistogramArray = new long[Byte.MAX_VALUE]; - baseQHetSensHistogram = new long[Byte.MAX_VALUE]; this.coverageCap = coverageCap; } @@ -318,14 +318,10 @@ public void addInfo(final SamLocusIterator.LocusInfo info, final ReferenceSequen // Figure out the coverage while not counting overlapping reads twice, and excluding various things final HashSet readNames = new HashSet<>(info.getRecordAndPositions().size()); int pileupSize = 0; - int pileupSizeForBaseQHetSens = 0; for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) { - pileupSizeForBaseQHetSens++; - if(pileupSizeForBaseQHetSens <= coverageCap) { - baseQHetSensHistogram[recs.getRecord().getBaseQualities()[recs.getOffset()]]++; - } - if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) { ++basesExcludedByBaseq; continue; } + if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY || + SequenceUtil.isNoCall(recs.getReadBase())) { ++basesExcludedByBaseq; continue; } if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; } pileupSize++; @@ -334,9 +330,9 @@ public void addInfo(final SamLocusIterator.LocusInfo info, final ReferenceSequen } } - final int depth = Math.min(readNames.size(), coverageCap); - if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - coverageCap; - histogramArray[depth]++; + final int depth = Math.min(pileupSize, coverageCap); + if (depth < pileupSize) basesExcludedByCapping += pileupSize - coverageCap; + depthHistogramArray[depth]++; } public void addToMetricsFile(final MetricsFile file, @@ -352,16 +348,7 @@ public void addToMetricsFile(final MetricsFile file, } protected void addBaseQHistogram(final MetricsFile file) { - // Construct and write the outputs - final Histogram baseQHistogram = new Histogram<>("value", "baseq_count"); - - for (int i = 0; i < baseQHistogramArray.length; ++i) { - baseQHistogram.increment(i, baseQHistogramArray[i]); - } - - if (INCLUDE_BQ_HISTOGRAM) { - file.addHistogram(baseQHistogram); - } + file.addHistogram(getBaseQHistogram()); } protected void addMetricsToFile(final MetricsFile file, @@ -378,11 +365,19 @@ protected void addMetricsToFile(final MetricsFile file, } protected Histogram getDepthHistogram() { - final Histogram depthHistogram = new Histogram<>("coverage", "count"); - for (int i=0; i getBaseQHistogram() { + return getHistogram(baseQHistogramArray, "value", "baseq_count"); + } + + private Histogram getHistogram(final long[] array, final String binLabel, final String valueLabel) { + final Histogram histogram = new Histogram<>(binLabel, valueLabel); + for (int i = 0; i < array.length; ++i) { + histogram.increment(i, array[i]); } - return depthHistogram; + return histogram; } protected WgsMetrics getMetrics(final Histogram depthHistogram, @@ -391,17 +386,6 @@ protected WgsMetrics getMetrics(final Histogram depthHistogram, final CountingPairedFilter pairFilter) { // the base q het histogram - final Histogram baseQHetHistogram = new Histogram<>("value", "baseq_count"); - final int BASEQ_MAX = 50; - final Integer[] x = new Integer[BASEQ_MAX]; - IntStream.range(0, BASEQ_MAX).forEach(i -> x[i] = i); - baseQHetHistogram.prefillBins(x); - - // Haplotype caller uses 17 as a baseQ cut off, so we are too. Everything below 17 is squashed into the '0' bin. - final int BASEQ_MIN_CUTOFF = 17; - for (int i=0; i depthHistogram, metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes; metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes; - - metrics.PCT_1X = MathUtil.sum(histogramArray, 1, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_5X = MathUtil.sum(histogramArray, 5, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_10X = MathUtil.sum(histogramArray, 10, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_15X = MathUtil.sum(histogramArray, 15, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_20X = MathUtil.sum(histogramArray, 20, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_25X = MathUtil.sum(histogramArray, 25, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_30X = MathUtil.sum(histogramArray, 30, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_40X = MathUtil.sum(histogramArray, 40, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_50X = MathUtil.sum(histogramArray, 50, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_60X = MathUtil.sum(histogramArray, 60, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_70X = MathUtil.sum(histogramArray, 70, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_80X = MathUtil.sum(histogramArray, 80, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_90X = MathUtil.sum(histogramArray, 90, histogramArray.length) / (double) metrics.GENOME_TERRITORY; - metrics.PCT_100X = MathUtil.sum(histogramArray, 100, histogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_1X = MathUtil.sum(depthHistogramArray, 1, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_5X = MathUtil.sum(depthHistogramArray, 5, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_10X = MathUtil.sum(depthHistogramArray, 10, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_15X = MathUtil.sum(depthHistogramArray, 15, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_20X = MathUtil.sum(depthHistogramArray, 20, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_25X = MathUtil.sum(depthHistogramArray, 25, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_30X = MathUtil.sum(depthHistogramArray, 30, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_40X = MathUtil.sum(depthHistogramArray, 40, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_50X = MathUtil.sum(depthHistogramArray, 50, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_60X = MathUtil.sum(depthHistogramArray, 60, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_70X = MathUtil.sum(depthHistogramArray, 70, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_80X = MathUtil.sum(depthHistogramArray, 80, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_90X = MathUtil.sum(depthHistogramArray, 90, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; + metrics.PCT_100X = MathUtil.sum(depthHistogramArray, 100, depthHistogramArray.length) / (double) metrics.GENOME_TERRITORY; // Get Theoretical Het SNP Sensitivity - final double [] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); - final double [] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHetHistogram); + final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(getBaseQHistogram()); metrics.HET_SNP_SENSITIVITY = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, SAMPLE_SIZE, LOG_ODDS_THRESHOLD); - metrics.HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1-metrics.HET_SNP_SENSITIVITY)); + metrics.HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - metrics.HET_SNP_SENSITIVITY)); return metrics; } } - } diff --git a/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java index 8f396fbdc..a829cc5f2 100644 --- a/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java +++ b/src/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java @@ -158,8 +158,8 @@ public void addToMetricsFile(final MetricsFile file, private Histogram depthHistogramNonZero() { final Histogram depthHistogram = new Histogram<>("coverage", "count"); // do not include the zero-coverage bin - for (int i = 1; i> collectWgsMetricsOutput = new MetricsFile>(); + final MetricsFile> collectWgsMetricsOutput = new MetricsFile<>(); collectWgsMetricsOutput.read(new FileReader(collectWgsOutfile)); for (final CollectWgsMetrics.WgsMetrics metrics : output.getMetrics()) { diff --git a/src/tests/java/picard/analysis/CollectWgsMetricsTest.java b/src/tests/java/picard/analysis/CollectWgsMetricsTest.java index 5a070cae4..87d5f9333 100644 --- a/src/tests/java/picard/analysis/CollectWgsMetricsTest.java +++ b/src/tests/java/picard/analysis/CollectWgsMetricsTest.java @@ -8,6 +8,7 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordSetBuilder; import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.Histogram; import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; import org.testng.Assert; import org.testng.annotations.BeforeTest; @@ -189,7 +190,7 @@ public void testLargeIntervals() throws IOException { }; Assert.assertEquals(runPicardCommandLine(args), 0); - final MetricsFile> output = new MetricsFile>(); + final MetricsFile> output = new MetricsFile<>(); output.read(new FileReader(outfile)); for (final CollectWgsMetrics.WgsMetrics metrics : output.getMetrics()) { @@ -197,7 +198,93 @@ public void testLargeIntervals() throws IOException { Assert.assertEquals(metrics.PCT_EXC_MAPQ, 0.271403); Assert.assertEquals(metrics.PCT_EXC_DUPE, 0.182149); Assert.assertEquals(metrics.PCT_EXC_UNPAIRED, 0.091075); + } + } + + @Test + public void testExclusions() throws IOException { + final File reference = new File("testdata/picard/sam/merger.fasta"); + final File tempSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); + tempSamFile.deleteOnExit(); + + final SAMFileHeader header = new SAMFileHeader(); + //Check that dictionary file is readable and then set header dictionary + try { + header.setSequenceDictionary(SAMSequenceDictionaryExtractor.extractDictionary(reference)); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + } catch (final SAMException e) { + e.printStackTrace(); } + + //Set readGroupRecord + final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(READ_GROUP_ID); + readGroupRecord.setSample(SAMPLE); + readGroupRecord.setPlatform(PLATFORM); + readGroupRecord.setLibrary(LIBRARY); + readGroupRecord.setPlatformUnit(READ_GROUP_ID); + header.addReadGroup(readGroupRecord); + + //Add to setBuilder + final SAMRecordSetBuilder setBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate, true, 100); + setBuilder.setReadGroup(readGroupRecord); + setBuilder.setUseNmFlag(true); + setBuilder.setHeader(header); + + setBuilder.setReadLength(10); + + int expectedSingltonCoverage = 0; + + expectedSingltonCoverage += 13; + setBuilder.addPair("overlappingReads", 0, 2, 5, false, false, "10M", "10M", true, false, 30); + + expectedSingltonCoverage += 2 * 5; // 5 bases for each mate are good (see AAA!!!AA!! below). + setBuilder.addPair("poorQualityReads", 1, 2, 20, false, false, "10M", "10M", true, false, -1); + + for(int i = 1; i < 5; i++) { + setBuilder.addPair("deepStack-" + i, 2, 2, 20, false, false, "10M", "10M", true, false, 30); + } + + // modify quality of reads + setBuilder.getRecords().stream() + .filter(samRecord -> samRecord.getReadName().equals("poorQualityReads")) + .forEach(record -> record.setBaseQualityString("AAA!!!AA!!")); + + setBuilder.getSamReader(); + + // Write SAM file + final SAMFileWriter writer = new SAMFileWriterFactory() + .setCreateIndex(true).makeBAMWriter(header, false, tempSamFile); + + for (final SAMRecord record : setBuilder) { + writer.addAlignment(record); + } + writer.close(); + + // create output files for tests + final File outfile = File.createTempFile("testWgsMetrics", ".txt"); + outfile.deleteOnExit(); + + final String[] args = new String[] { + "INPUT=" + tempSamFile.getAbsolutePath(), + "OUTPUT=" + outfile.getAbsolutePath(), + "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), + "INCLUDE_BQ_HISTOGRAM=true", + "COVERAGE_CAP=3" + }; + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(outfile)); + final CollectWgsMetrics.WgsMetrics metrics = output.getMetrics().get(0); + + final Histogram depthHistogram = output.getAllHistograms().get(0); + final Histogram baseQHistogram = output.getAllHistograms().get(1); + + Assert.assertEquals((long) depthHistogram.getSumOfValues(), metrics.GENOME_TERRITORY); + Assert.assertEquals(baseQHistogram.getSumOfValues(), depthHistogram.getSum()); + Assert.assertEquals((long) depthHistogram.get(1).getValue(), expectedSingltonCoverage); + Assert.assertEquals((long) depthHistogram.get(3).getValue(), 2*10); + } } \ No newline at end of file