Permalink
Browse files

Merge pull request #809 from broadinstitute/eb_aggreagate_variant_met…

…rics

Added a new tool to merge a collection of variant calling metrics
  • Loading branch information...
2 parents b4a02aa + bfeb7f5 commit bb529afcb5757a39a14e66d922acad1f081e4948 @eitanbanks eitanbanks committed on GitHub May 16, 2017
@@ -0,0 +1,134 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2017 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package picard.vcf;
+
+import htsjdk.samtools.metrics.MetricsFile;
+import htsjdk.samtools.util.*;
+import picard.PicardException;
+import picard.cmdline.CommandLineProgram;
+import picard.cmdline.CommandLineProgramProperties;
+import picard.cmdline.Option;
+import picard.cmdline.StandardOptionDefinitions;
+import picard.cmdline.programgroups.Metrics;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Combines multiple Variant Calling Metrics files into a single file.
+ * @author Eric Banks
+ */
+@CommandLineProgramProperties(
+ usage = "Combines multiple Variant Calling Metrics files into a single file. This tool is used in cases where the metrics are calculated" +
+ " separately for different (genomic) shards of the same callset and we want to combine them into a single result over the entire callset." +
+ " The shards are expected to contain the same samples (although it will not fail if they do not) and to not have been run over overlapping genomic positions.",
+ usageShort = "Combines multiple Variant Calling Metrics files into a single file",
+ programGroup = Metrics.class
+)
+public class AccumulateVariantCallingMetrics extends CommandLineProgram {
+
+ @Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Paths (except for the file extensions) of Variant Calling Metrics files to read and merge.", minElements=1)
+ public List<File> INPUT;
+
+ @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Path (except for the file extension) of output metrics files to write.")
+ public File OUTPUT;
+
+ @Override
+ protected int doWork() {
+
+ final String outputPrefix = OUTPUT.getAbsolutePath() + ".";
+ final File detailOutputFile = new File(outputPrefix + CollectVariantCallingMetrics.VariantCallingDetailMetrics.getFileExtension());
+ final File summaryOutputFile = new File(outputPrefix + CollectVariantCallingMetrics.VariantCallingSummaryMetrics.getFileExtension());
+ IOUtil.assertFileIsWritable(detailOutputFile);
+ IOUtil.assertFileIsWritable(summaryOutputFile);
+
+ // set up the collectors
+ final Map<String, Collection<CollectVariantCallingMetrics.VariantCallingDetailMetrics>> sampleDetailsMap = new HashMap<>();
+ final Collection<CollectVariantCallingMetrics.VariantCallingSummaryMetrics> summaries = new ArrayList<>();
+
+ for (final File file : INPUT) {
+ final String inputPrefix = file.getAbsolutePath() + ".";
+
+ try {
+ // read in the detailed metrics file
+ final File detail = new File(inputPrefix + CollectVariantCallingMetrics.VariantCallingDetailMetrics.getFileExtension());
+ IOUtil.assertFileIsReadable(detail);
+ MetricsFile<CollectVariantCallingMetrics.VariantCallingDetailMetrics, ?> detailedMetricsFile = getMetricsFile();
+ detailedMetricsFile.read(new FileReader(detail));
+
+ // for each sample in the detailed metrics...
+ long totalHetDepth = 0L;
+ for (final CollectVariantCallingMetrics.VariantCallingDetailMetrics detailedMetrics : detailedMetricsFile.getMetrics()) {
+ // re-calculate internal fields from derived fields
+ detailedMetrics.calculateFromDerivedFields();
+ totalHetDepth += detailedMetrics.TOTAL_HET_DEPTH;
+
+ // add it to the list of metrics for that sample so that we can merge them later
+ sampleDetailsMap.computeIfAbsent(detailedMetrics.SAMPLE_ALIAS, f -> new ArrayList<>()).add(detailedMetrics);
+ }
+
+ // next, read in the summary metrics
+ final File summary = new File(inputPrefix + CollectVariantCallingMetrics.VariantCallingSummaryMetrics.getFileExtension());
+ IOUtil.assertFileIsReadable(summary);
+ MetricsFile<CollectVariantCallingMetrics.VariantCallingSummaryMetrics, ?> summaryMetricsFile = getMetricsFile();
+ summaryMetricsFile.read(new FileReader(summary));
+ if (summaryMetricsFile.getMetrics().size() != 1) {
+ throw new PicardException(String.format("Expected 1 row in the summary metrics file but saw %d", summaryMetricsFile.getMetrics().size()));
+ }
+
+ // re-calculate internal fields from derived fields and add it to the list of summary metrics
+ final CollectVariantCallingMetrics.VariantCallingSummaryMetrics summaryMetrics = summaryMetricsFile.getMetrics().get(0);
+ summaryMetrics.calculateFromDerivedFields(totalHetDepth);
+ summaries.add(summaryMetrics);
+ } catch (IOException e) {
+ throw new PicardException(String.format("Cannot read from metrics files with prefix %s", inputPrefix));
+ }
+ }
+
+ // now merge all of the accumulated metrics
+ final Collection<CollectVariantCallingMetrics.VariantCallingDetailMetrics> collapsedDetails = new ArrayList<>();
+ sampleDetailsMap.values().forEach(sampleDetails -> {
+ final CollectVariantCallingMetrics.VariantCallingDetailMetrics collapsed = new CollectVariantCallingMetrics.VariantCallingDetailMetrics();
+ CollectVariantCallingMetrics.VariantCallingDetailMetrics.foldInto(collapsed, sampleDetails);
+ collapsed.calculateDerivedFields();
+ collapsedDetails.add(collapsed);
+ });
+ final CollectVariantCallingMetrics.VariantCallingSummaryMetrics collapsedSummary = new CollectVariantCallingMetrics.VariantCallingSummaryMetrics();
+ CollectVariantCallingMetrics.VariantCallingSummaryMetrics.foldInto(collapsedSummary, summaries);
+ collapsedSummary.calculateDerivedFields();
+
+ // prepare and write the finalized merged metrics
+ final MetricsFile<CollectVariantCallingMetrics.VariantCallingDetailMetrics, Integer> detail = getMetricsFile();
+ final MetricsFile<CollectVariantCallingMetrics.VariantCallingSummaryMetrics, Integer> summary = getMetricsFile();
+ summary.addMetric(collapsedSummary);
+ collapsedDetails.forEach(detail::addMetric);
+
+ detail.write(detailOutputFile);
+ summary.write(summaryOutputFile);
+
+ return 0;
+ }
+}
@@ -136,75 +136,75 @@ protected int doWork() {
/** A collection of metrics relating to snps and indels within a variant-calling file (VCF). */
public static class VariantCallingSummaryMetrics extends MergeableMetricBase {
- /** The number of high confidence SNPs calls (i.e. non-reference genotypes) that were examined */
+ /** The number of passing bi-allelic SNPs calls (i.e. non-reference genotypes) that were examined */
@MergeByAdding
public long TOTAL_SNPS;
- /** The number of high confidence SNPs found in dbSNP */
+ /** The number of passing bi-allelic SNPs found in dbSNP */
@MergeByAdding
public long NUM_IN_DB_SNP;
- /** The number of high confidence SNPS called that were not found in dbSNP */
+ /** The number of passing bi-allelic SNPS called that were not found in dbSNP */
@MergeByAdding
public long NOVEL_SNPS;
- /** The number of SNPs that are also filtered */
+ /** The number of SNPs that are filtered */
@MergeByAdding
public long FILTERED_SNPS;
- /** The fraction of high confidence SNPs in dbSNP */
+ /** The fraction of passing bi-allelic SNPs in dbSNP */
@NoMergingIsDerived
public float PCT_DBSNP;
- /** The Transition/Transversion ratio of the SNP calls made at dbSNP sites */
+ /** The Transition/Transversion ratio of the passing bi-allelic SNP calls made at dbSNP sites */
@NoMergingIsDerived
public double DBSNP_TITV;
- /** The Transition/Transversion ratio of the SNP calls made at non-dbSNP sites */
+ /** The Transition/Transversion ratio of the passing bi-allelic SNP calls made at non-dbSNP sites */
@NoMergingIsDerived
public double NOVEL_TITV;
- /** The number of high confidence Indel calls that were examined */
+ /** The number of passing indel calls that were examined */
@MergeByAdding
public long TOTAL_INDELS;
- /** The number of high confidence Indels called that were not found in dbSNP */
+ /** The number of passing indels called that were not found in dbSNP */
@MergeByAdding
public long NOVEL_INDELS;
- /** The number of indels that are also filtered */
+ /** The number of indels that are filtered */
@MergeByAdding
public long FILTERED_INDELS;
- /** The fraction of high confidence Indels in dbSNP */
+ /** The fraction of passing indels in dbSNP */
@NoMergingIsDerived
public float PCT_DBSNP_INDELS;
- /** The number of high confidence Indels found in dbSNP */
+ /** The number of passing indels found in dbSNP */
@MergeByAdding
public long NUM_IN_DB_SNP_INDELS;
- /** The Insertion/Deletion ratio of the Indel calls made at dbSNP sites */
+ /** The Insertion/Deletion ratio of the indel calls made at dbSNP sites */
@NoMergingIsDerived
public double DBSNP_INS_DEL_RATIO;
- /** The Insertion/Deletion ratio of the Indel calls made at non-dbSNP sites */
+ /** The Insertion/Deletion ratio of the indel calls made at non-dbSNP sites */
@NoMergingIsDerived
public double NOVEL_INS_DEL_RATIO;
- /** The number of high confidence multiallelic SNP calls that were examined */
+ /** The number of passing multi-allelic SNP calls that were examined */
@MergeByAdding
public double TOTAL_MULTIALLELIC_SNPS;
- /** The number of high confidence multiallelic SNPs found in dbSNP */
+ /** The number of passing multi-allelic SNPs found in dbSNP */
@MergeByAdding
public double NUM_IN_DB_SNP_MULTIALLELIC;
- /** The number of high confidence complex Indel calls that were examined */
+ /** The number of passing complex indel calls that were examined */
@MergeByAdding
public double TOTAL_COMPLEX_INDELS;
- /** The number of high confidence complex Indels found in dbSNP */
+ /** The number of passing complex indels found in dbSNP */
@MergeByAdding
public double NUM_IN_DB_SNP_COMPLEX_INDELS;
@@ -249,11 +249,35 @@ public void calculateDerivedFields() {
this.NOVEL_INS_DEL_RATIO = this.novelInsertions / (double) this.novelDeletions;
}
+ public void calculateFromDerivedFields(final long totalHetDepth) {
+ dbSnpTransversions = invertFromRatio(NUM_IN_DB_SNP, DBSNP_TITV);
+ dbSnpTransitions = NUM_IN_DB_SNP - dbSnpTransversions;
+ novelTransversions = invertFromRatio(NOVEL_SNPS, NOVEL_TITV);
+ novelTransitions = NOVEL_SNPS - novelTransversions;
+ dbSnpDeletions = invertFromRatio(NUM_IN_DB_SNP_INDELS, DBSNP_INS_DEL_RATIO);
+ dbSnpInsertions = NUM_IN_DB_SNP_INDELS - dbSnpDeletions;
+ novelDeletions = invertFromRatio(NOVEL_INDELS, NOVEL_INS_DEL_RATIO);
+ novelInsertions = NOVEL_INDELS - novelDeletions;
+ refAlleleObs = Double.isNaN(SNP_REFERENCE_BIAS) ? 0L : Math.round(totalHetDepth * SNP_REFERENCE_BIAS);
+ altAlleleObs = totalHetDepth - refAlleleObs;
+ }
+
public static <T extends VariantCallingSummaryMetrics> void foldInto(final T target, final Collection<T> metrics) {
metrics.forEach(target::merge);
}
}
+ /**
+ * Given the ratio (X/Y) and the sum (X+Y), returns Y.
+ *
+ * @param sum X+Y
+ * @param ratio X/Y
+ * @return Y as a long
+ */
+ private static long invertFromRatio(final long sum, final Double ratio) {
+ return ratio.isNaN() ? 0L : Math.round(sum / (ratio + 1.0));
+ }
+
/** A collection of metrics relating to snps and indels within a variant-calling file (VCF) for a given sample. */
public static class VariantCallingDetailMetrics extends CollectVariantCallingMetrics.VariantCallingSummaryMetrics {
/** The name of the sample being assayed */
@@ -279,6 +303,12 @@ public void calculateDerivedFields() {
public long TOTAL_GQ0_VARIANTS;
/**
+ * total number of reads (from AD field) for passing bi-allelic SNP hets for this sample
+ */
+ @NoMergingIsDerived
+ public long TOTAL_HET_DEPTH;
+
+ /**
* Hidden fields not propagated to the metrics file.
*/
@MergeByAdding
@@ -293,8 +323,15 @@ public void calculateDerivedFields() {
super.calculateDerivedFields();
// Divide by zero should be OK -- NaN should get propagated to metrics file.
HET_HOMVAR_RATIO = numHets / (double) numHomVar;
-
PCT_GQ0_VARIANTS = TOTAL_GQ0_VARIANTS / (double) (numHets + numHomVar);
+ TOTAL_HET_DEPTH = refAlleleObs + altAlleleObs;
+ }
+
+ public void calculateFromDerivedFields() {
+ numHomVar = invertFromRatio(TOTAL_SNPS, HET_HOMVAR_RATIO);
+ numHets = TOTAL_SNPS - numHomVar;
+
+ calculateFromDerivedFields(TOTAL_HET_DEPTH);
}
}
}
Oops, something went wrong.

0 comments on commit bb529af

Please sign in to comment.