From add34e1c68f12c1a43b30b0c7eed8ea279630c58 Mon Sep 17 00:00:00 2001 From: ldgauthier Date: Fri, 17 Jun 2022 11:59:31 -0400 Subject: [PATCH] Type change int -> long to prevent tranche novel variant count overflow (#7864) * Type change int -> long to prevent tranche novel variant count overflow in VQSR scattered mode --- .../tools/walkers/vqsr/Tranche.java | 37 ++++++++++--- .../walkers/vqsr/TruthSensitivityTranche.java | 8 +-- .../tools/walkers/vqsr/VQSLODTranche.java | 12 ++--- .../vqsr/GatherTranchesIntegrationTest.java | 53 +++++++++---------- .../VQSR/expected/singleOverflow.tranches | 3 ++ .../VQSR/expected/testSummedOverflow.tranches | 3 ++ .../test-single-giant-input-snps.tranches | 3 ++ .../VQSR/test-very-large-one-snps.tranches | 3 ++ .../VQSR/test-very-large-two-snps.tranches | 3 ++ 9 files changed, 79 insertions(+), 46 deletions(-) create mode 100644 src/test/resources/large/VQSR/expected/singleOverflow.tranches create mode 100644 src/test/resources/large/VQSR/expected/testSummedOverflow.tranches create mode 100644 src/test/resources/large/VQSR/test-single-giant-input-snps.tranches create mode 100644 src/test/resources/large/VQSR/test-very-large-one-snps.tranches create mode 100644 src/test/resources/large/VQSR/test-very-large-two-snps.tranches diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/Tranche.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/Tranche.java index 6a026c09046..7ef953f3757 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/Tranche.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/Tranche.java @@ -27,14 +27,16 @@ public class Tranche { final double minVQSLod; //minimum value of VQSLOD in this tranche final double knownTiTv; //titv value of known sites in this tranche final double novelTiTv; //titv value of novel sites in this tranche - final int numKnown; //number of known sites in this tranche - final int numNovel; //number of novel sites in this tranche + final long numKnown; //number of known sites in this tranche + final long numNovel; //number of novel sites in this tranche final VariantRecalibratorArgumentCollection.Mode model; final String name; //Name of the tranche - public Tranche(final String name, final double knownTiTv, final int numNovel, final double minVQSLod, final VariantRecalibratorArgumentCollection.Mode model, final double novelTiTv, final int accessibleTruthSites, final int numKnown, final int callsAtTruthSites) { + public Tranche(final String name, final double knownTiTv, final long numNovel, final double minVQSLod, + final VariantRecalibratorArgumentCollection.Mode model, final double novelTiTv, + final int accessibleTruthSites, final long numKnown, final int callsAtTruthSites) { if ( numKnown < 0 || numNovel < 0) { - throw new GATKException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel); + throw new GATKException("Invalid tranche " + name + " - no. variants is < 0 : known " + numKnown + " novel " + numNovel); } if ( name == null ) { @@ -104,7 +106,8 @@ public String getTrancheString(final T prev) { } protected static Tranche trancheOfVariants(final List data, final int minI, final double ts, final VariantRecalibratorArgumentCollection.Mode model ) { - int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0; + long numKnown = 0, numNovel = 0; + int knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0; final double minLod = data.get(minI).lod; for (final VariantDatum datum : data) { @@ -147,8 +150,8 @@ protected static Tranche emptyTranche(final List data, final int m final double knownTiTv = 0.0; final double novelTiTv = 0.0; - final int numKnown = 0; - final int numNovel = 0; + final long numKnown = 0; + final long numNovel = 0; return new Tranche("unnamed", knownTiTv, numNovel, minLod, model, novelTiTv, accessibleTruthSites, numKnown, nCallsAtTruth); } @@ -194,6 +197,26 @@ protected static int getOptionalInteger(final Map bindings, fina } } + protected static long getRequiredLong(final Map bindings, final String key) { + if ( bindings.containsKey(key) ) { + try{ + return Long.valueOf(bindings.get(key)); + } catch (NumberFormatException e){ + throw new UserException.MalformedFile("Malformed tranches file. Invalid value for key " + key); + } + } else { + throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key); + } + } + + protected static long getOptionalLong(final Map bindings, final String key, final int defaultValue) { + try{ + return Long.valueOf(bindings.getOrDefault(key, String.valueOf(defaultValue))); + } catch (NumberFormatException e){ + throw new UserException.MalformedFile("Malformed tranches file. Invalid value for key " + key); + } + } + protected double getTruthSensitivity() { return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/TruthSensitivityTranche.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/TruthSensitivityTranche.java index 0007dad3176..89146be187c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/TruthSensitivityTranche.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/TruthSensitivityTranche.java @@ -28,9 +28,9 @@ final class TruthSensitivityTranche extends Tranche { public TruthSensitivityTranche( final double targetTruthSensitivity, final double minVQSLod, - final int numKnown, + final long numKnown, final double knownTiTv, - final int numNovel, + final long numNovel, final double novelTiTv, final int accessibleTruthSites, final int callsAtTruthSites, @@ -41,9 +41,9 @@ public TruthSensitivityTranche( public TruthSensitivityTranche( final double targetTruthSensitivity, final double minVQSLod, - final int numKnown, + final long numKnown, final double knownTiTv, - final int numNovel, + final long numNovel, final double novelTiTv, final int accessibleTruthSites, final int callsAtTruthSites, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VQSLODTranche.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VQSLODTranche.java index ee0e47d5a1c..2b28af10741 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VQSLODTranche.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VQSLODTranche.java @@ -26,9 +26,9 @@ public Double getTrancheIndex() { public VQSLODTranche( final double minVQSLod, - final int numKnown, + final long numKnown, final double knownTiTv, - final int numNovel, + final long numNovel, final double novelTiTv, final int accessibleTruthSites, final int callsAtTruthSites, @@ -114,9 +114,9 @@ public static List readTranches(final GATKPath f) throws IOExcept } tranches.add(new VQSLODTranche( getRequiredDouble(bindings, "minVQSLod"), - getOptionalInteger(bindings, "numKnown", -1), + getOptionalLong(bindings, "numKnown", -1), getOptionalDouble(bindings, "knownTiTv", -1.0), - getRequiredInteger(bindings, "numNovel"), + getRequiredLong(bindings, "numNovel"), getRequiredDouble(bindings, "novelTiTv"), getOptionalInteger(bindings, "accessibleTruthSites", -1), getOptionalInteger(bindings, "callsAtTruthSites", -1), @@ -178,10 +178,10 @@ public static List mergeAndConvertTranches(final TreeMa public static VQSLODTranche mergeAndConvertTranches(final List scatteredTranches, VariantRecalibratorArgumentCollection.Mode mode) { double indexVQSLOD = scatteredTranches.get(0).minVQSLod; - int sumNumKnown = 0; + long sumNumKnown = 0; double sumKnownTransitions = 0; double sumKnownTransversions = 0; - int sumNumNovel = 0; + long sumNumNovel = 0; double sumNovelTransitions = 0; double sumNovelTransversions = 0; int sumAccessibleTruthSites = 0; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/GatherTranchesIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/GatherTranchesIntegrationTest.java index 131462b84c0..89c6a681dac 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/GatherTranchesIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/GatherTranchesIntegrationTest.java @@ -5,9 +5,13 @@ import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; /** * Created by gauthier on 7/18/17. @@ -16,45 +20,36 @@ public class GatherTranchesIntegrationTest extends CommandLineProgramTest { private static final String testDir = GATKBaseTest.publicTestDir + "/large/VQSR/"; - @Test - public void testCombine2Shards() throws Exception { - final File recal1 = new File(testDir + "snpTranches.scattered.txt"); //this is the output of VariantRecalibratorIntegrationTest.testVariantRecalibratorSNPscattered - final File recal2 = new File(testDir + "snpTranches.scattered.2.txt"); //this is a copy of the above + @DataProvider(name = "testInputs") + public Object[][] getTestInputs () { + return new Object[][]{ + {Arrays.asList(new File(testDir + "snpTranches.scattered.txt"), new File(testDir + "snpTranches.scattered.txt")), + new File(testDir + "expected/snpTranches.gathered.txt"), "SNP"}, - final File recal_original = new File(testDir + "expected/snpTranches.gathered.txt"); + {Arrays.asList(new File(testDir + "indels.0.tranches"), new File(testDir + "indels.1.tranches")), + new File(testDir + "expected/indels.gathered.tranches"), "INDEL"}, - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("--input"); - args.addRaw(recal1.getAbsolutePath()); - args.addRaw("--input"); - args.addRaw(recal2.getAbsolutePath()); - args.add("mode", "SNP"); + {Arrays.asList(new File(testDir + "test-single-giant-input-snps.tranches")), + new File(testDir + "expected/singleOverflow.tranches"), "SNP"}, - final File outFile = GATKBaseTest.createTempFile("gatheredTranches", ".txt"); - args.addOutput(outFile); - final Object res = this.runCommandLine(args.getArgsArray()); - Assert.assertEquals(res, 0); - IntegrationTestSpec.assertEqualTextFiles(outFile, recal_original); + {Arrays.asList(new File(testDir + "test-very-large-one-snps.tranches"), new File(testDir + "test-very-large-two-snps.tranches")), + new File(testDir + "expected/testSummedOverflow.tranches"), "SNP"} + }; } - @Test - public void testCombine2IndelTranches() throws Exception { - final File tranches1 = new File(testDir + "indels.0.tranches"); - final File tranches2 = new File(testDir + "indels.1.tranches"); - - final File recal_original = new File(testDir + "expected/indels.gathered.tranches"); - + @Test (dataProvider = "testInputs") + public void testGatherTranches(List inputs, File expected, String mode) throws IOException { final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("--input"); - args.addRaw(tranches1.getAbsolutePath()); - args.addRaw("--input"); - args.addRaw(tranches2.getAbsolutePath()); - args.add("mode", "INDEL"); + for (File inFile : inputs) { + args.addRaw("--input"); + args.addRaw(inFile); + } + args.add("mode", mode); final File outFile = GATKBaseTest.createTempFile("gatheredTranches", ".txt"); args.addOutput(outFile); final Object res = this.runCommandLine(args.getArgsArray()); Assert.assertEquals(res, 0); - IntegrationTestSpec.assertEqualTextFiles(outFile, recal_original); + IntegrationTestSpec.assertEqualTextFiles(outFile, expected); } } \ No newline at end of file diff --git a/src/test/resources/large/VQSR/expected/singleOverflow.tranches b/src/test/resources/large/VQSR/expected/singleOverflow.tranches new file mode 100644 index 00000000000..a8ebe9bb314 --- /dev/null +++ b/src/test/resources/large/VQSR/expected/singleOverflow.tranches @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a349558a36836addbdb6821572673cee6e8805607ead360e14e4adbba96746f7 +size 477 diff --git a/src/test/resources/large/VQSR/expected/testSummedOverflow.tranches b/src/test/resources/large/VQSR/expected/testSummedOverflow.tranches new file mode 100644 index 00000000000..d5070d6799c --- /dev/null +++ b/src/test/resources/large/VQSR/expected/testSummedOverflow.tranches @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fb0457daab12a43909081530e054d09c93339798caebfd4b009d2080db01bd2 +size 484 diff --git a/src/test/resources/large/VQSR/test-single-giant-input-snps.tranches b/src/test/resources/large/VQSR/test-single-giant-input-snps.tranches new file mode 100644 index 00000000000..1a60cd43613 --- /dev/null +++ b/src/test/resources/large/VQSR/test-single-giant-input-snps.tranches @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bc2f9661f3cf3eaf23b6cee763e9e3035b7e4ac38e7f34a92684c86e6fa3a5 +size 96288 diff --git a/src/test/resources/large/VQSR/test-very-large-one-snps.tranches b/src/test/resources/large/VQSR/test-very-large-one-snps.tranches new file mode 100644 index 00000000000..5c6953da4cb --- /dev/null +++ b/src/test/resources/large/VQSR/test-very-large-one-snps.tranches @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83c41d141ad6b5e8a25b768b58f692857be7a42208ee85987259718c90d307b9 +size 96288 diff --git a/src/test/resources/large/VQSR/test-very-large-two-snps.tranches b/src/test/resources/large/VQSR/test-very-large-two-snps.tranches new file mode 100644 index 00000000000..f5efb1b8e07 --- /dev/null +++ b/src/test/resources/large/VQSR/test-very-large-two-snps.tranches @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d1e01e122957eddaaa5645d516310fdff74914a11e9efaa62cf783efead58c +size 96107