Permalink
Browse files

Merge pull request #431 from broadinstitute/jt_custom_vcf_idx

Add engine options to override the default VCF/BCF indexing strategy
  • Loading branch information...
2 parents 499fb26 + a685742 commit 81e66af56a97842f3c51864e3a8078a65b38120f @droazen droazen committed Dec 4, 2013
Showing with 352 additions and 18 deletions.
  1. +23 −0 public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
  2. +1 −1 public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java
  3. +9 −2 public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
  4. +18 −6 public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
  5. +11 −1 public/java/src/org/broadinstitute/sting/tools/CatVariants.java
  6. +39 −0 public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java
  7. +30 −0 public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java
  8. +36 −0 public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java
  9. +43 −0 public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java
  10. +131 −0 public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java
  11. +3 −0 public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala
  12. BIN settings/repository/net.sf/{picard-1.96.1534.jar → picard-1.102.1595.jar}
  13. +3 −0 settings/repository/net.sf/picard-1.102.1595.xml
  14. +0 −3 settings/repository/net.sf/picard-1.96.1534.xml
  15. BIN settings/repository/net.sf/{sam-1.96.1534.jar → sam-1.102.1595.jar}
  16. +3 −0 settings/repository/net.sf/sam-1.102.1595.xml
  17. +0 −3 settings/repository/net.sf/sam-1.96.1534.xml
  18. BIN settings/repository/org.broad/{tribble-1.96.1534.jar → tribble-1.102.1595.jar}
  19. +1 −1 settings/repository/org.broad/{tribble-1.96.1534.xml → tribble-1.102.1595.xml}
  20. BIN settings/repository/org.broadinstitute/{variant-1.96.1534.jar → variant-1.102.1595.jar}
  21. +1 −1 settings/repository/org.broadinstitute/{variant-1.96.1534.xml → variant-1.102.1595.xml}
@@ -35,6 +35,8 @@
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
+import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import java.io.File;
import java.util.ArrayList;
@@ -454,5 +456,26 @@ public void setDownsamplingMethod(DownsamplingMethod method) {
@Hidden
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
+
+ // --------------------------------------------------------------------------------------------------------------
+ //
+ // VCF/BCF index parameters
+ //
+ // --------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Specify the Tribble indexing strategy to use for VCFs.
+ *
+ * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter
+ * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter
+ * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
+ * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
+ */
+
+ @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="which type of IndexCreator to use for VCF/BCF indices",required=false)
+ public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE;
+
+ @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator",required=false)
+ public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER;
}
@@ -62,7 +62,7 @@ private StorageFactory() {}
* @param <T> Type of the stream to create.
* @return Storage object with a facade of type T.
*/
- public static <T> Storage<T> createStorage( Stub<T> stub, File file ) {
+ public static <T> Storage<T> createStorage( Stub<T> stub, File file ) {
Storage storage;
if(stub instanceof OutputStreamStub) {
@@ -133,14 +133,21 @@ private VariantContextWriter vcfWriterToFile(final VariantContextWriterStub stub
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
EnumSet<Options> options = stub.getWriterOptions(indexOnTheFly);
- VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), options);
+ VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options);
// if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both
// TODO -- remove me when argument generateShadowBCF is removed
if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) {
final File bcfFile = BCF2Utils.shadowBCF(file);
if ( bcfFile != null ) {
- VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, stub.getMasterSequenceDictionary(), options);
+ FileOutputStream bcfStream;
+ try {
+ bcfStream = new FileOutputStream(bcfFile);
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e);
+ }
+
+ VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options);
writer = new TestWriter(writer, bcfWriter);
}
}
@@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.io.stubs;
import net.sf.samtools.SAMSequenceDictionary;
+import org.broad.tribble.index.IndexCreator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
@@ -71,6 +72,17 @@
private final PrintStream genotypeStream;
/**
+ * A hack: push the argument sources into the VCF header so that the VCF header
+ * can rebuild the command-line arguments.
+ */
+ private final Collection<Object> argumentSources;
+
+ /**
+ * Which IndexCreator to use
+ */
+ private final IndexCreator indexCreator;
+
+ /**
* The cached VCF header (initialized to null)
*/
private VCFHeader vcfHeader = null;
@@ -81,12 +93,6 @@
private boolean isCompressed = false;
/**
- * A hack: push the argument sources into the VCF header so that the VCF header
- * can rebuild the command-line arguments.
- */
- private final Collection<Object> argumentSources;
-
- /**
* Should the header be written out? A hidden argument.
*/
private boolean skipWritingCommandLineHeader = false;
@@ -118,6 +124,7 @@ public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile,
this.engine = engine;
this.genotypeFile = genotypeFile;
this.genotypeStream = null;
+ this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile);
this.argumentSources = argumentSources;
}
@@ -132,6 +139,7 @@ public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genoty
this.engine = engine;
this.genotypeFile = null;
this.genotypeStream = new PrintStream(genotypeStream);
+ this.indexCreator = null;
this.argumentSources = argumentSources;
}
@@ -175,6 +183,10 @@ public void setForceBCF(boolean forceBCF) {
this.forceBCF = forceBCF;
}
+ public IndexCreator getIndexCreator() {
+ return indexCreator;
+ }
+
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
@@ -31,12 +31,15 @@
import org.apache.log4j.Level;
import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.FeatureReader;
+import org.broad.tribble.index.IndexCreator;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
+import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
+import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.bcf2.BCF2Codec;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.vcf.VCFCodec;
@@ -123,6 +126,12 @@
@Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false)
private Boolean assumeSorted = false;
+ @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false)
+ private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE;
+
+ @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false)
+ private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER;
+
/*
* print usage information
*/
@@ -204,7 +213,8 @@ protected int execute() throws Exception {
FileOutputStream outputStream = new FileOutputStream(outputFile);
EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
- final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), options);
+ final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile);
+ final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options);
boolean firstFile = true;
int count =0;
@@ -0,0 +1,39 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.utils.variant;
+
+import org.broadinstitute.sting.commandline.EnumerationArgumentDefault;
+
+/**
+ * Choose the Tribble indexing strategy
+ */
+public enum GATKVCFIndexType {
+ @EnumerationArgumentDefault
+ DYNAMIC_SEEK, // use DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME)
+ DYNAMIC_SIZE, // use DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SIZE)
+ LINEAR, // use LinearIndexCreator()
+ INTERVAL // use IntervalIndexCreator()
+}
@@ -28,6 +28,11 @@
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
+import org.broad.tribble.index.DynamicIndexCreator;
+import org.broad.tribble.index.IndexCreator;
+import org.broad.tribble.index.IndexFactory;
+import org.broad.tribble.index.interval.IntervalIndexCreator;
+import org.broad.tribble.index.linear.LinearIndexCreator;
import org.broad.tribble.readers.LineIterator;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.RodBinding;
@@ -43,6 +48,7 @@
import java.io.IOException;
import java.util.*;
+
/**
* A set of GATK-specific static utility methods for common operations on VCF files/records.
*/
@@ -55,6 +61,9 @@ private GATKVCFUtils() { }
public final static String GATK_COMMAND_LINE_KEY = "GATKCommandLine";
+ public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type.
+ public final static Integer DEFAULT_INDEX_PARAMETER = -1; // the default DYNAMIC_SEEK does not use a parameter
+
/**
* Gets the appropriately formatted header for a VCF file describing this GATK run
*
@@ -176,6 +185,27 @@ public static VCFHeader withUpdatedContigs(final VCFHeader header, final GenomeA
}
/**
+ * Create and return an IndexCreator
+ * @param type
+ * @param parameter
+ * @param outFile
+ * @return
+ */
+ public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile) {
+ IndexCreator idxCreator;
+ switch (type) {
+ case DYNAMIC_SEEK: idxCreator = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); break;
+ case DYNAMIC_SIZE: idxCreator = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SIZE); break;
+ case LINEAR: idxCreator = new LinearIndexCreator(); break;
+ case INTERVAL: idxCreator = new IntervalIndexCreator(); break;
+ default: throw new IllegalArgumentException("Unknown IndexCreator type: " + type);
+ }
+
+ idxCreator.initialize(outFile, parameter);
+ return idxCreator;
+ }
+
+ /**
* Utility class to read all of the VC records from a file
*
* @param file
@@ -105,6 +105,42 @@ public void testMyData(final int start, final int size) {
}
/**
+ * DataProvider example using a class-based data structure
+ */
+ private class MyDataProviderClass extends TestDataProvider {
+ private int start;
+ private int size;
+
+ private MyDataProviderClass(int start, int size) {
+ super(MyDataProviderClass.class);
+ this.start = start;
+ this.size = size;
+ }
+ }
+
+ @DataProvider(name = "MyClassBasedDataProvider")
+ public Object[][] makeMyDataProviderClass() {
+ // this functionality can be adapted to provide input data for whatever you might want in your data
+ for ( final int start : Arrays.asList(1, 10, 100) ) {
+ for ( final int size : Arrays.asList(1, 10, 100, 1000) ) {
+ new MyDataProviderClass(start, size);
+ }
+ }
+
+ return TestDataProvider.getTests(MyDataProviderClass.class);
+ }
+
+ /**
+ * Example testng test using MyClassBasedDataProvider
+ */
+ @Test(dataProvider = "MyClassBasedDataProvider")
+ public void testMyDataProviderClass(MyDataProviderClass testSpec) {
+ // adaptor this code to do whatever testing you want given the arguments start and size
+ Assert.assertTrue(testSpec.start >= 0);
+ Assert.assertTrue(testSpec.size >= 0);
+ }
+
+ /**
* A unit test that creates an artificial read for testing some code that uses reads
*/
@Test()
@@ -25,6 +25,10 @@
package org.broadinstitute.sting.utils.variant;
+import org.broad.tribble.index.DynamicIndexCreator;
+import org.broad.tribble.index.IndexCreator;
+import org.broad.tribble.index.interval.IntervalIndexCreator;
+import org.broad.tribble.index.linear.LinearIndexCreator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@@ -35,8 +39,10 @@
import org.broadinstitute.variant.vcf.VCFHeader;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
import org.testng.Assert;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.Set;
@@ -83,4 +89,41 @@ public void testAddingVCFHeaderInfo() {
Assert.assertTrue(lines2.contains(line1));
Assert.assertTrue(lines2.contains(line2));
}
+
+ private class IndexCreatorTest extends TestDataProvider {
+ private final GATKVCFIndexType type;
+ private final int parameter;
+ private final Class expectedClass;
+ private final int expectedDefaultBinSize;
+ private final int expectedBinSize;
+
+ private IndexCreatorTest(GATKVCFIndexType type, int parameter, Class expectedClass, int expectedDefaultBinSize, int expectedBinSize) {
+ super(IndexCreatorTest.class);
+
+ this.type = type;
+ this.parameter = parameter;
+ this.expectedClass = expectedClass;
+ this.expectedDefaultBinSize = expectedDefaultBinSize;
+ this.expectedBinSize = expectedBinSize;
+ }
+ }
+
+ @DataProvider(name = "indexCreator")
+ public Object[][] indexCreatorData() {
+ new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0, DynamicIndexCreator.class, -1, -1);
+ new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0, DynamicIndexCreator.class, -1, -1);
+ new IndexCreatorTest(GATKVCFIndexType.LINEAR, 100, LinearIndexCreator.class, LinearIndexCreator.DEFAULT_BIN_WIDTH, 100);
+ new IndexCreatorTest(GATKVCFIndexType.INTERVAL, 200, IntervalIndexCreator.class, IntervalIndexCreator.DEFAULT_FEATURE_COUNT, 200);
+
+ return IndexCreatorTest.getTests(IndexCreatorTest.class);
+ }
+
+ @Test(dataProvider = "indexCreator")
+ public void testGetIndexCreator(IndexCreatorTest spec) {
+ File dummy = new File("");
+ IndexCreator ic = GATKVCFUtils.getIndexCreator(spec.type, spec.parameter, dummy);
+ Assert.assertEquals(ic.getClass(), spec.expectedClass, "Wrong IndexCreator type");
+ Assert.assertEquals(ic.defaultBinSize(), spec.expectedDefaultBinSize, "Wrong default bin size");
+ Assert.assertEquals(ic.getBinSize(), spec.expectedBinSize, "Wrong bin size");
+ }
}
Oops, something went wrong.

0 comments on commit 81e66af

Please sign in to comment.