Browse files

Merge remote-tracking branch 'unstable/master'

  • Loading branch information...
2 parents d0b2be7 + ae5c6bd commit 742b44eb7c056cd5c18fdb59287e784953154bf6 @eitanbanks eitanbanks committed Dec 6, 2013
Showing with 2,190 additions and 541 deletions.
  1. +10 −2 build.xml
  2. +2 −2 ivy.xml
  3. +41 −9 public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
  4. +8 −0 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
  5. +3 −0 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
  6. +8 −0 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
  7. +3 −0 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java
  8. +1 −1 public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java
  9. +9 −2 public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
  10. +18 −6 public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
  11. +3 −5 public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java
  12. +1 −1 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
  13. +2 −0 public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
  14. +13 −11 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
  15. +28 −28 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
  16. +14 −9 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
  17. +78 −25 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
  18. +1 −1 public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java
  19. +31 −27 public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
  20. +2 −0 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
  21. +32 −18 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
  22. +2 −1 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java
  23. +8 −11 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
  24. +11 −1 public/java/src/org/broadinstitute/sting/tools/CatVariants.java
  25. +42 −0 public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
  26. +56 −6 public/java/src/org/broadinstitute/sting/utils/MathUtils.java
  27. +1 −0 public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
  28. +14 −1 public/java/src/org/broadinstitute/sting/utils/Utils.java
  29. +15 −3 public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java
  30. +9 −0 public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
  31. +1 −1 public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java
  32. +24 −11 public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
  33. +30 −1 public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java
  34. +53 −45 public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java
  35. +7 −5 public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java
  36. +103 −7 public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
  37. +182 −0 public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java
  38. +1 −1 public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
  39. +44 −7 public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
  40. +2 −2 public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
  41. +1 −1 public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java
  42. +39 −0 public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java
  43. +30 −0 public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java
  44. +407 −124 public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java
  45. +1 −0 public/java/test/org/broadinstitute/sting/BaseTest.java
  46. +36 −0 public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java
  47. +1 −1 public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java
  48. +1 −1 public/java/test/org/broadinstitute/sting/WalkerTest.java
  49. +1 −1 public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
  50. +341 −3 public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
  51. +43 −0 public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java
  52. +143 −18 public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java
  53. +131 −0 public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java
  54. +5 −5 public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java
  55. +2 −2 public/perl/liftOverVCF.pl
  56. +18 −4 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala
  57. +11 −3 public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala
  58. +2 −2 public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala
  59. +2 −2 public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala
  60. +2 −2 public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala
  61. +3 −0 public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala
  62. +1 −0 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala
  63. +1 −0 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala
  64. +1 −0 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala
  65. +6 −0 public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala
  66. +1 −1 public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala
  67. +2 −2 public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala
  68. +2 −0 public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala
  69. +16 −1 public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala
  70. +1 −1 public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala
  71. +1 −22 public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala
  72. +2 −2 public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala
  73. +0 −75 public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala
  74. +2 −2 settings/helpTemplates/common.html
  75. +2 −2 settings/helpTemplates/generic.index.template.html
  76. +11 −6 settings/helpTemplates/generic.template.html
  77. BIN settings/repository/net.sf/{picard-1.96.1534.jar → picard-1.104.1628.jar}
  78. +3 −0 settings/repository/net.sf/picard-1.104.1628.xml
  79. +0 −3 settings/repository/net.sf/picard-1.96.1534.xml
  80. BIN settings/repository/net.sf/{sam-1.96.1534.jar → sam-1.104.1628.jar}
  81. +3 −0 settings/repository/net.sf/sam-1.104.1628.xml
  82. +0 −3 settings/repository/net.sf/sam-1.96.1534.xml
  83. BIN settings/repository/org.broad/{tribble-1.96.1534.jar → tribble-1.104.1628.jar}
  84. +1 −1 settings/repository/org.broad/{tribble-1.96.1534.xml → tribble-1.104.1628.xml}
  85. BIN settings/repository/org.broadinstitute/{variant-1.96.1534.jar → variant-1.104.1628.jar}
  86. +1 −1 settings/repository/org.broadinstitute/{variant-1.96.1534.xml → variant-1.104.1628.xml}
View
12 build.xml
@@ -526,6 +526,7 @@
<fileset dir="${lib.dir}">
<include name="scala-compiler-*.jar"/>
<include name="scala-library-*.jar"/>
+ <include name="scala-reflect-*.jar"/>
</fileset>
</path>
<taskdef resource="scala/tools/ant/antlib.xml">
@@ -537,7 +538,7 @@
<target name="scala.compile" depends="init,resolve,gatk.compile,queue-extensions.generate,init.scala.compile" if="include.scala">
<mkdir dir="${scala.classes}"/>
<echo>Building Scala...</echo>
- <scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.classes}" classpathref="scala.dependencies" deprecation="yes" unchecked="yes">
+ <scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.classes}" classpathref="scala.dependencies" deprecation="yes" unchecked="yes" addparams="-feature">
<src refid="scala.source.path" />
<src path="${queue-extensions.source.dir}" />
<include name="**/*.scala" />
@@ -1218,7 +1219,7 @@
<target name="test.scala.compile" depends="test.java.compile,scala.compile" if="include.scala">
<echo message="Scala: Compiling test cases!"/>
- <scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
+ <scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes" addparams="-feature">
<src refid="scala.test.source.path" />
<classpath>
<path refid="build.results"/>
@@ -1414,6 +1415,13 @@
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
</target>
+ <target name="knowledgebasetest" depends="test.compile,test.init" description="Run knowledge base tests">
+ <condition property="ktype" value="*KnowledgeBaseTest" else="${single}">
+ <not><isset property="single"/></not>
+ </condition>
+ <run-test testtype="${ktype}" outputdir="${report}/${ktype}" runfailed="false"/>
+ </target>
+
<target name="failed-unit" depends="test.compile,test.init">
<run-test testtype="${report}/*UnitTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
</target>
View
4 ivy.xml
@@ -82,8 +82,8 @@
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
<!-- Scala dependancies -->
- <dependency org="org.scala-lang" name="scala-compiler" rev="2.9.2"/>
- <dependency org="org.scala-lang" name="scala-library" rev="2.9.2"/>
+ <dependency org="org.scala-lang" name="scala-compiler" rev="2.10.2"/>
+ <dependency org="org.scala-lang" name="scala-library" rev="2.10.2"/>
<!-- testing and evaluation dependencies -->
<dependency org="org.testng" name="testng" rev="6.8"/>
View
50 public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -35,6 +35,8 @@
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
+import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import java.io.File;
import java.util.ArrayList;
@@ -119,21 +121,28 @@ public GATKArgumentCollection() {
// Downsampling Arguments
//
// --------------------------------------------------------------------------------------------------------------
- @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
+ /**
+ * Reads will be selected randomly to be removed from the pile based on the method described here.
+ */
+ @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus", required = false)
public DownsampleType downsamplingType = null;
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
public Double downsampleFraction = null;
+ /**
+ * For locus-based traversals (LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the
+ * maximum depth of coverage at each locus. For read-based traversals (ReadWalkers), it controls the
+ * maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use
+ * much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does
+ * not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the
+ * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when
+ * removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note
+ * that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling
+ * algorithm will under some circumstances retain slightly more or less coverage than requested.
+ */
@Argument(fullName = "downsample_to_coverage", shortName = "dcov",
- doc = "Coverage [integer] to downsample to. For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers)," +
- "this controls the maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), " +
- "this controls the maximum number of reads sharing the same alignment start position. Note that this downsampling " +
- "option does NOT produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of " +
- "the to-coverage downsampler is to maintain an even representation of reads from all alignment start positions " +
- "when removing excess coverage. For a true across-the-board unbiased random sampling of reads, use -dfrac instead. " +
- "Also note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling " +
- "algorithm will under some circumstances retain slightly more coverage than requested.",
+ doc = "Coverage [integer] to downsample to per locus (for locus walkers) or per alignment start position (for read walkers)",
required = false)
public Integer downsampleCoverage = null;
@@ -447,5 +456,28 @@ public void setDownsamplingMethod(DownsamplingMethod method) {
@Hidden
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
+
+ // --------------------------------------------------------------------------------------------------------------
+ //
+ // VCF/BCF index parameters
+ //
+ // --------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Specify the Tribble indexing strategy to use for VCFs.
+ *
+ * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter
+ * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter
+ * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
+ * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored)
+ */
+
+ @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="which type of IndexCreator to use for VCF/BCF indices",required=false)
+ @Advanced
+ public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE;
+
+ @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator",required=false)
+ @Advanced
+ public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER;
}
View
8 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -247,6 +247,14 @@ else if(locusIterator.hasNext())
private PeekableIterator<BAMScheduleEntry> bamScheduleIterator = null;
/**
+ * Clean up underlying BAMSchedule file handles.
+ */
+ public void close() {
+ if(bamScheduleIterator != null)
+ bamScheduleIterator.close();
+ }
+
+ /**
* Get the next overlapping tree of bins associated with the given BAM file.
* @param currentLocus The actual locus for which to check overlap.
* @return The next schedule entry overlapping with the given list of loci.
View
3 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
@@ -62,6 +62,9 @@ private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser pars
wrappedIterator = new PeekableIterator<FilePointer>(scheduler);
this.parser = parser;
}
+ public void close() {
+ wrappedIterator.close();
+ }
public boolean hasNext() {
return wrappedIterator.hasNext();
View
8 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -352,6 +352,14 @@ public SAMDataSource(
resourcePool.releaseReaders(readers);
}
+ public void close() {
+ SAMReaders readers = resourcePool.getAvailableReaders();
+ for(SAMReaderID readerID: readerIDs) {
+ SAMFileReader reader = readers.getReader(readerID);
+ reader.close();
+ }
+ }
+
/**
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
* information about how they are downsampled, sorted, and filtered
View
3 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java
@@ -43,4 +43,7 @@ public void initialize(final SAMDataSource readsDataSource, final Iterator<FileP
this.filePointers = new PeekableIterator<FilePointer>(filePointers);
this.parser = parser;
}
+ public void close() {
+ this.filePointers.close();
+ }
}
View
2 public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java
@@ -62,7 +62,7 @@ private StorageFactory() {}
* @param <T> Type of the stream to create.
* @return Storage object with a facade of type T.
*/
- public static <T> Storage<T> createStorage( Stub<T> stub, File file ) {
+ public static <T> Storage<T> createStorage( Stub<T> stub, File file ) {
Storage storage;
if(stub instanceof OutputStreamStub) {
View
11 public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
@@ -133,14 +133,21 @@ private VariantContextWriter vcfWriterToFile(final VariantContextWriterStub stub
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
EnumSet<Options> options = stub.getWriterOptions(indexOnTheFly);
- VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), options);
+ VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options);
// if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both
// TODO -- remove me when argument generateShadowBCF is removed
if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) {
final File bcfFile = BCF2Utils.shadowBCF(file);
if ( bcfFile != null ) {
- VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, stub.getMasterSequenceDictionary(), options);
+ FileOutputStream bcfStream;
+ try {
+ bcfStream = new FileOutputStream(bcfFile);
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e);
+ }
+
+ VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options);
writer = new TestWriter(writer, bcfWriter);
}
}
View
24 public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
@@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.io.stubs;
import net.sf.samtools.SAMSequenceDictionary;
+import org.broad.tribble.index.IndexCreator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
@@ -71,6 +72,17 @@
private final PrintStream genotypeStream;
/**
+ * A hack: push the argument sources into the VCF header so that the VCF header
+ * can rebuild the command-line arguments.
+ */
+ private final Collection<Object> argumentSources;
+
+ /**
+ * Which IndexCreator to use
+ */
+ private final IndexCreator indexCreator;
+
+ /**
* The cached VCF header (initialized to null)
*/
private VCFHeader vcfHeader = null;
@@ -81,12 +93,6 @@
private boolean isCompressed = false;
/**
- * A hack: push the argument sources into the VCF header so that the VCF header
- * can rebuild the command-line arguments.
- */
- private final Collection<Object> argumentSources;
-
- /**
* Should the header be written out? A hidden argument.
*/
private boolean skipWritingCommandLineHeader = false;
@@ -118,6 +124,7 @@ public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile,
this.engine = engine;
this.genotypeFile = genotypeFile;
this.genotypeStream = null;
+ this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile);
this.argumentSources = argumentSources;
}
@@ -132,6 +139,7 @@ public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genoty
this.engine = engine;
this.genotypeFile = null;
this.genotypeStream = new PrintStream(genotypeStream);
+ this.indexCreator = null;
this.argumentSources = argumentSources;
}
@@ -175,6 +183,10 @@ public void setForceBCF(boolean forceBCF) {
this.forceBCF = forceBCF;
}
+ public IndexCreator getIndexCreator() {
+ return indexCreator;
+ }
+
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
View
8 public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java
@@ -42,23 +42,21 @@ public void gather(List<File> inputs, File output) {
try {
o = new PrintStream(output);
} catch (FileNotFoundException e) {
- throw new UserException("File to be output by CoverageByRG Gather function was not found");
+ throw new UserException(String.format("File %s to be output by GATKReportGatherer function was not found", output));
}
GATKReport current = new GATKReport();
boolean isFirst = true;
for (File input : inputs) {
-
- // If the table is empty
if (isFirst) {
current = new GATKReport(input);
isFirst = false;
} else {
- GATKReport toAdd = new GATKReport(input);
- current.concat(toAdd);
+ current.concat(new GATKReport(input));
}
}
current.print(o);
+ o.close();
}
}
View
2 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -86,7 +86,7 @@
private int maxRegionSize = -1;
private int minRegionSize = -1;
- private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
+ private final LinkedList<ActiveRegion> workQueue = new LinkedList<>();
private TAROrderedReadCache myReads = null;
View
2 public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
@@ -180,4 +180,6 @@ public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet interv
}
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL);
}
+
+
}
View
24 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -32,6 +32,7 @@
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
@@ -83,6 +84,7 @@
@Requires(value={})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@Reference(window=@Window(start=-50,stop=50))
+@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250)
@By(DataSource.REFERENCE)
public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible, TreeReducible<Integer> {
@@ -132,21 +134,21 @@
* See the -list argument to view available annotations.
*/
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
- protected List<String> annotationsToUse = new ArrayList<String>();
+ protected List<String> annotationsToUse = new ArrayList<>();
/**
* Note that this argument has higher priority than the -A or -G arguments,
* so annotations will be excluded even if they are explicitly included with the other options.
*/
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
- protected List<String> annotationsToExclude = new ArrayList<String>();
+ protected List<String> annotationsToExclude = new ArrayList<>();
/**
* If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups.
* Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
- protected List<String> annotationGroupsToUse = new ArrayList<String>();
+ protected List<String> annotationGroupsToUse = new ArrayList<>();
/**
* This option enables you to add annotations from one VCF to another.
@@ -193,8 +195,8 @@ public void initialize() {
}
// get the list of all sample names from the variant VCF input rod, if applicable
- List<String> rodName = Arrays.asList(variantCollection.variants.getName());
- Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
+ final List<String> rodName = Arrays.asList(variantCollection.variants.getName());
+ final Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
if ( USE_ALL_ANNOTATIONS )
engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit());
@@ -204,23 +206,23 @@ public void initialize() {
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
- Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+ final Set<VCFHeaderLine> hInfo = new HashSet<>();
hInfo.addAll(engine.getVCFAnnotationDescriptions());
- for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) {
+ for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) {
if ( isUniqueHeaderLine(line, hInfo) )
hInfo.add(line);
}
// for the expressions, pull the info header line from the header of the resource rod
- for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) {
+ for ( final VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) {
// special case the ID field
if ( expression.fieldName.equals("ID") ) {
hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource"));
continue;
}
VCFInfoHeaderLine targetHeaderLine = null;
- for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
+ for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
if ( line instanceof VCFInfoHeaderLine ) {
- VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
+ final VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
if ( infoline.getID().equals(expression.fieldName) ) {
targetHeaderLine = infoline;
break;
@@ -285,7 +287,7 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
Map<String, AlignmentContext> stratifiedContexts;
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
- annotatedVCs = new ArrayList<VariantContext>(VCs.size());
+ annotatedVCs = new ArrayList<>(VCs.size());
for ( VariantContext vc : VCs )
annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
}
View
56 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -58,15 +58,15 @@
public RodBinding<VariantContext> binding;
public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings) {
- int indexOfDot = fullExpression.lastIndexOf(".");
+ final int indexOfDot = fullExpression.lastIndexOf(".");
if ( indexOfDot == -1 )
throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format");
fullName = fullExpression;
fieldName = fullExpression.substring(indexOfDot+1);
- String bindingName = fullExpression.substring(0, indexOfDot);
- for ( RodBinding<VariantContext> rod : bindings ) {
+ final String bindingName = fullExpression.substring(0, indexOfDot);
+ for ( final RodBinding<VariantContext> rod : bindings ) {
if ( rod.getName().equals(bindingName) ) {
binding = rod;
break;
@@ -96,7 +96,7 @@ public VariantAnnotatorEngine(List<String> annotationGroupsToUse, List<String> a
// select specific expressions to use
public void initializeExpressions(Set<String> expressionsToUse) {
// set up the expressions
- for ( String expression : expressionsToUse )
+ for ( final String expression : expressionsToUse )
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
}
@@ -113,15 +113,15 @@ private void excludeAnnotations(List<String> annotationsToExclude) {
if ( annotationsToExclude.size() == 0 )
return;
- List<InfoFieldAnnotation> tempRequestedInfoAnnotations = new ArrayList<InfoFieldAnnotation>(requestedInfoAnnotations.size());
- for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) {
+ final List<InfoFieldAnnotation> tempRequestedInfoAnnotations = new ArrayList<>(requestedInfoAnnotations.size());
+ for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) {
if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) )
tempRequestedInfoAnnotations.add(annotation);
}
requestedInfoAnnotations = tempRequestedInfoAnnotations;
- List<GenotypeAnnotation> tempRequestedGenotypeAnnotations = new ArrayList<GenotypeAnnotation>(requestedGenotypeAnnotations.size());
- for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
+ final List<GenotypeAnnotation> tempRequestedGenotypeAnnotations = new ArrayList<>(requestedGenotypeAnnotations.size());
+ for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) )
tempRequestedGenotypeAnnotations.add(annotation);
}
@@ -143,24 +143,24 @@ private void initializeDBs(final GenomeAnalysisEngine engine) {
variantOverlapAnnotator = new VariantOverlapAnnotator(dbSNPBinding, overlapBindings, engine.getGenomeLocParser());
}
- public void invokeAnnotationInitializationMethods( Set<VCFHeaderLine> headerLines ) {
- for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
+ public void invokeAnnotationInitializationMethods( final Set<VCFHeaderLine> headerLines ) {
+ for ( final VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) {
annotation.initialize(walker, toolkit, headerLines);
}
- for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
+ for ( final VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.initialize(walker, toolkit, headerLines);
}
}
public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
- Set<VCFHeaderLine> descriptions = new HashSet<VCFHeaderLine>();
+ final Set<VCFHeaderLine> descriptions = new HashSet<>();
- for ( InfoFieldAnnotation annotation : requestedInfoAnnotations )
+ for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations )
descriptions.addAll(annotation.getDescriptions());
- for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations )
+ for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations )
descriptions.addAll(annotation.getDescriptions());
- for ( String db : variantOverlapAnnotator.getOverlapNames() ) {
+ for ( final String db : variantOverlapAnnotator.getOverlapNames() ) {
if ( VCFStandardHeaderLines.getInfoLine(db, false) != null )
descriptions.add(VCFStandardHeaderLines.getInfoLine(db));
else
@@ -170,10 +170,10 @@ public void invokeAnnotationInitializationMethods( Set<VCFHeaderLine> headerLine
return descriptions;
}
- public VariantContext annotateContext(final RefMetaDataTracker tracker,
- final ReferenceContext ref,
- final Map<String, AlignmentContext> stratifiedContexts,
- VariantContext vc) {
+ public VariantContext annotateContext(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ final Map<String, AlignmentContext> stratifiedContexts,
+ final VariantContext vc) {
return annotateContext(tracker, ref, stratifiedContexts, vc, null);
}
@@ -182,20 +182,20 @@ public VariantContext annotateContext(final RefMetaDataTracker tracker,
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String,PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
- Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
+ final Map<String, Object> infoAnnotations = new LinkedHashMap<>(vc.getAttributes());
// annotate expressions where available
annotateExpressions(tracker, ref.getLocus(), infoAnnotations);
// go through all the requested info annotationTypes
- for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
- Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap);
+ for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
+ final Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap);
if ( annotationsFromCurrentType != null )
infoAnnotations.putAll(annotationsFromCurrentType);
}
// generate a new annotated VC
- VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
+ final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
// annotate genotypes, creating another new VC in the process
final VariantContext annotated = builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make();
@@ -210,11 +210,11 @@ public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tr
final Map<String, Object> infoAnnotations = new LinkedHashMap<>(vc.getAttributes());
// go through all the requested info annotationTypes
- for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
+ for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
if ( !(annotationType instanceof ActiveRegionBasedAnnotation) )
continue;
- Map<String, Object> annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc);
+ final Map<String, Object> annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc);
if ( annotationsFromCurrentType != null ) {
infoAnnotations.putAll(annotationsFromCurrentType);
}
@@ -244,12 +244,12 @@ private VariantContext annotateDBs(final RefMetaDataTracker tracker, VariantCont
}
private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map<String, Object> infoAnnotations) {
- for ( VAExpression expression : requestedExpressions ) {
- Collection<VariantContext> VCs = tracker.getValues(expression.binding, loc);
+ for ( final VAExpression expression : requestedExpressions ) {
+ final Collection<VariantContext> VCs = tracker.getValues(expression.binding, loc);
if ( VCs.size() == 0 )
continue;
- VariantContext vc = VCs.iterator().next();
+ final VariantContext vc = VCs.iterator().next();
// special-case the ID field
if ( expression.fieldName.equals("ID") ) {
if ( vc.hasID() )
View
23 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
@@ -44,10 +44,11 @@
import java.io.PrintStream;
/**
- * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine)
+ * Compute the read error rate per position
*
- * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read
- * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS.
+ * <p>This tool computes the read error rate per position in sequence reads. It does this in the original 5'->3'
+ * orientation that the read had coming off the machine. It then emits a GATKReport containing readgroup, cycle,
+ * mismatches, counts, qual, and error rate for each read group in the input BAMs.</p>
*
* <h3>Input</h3>
* <p>
@@ -56,9 +57,9 @@
*
* <h3>Output</h3>
* <p>
- * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate.
+ * A GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate.
*
- * For example, running this tool on the NA12878 data sets:
+ * For example, running this tool on the NA12878 data sets yields the following table:
*
* <pre>
* ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
@@ -82,16 +83,20 @@
* </pre>
* </p>
*
- * <h3>Examples</h3>
+ * <h3>Example</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ErrorRatePerCycle
- * -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
- * -R bundle/current/b37/human_g1k_v37.fasta
- * -o example.gatkreport.txt
+ * -R human_g1k_v37.fasta
+ * -I my_sequence_reads.bam
+ * -o error_rates.gatkreport.txt
* </pre>
*
+ * <h3>Caveat</h3>
+ *
+ * <p>Note that when it is run on paired-end sequence data, this tool only uses the first read in a pair.</p>
+ *
* @author Kiran Garimella, Mark DePristo
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
View
103 ...ic/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
@@ -38,7 +38,10 @@
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
/**
* Outputs the read lengths of all the reads in a file.
@@ -77,51 +80,101 @@
@Output
public PrintStream out;
- private GATKReport report;
+ //A map from RG to its column number (its index in an int[] array)
+ private Map<SAMReadGroupRecord,Integer> readGroupsLocation;
+ //Each line in the table is a read length and each column it the number of reads of a specific RG with that length. Thus a table is a map between read lengths to array of values (one for each RG).
+ private Map<Integer,int[]> table;
+ private List<SAMReadGroupRecord> readGroups;
public void initialize() {
- final List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
-
- report = new GATKReport();
- report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroups.isEmpty() ? 1 : readGroups.size()));
- GATKReportTable table = report.getTable("ReadLengthDistribution");
-
- table.addColumn("readLength");
-
- if (readGroups.isEmpty())
- table.addColumn("SINGLE_SAMPLE");
- else
- for (SAMReadGroupRecord rg : readGroups)
- table.addColumn(rg.getSample());
- }
-
- public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
- return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag());
+ readGroups = getToolkit().getSAMFileHeader().getReadGroups();
+ readGroupsLocation = new HashMap<>();
+ table = new TreeMap<>();
+ int readGroupsNum = 0;
+
+ if (!readGroups.isEmpty()){
+ for (SAMReadGroupRecord rg : readGroups){
+ readGroupsLocation.put(rg,readGroupsNum);
+ readGroupsNum++;
+ }
+ }
}
@Override
- public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) {
- GATKReportTable table = report.getTable("ReadLengthDistribution");
+ public Integer map(final ReferenceContext referenceContext,final GATKSAMRecord samRecord,final RefMetaDataTracker RefMetaDataTracker) {
- int length = Math.abs(samRecord.getReadLength());
- String sample = samRecord.getReadGroup().getSample();
+ final int length = Math.abs(samRecord.getReadLength());
+ final SAMReadGroupRecord rg = samRecord.getReadGroup();
- table.increment(length, sample);
+ increment(table,length, rg);
return null;
}
+ final private void increment(final Map<Integer,int[]> table,final int length,final SAMReadGroupRecord rg){
+ if(readGroupsLocation.isEmpty()){
+ if(table.containsKey(length))
+ table.get(length)[0]++;
+ else{
+ final int[] newLength = {1};
+ table.put(length,newLength);
+ }
+ }
+ else{
+ final int rgLocation = readGroupsLocation.get(rg);
+ if(table.containsKey(length))
+ table.get(length)[rgLocation]++;
+ else{
+ table.put(length,new int[readGroupsLocation.size()]);
+ table.get(length)[rgLocation]++;
+ }
+ }
+ }
+
@Override
public Integer reduceInit() {
return null;
}
@Override
- public Integer reduce(Integer integer, Integer integer1) {
+ public Integer reduce(final Integer integer,final Integer integer1) {
return null;
}
- public void onTraversalDone(Integer sum) {
+ public void onTraversalDone(final Integer sum) {
+ final GATKReport report = createGATKReport();
report.print(out);
}
+
+ final private GATKReport createGATKReport(){
+ final GATKReport report = new GATKReport();
+ report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroupsLocation.isEmpty() ? 1 : readGroupsLocation.size()));
+ final GATKReportTable tableReport = report.getTable("ReadLengthDistribution");
+
+ tableReport.addColumn("readLength");
+
+ if (readGroupsLocation.isEmpty()){
+ tableReport.addColumn("SINGLE_SAMPLE");
+ int rowIndex = 0;
+ for (Integer length : table.keySet()){
+ tableReport.set(rowIndex,0,length);
+ tableReport.set(rowIndex,1,table.get(length)[0]);
+ rowIndex++;
+ }
+ }
+ else{
+ for (SAMReadGroupRecord rg : readGroups)
+ tableReport.addColumn(rg.getSample());
+ int rowIndex = 0;
+ for (Integer length : table.keySet()){
+ tableReport.set(rowIndex,0,length);
+ for (int i=0; i < readGroupsLocation.size(); i++)
+ tableReport.set(rowIndex,i+1,table.get(length)[i]);
+ rowIndex++;
+ }
+
+ }
+
+ return report;
+ }
}
View
2 public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java
@@ -230,7 +230,7 @@ public void initialize() {
filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS);
- VariantContextUtils.engine.setSilent(true);
+ VariantContextUtils.engine.get().setSilent(true);
initializeVcfWriter();
}
View
58 public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
@@ -57,36 +57,34 @@
import java.util.regex.Pattern;
/**
- * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
+ * Read clipping based on quality, position or sequence matching
*
+ * <p>This tool provides simple, powerful read clipping capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.</p>
*
- * <p>
- * It allows the user to clip bases in reads with poor quality scores, that match particular
- * sequences, or that were generated by particular machine cycles.
+ * <p>There are three options for clipping (quality, position and sequence), which can be used alone or in combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at least one of the three clipping options, and specifying a clipping representation is not sufficient. If you do not specify a clipping option, the program will run but it will not do anything to your reads.</p>
*
* <dl>
* <dt>Quality score based clipping</dt>
* <dd>
* Clip bases from the read in clipper from
- * <br>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</br>
- * to the end of the read. This is blatantly stolen from BWA.
+ * <pre>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</pre>
+ * to the end of the read. This is copied from BWA.
*
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
* clipping index in the read (from the end).
- * </dd>
+ * </dd><br />
* <dt>Cycle based clipping</dt>
* <dd>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
* For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
- * </dd>
+ * </dd><br />
* <dt>Sequence matching</dt>
* <dd>Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
* filtering only bases whose sequence exactly matches SEQ.</dd>
* </dl>
*
- * </p>
*
* <h3>Input</h3>
* <p>
@@ -99,7 +97,7 @@
* operation applied to each read.
* </p>
* <p>
- * <h3>Summary output</h3>
+ * <h4>Summary output (console)</h4>
* <pre>
* Number of examined reads 13
* Number of clipped reads 13
@@ -113,43 +111,50 @@
* </pre>
* </p>
*
- * <p>
- * <h3>Example clipping</h3>
- * Suppose we are given this read:
+ * <h3>Example</h3>
+ * <pre>
+ * java -jar GenomeAnalysisTK.jar \
+ * -T ClipReads \
+ * -R reference.fasta \
+ * -I original.bam \
+ * -o clipped.bam \
+ * -XF seqsToClip.fasta \
+ * -X CCCCC \
+ * -CT "1-5,11-15" \
+ * -QT 10
+ * </pre>
+ * <p>The command line shown above will apply all three options in combination. See the detailed examples below to see how the choice of clipping representation affects the output.</p>
+ *
+ * <h4>Detailed clipping examples</h4>
+ * <p>Suppose we are given this read:</p>
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
- * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
+ * <p>If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:</p>
*
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
- * Whereas with -CR WRITE_Q0S:
+ * <p>Whereas with -QT 10 -CR WRITE_Q0S:</p>
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
- * Or -CR SOFTCLIP_BASES:
+ * <p>Or -QT 10 -CR SOFTCLIP_BASES:</p>
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
- * </p>
*
- * <h3>Examples</h3>
- * <pre>
- * -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
- * -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
- * </pre>
* @author Mark DePristo
* @since 2010
@@ -158,10 +163,9 @@
@Requires({DataSource.READS})
public class ClipReads extends ReadWalker<ClipReads.ReadClipperWithData, ClipReads.ClippingData> {
/**
- * If provided, ClipReads will write summary statistics about the clipping operations applied
- * to the reads to this file.
+ * If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file.
*/
- @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false, defaultToStdout = false)
+ @Output(fullName = "outputStatistics", shortName = "os", doc = "File to output statistics", required = false, defaultToStdout = false)
PrintStream out = null;
/**
@@ -305,7 +309,7 @@ private void addSeqToClip(String name, byte[] bases) {
*/
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
- if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES )
+ if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES || clippingRepresentation == ClippingRepresentation.REVERT_SOFTCLIPPED_BASES )
read = ReadClipper.revertSoftClippedBases(read);
ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip);
@@ -600,4 +604,4 @@ public void addData(ClippingData data) {
}
-}
+}
View
2 .../java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
@@ -197,6 +197,8 @@ else if (vc1.isSimpleDeletion())
break;
case MIXED:
break;
+ case UNAVAILABLE:
+ break;
default:
throw new ReviewedStingException("BUG: Unexpected genotype type: " + g);
}
View
50 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@@ -164,14 +164,17 @@
@Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false)
public boolean minimalVCF = false;
+ @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false)
+ public boolean EXCLUDE_NON_VARIANTS = false;
+
/**
* Set to 'null' if you don't want the set field emitted.
*/
@Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
public String SET_KEY = "set";
/**
- * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime..
+ * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.
*/
@Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false)
public boolean ASSUME_IDENTICAL_SAMPLES = false;
@@ -188,6 +191,9 @@
@Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false)
public boolean MERGE_INFO_WITH_MAX_AC = false;
+ @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false)
+ public boolean COMBINE_ANNOTATIONS = false;
+
private List<String> priority = null;
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
@@ -229,16 +235,14 @@ public void initialize() {
vcfWriter.writeHeader(vcfHeader);
}
-
-
private void validateAnnotateUnionArguments() {
Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null )
throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes");
if ( PRIORITY_STRING != null){
- priority = new ArrayList<String>(Arrays.asList(PRIORITY_STRING.split(",")));
+ priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(",")));
if ( rodNames.size() != priority.size() )
throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority);
@@ -252,13 +256,16 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
- Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
+ final Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
// get all of the vcf rods at this locus
// Need to provide reference bases to simpleMerge starting at current locus
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
+ Collection<VariantContext> potentialRefVCs = tracker.getValues(variants);
+ potentialRefVCs.removeAll(vcs);
if ( sitesOnlyVCF ) {
vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
+ potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs);
}
if ( ASSUME_IDENTICAL_SAMPLES ) {
@@ -270,24 +277,24 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
}
int numFilteredRecords = 0;
- for (VariantContext vc : vcs) {
+ for (final VariantContext vc : vcs) {
if (vc.filtersWereApplied() && vc.isFiltered())
numFilteredRecords++;
}
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
return 0;
- List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
+ final List<VariantContext> mergedVCs = new ArrayList<>();
if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
- Map<VariantContext.Type, List<VariantContext>> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs);
+ final Map<VariantContext.Type, List<VariantContext>> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs);
// TODO -- clean this up in a refactoring
// merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type)
if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) {
final List<VariantContext> refs = VCsByType.remove(VariantContext.Type.NO_VARIATION);
- for ( VariantContext.Type type : VariantContext.Type.values() ) {
+ for ( final VariantContext.Type type : VariantContext.Type.values() ) {
if ( VCsByType.containsKey(type) ) {
VCsByType.get(type).addAll(refs);
break;
@@ -296,33 +303,40 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
}
// iterate over the types so that it's deterministic
- for (VariantContext.Type type : VariantContext.Type.values()) {
- if (VCsByType.containsKey(type))
- mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type),
- priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
- SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
+ for (final VariantContext.Type type : VariantContext.Type.values()) {
+ // make sure that it is a variant or in case it is not, that we want to include the sites with no variants
+ if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) {
+ if (VCsByType.containsKey(type)) {
+ mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs,
+ priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
+ SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS));
+ }
+ }
}
}
else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
- mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs,
+ mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs,
priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
- SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
+ SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS));
}
else {
logger.warn("Ignoring all records at site " + ref.getLocus());
}
- for ( VariantContext mergedVC : mergedVCs ) {
+ for ( final VariantContext mergedVC : mergedVCs ) {
// only operate at the start of events
if ( mergedVC == null )
continue;
final VariantContextBuilder builder = new VariantContextBuilder(mergedVC);
// re-compute chromosome counts
VariantContextUtils.calculateChromosomeCounts(builder, false);
+
if ( minimalVCF )
GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
- vcfWriter.add(builder.make());
+ final VariantContext vc = builder.make();
+ if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() )
+ vcfWriter.add(builder.make());
}
return vcs.isEmpty() ? 0 : 1;
View
3 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java
@@ -42,6 +42,7 @@
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.variant.variantcontext.writer.Options;
import org.broadinstitute.variant.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@@ -118,7 +119,7 @@ public void initialize() {
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
- writer = VariantContextWriterFactory.create(file, getMasterSequenceDictionary(), VariantContextWriterFactory.NO_OPTIONS);
+ writer = VariantContextWriterFactory.create(file, getMasterSequenceDictionary(), EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
writer.writeHeader(vcfHeader);
}
View
19 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -299,7 +299,7 @@
@Argument(doc="indel size select",required=false,fullName="maxIndelSize")
private int maxIndelSize = Integer.MAX_VALUE;
- @Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")
+ @Argument(doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
@@ -657,6 +657,7 @@ public void onTraversalDone(Integer result) {
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
*
* @param vc the VariantContext record to subset
+ * @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
* @return the subsetted VariantContext
*/
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) {
@@ -665,14 +666,10 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean exclu
final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
- VariantContextBuilder builder = new VariantContextBuilder(sub);
+ final VariantContextBuilder builder = new VariantContextBuilder(sub);
- GenotypesContext newGC = sub.getGenotypes();
-
- // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate)
- final boolean lostAllelesInSelection = vc.getAlleles().size() != sub.getAlleles().size();
- if ( lostAllelesInSelection )
- newGC = GATKVariantContextUtils.stripPLsAndAD(sub.getGenotypes());
+ // if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values
+ GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc);
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
if ( vc.getNSamples() != sub.getNSamples() ) {
@@ -682,11 +679,11 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean exclu
// Remove a fraction of the genotypes if needed
if ( fractionGenotypes > 0 ){
- ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
+ final ArrayList<Genotype> genotypes = new ArrayList<>();
for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
- List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
+ final List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
}
else{
@@ -698,7 +695,7 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean exclu
builder.genotypes(newGC);
- addAnnotations(builder, sub, lostAllelesInSelection);
+ addAnnotations(builder, sub, vc.getAlleles().size() != sub.getAlleles().size());
return builder.make();
}
View
12 public/java/src/org/broadinstitute/sting/tools/CatVariants.java
@@ -31,12 +31,15 @@
import org.apache.log4j.Level;
import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.FeatureReader;
+import org.broad.tribble.index.IndexCreator;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
+import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
+import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.bcf2.BCF2Codec;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.vcf.VCFCodec;
@@ -123,6 +126,12 @@
@Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false)
private Boolean assumeSorted = false;
+ @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false)
+ private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE;
+
+ @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false)
+ private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER;
+
/*
* print usage information
*/
@@ -204,7 +213,8 @@ protected int execute() throws Exception {
FileOutputStream outputStream = new FileOutputStream(outputFile);
EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
- final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), options);
+ final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile);
+ final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options);
boolean firstFile = true;
int count =0;
View
42 public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@@ -138,6 +138,48 @@ static public boolean basesAreEqual(byte base1, byte base2) {
return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2);
}
+ /**
+ * Checks whether to bases are the same in fact ignore ambiguous 'N' bases.
+ *
+ * @param base1 first base to compare.
+ * @param base2 second base to compare.
+ * @return true if {@code base1 == base2} or either is an 'N', false otherwise.
+ */
+ static public boolean basesAreEqualIgnoreAmbiguous(final byte base1, final byte base2) {
+ if (base1 == base2) return true;
+ else if (base1 == 'n' || base1 == 'N' || base2 == 'N' || base2 == 'n') return true;
+ else return false;
+ }
+
+ /**
+ * Compare to base arrays ranges checking whether they contain the same bases.
+ *
+ * <p>
+ * By default two array have equal bases, i.e. {@code length == 0} results results in {@code true}.
+ * </p>
+ *
+ * @param bases1 first base array to compare.
+ * @param offset1 position of the first base in bases1 to compare.
+ * @param bases2 second base array to compare.
+ * @param offset2 position of the first base in bases2 to compare.
+ * @param length number of bases to compare.
+ *
+ * @throws NullPointerException if {@code bases1} or {@code bases2} is {@code null}.
+ * @throws ArrayIndexOutOfBoundsException if:
+ * <ul>
+ * <li>{@code offset1} is not within the range [0,{@code bases1.length}) or</li>
+ * <li>{@code offset2} is not within the range [0,{@code bases2.length}) or</li>
+ * <li>{@code offset1 + length} is not within the range [0,{@code bases1.length}) or </li>
+ * <li>{@code offset2 + length} is not within the range [0,{@code bases2.length})</li>
+ * </ul>
+ * @return
+ */
+ static public boolean basesAreEqualIgnoreAmbiguous(final byte[] bases1, final int offset1, final byte[] bases2, final int offset2, final int length) {
+ for (int i = 0; i < length; i++)
+ if (!basesAreEqualIgnoreAmbiguous(bases1[offset1 + i],bases2[offset2 + i])) return false;
+ return true;
+ }
+
static public boolean extendedBasesAreEqual(byte base1, byte base2) {
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
}
View
62 public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@@ -30,7 +30,6 @@
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import javax.annotation.Nullable;
import java.math.BigDecimal;
import java.util.*;
@@ -425,7 +424,6 @@ public static double log10BinomialProbability(final int n, final int k) {
* happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a
* utility function.
*/
- @Nullable
static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) {
if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) {
return null;
@@ -845,17 +843,29 @@ public static int arrayMin(final List<Integer> array) {
}
/**
- * Compute the median element of the array of integers
+ * Compute the median element of the list of integers
* @param array a list of integers
* @return the median element
*/
- public static int median(final List<Integer> array) {
+ public static <T extends Comparable<? super T>> T median(final List<T> array) {
+ /* TODO -- from Valentin
+ the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2).
+
+ But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia]
+
+ My suggestion for a solution is then:
+
+ unify median and medianDoubles to public static <T extends Number> T median(Collection<T>)
+ check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc.
+ relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2)
+ In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n)
+ */
if ( array == null ) throw new IllegalArgumentException("Array must be non-null");
final int size = array.size();
if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0");
else if ( size == 1 ) return array.get(0);
else {
- final ArrayList<Integer> sorted = new ArrayList<>(array);
+ final ArrayList<T> sorted = new ArrayList<>(array);
Collections.sort(sorted);
return sorted.get(size / 2);
}
@@ -966,6 +976,16 @@ public static int countOccurrences(byte element, byte[] array) {
return count;
}
+ public static int countOccurrences(final boolean element, final boolean[] array) {
+ int count = 0;
+ for (final boolean b : array) {
+ if (element == b)
+ count++;
+ }
+
+ return count;
+ }
+
/**
* Returns n random indices drawn with replacement from the range 0..(k-1)
@@ -1405,7 +1425,7 @@ public static double log10Factorial(final int x) {
* @return
*/
public static List<Integer> log10LinearRange(final int start, final int stop, final double eps) {
- final LinkedList<Integer> values = new LinkedList<Integer>();
+ final LinkedList<Integer> values = new LinkedList<>();
final double log10range = Math.log10(stop - start);
if ( start == 0 )
@@ -1460,4 +1480,34 @@ else if ( x == 0.0 )
return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list);
}
+ /**
+ * Return the likelihood of observing the counts of categories having sampled a population
+ * whose categorial frequencies are distributed according to a Dirichlet distribution
+ * @param dirichletParams - params of the prior dirichlet distribution
+ * @param dirichletSum - the sum of those parameters
+ * @param counts - the counts of observation in each category
+ * @param countSum - the sum of counts (number of trials)
+ * @return - associated likelihood
+ */
+ public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum,
+ final int[] counts, final int countSum) {
+ if ( dirichletParams.length != counts.length ) {
+ throw new IllegalStateException("The number of dirichlet parameters must match the number of categories");
+ }
+ // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1)
+ double likelihood = log10MultinomialCoefficient(countSum,counts);
+ likelihood += log10Gamma(dirichletSum);
+ likelihood -= log10Gamma(dirichletSum+countSum);
+ for ( int idx = 0; idx < counts.length; idx++ ) {
+ likelihood += log10Gamma(counts[idx] + dirichletParams[idx]);
+ likelihood -= log10Gamma(dirichletParams[idx]);
+ }
+
+ return likelihood;
+ }
+
+ public static double dirichletMultinomial(double[] params, int[] counts) {
+ return dirichletMultinomial(params,sum(params),counts,(int) sum(counts));
+ }
+
}
View
1 public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
@@ -63,6 +63,7 @@
private static double qualToErrorProbCache[] = new double[256];
private static double qualToProbLog10Cache[] = new double[256];
+
static {
for (int i = 0; i < 256; i++) {
qualToErrorProbCache[i] = qualToErrorProb((double) i);
View
15 public/java/src/org/broadinstitute/sting/utils/Utils.java
@@ -27,7 +27,6 @@
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
-import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMProgramRecord;
import org.apache.log4j.Logger;
@@ -835,4 +834,18 @@ public static int longestCommonSuffix(final byte[] seq1, final byte[] seq2, fina
// don't perform array copies if we need to copy everything anyways
return ( trimFromFront == 0 && trimFromBack == 0 ) ? seq : Arrays.copyOfRange(seq, trimFromFront, seq.length - trimFromBack);
}
+
+ /**
+ * Simple wrapper for sticking elements of a int[] array into a List<Integer>
+ * @param ar - the array whose elements should be listified
+ * @return - a List<Integer> where each element has the same value as the corresponding index in @ar
+ */
+ public static List<Integer> listFromPrimitives(final int[] ar) {
+ final ArrayList<Integer> lst = new ArrayList<>(ar.length);
+ for ( final int d : ar ) {
+ lst.add(d);
+ }
+
+ return lst;
+ }
}
View
18 public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java
@@ -32,9 +32,7 @@
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.HasGenomeLocation;
-import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
@@ -109,6 +107,12 @@
*/
private GenomeLoc spanIncludingReads;
+
+ /**
+ * Indicates whether the active region has been finalized
+ */
+ private boolean hasBeenFinalized;
+
/**
* Create a new ActiveRegion containing no reads
*
@@ -205,7 +209,7 @@ public String toString() {
* @return a non-null array of bytes holding the reference bases in referenceReader
*/
@Ensures("result != null")
- private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
+ public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null");
if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding);
if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null");
@@ -451,4 +455,12 @@ public ActiveRegion trim(final GenomeLoc newExtent) {
return new ActiveRegion( subActive, Collections.<ActivityProfileState>emptyList(), isActive, genomeLocParser, requiredExtension );
}
+
+ public void setFinalized(final boolean value) {
+ hasBeenFinalized = value;
+ }
+
+ public boolean isFinalized() {
+ return hasBeenFinalized;
+ }
}
View
9 public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@@ -33,6 +33,7 @@
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
@@ -455,6 +456,14 @@ public KeySignatureVerificationException ( File f ) {
}
}
+ public static class GVCFIndexException extends UserException {
+ public GVCFIndexException (GATKVCFIndexType indexType, int indexParameter) {
+ super(String.format("GVCF output requires a specific indexing strategy. Please re-run including the arguments " +
+ "-variant_index_type %s -variant_index_parameter %d.",
+ indexType, indexParameter));
+ }
+ }
+
/**
* A special exception that happens only in the case where
* the filesystem, by design or configuration, is completely unable
View
2 public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java
@@ -65,7 +65,7 @@
* @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele
* @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available)
*/
- public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
+ public MostLikelyAllele(final Allele mostLikely, final Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null");
if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) )
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely);
View
35 public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
@@ -110,7 +110,7 @@ public void performPerAlleleDownsampling(final double downsamplingFraction) {
* @return a map from each allele to a list of reads that 'support' the allele
*/
protected Map<Allele,List<GATKSAMRecord>> getAlleleStratifiedReadMap() {
- final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
+ final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<>(alleles.size());
for ( final Allele allele : alleles )
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
@@ -152,7 +152,7 @@ public void add(PileupElement p, Allele a, Double likelihood) {
/**
* Does the current map contain the key associated with a particular SAM record in pileup?
* @param p Pileup element
- * @return
+ * @return true if the map contains pileup element, else false
*/
public boolean containsPileupElement(final PileupElement p) {
return likelihoodReadMap.containsKey(p.getRead());
@@ -176,9 +176,9 @@ public void clear() {
return likelihoodReadMap.keySet();
}
- public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
- return likelihoodReadMap.values();
- }
+// public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
+// return likelihoodReadMap.values();
+// }
public int getNumberOfStoredElements() {
return likelihoodReadMap.size();
@@ -191,6 +191,21 @@ public int getNumberOfStoredElements() {
return likelihoodReadMap.get(p.getRead());
}
+
+ /**
+ * Get the log10 likelihood associated with an individual read/allele
+ *
+ * @param read the read whose likelihood we want
+ * @param allele the allele whose likelihood we want
+ * @return the log10 likelihood that this read matches this allele
+ */
+ public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){
+ if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read))
+ return 0.0;
+
+ return likelihoodReadMap.get(read).get(allele);
+ }
+
/**
* Get the most likely alleles estimated across all reads in this object
*
@@ -290,18 +305,16 @@ public static MostLikelyAllele getMostLikelyAllele( final Map<Allele,Double> all
/**
* Debug method to dump contents of object into string for display
*/
- @Override
public String toString() {
- StringBuilder sb = new StringBuilder();
+ final StringBuilder sb = new StringBuilder();
sb.append("Alelles in map:");
- for (Allele a:alleles) {
+ for (final Allele a:alleles) {
sb.append(a.getDisplayString()+",");
-
}
sb.append("\n");
- for (Map.Entry <GATKSAMRecord, Map<Allele, Double>> el : getLikelihoodReadMap().entrySet() ) {
- for (Map.Entry<Allele,Double> eli : el.getValue().entrySet()) {
+ for (final Map.Entry <GATKSAMRecord, Map<Allele, Double>> el : getLikelihoodReadMap().entrySet() ) {
+ for (final Map.Entry<Allele,Double> eli : el.getValue().entrySet()) {
sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n");
}
View
31 public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java
@@ -38,10 +38,13 @@
import org.broadinstitute.variant.variantcontext.Allele;
import java.util.Arrays;
+import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
public class Haplotype extends Allele {
+
+
private GenomeLoc genomeLocation = null;
private EventMap eventMap = null;
private Cigar cigar;
@@ -214,7 +217,7 @@ public Cigar getConsolidatedPaddedCigar(final int padSize) {
public void setCigar( final Cigar cigar ) {
this.cigar = AlignmentUtils.consolidateCigar(cigar);
if ( this.cigar.getReadLength() != length() )
- throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength());
+ throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength() + " " + this.cigar);
}
@Requires({"refInsertLocation >= 0"})
@@ -311,4 +314,30 @@ public double getScore() {
public void setScore(double score) {
this.score = this.isReference() ? Double.MAX_VALUE : score;
}
+
+ /**
+ * Comparator used to sort haplotypes, alphanumerically.
+ *
+ * <p>
+ * If one haplotype is the prefix of the other, the shorter one comes first.
+ * </p>
+ */
+ public static final Comparator<Haplotype> ALPHANUMERICAL_COMPARATOR = new Comparator<Haplotype>() {
+
+ @Override
+ public int compare(final Haplotype o1, final Haplotype o2) {
+ if (o1 == o2)
+ return 0;
+ final byte[] bases1 = o1.getBases();
+ final byte[] bases2 = o2.getBases();
+ final int iLimit = Math.min(bases1.length, bases2.length);
+ for (int i = 0; i < iLimit; i++) {
+ final int cmp = Byte.compare(bases1[i], bases2[i]);
+ if (cmp != 0) return cmp;
+ }
+ if (bases1.length == bases2.length) return 0;
+ return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first.
+ }
+ };
+
}
View
98 public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java
@@ -1,27 +1,27 @@
/*