Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge remote-tracking branch 'unstable/master'

  • Loading branch information...
commit 3ce0a32da7fa141050c433725b9e05bfb4ca4a32 2 parents d785397 + 7a7adb7
@eitanbanks eitanbanks authored
Showing with 8,084 additions and 1,349 deletions.
  1. +2 −0  .gitignore
  2. +70 −56 build.xml
  3. +7 −2 ivy.xml
  4. +43 −0 licensing/private_license.txt
  5. +43 −0 licensing/protected_license.txt
  6. +22 −0 licensing/public_license.txt
  7. +0 −34 protected/java/src/org/broadinstitute/sting/gatk/DummyProtectedClass.java
  8. +184 −0 protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
  9. +0 −296 protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
  10. +102 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
  11. +98 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java
  12. +97 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java
  13. +119 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java
  14. +162 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
  15. +327 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
  16. +110 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
  17. +111 −96 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
  18. +125 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java
  19. +157 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
  20. +143 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
  21. +131 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java
  22. +102 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
  23. +134 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java
  24. +130 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
  25. +136 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java
  26. +188 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
  27. +198 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
  28. +100 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java
  29. +102 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java
  30. +109 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java
  31. +162 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java
  32. +106 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java
  33. +0 −84 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java
  34. +107 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
  35. +135 −76 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
  36. +184 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java
  37. +71 −24 ... → protected}/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
  38. +237 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
  39. +131 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
  40. +47 −4 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java
  41. +48 −11 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java
  42. +52 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java
  43. +48 −1 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
  44. +59 −13 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java
  45. +46 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java
  46. +82 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java
  47. +67 −20 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java
  48. +47 −2 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
  49. +59 −47 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
  50. +46 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
  51. +47 −4 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
  52. +119 −62 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
  53. +141 −45 protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
  54. +300 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
  55. +83 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
  56. +50 −27 ...c → protected}/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
  57. +116 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java
  58. +172 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java
  59. +104 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java
  60. +44 −22 ... → protected}/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java
  61. +174 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java
  62. +52 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java
  63. +64 −58 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
  64. +48 −26 ...arr; protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
  65. +55 −8 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
  66. +47 −26 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
  67. +49 −25 ...a/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
  68. +48 −2 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
  69. +45 −24 .../org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
  70. +48 −13 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
  71. +48 −1 ...rc/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
  72. +141 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
  73. +56 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java
  74. +49 −28 ...ted}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
  75. +46 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java
  76. +46 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java
  77. +58 −29 ...ected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
  78. +54 −26 ...c → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
  79. +62 −56 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
  80. +83 −45 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
  81. +99 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java
  82. +221 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java
  83. +47 −1 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java
  84. +50 −4 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java
  85. +45 −24 {public → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java
  86. +47 −1 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
  87. +48 −26 ...c → protected}/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java
  88. +92 −0 protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java
Sorry, we could not display the entire diff because too many files (1,529) changed.
View
2  .gitignore
@@ -12,6 +12,7 @@
*.ipr
*.iws
*.iml
+*.pyc
.DS_Store
queueScatterGather
/foo*
@@ -23,3 +24,4 @@ dist/
dump/
lib/
out/
+/atlassian-ide-plugin.xml
View
126 build.xml
@@ -107,7 +107,12 @@
<!-- To run tests with debugging, use -Dtest.debug=true -Dtest.debug.port=XXXX on the command line -->
<property name="test.debug.port" value="5005" /> <!-- override on the command line if desired -->
+ <property name="test.default.maxmemory" value="4g"/>
+ <!-- clover parameters -->
+ <property name="clover.jar" location="private/resources/clover/lib/clover.jar"/>
+ <property name="clover.instrument.level" value="method"/>
+ <taskdef resource="cloverlib.xml" classpath="${clover.jar}"/>
<!-- ******************************************************************************** -->
<!-- Filesets and paths -->
@@ -596,6 +601,7 @@
<path id="doclet.classpath">
<path refid="external.dependencies" />
<pathelement location="${java.classes}" />
+ <pathelement location="${clover.jar}" />
</path>
<javadoc doclet="org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet"
@@ -644,12 +650,8 @@
<target name="sting-utils.jar" depends="gatk.compile, init.jar, R.public.tar, R.script.stage">
<jar jarfile="${dist.dir}/StingUtils.jar">
- <fileset dir="${java.classes}">
- <include name="**/utils/**/*.class"/>
- <exclude name="**/utils/codecs/vcf/**/*.class"/>
- <exclude name="**/utils/variantcontext/**/*.class"/>
- </fileset>
- <fileset dir="${java.classes}" includes="**/commandline/**/*.class"/>
+ <fileset dir="${java.classes}" includes="**/sting/utils/**/*.class"/>
+ <fileset dir="${java.classes}" includes="**/sting/commandline/**/*.class"/>
<fileset dir="${java.classes}" includes="**/sting/pipeline/**/*.class"/>
<fileset dir="${java.classes}" includes="**/sting/tools/**/*.class"/>
<fileset dir="${java.classes}" includes="**/sting/jna/**/*.class"/>
@@ -659,7 +661,7 @@
<include name="**/${R.package.path}/**/*.tar.gz"/>
</fileset>
<fileset dir="${R.script.staging.dir}">
- <include name="**/utils/**/*.R"/>
+ <include name="**/sting/utils/**/*.R"/>
</fileset>
<manifest>
<attribute name="Premain-Class" value="org.broadinstitute.sting.utils.instrumentation.Sizeof" />
@@ -667,18 +669,6 @@
</jar>
</target>
- <target name="vcf.jar" depends="gatk.compile,init.jar">
- <jar jarfile="${dist.dir}/vcf.jar">
- <fileset dir="${java.classes}">
- <include name="org/broadinstitute/sting/utils/codecs/vcf/**/*.class"/>
- <include name="org/broadinstitute/sting/utils/codecs/bcf2/**/*.class"/>
- <include name="org/broadinstitute/sting/utils/variantcontext/**/*.class"/>
- <include name="org/broadinstitute/sting/utils/exceptions/**"/>
- <include name="org/broadinstitute/sting/utils/help/DocumentedGATKFeature.class"/>
- </fileset>
- </jar>
- </target>
-
<target name="na12878kb.jar" depends="gatk.compile,init.jar">
<jar jarfile="${dist.dir}/na12878kb.jar">
<fileset dir="${java.classes}">
@@ -703,12 +693,12 @@
<fileset dir="${java.contracts.dir}" />
<fileset dir="${java.classes}">
<include name="${resource.file}" />
- <include name="**/gatk/**/*.class" />
- <include name="**/alignment/**/*.class"/>
+ <include name="**/sting/gatk/**/*.class" />
+ <include name="**/sting/alignment/**/*.class"/>
</fileset>
<fileset dir="${R.script.staging.dir}">
- <include name="**/gatk/**/*.R"/>
- <include name="**/alignment/**/*.R"/>
+ <include name="**/sting/gatk/**/*.R"/>
+ <include name="**/sting/alignment/**/*.R"/>
</fileset>
<fileset dir="${key.dir}">
<include name="**/*.key"/>
@@ -719,7 +709,7 @@
</jar>
<jar jarfile="${dist.dir}/Aligner.jar">
- <fileset dir="${java.classes}" includes="**/alignment/**/*.class" />
+ <fileset dir="${java.classes}" includes="**/sting/alignment/**/*.class" />
</jar>
<subant target="dist" genericantfile="build.xml">
@@ -755,7 +745,7 @@
</jar>
</target>
- <target name="sting.jar" depends="sting-utils.jar, vcf.jar, gatk.jar, queue.jar" />
+ <target name="sting.jar" depends="sting-utils.jar, gatk.jar, queue.jar" />
<target name="init.manifests" depends="sting.jar">
<pathconvert property="jar.classpath" pathsep=" ">
@@ -875,10 +865,6 @@
<property name="executable" value="GenomeAnalysisTK" />
</target>
- <target name="init.executable.gatklite" depends="init.build.publiconly, init.javaonly">
- <property name="executable" value="GenomeAnalysisTKLite" />
- </target>
-
<target name="init.executable.queueall" depends="init.build.all, init.javaandscala">
<property name="executable" value="Queue" />
</target>
@@ -887,10 +873,6 @@
<property name="executable" value="Queue" />
</target>
- <target name="init.executable.queuelite" depends="init.build.publiconly, init.javaandscala">
- <property name="executable" value="QueueLite" />
- </target>
-
<target name="require.executable">
<condition property="no.executable.defined">
<or>
@@ -941,15 +923,10 @@
<!-- Package specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
<target name="package.gatk.full" depends="init.executable.gatkfull,package" />
- <target name="package.gatk.lite" depends="init.executable.gatklite,package" />
-
<target name="package.queue.all" depends="init.executable.queueall,package" />
<target name="package.queue.full" depends="init.executable.queuefull,package" />
- <target name="package.queue.lite" depends="init.executable.queuelite,package" />
-
-
<!-- Release a build. Don't call this target directly. Call one of the specific release targets below -->
<target name="release" depends="require.executable" description="release a build, putting each file in a location specified by the package">
<ant antfile="${package.output.dir}/${executable}.xml" target="release" />
@@ -958,13 +935,8 @@
<!-- Release specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
<target name="release.gatk.full" depends="package.gatk.full,release" />
- <target name="release.gatk.lite" depends="package.gatk.lite,release" />
-
<target name="release.queue.full" depends="package.queue.full,release" />
- <target name="release.queue.lite" depends="package.queue.lite,release" />
-
-
<!-- Build a subset of picard with only those classes we need by completely abusing the packaging system -->
<!-- TODO: Reuse as much as possible of the 'stage' and 'package' targets -->
<target name="build-picard-private" depends="resolve">
@@ -993,7 +965,7 @@
<!-- Maven install a package consisting of all supporting files. Don't call this target directly. Call one of the specific packaging targets below -->
<target name="mvninstall" depends="package" description="maven install a package into .m2/repository">
- <property name="mvn.build.version" value="0.0.1" />
+ <property name="mvn.build.version" value="0.0.2" />
<!--
We should use the build version or better yet a git tag version, but tags are currently missing. Alternatively how do we then depend on the LATEST?
<property name="mvn.build.version" value="${build.version}" />
@@ -1008,14 +980,10 @@
<!-- Maven install specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
<target name="mvninstall.gatk.full" depends="package.gatk.full,mvninstall" />
- <target name="mvninstall.gatk.lite" depends="package.gatk.lite,mvninstall" />
-
<target name="mvninstall.queue.all" depends="package.queue.all,mvninstall" />
<target name="mvninstall.queue.full" depends="package.queue.full,mvninstall" />
- <target name="mvninstall.queue.lite" depends="package.queue.lite,mvninstall" />
-
<!-- ******************************************************************************** -->
<!-- Clean -->
<!-- ******************************************************************************** -->
@@ -1032,6 +1000,14 @@
<delete dir="${scaladoc.dir}" />
</target>
+ <target name="-check.clover">
+ <available property="clover.installed" classname="com.cenqua.clover.CloverInstr" />
+ </target>
+
+ <target name="clean.clover" depends="-check.clover" if="clover.installed">
+ <clover-clean/>
+ </target>
+
<target name="clean.gsalib">
<!-- Currently not cleaning out the lib during 'ant clean' -->
<exec executable="R" failonerror="false">
@@ -1039,7 +1015,7 @@
</exec>
</target>
- <target name="clean" description="clean up" depends="clean.javadoc,clean.scaladoc,clean.gatkdocs">
+ <target name="clean" description="clean up" depends="clean.javadoc,clean.scaladoc,clean.gatkdocs,clean.clover">
<delete dir="${build.dir}"/>
<delete dir="${lib.dir}"/>
<delete dir="${contract.dump.dir}"/>
@@ -1097,8 +1073,7 @@
<property name="report" value="${build.dir}/report"/>
<property name="iwww.report.dir" value="${user.home}/private_html/report"/>
<property name="test.output" value="${dist.dir}/test"/>
- <property name="testng.jar" value="${lib.dir}/testng-5.14.1.jar"/>
- <property name="test.maxmemory" value="4g"/> <!-- provide a ceiling on the memory that unit/integration tests can consume. -->
+ <property name="testng.jar" value="${lib.dir}/testng-6.8.jar"/>
<path id="java.test.source.path">
<dirset dir="${basedir}">
@@ -1129,6 +1104,7 @@
<path id="testng.default.classpath">
<path refid="build.results" />
+ <pathelement path="${clover.jar}"/>
<pathelement location="${java.contracts.dir}" />
<pathelement location="${java.test.classes}" />
<pathelement location="${scala.test.classes}" />
@@ -1136,6 +1112,45 @@
<!-- Test targets -->
+ <target name="clover.report">
+ <clover-report coverageCacheSize="nocache">
+ <current outfile="clover_html" title="GATK clover report" showUniqueCoverage="false" numThreads="4">
+ <format type="html" filter="catch,static,property"/>
+ <fileset dir="public">
+ <patternset id="clover.excludes">
+ <exclude name="**/*UnitTest.java"/>
+ <exclude name="**/*TestProvider*.java"/>
+ <exclude name="**/*PerformanceTest.java"/>
+ <exclude name="**/*Benchmark.java"/>
+ <exclude name="**/*LargeScaleTest.java"/>
+ <exclude name="**/*IntegrationTest.java"/>
+ <exclude name="**/jna/**/*.java"/>
+ <exclude name="**/queue/extensions/**/*.java"/>
+ <exclude name="**/sting/utils/help/*.java"/>
+ <exclude name="**/sting/tools/*.java"/>
+ <exclude name="**/datasources/reads/utilities/*.java"/>
+ <exclude name="**/sting/alignment/**/*.java"/>
+ <exclude name="**/examples/**/*.java"/>
+ </patternset>
+ </fileset>
+ <fileset dir="private">
+ <patternset refid="clover.excludes" />
+ </fileset>
+ <fileset dir="protected">
+ <patternset refid="clover.excludes" />
+ </fileset>
+ </current>
+ </clover-report>
+ </target>
+
+ <target name="with.clover">
+ <clover-setup fullyQualifyJavaLang="true" instrumentationLevel="${clover.instrument.level}">
+ </clover-setup>
+ <property name="compile.scala" value="false" /> <!-- currently doesn't work with scala -->
+ <property name="test.maxmemory" value="32g"/> <!-- clover requires lots of memory -->
+ <echo message="Clover instrument level: ${clover.instrument.level}" />
+ </target>
+
<target name="test.init.compile">
<mkdir dir="${java.test.classes}"/>
<mkdir dir="${scala.test.classes}"/>
@@ -1215,6 +1230,7 @@
<echo message="Test Classpath: ${test.classpath.display.string}" />
<echo message="" />
<echo message="Sting: Running @{testtype} test cases!"/>
+ <echo message="Test Memory : ${test.maxmemory}" />
<!-- no test is allowed to run for more than 10 hours -->
<taskdef resource="testngtasks" classpath="${testng.jar}"/>
@@ -1228,6 +1244,7 @@
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
<jvmarg value="-Xmx${test.maxmemory}" />
<jvmarg value="-ea" />
+ <jvmarg value="-Dclover.pertest.coverage=diff" />
<jvmarg value="-Djava.awt.headless=true" />
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
@@ -1270,6 +1287,7 @@
<target name="test.init">
<property name="testng.classpath" value="testng.default.classpath" />
+ <property name="test.maxmemory" value="${test.default.maxmemory}"/>
</target>
<target name="init.testgatkjar">
@@ -1315,13 +1333,8 @@
<!-- Order of the dependencies is significant in the *.release.tests targets -->
<target name="gatkfull.binary.release.tests" depends="init.usecontracts,package.gatk.full,init.testgatkjar,unittest,integrationtest" />
- <target name="gatklite.binary.release.tests" depends="init.usecontracts,package.gatk.lite,init.testgatkjar,unittest,integrationtest" />
-
<target name="queuefull.binary.release.tests" depends="init.usecontracts,package.queue.full,init.testqueuejar,pipelinetest" />
- <target name="queuelite.binary.release.tests" depends="init.usecontracts,package.queue.lite,init.testqueuejar,pipelinetest" />
-
-
<!-- Our four different test types: UnitTest, IntegrationTest, LargeScaleTest, PipelineTest -->
<target name="unittest" depends="test.compile,test.init" description="Run unit tests">
<condition property="ttype" value="*UnitTest" else="${single}">
@@ -1382,6 +1395,7 @@
<!-- Fast test target that cuts major corners for speed. Requires that a full build has been done first. Java-only, single test class only -->
<!-- Usage: ant fasttest -Dsingle=TestClass -->
<target name="fasttest" depends="init.javaonly,init">
+ <property name="test.maxmemory" value="${test.default.maxmemory}"/>
<condition property="not.clean">
<and>
<available file="${build.dir}" />
View
9 ivy.xml
@@ -35,6 +35,9 @@
<!-- Tribble -->
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
+ <!-- Variant -->
+ <dependency org="org.broadinstitute" name="variant" rev="latest.integration"/>
+
<dependency org="log4j" name="log4j" rev="1.2.15"/>
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
<dependency org="colt" name="colt" rev="1.2.0"/>
@@ -61,6 +64,7 @@
<dependency org="commons-lang" name="commons-lang" rev="2.5"/>
<dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
<dependency org="commons-io" name="commons-io" rev="2.1"/>
+ <dependency org="commons-collections" name="commons-collections" rev="3.2.1"/>
<dependency org="org.apache.commons" name="commons-math" rev="2.2"/>
<!-- Lucene core utilities -->
@@ -80,9 +84,10 @@
<dependency org="org.scala-lang" name="scala-library" rev="2.9.2"/>
<!-- testing and evaluation dependencies -->
- <dependency org="org.testng" name="testng" rev="5.14.1"/>
+ <dependency org="org.testng" name="testng" rev="6.8"/>
<dependency org="org.uncommons" name="reportng" rev="1.1.2"/>
- <dependency org="com.google.code.caliper" name="caliper" rev="1.0-SNAPSHOT"/>
+ <dependency org="com.google.caliper" name="caliper" rev="0.5-rc1"/>
+ <dependency org="com.google.inject" name="guice" rev="3.0"/>
<!-- Contracts for Java and dependencies -->
<dependency org="com.google.code.cofoja" name="cofoja" rev="1.0-r139"/>
View
43 licensing/private_license.txt
@@ -0,0 +1,43 @@
+ By downloading the PROGRAM you agree to the following terms of use:
+
+ BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+
+ This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+
+ WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+ WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+ NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+
+ 1. DEFINITIONS
+ 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+
+ 2. LICENSE
+ 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+ The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+ 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+ 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+
+ 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+ LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+ Copyright 2012 Broad Institute, Inc.
+ Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+ LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+
+ 4. INDEMNIFICATION
+ LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+
+ 5. NO REPRESENTATIONS OR WARRANTIES
+ THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+ IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+
+ 6. ASSIGNMENT
+ This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+
+ 7. MISCELLANEOUS
+ 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+ 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+ 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+ 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+ 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+ 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+ 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
View
43 licensing/protected_license.txt
@@ -0,0 +1,43 @@
+ By downloading the PROGRAM you agree to the following terms of use:
+
+ BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+
+ This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+
+ WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+ WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+ NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+
+ 1. DEFINITIONS
+ 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+
+ 2. LICENSE
+ 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+ The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+ 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+ 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+
+ 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+ LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+ Copyright 2012 Broad Institute, Inc.
+ Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+ LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+
+ 4. INDEMNIFICATION
+ LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+
+ 5. NO REPRESENTATIONS OR WARRANTIES
+ THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+ IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+
+ 6. ASSIGNMENT
+ This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+
+ 7. MISCELLANEOUS
+ 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+ 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+ 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+ 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+ 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+ 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+ 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
View
22 licensing/public_license.txt
@@ -0,0 +1,22 @@
+Copyright (c) 2012 The Broad Institute
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+THE USE OR OTHER DEALINGS IN THE SOFTWARE.
View
34 protected/java/src/org/broadinstitute/sting/gatk/DummyProtectedClass.java
@@ -1,34 +0,0 @@
-package org.broadinstitute.sting.gatk;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
-
-public class DummyProtectedClass implements ProtectedPackageSource {
-
- // THIS CLASS IS USED JUST SO THAT WE CAN TEST WHETHER WE ARE USING THE LITE OR FULL VERSION OF THE GATK
- // **** DO NOT REMOVE! ****
-}
View
184 protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@@ -0,0 +1,184 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.arguments;
+
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
+import org.broadinstitute.sting.utils.collections.DefaultHashMap;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 8/20/12
+ * A collection of arguments that are common to the various callers.
+ * This is pulled out so that every caller isn't exposed to the arguments from every other caller.
+ */
+
+public class StandardCallerArgumentCollection {
+ /**
+ * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
+ * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
+ */
+ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
+ public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
+
+ @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
+ public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
+
+ @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
+ public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+
+ /**
+ * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
+ * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
+ * is the default).
+ */
+ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
+ public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
+
+ /**
+ * This argument allows you to emit low quality calls as filtered records.
+ */
+ @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
+ public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
+
+ /**
+ * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
+ */
+ @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
+ public RodBinding<VariantContext> alleles;
+
+ /**
+ * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
+ * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
+ * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
+ * that you not play around with this parameter.
+ *
+ * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
+ */
+ @Advanced
+ @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
+ public int MAX_ALTERNATE_ALLELES = 6;
+
+ /**
+ * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
+ * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
+ * will try to remove (N * contamination fraction) bases for each alternate allele.
+ */
+ @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
+ public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
+ public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05;
+
+ /**
+ * This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
+ * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION
+ **/
+ @Advanced
+ @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"<SampleID><TAB><Contamination>\" (Contamination is double) per line; No header.", required = false)
+ public File CONTAMINATION_FRACTION_FILE = null;
+
+ /**
+ *
+ * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION
+ */
+ public Map<String,Double> getSampleContamination(){
+ //make sure that the default value is set up right
+ sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
+ return Collections.unmodifiableMap(sampleContamination);
+ }
+
+ public void setSampleContamination(DefaultHashMap<String, Double> sampleContamination) {
+ this.sampleContamination.clear();
+ this.sampleContamination.putAll(sampleContamination);
+ this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
+ }
+
+ //Needs to be here because it uses CONTAMINATION_FRACTION
+ private DefaultHashMap<String,Double> sampleContamination = new DefaultHashMap<String,Double>(CONTAMINATION_FRACTION);
+
+ /**
+ * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
+ */
+ @Hidden
+ @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
+ public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
+
+ @Hidden
+ @Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
+ public PrintStream contaminationLog = null;
+
+ @Hidden
+ @Argument(shortName = "logExactCalls", doc="x", required=false)
+ public File exactCallsLog = null;
+
+ public StandardCallerArgumentCollection() { }
+
+ // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
+ public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
+ this.alleles = SCAC.alleles;
+ this.GenotypingMode = SCAC.GenotypingMode;
+ this.heterozygosity = SCAC.heterozygosity;
+ this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
+ this.OutputMode = SCAC.OutputMode;
+ this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
+ this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
+ this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
+ this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
+ this.contaminationLog = SCAC.contaminationLog;
+ this.exactCallsLog = SCAC.exactCallsLog;
+ this.sampleContamination=SCAC.sampleContamination;
+ this.AFmodel = SCAC.AFmodel;
+ }
+}
View
296 protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2010.
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.downsampling;
-
-import net.sf.samtools.SAMReadGroupRecord;
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.pileup.*;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.variantcontext.Allele;
-
-import java.io.PrintStream;
-import java.util.*;
-
-public class AlleleBiasedDownsamplingUtils {
-
- /**
- * Computes an allele biased version of the given pileup
- *
- * @param pileup the original pileup
- * @param downsamplingFraction the fraction of total reads to remove per allele
- * @param log logging output
- * @return allele biased pileup
- */
- public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
- // special case removal of all or no reads
- if ( downsamplingFraction <= 0.0 )
- return pileup;
- if ( downsamplingFraction >= 1.0 )
- return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
-
- final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
- for ( int i = 0; i < 4; i++ )
- alleleStratifiedElements[i] = new ArrayList<PileupElement>();
-
- // keep all of the reduced reads
- final ArrayList<PileupElement> reducedReadPileups = new ArrayList<PileupElement>();
-
- // start by stratifying the reads by the alleles they represent at this position
- for( final PileupElement pe : pileup ) {
- // we do not want to remove a reduced read
- if ( pe.getRead().isReducedRead() )
- reducedReadPileups.add(pe);
-
- final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
- if ( baseIndex != -1 )
- alleleStratifiedElements[baseIndex].add(pe);
- }
-
- // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
- int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
- final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
- @Override
- public int compare(PileupElement element1, PileupElement element2) {
- final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
- return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
- }
- });
- elementsToKeep.addAll(reducedReadPileups);
-
- // make a listing of allele counts
- final int[] alleleCounts = new int[4];
- for ( int i = 0; i < 4; i++ )
- alleleCounts[i] = alleleStratifiedElements[i].size();
-
- // do smart down-sampling
- final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
-
- for ( int i = 0; i < 4; i++ ) {
- final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
- // if we don't need to remove any reads, keep them all
- if ( alleleList.size() <= targetAlleleCounts[i] )
- elementsToKeep.addAll(alleleList);
- else
- elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
- }
-
- // clean up pointers so memory can be garbage collected if needed
- for ( int i = 0; i < 4; i++ )
- alleleStratifiedElements[i].clear();
-
- return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
- }
-
- private static int scoreAlleleCounts(final int[] alleleCounts) {
- if ( alleleCounts.length < 2 )
- return 0;
-
- // sort the counts (in ascending order)
- final int[] alleleCountsCopy = alleleCounts.clone();
- Arrays.sort(alleleCountsCopy);
-
- final int maxCount = alleleCountsCopy[alleleCounts.length - 1];
- final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2];
-
- int remainderCount = 0;
- for ( int i = 0; i < alleleCounts.length - 2; i++ )
- remainderCount += alleleCountsCopy[i];
-
- // try to get the best score:
- // - in the het case the counts should be equal with nothing else
- // - in the hom case the non-max should be zero
- return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount));
- }
-
- /**
- * Computes an allele biased version of the given pileup
- *
- * @param alleleCounts the original pileup
- * @param numReadsToRemove fraction of total reads to remove per allele
- * @return allele biased pileup
- */
- protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
- final int numAlleles = alleleCounts.length;
-
- int maxScore = scoreAlleleCounts(alleleCounts);
- int[] alleleCountsOfMax = alleleCounts;
-
- final int numReadsToRemovePerAllele = numReadsToRemove / 2;
-
- for ( int i = 0; i < numAlleles; i++ ) {
- for ( int j = i; j < numAlleles; j++ ) {
- final int[] newCounts = alleleCounts.clone();
-
- // split these cases so we don't lose on the floor (since we divided by 2)
- if ( i == j ) {
- newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove);
- } else {
- newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele);
- newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele);
- }
-
- final int score = scoreAlleleCounts(newCounts);
-
- if ( score < maxScore ) {
- maxScore = score;
- alleleCountsOfMax = newCounts;
- }
- }
- }
-
- return alleleCountsOfMax;
- }
-
- /**
- * Performs allele biased down-sampling on a pileup and computes the list of elements to keep
- *
- * @param elements original list of records
- * @param numElementsToRemove the number of records to remove
- * @param log logging output
- * @return the list of pileup elements TO KEEP
- */
- private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
- if ( numElementsToRemove == 0 )
- return elements;
-
- final int pileupSize = elements.size();
- if ( numElementsToRemove == pileupSize ) {
- logAllElements(elements, log);
- return new ArrayList<PileupElement>(0);
- }
-
- final BitSet itemsToRemove = new BitSet(pileupSize);
- for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
- itemsToRemove.set(selectedIndex);
- }
-
- ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
- for ( int i = 0; i < pileupSize; i++ ) {
- if ( itemsToRemove.get(i) )
- logRead(elements.get(i).getRead(), log);
- else
- elementsToKeep.add(elements.get(i));
- }
-
- return elementsToKeep;
- }
-
- /**
- * Computes reads to remove based on an allele biased down-sampling
- *
- * @param alleleReadMap original list of records per allele
- * @param downsamplingFraction the fraction of total reads to remove per allele
- * @param log logging output
- * @return list of reads TO REMOVE from allele biased down-sampling
- */
- public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
- int totalReads = 0;
- for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
- totalReads += reads.size();
-
- int numReadsToRemove = (int)(totalReads * downsamplingFraction);
-
- // make a listing of allele counts
- final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
- alleles.remove(Allele.NO_CALL); // ignore the no-call bin
- final int numAlleles = alleles.size();
- final int[] alleleCounts = new int[numAlleles];
- for ( int i = 0; i < numAlleles; i++ )
- alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
-
- // do smart down-sampling
- final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
-
- final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
- for ( int i = 0; i < numAlleles; i++ ) {
- final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
-
- if ( alleleBin.size() > targetAlleleCounts[i] ) {
- readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
- }
- }
-
- return readsToRemove;
- }
-
- /**
- * Performs allele biased down-sampling on a pileup and computes the list of elements to remove
- *
- * @param reads original list of records
- * @param numElementsToRemove the number of records to remove
- * @param log logging output
- * @return the list of pileup elements TO REMOVE
- */
- private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
- final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
-
- if ( numElementsToRemove == 0 )
- return readsToRemove;
-
- final int pileupSize = reads.size();
- if ( numElementsToRemove == pileupSize ) {
- logAllReads(reads, log);
- return reads;
- }
-
- final BitSet itemsToRemove = new BitSet(pileupSize);
- for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
- itemsToRemove.set(selectedIndex);
- }
-
- for ( int i = 0; i < pileupSize; i++ ) {
- if ( itemsToRemove.get(i) ) {
- final GATKSAMRecord read = reads.get(i);
- readsToRemove.add(read);
- logRead(read, log);
- }
- }
-
- return readsToRemove;
- }
-
- private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
- if ( log != null ) {
- for ( final PileupElement p : elements )
- logRead(p.getRead(), log);
- }
- }
-
- private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
- if ( log != null ) {
- for ( final GATKSAMRecord read : reads )
- logRead(read, log);
- }
- }
-
- private static void logRead(final SAMRecord read, final PrintStream log) {
- if ( log != null ) {
- final SAMReadGroupRecord readGroup = read.getReadGroup();
- log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
- }
- }
-}
View
102 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
@@ -0,0 +1,102 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.Allele;
+
+import java.util.*;
+
+
+/**
+ * The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
+ * Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ */
+public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
+ public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
+
+ public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); }
+
+ protected void fillQualsFromPileup(final List<Allele> allAlleles, final int refLoc,
+ final ReadBackedPileup pileup,
+ final PerReadAlleleLikelihoodMap alleleLikelihoodMap,
+ final List<Double> refQuals, final List<Double> altQuals){
+
+ if (alleleLikelihoodMap == null) {
+ // use fast SNP-based version if we don't have per-read allele likelihoods
+ for ( final PileupElement p : pileup ) {
+ if ( isUsableBase(p) ) {
+ if ( allAlleles.get(0).equals(Allele.create(p.getBase(),true)) ) {
+ refQuals.add((double)p.getQual());
+ } else if ( allAlleles.contains(Allele.create(p.getBase()))) {
+ altQuals.add((double)p.getQual());
+ }
+ }
+ }
+ return;
+ }
+
+ for (Map<Allele,Double> el : alleleLikelihoodMap.getLikelihoodMapValues()) {
+ final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el);
+ if (a.isNoCall())
+ continue; // read is non-informative
+ if (a.isReference())
+ refQuals.add(-10.0*(double)el.get(a));
+ else if (allAlleles.contains(a))
+ altQuals.add(-10.0*(double)el.get(a));
+
+
+ }
+ }
+
+
+}
View
98 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java
@@ -0,0 +1,98 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFHeaderLine;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.broadinstitute.variant.variantcontext.VariantContextUtils;
+
+import java.util.*;
+
+
+/**
+ * Allele count in genotypes, for each ALT allele, in the same order as listed;
+ * allele Frequency, for each ALT allele, in the same order as listed; total number
+ * of alleles in called genotypes.
+ */
+public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+
+ private Set<String> founderIds = new HashSet<String>();
+
+ public Map<String, Object> annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map<String, AlignmentContext> stratifiedContexts,
+ final VariantContext vc,
+ final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
+ if ( ! vc.hasGenotypes() )
+ return null;
+
+ return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap<String, Object>(), true,founderIds);
+ }
+
+ public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ){
+ //If families were given, get the founders ids
+ founderIds = ((Walker)walker).getSampleDB().getFounderIds();
+ }
+
+ public List<String> getKeyNames() {
+ return Arrays.asList(ChromosomeCountConstants.keyNames);
+ }
+
+ public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(ChromosomeCountConstants.descriptions); }
+}
View
97 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java
@@ -0,0 +1,97 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;
+
+import java.util.*;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 6/28/12
+ */
+
+/**
+ * The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele)
+ * Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
+ */
+public class ClippingRankSumTest extends RankSumTest {
+
+ public List<String> getKeyNames() { return Arrays.asList("ClippingRankSum"); }
+
+ public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); }
+
+
+ protected void fillQualsFromPileup(final List<Allele> allAlleles,
+ final int refLoc,
+ final ReadBackedPileup pileup,
+ final PerReadAlleleLikelihoodMap likelihoodMap, final List<Double> refQuals, final List<Double> altQuals) {
+ // todo - only support non-pileup case for now, e.g. active-region based version
+ if (pileup != null || likelihoodMap == null)
+ return;
+
+ for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
+
+ final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ if (a.isNoCall())
+ continue; // read is non-informative
+ if (a.isReference())
+ refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
+ else if (allAlleles.contains(a))
+ altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
+
+ }
+ }
+
+ }
View
119 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java
@@ -0,0 +1,119 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Total (unfiltered) depth over all samples.
+ *
+ * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's
+ * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
+ * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
+ * N samples with -dcov D is N * D
+ */
+public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+
+ public Map<String, Object> annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map<String, AlignmentContext> stratifiedContexts,
+ final VariantContext vc,
+ final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap ) {
+
+ int depth = 0;
+ if (stratifiedContexts != null) {
+ if ( stratifiedContexts.size() == 0 )
+ return null;
+
+ for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
+ depth += sample.getValue().getBasePileup().depthOfCoverage();
+ }
+ else if (perReadAlleleLikelihoodMap != null) {
+ if ( perReadAlleleLikelihoodMap.size() == 0 )
+ return null;
+
+ for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
+ for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
+ final GATKSAMRecord read = el.getKey();
+ depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
+ }
+ }
+ }
+ else
+ return null;
+
+ Map<String, Object> map = new HashMap<String, Object>();
+ map.put(getKeyNames().get(0), String.format("%d", depth));
+ return map;
+ }
+
+ public List<String> getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); }
+
+ public List<VCFInfoHeaderLine> getDescriptions() {
+ return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+}
View
162 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
@@ -0,0 +1,162 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
+import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * The depth of coverage of each VCF allele in this sample.
+ *
+ * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
+ * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
+ * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
+ * REF and ALT fields) is the unfiltered count of all reads that carried with them the
+ * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
+ * power I have to determine the genotype of the sample at this site, while the AD tells me how many times
+ * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
+ * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
+ * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
+ * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
+ * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
+ * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
+ * many non-informatice reads.
+ * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation,
+ * <b>one should not base assumptions about the underlying genotype based on it</b>;
+ * instead, the genotype likelihoods (PLs) are what determine the genotype calls.
+ */
+public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
+
+ public void annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final AlignmentContext stratifiedContext,
+ final VariantContext vc,
+ final Genotype g,
+ final GenotypeBuilder gb,
+ final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
+ if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
+ return;
+
+ if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
+ annotateWithLikelihoods(alleleLikelihoodMap, vc, gb);
+ else if ( stratifiedContext != null && (vc.isSNP()))
+ annotateWithPileup(stratifiedContext, vc, gb);
+ }
+
+ private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) {
+
+ HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
+ for ( Allele allele : vc.getAlleles() )
+ alleleCounts.put(allele.getBases()[0], 0);
+
+ ReadBackedPileup pileup = stratifiedContext.getBasePileup();
+ for ( PileupElement p : pileup ) {
+ if ( alleleCounts.containsKey(p.getBase()) )
+ alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
+ }
+
+ // we need to add counts in the correct order
+ int[] counts = new int[alleleCounts.size()];
+ counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
+ for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
+ counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
+
+ gb.AD(counts);
+ }
+
+ private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) {
+ final HashMap<Allele, Integer> alleleCounts = new HashMap<Allele, Integer>();
+
+ for ( final Allele allele : vc.getAlleles() ) {
+ alleleCounts.put(allele, 0);
+ }
+ for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
+ final GATKSAMRecord read = el.getKey();
+ final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ if (a.isNoCall())
+ continue; // read is non-informative
+ if (!vc.getAlleles().contains(a))
+ continue; // sanity check - shouldn't be needed
+ alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
+ }
+ final int[] counts = new int[alleleCounts.size()];
+ counts[0] = alleleCounts.get(vc.getReference());
+ for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
+ counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) );
+
+ gb.AD(counts);
+ }
+
+ public List<String> getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
+
+ public List<VCFFormatHeaderLine> getDescriptions() {
+ return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
+ }
+}
View
327 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
@@ -0,0 +1,327 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import cern.jet.math.Arithmetic;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.*;
+
+
+/**
+ * Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
+ * being seen on only the forward or only the reverse strand) in the reads? More bias is
+ * indicative of false positive calls. Note that the fisher strand test may not be
+ * calculated for certain complex indel cases or for multi-allelic sites.
+ */
+public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+ private static final String FS = "FS";
+ private static final double MIN_PVALUE = 1E-320;
+ private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
+
+ public Map<String, Object> annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map<String, AlignmentContext> stratifiedContexts,
+ final VariantContext vc,
+ final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
+ if ( !vc.isVariant() )
+ return null;
+
+ if (vc.isSNP() && stratifiedContexts != null) {
+ final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1);
+ final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST);
+ return pValueForBestTable(tableFiltering, tableNoFiltering);
+ }
+ else if (stratifiedPerReadAlleleLikelihoodMap != null) {
+ // either SNP with no alignment context, or indels: per-read likelihood map needed
+ final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
+ return pValueForBestTable(table, null);
+ }
+ else
+ // for non-snp variants, we need per-read likelihoods.
+ // for snps, we can get same result from simple pileup
+ return null;
+ }
+
+ /**
+ * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
+ *
+ * @param table1 a contingency table, may be null
+ * @param table2 a contingency table, may be null
+ * @return annotation result for FS given tables
+ */
+ private Map<String, Object> pValueForBestTable(final int[][] table1, final int[][] table2) {
+ if ( table2 == null )
+ return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1));
+ else if (table1 == null)
+ return annotationForOneTable(pValueForContingencyTable(table2));
+ else { // take the one with the best (i.e., least significant pvalue)
+ double pvalue1 = pValueForContingencyTable(table1);
+ double pvalue2 = pValueForContingencyTable(table2);
+ return annotationForOneTable(Math.max(pvalue1, pvalue2));
+ }
+ }
+
+ /**
+ * Returns an annotation result given a pValue
+ *
+ * @param pValue
+ * @return a hash map from FS -> phred-scaled pValue
+ */
+ private Map<String, Object> annotationForOneTable(final double pValue) {
+ final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
+ return Collections.singletonMap(FS, value);
+// Map<String, Object> map = new HashMap<String, Object>();
+// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)));
+// return map;
+ }
+
+ public List<String> getKeyNames() {
+ return Arrays.asList(FS);
+ }
+
+ public List<VCFInfoHeaderLine> getDescriptions() {
+ return Arrays.asList(
+ new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
+ }
+
+ private Double pValueForContingencyTable(int[][] originalTable) {
+ int [][] table = copyContingencyTable(originalTable);
+
+ double pCutoff = computePValue(table);
+ //printTable(table, pCutoff);
+
+ double pValue = pCutoff;
+ while (rotateTable(table)) {
+ double pValuePiece = computePValue(table);
+
+ //printTable(table, pValuePiece);
+
+ if (pValuePiece <= pCutoff) {
+ pValue += pValuePiece;
+ }
+ }
+
+ table = copyContingencyTable(originalTable);
+ while (unrotateTable(table)) {
+ double pValuePiece = computePValue(table);
+
+ //printTable(table, pValuePiece);
+
+ if (pValuePiece <= pCutoff) {
+ pValue += pValuePiece;
+ }
+ }
+
+ //System.out.printf("P-cutoff: %f\n", pCutoff);
+ //System.out.printf("P-value: %f\n\n", pValue);
+
+ // min is necessary as numerical precision can result in pValue being slightly greater than 1.0
+ return Math.min(pValue, 1.0);
+ }
+
+ private static int [][] copyContingencyTable(int [][] t) {
+ int[][] c = new int[2][2];
+
+ for ( int i = 0; i < 2; i++ )
+ for ( int j = 0; j < 2; j++ )
+ c[i][j] = t[i][j];
+
+ return c;
+ }
+
+
+ private static void printTable(int[][] table, double pValue) {
+ System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue);
+ }
+
+ private static boolean rotateTable(int[][] table) {
+ table[0][0] -= 1;
+ table[1][0] += 1;
+
+ table[0][1] += 1;
+ table[1][1] -= 1;
+
+ return (table[0][0] >= 0 && table[1][1] >= 0);
+ }
+
+ private static boolean unrotateTable(int[][] table) {
+ table[0][0] += 1;
+ table[1][0] -= 1;
+
+ table[0][1] -= 1;
+ table[1][1] += 1;
+
+ return (table[0][1] >= 0 && table[1][0] >= 0);
+ }
+
+ private static double computePValue(int[][] table) {
+
+ int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
+ int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
+ int N = rowSums[0] + rowSums[1];
+
+ // calculate in log space so we don't die with high numbers
+ double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ + Arithmetic.logFactorial(rowSums[1])
+ + Arithmetic.logFactorial(colSums[0])
+ + Arithmetic.logFactorial(colSums[1])
+ - Arithmetic.logFactorial(table[0][0])
+ - Arithmetic.logFactorial(table[0][1])
+ - Arithmetic.logFactorial(table[1][0])
+ - Arithmetic.logFactorial(table[1][1])
+ - Arithmetic.logFactorial(N);
+ return Math.exp(pCutoff);
+ }
+
+ private static int sumRow(int[][] table, int column) {
+ int sum = 0;
+ for (int r = 0; r < table.length; r++) {
+ sum += table[r][column];
+ }
+
+ return sum;
+ }
+
+ private static int sumColumn(int[][] table, int row) {
+ int sum = 0;
+ for (int c = 0; c < table[row].length; c++) {
+ sum += table[row][c];
+ }
+
+ return sum;
+ }
+
+ /**
+ Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
+ * fw rc
+ * allele1 # #
+ * allele2 # #
+ * @return a 2x2 contingency table
+ */
+ private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
+ final Allele ref = vc.getReference();
+ final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
+ int[][] table = new int[2][2];
+
+ for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
+ for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
+ final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ final GATKSAMRecord read = el.getKey();
+ final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
+ updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount);
+ }
+ }
+
+ return table;
+ }
+
+ /**
+ Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
+ * fw rc
+ * allele1 # #
+ * allele2 # #
+ * @return a 2x2 contingency table
+ */
+ private static int[][] getSNPContingencyTable(final Map<String, AlignmentContext> stratifiedContexts,
+ final Allele ref,
+ final Allele alt,
+ final int minQScoreToConsider ) {
+ int[][] table = new int[2][2];
+
+ for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
+ for (PileupElement p : sample.getValue().getBasePileup()) {
+
+ if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
+ continue;
+
+ if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
+ continue;
+
+ updateTable(table, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount());
+ }
+ }
+
+ return table;
+ }
+
+ private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) {
+ // ignore reduced reads because they are always on the forward strand!
+ // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test
+ if ( read.isReducedRead() )
+ return;
+
+ final boolean matchesRef = allele.equals(ref, true);
+ final boolean matchesAlt = allele.equals(alt, true);
+
+ if ( matchesRef || matchesAlt ) {
+
+ final boolean isFW = !read.getReadNegativeStrandFlag();
+
+ int row = matchesRef ? 0 : 1;
+ int column = isFW ? 0 : 1;
+
+ table[row][column] += representativeCount;
+ }
+ }
+}
View
110 protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
@@ -0,0 +1,110 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.help.HelpConstants;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site
+ */
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
+public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation {
+
+ public Map<String, Object> annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map<String, AlignmentContext> stratifiedContexts,
+ final VariantContext vc,
+ final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
+ double content = computeGCContent(ref);
+ Map<String, Object> map = new HashMap<String, Object>();
+ map.put(getKeyNames().get(0), String.format("%.2f", content));
+ return map;
+ }
+
+ public List<String> getKeyNames() { return Arrays.asList("GC"); }
+
+ public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content within 20 bp +/- the variant")); }
+
+ public boolean useZeroQualityReads() { return false; }
+
+ private static double computeGCContent(ReferenceContext ref) {
+ int gc = 0, at = 0;
+
+ for ( byte base : ref.getBases() ) {
+ int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
+ if ( baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal() )
+ gc++;
+ else if ( baseIndex == BaseUtils.Base.A.ordinal() || baseIndex == BaseUtils.Base.T.ordinal() )
+ at++;
+ else
+ ; // ignore