diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java
index 88ea9496a0..d45ddbef1a 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java
@@ -75,18 +75,16 @@ public class RecalibrationArgumentCollection implements Cloneable {
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
- * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
- * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
- * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
+ * so it is critical that a database of known polymorphic sites (e.g. dbSNP) is given to the tool in order to mask out those sites.
*/
- @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
+ @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites", required = false)
public List This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting each alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact. This variant-level annotation compares the base qualities of the data supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact. The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test. Uninformative reads are not used in these calculations. Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The AS_FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele, and does so separately for each alternate allele. The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls. See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test. This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele. Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
+ *
+ * The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test. Uninformative reads are not used in these annotations. Uninformative reads are not used in this annotation. This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and each alternate allele. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported. This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and each alternate allele. To be clear, it does so separately for each alternate allele. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported. The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele. The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test. Uninformative reads are not used in these annotations. Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The AS_StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele. It does so separately for each allele. The reported value is ln-scaled. Odds Ratios in the 2x2 contingency table below are and its inverse: The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where and ensures that the annotation value is large only. See the method document on statistical tests for a more detailed explanation of this statistical test.
+ * The name AS_StrandOddsRatio is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there. This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact. This variant-level annotation compares the base qualities of the data supporting the reference allele with those supporting any alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact. The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test. The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. This annotation is a one-sided phred-scaled p-value using an exact test of the Hardy-Weinberg Equilibrium. The null hypothesis is that the number of heterozygotes follows the Hardy-Weinberg Equilibrium. The p-value is the probability of getting the same or more heterozygotes as was observed, given the null hypothesis. The implementation used is adapted from Wigginton JE, Cutler DJ, Abecasis GR. A Note on Exact Tests of Hardy-Weinberg Equilibrium. American Journal of Human Genetics. 2005;76(5):887-893. The p-value is calculated exactly by using the Levene-Haldane distribution. This implementation also uses a mid-p correction as described by Graffelman, J. & Moreno, V. (2013). The mid p-value in exact tests for Hardy-Weinberg equilibrium. Statistical Applications in Genetics and Molecular Biology, 12(4), pp. 433-448. Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.” Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele. The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls. The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test. The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. The root mean square is equivalent to the mean of the mapping qualities plus the standard deviation of the mapping qualities. Uninformative reads are not used in this annotation. This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported. This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported. The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele. The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test. The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele. Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele. The reported value is ln-scaled. Odds Ratios in the 2x2 contingency table below are See the method document on statistical tests for a more detailed explanation of this statistical test.
+ * The name SOR is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there.Statistical notes
* Caveat
- * Caveats
+ *
+ *
+ *
+ * Related annotations
+ *
+ *
*
*/
public class AS_BaseQualityRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java
index 7a84795943..4166c6bca4 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java
@@ -65,7 +65,27 @@
/**
- * Allele specific strand bias estimated using Fisher's Exact Test
+ * Allele-specific strand bias estimated using Fisher's Exact Test
+ *
+ * * Statistical notes
+ * Caveats
+ *
+ *
+ * Related annotations
+ *
+ *
*
*/
public class AS_FisherStrand extends AS_StrandBiasTest implements AS_StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
index b3255fdf12..cc492d1934 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
@@ -68,9 +68,26 @@
/**
- * Allele specific Rank Sum Test for mapping qualities of REF versus each ALT reads
+ * Allele specific Rank Sum Test for mapping qualities of REF versus ALT reads
*
- * Currently this annotation duplicate the MappingQualityRankSumTest annotation
+ * Statistical notes
+ * Caveats
+ *
+ *
+ * Related annotations
+ *
+ *
*
*/
public class AS_MappingQualityRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java
index 1b44d66bc6..dc6582997a 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java
@@ -79,8 +79,13 @@
*
*
* Caveat
- * Related annotations
+ *
+ *
*/
public class AS_RMSMappingQuality extends AS_RMSAnnotation implements AS_StandardAnnotation, ActiveRegionBasedAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
index 5cd4e060b5..d125c2fe52 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
@@ -63,9 +63,11 @@
import java.util.List;
/**
- * Allele-specific Rank Sum Test for relative positioning of REF versus each ALT allele within reads
+ * Allele-specific Rank Sum Test for relative positioning of REF versus ALT allele within reads
*
- * Caveat
- *
+ *
*
+ * Related annotations
+ *
+ *
*
*/
public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java
index 40cfe39b2d..9e8124b860 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java
@@ -65,6 +65,46 @@
/**
* Allele-specific strand bias estimated by the Symmetric Odds Ratio test
*
+ * Statistical notes
+ *
+ *
+ *
+ *
+ * + strand - strand
+ * REF; X[0][0] X[0][1]
+ * ALT; X[1][0] X[1][1] Caveat
+ * Related annotations
+ *
+ *
+ *
*/
public class AS_StrandOddsRatio extends AS_StrandBiasTest implements AS_StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java
index 9605c94eb4..c61403933f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java
@@ -65,13 +65,23 @@
/**
* Rank Sum Test of REF versus ALT base quality scores
*
- * Statistical notes
* Caveat
- * Caveats
+ *
+ *
+ *
+ * Related annotations
+ *
+ *
*
*/
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java
index 076d5bc269..f50c4ae274 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java
@@ -77,9 +77,24 @@
/**
- * Phred-scaled p-value for exact test of excess heterozygosity.
- * Using implementation from
- * Wigginton JE, Cutler DJ, Abecasis GR. A Note on Exact Tests of Hardy-Weinberg Equilibrium. American Journal of Human Genetics. 2005;76(5):887-893.
+ * Phred-scaled p-value for exact test of excess heterozygosity
+ *
+ * Statistical notes
+ * Caveats
+ *
+ *
+ *
+ * Related annotations
+ *
+ *
+ *
*/
public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(ExcessHet.class);
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
index 0f83e64446..1a6e4242ef 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
@@ -68,7 +68,10 @@
/**
* Strand bias estimated using Fisher's Exact Test
*
- * Statistical notes
@@ -81,6 +84,7 @@
*
* Related annotations
*
+ *
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java
index 28c707e474..122767bfba 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java
@@ -85,6 +85,11 @@
* Related annotations
+ *
+ *
+ *
*/
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation, ReducibleAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java
index f939542588..256b90259f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java
@@ -70,11 +70,14 @@
* Statistical notes
* Caveat
- * Caveats
+ *
*
* Related annotations
*
+ *
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java
index 7d0332e7d9..fda303cabc 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java
@@ -80,8 +80,12 @@
* Statistical notes
* Caveat
+ * Related annotations
*
+ *
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
index 3b33d80c65..09a38952e2 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
@@ -65,7 +65,9 @@
/**
* Rank Sum Test for relative positioning of REF versus ALT alleles within reads
*
- * Caveat
- *
+ *
+ *
+ * * Related annotations
+ *
+ *
*
*/
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
index 38822a0555..653848a0c1 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
@@ -66,7 +66,9 @@
/**
* Strand bias estimated by the Symmetric Odds Ratio test
*
- * Statistical notes
* Caveat
+ * Related annotations
*
+ *
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java
index 1419180a92..a5419139bf 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java
@@ -79,7 +79,7 @@
* Create plots to visualize base recalibration results
*
*
- * This tool generates plots for visualizing the quality of a recalibration run.
+ * This tool generates plots for visualizing the quality of a recalibration run (effected by BaseRecalibrator).
*
- * This tool is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating - * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative - * of poor base quality. This tool generates tables based on various user-specified covariates (such as read group, - * reported quality score, cycle, and context). Since there is a large amount of data, one can then calculate an empirical - * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. - * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). - *
- *- * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added regardless of whether - * or not they were specified. + * Variant calling algorithms rely heavily on the quality scores assigned to the individual base calls in each sequence + * read. These scores are per-base estimates of error emitted by the sequencing machines. Unfortunately the scores + * produced by the machines are subject to various sources of systematic technical error, leading to over- or + * under-estimated base quality scores in the data. Base quality score recalibration (BQSR) is a process in which we + * apply machine learning to model these errors empirically and adjust the quality scores accordingly. This allows us + * to get more accurate base qualities, which in turn improves the accuracy of our variant calls. + * + * The base recalibration process involves two key steps: first the program builds a model of covariation based on the + * data and a set of known variants (which you can bootstrap if there is none available for your organism), then it + * adjusts the base quality scores in the data based on the model. + * + * There is an optional but highly recommended step that involves building a second model and generating before/after + * plots to visualize the effects of the recalibration process. This is useful for quality control purposes. + * + * This tool performs the first step described above: it builds the model of covariation and produces the recalibration + * table. It operates only at sites that are not in dbSNP; we assume that all reference mismatches we see are therefore + * errors and indicative of poor base quality. This tool generates tables based on various user-specified covariates + * (such as read group, reported quality score, cycle, and context). Assuming we are working with a large amount of data, + * we can then calculate an empirical probability of error given the particular covariates seen at this site, + * where p(error) = num mismatches / num observations. + * + * The output file is a table (of the several covariate values, number of observations, number of mismatches, empirical + * quality score). *
* - ** A BAM file containing data that needs to be recalibrated. *
@@ -131,6 +144,13 @@ * -knownSites latest_dbsnp.vcf \ * -o recal_data.table * + * + *
The basic operation of the HaplotypeCaller proceeds as follows:
* @@ -119,7 +119,7 @@ * evidence for variation. * *For each ActiveRegion, the program builds a De Bruijn-like graph to reassemble the ActiveRegion, and identifies
* what are the possible haplotypes present in the data. The program then realigns each haplotype against the reference
@@ -135,7 +135,7 @@
*
*
For each potentially variant site, the program applies Bayes’ rule, using the likelihoods of alleles given the + *
For each potentially variant site, the program applies Bayes' rule, using the likelihoods of alleles given the * read data to calculate the likelihoods of each genotype per sample given the read data observed for that * sample. The most likely genotype is then assigned to the sample.
* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java index d5276254f9..050d174ca6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java @@ -130,7 +130,7 @@ * -T IndelRealigner \ * -R reference.fasta \ * -I input.bam \ - * --known indels.vcf \ + * -known indels.vcf \ * -targetIntervals intervalListFromRTC.intervals \ * -o realignedBam.bam * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java index 9f80b66f4c..6938784ee6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java @@ -140,7 +140,7 @@ * * */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class PhaseByTransmission extends RodWalker- * A VCF file and a metadata file + * A VCF file and a metadata file. + *
+ * +*The metaData file can take two formats, the first of which is the first 6 lines of the standard pedigree file. This + * is what Plink describes as a .fam file. Note that the sex encoding convention is 1=male; 2=female; other=unknown. An example .fam file is as follows (note that there is no header):
+ *+ * CEUTrio NA12878 NA12891 NA12892 2 -9 + * CEUTrio NA12891 UNKN1 UNKN2 1 -9 + * CEUTrio NA12892 UNKN3 UNKN4 2 -9 + *+ *
where the entries are: FamilyID IndividualID DadID MomID Sex Phenotype.
+ *An alternate format is a two-column key-value file:
+ *+ * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 + * NA12891 fid=CEUTrio;sex=1;phenotype=-9 + * NA12892 fid=CEUTrio;sex=2;phenotype=-9 + *+ *
where unknown parents do not need to be specified. The columns are the individual ID and a list of key-value pairs.
+ * + *+ * Regardless of which file is specified, the tool will output a .fam file alongside the pedigree file. If the + * command line has "-m [name].fam", the fam file will be subset and reordered to match the sample content and ordering + * of the VCF. However, if a metadata file of the alternate format is passed by "-m [name].txt", the tool will + * construct a formatted .fam file from the data. *
* *- * A binary pedigree in PLINK format, composed of three files (.bed/.bim/.fam) + * A binary pedigree in PLINK format, composed of three files (.bed/.bim/.fam). See the PLINK format specification for more details. *
* *The metaData file can take two formats, the first of which is the first 6 lines of the standard pedigree file. This - * is what Plink describes as a .fam file. An example .fam file is as follows (note that there is no header):
- *- * CEUTrio NA12878 NA12891 NA12892 2 -9 - * CEUTrio NA12891 UNKN1 UNKN2 2 -9 - * CEUTrio NA12892 UNKN3 UNKN4 1 -9 - *- *
where the entries are: FamilyID IndividualID DadID MomID Phenotype Sex.
- *An alternate format is a two-column key-value file:
- *- * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 - * NA12891 fid=CEUTrio;sex=2;phenotype=-9 - * NA12892 fid=CEUTrio;sex=1;phenotype=-9 - *- *
where unknown parents do not need to be specified. The columns are the individual ID and a list of key-value pairs.
- *- * Regardless of which file is specified, the tool will output a .fam file alongside the pedigree file. If the - * command line has "-m [name].fam", the fam file will be subset and reordered to match the sample content and ordering - * of the VCF. However, if a metadata file of the alternate format is passed by "-m [name].txt", the tool will - * construct a formatted .fam file from the data. - *
- */ + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file") File metaDataFile; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java index aac28832f1..ceb1d0f4fc 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java @@ -100,11 +100,11 @@ public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { - super(String.format("Badly formed genome loc: %s: %s", message, loc)); + super(String.format("Badly formed genome location: %s: %s", message, loc)); } public MalformedGenomeLoc(String message) { - super(String.format("Badly formed genome loc: %s", message)); + super(String.format("Badly formed genome location: %s", message)); } } @@ -129,66 +129,66 @@ public BadArgumentValue(String arg, String message) { public static class UnknownTribbleType extends CommandLineException { public UnknownTribbleType(String type, String message) { - super(String.format("Unknown tribble type %s: %s", type, message)); + super(String.format("Unknown variant input file type %s: %s", type, message)); } } public static class BadTmpDir extends UserException { public BadTmpDir(String message) { - super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); + super(String.format("An error occurred while working with the tmp directory %s. You can specify -Djava.io.tmpdir=X on the command line (before the -jar argument) where X is a directory path, to use a more appropriate temporary directory. The exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); } } public static class TooManyOpenFiles extends UserException { public TooManyOpenFiles() { - super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); + super(String.format("An error occurred because there were too many files open concurrently; your system's open file handle limit is probably too small. See the unix ulimit command to adjust this limit or ask your system administrator for help.")); } } public static class LocalParallelizationProblem extends UserException { public LocalParallelizationProblem(final File file) { - super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); + super(String.format("An error occurred because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or your system experienced a temporary instability. Your system administrator can help you resolve these problems.", file.getAbsolutePath())); } } public static class NotEnoughMemory extends UserException { public NotEnoughMemory() { - super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); + super(String.format("An error occurred because you did not provide enough memory to run this program. You can use the -Xmx argument (before the -jar argument) to adjust the maximum heap size provided to Java. Note that this is a JVM argument, not a GATK argument.")); } } public static class ErrorWritingBamFile extends UserException { public ErrorWritingBamFile(String message) { - super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); + super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. Your system administrator can help you resolve these issues. If you know what temporary directory to use, you can specify it by adding -Djava.io.tmpdir=X to the command line (before the -jar argument), where X is the directory path. The exact error was %s", message)); } } public static class NoSpaceOnDevice extends UserException { public NoSpaceOnDevice() { - super("There is no space left on the device, so writing failed"); + super("Writing failed because there is no space left on the disk or hard drive. Please make some space or specify a different location for writing output files."); } } public static class CouldNotReadInputFile extends UserException { public CouldNotReadInputFile(String message, Exception e) { - super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); + super(String.format("Could not read file because %s caused by %s", message, getMessage(e))); } public CouldNotReadInputFile(File file) { - super(String.format("Couldn't read file %s", file.getAbsolutePath())); + super(String.format("Could not read file %s", file.getAbsolutePath())); } public CouldNotReadInputFile(File file, String message) { - super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); + super(String.format("Could not read file %s because %s", file.getAbsolutePath(), message)); } public CouldNotReadInputFile(String file, String message) { - super(String.format("Couldn't read file %s because %s", file, message)); + super(String.format("Could not read file %s because %s", file, message)); } public CouldNotReadInputFile(File file, String message, Exception e) { - super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + super(String.format("Could not read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); } public CouldNotReadInputFile(File file, Exception e) { @@ -203,19 +203,19 @@ public CouldNotReadInputFile(String message) { public static class CouldNotCreateOutputFile extends UserException { public CouldNotCreateOutputFile(File file, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + super(String.format("Could not write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); } public CouldNotCreateOutputFile(File file, String message) { - super(String.format("Couldn't write file %s because %s", file.getAbsolutePath(), message)); + super(String.format("Could not write file %s because %s", file.getAbsolutePath(), message)); } public CouldNotCreateOutputFile(String filename, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", filename, message, getMessage(e))); + super(String.format("Could not write file %s because %s with exception %s", filename, message, getMessage(e))); } public CouldNotCreateOutputFile(File file, Exception e) { - super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); + super(String.format("Could not write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); } public CouldNotCreateOutputFile(String message, Exception e) { @@ -225,20 +225,20 @@ public CouldNotCreateOutputFile(String message, Exception e) { public static class MissortedBAM extends UserException { public MissortedBAM(SAMFileHeader.SortOrder order, File file, SAMFileHeader header) { - super(String.format("Missorted Input SAM/BAM/CRAM files: %s is must be sorted in %s order but order was: %s", file, order, header.getSortOrder())); + super(String.format("Missorted input SAM/BAM/CRAM files: %s must be sorted in %s order but order was: %s. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information.", file, order, header.getSortOrder())); } public MissortedBAM(SAMFileHeader.SortOrder order, String message) { - super(String.format("Missorted Input SAM/BAM/CRAM files: files are not sorted in %s order; %s", order, message)); + super(String.format("Missorted input SAM/BAM/CRAM files: files are not sorted in %s order. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", order, message)); } public MissortedBAM(SAMFileHeader.SortOrder order, SAMRecord read, String message) { - super(String.format("Missorted Input SAM/BAM/CRAM file %s: file sorted in %s order but %s is required; %s", + super(String.format("Missorted input SAM/BAM/CRAM file %s: file sorted in %s order but %s is required. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", read.getFileSource().getReader(), read.getHeader().getSortOrder(), order, message)); } public MissortedBAM(String message) { - super(String.format("Missorted Input SAM/BAM/CRAM files: %s", message)); + super(String.format("Missorted input SAM/BAM/CRAM files. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", message)); } } @@ -252,7 +252,7 @@ public MalformedBAM(File file, String message) { } public MalformedBAM(String source, String message) { - super(String.format("SAM/BAM/CRAM file %s is malformed: %s", source, message)); + super(String.format("SAM/BAM/CRAM file %s is malformed. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", source, message)); } } @@ -262,7 +262,7 @@ public MisencodedBAM(SAMRecord read, String message) { } public MisencodedBAM(String source, String message) { - super(String.format("SAM/BAM/CRAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message)); + super(String.format("SAM/BAM/CRAM file %s appears to be using the wrong encoding for quality scores: %s. Please see https://www.broadinstitute.org/gatk/guide?id=6470 for more details and options related to this error.", source, message)); } } @@ -294,25 +294,25 @@ public MalformedVCFHeader(String message) { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(final SAMRecord read) { - super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please see " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); } } public static class ReadHasUndefinedReadGroup extends MalformedBAM { public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { - super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); + super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please see " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); } } public static class VariantContextMissingRequiredField extends UserException { public VariantContextMissingRequiredField(String field, VariantContext vc) { - super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); + super(String.format("Variant at %s:%d is is missing the required field %s.", vc.getChr(), vc.getStart(), field)); } } public static class MissortedFile extends UserException { public MissortedFile(File file, String message, Exception e) { - super(String.format("Missorted Input file: %s is must be sorted in coordinate order. %s and got error %s", file, message, getMessage(e))); + super(String.format("Missorted input file: %s is must be sorted in coordinate order. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s and got error %s", file, message, getMessage(e))); } } @@ -366,18 +366,14 @@ public DeprecatedArgument(String param, String doc) { public static class IncompatibleSequenceDictionaries extends UserException { public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { - super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", + super(String.format("Input files %s and %s have incompatible contigs. Please see " + HelpConstants.forumPost("discussion/63/input-files-have-incompatible-contigs") + "for more information. Error details: %s.\n %s contigs = %s\n %s contigs = %s", name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); } } public static class LexicographicallySortedSequenceDictionary extends UserException { public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { - super(String.format("Lexicographically sorted human genome sequence detected in %s." - + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." - + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") - + "\n %s contigs = %s", + super(String.format("Lexicographically sorted human genome sequence detected in %s. Please see " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") + "for more information. Error details: %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } }