diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java index 88ea9496a0..d45ddbef1a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java @@ -75,18 +75,16 @@ public class RecalibrationArgumentCollection implements Cloneable { /** * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, - * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) - * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. - * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. + * so it is critical that a database of known polymorphic sites (e.g. dbSNP) is given to the tool in order to mask out those sites. */ - @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) + @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites", required = false) public List> knownSites = Collections.emptyList(); /** * After the header, data records occur one per line until the end of the file. The first several items on a line are the * values of the individual covariates and will change depending on which covariates were specified at runtime. The last * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. */ @Gather(BQSRGatherer.class) @Output(doc = "The output recalibration table file to create", required = true) @@ -107,7 +105,7 @@ public class RecalibrationArgumentCollection implements Cloneable { @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) public String[] COVARIATES = null; - /* + /** * The Cycle and Context covariates are standard and are included by default unless this argument is provided. * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java index 10439d9184..dc21830245 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java @@ -64,15 +64,25 @@ /** - * Allele-specific rank Sum Test of REF versus each ALT base quality scores + * Allele-specific rank Sum Test of REF versus ALT base quality scores * - *

This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting each alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

+ *

This variant-level annotation compares the base qualities of the data supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele.

+ * + *

The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

* *

Statistical notes

*

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

* - *

Caveat

- *

Uninformative reads are not used in these calculations.

+ *

Caveats

+ * + * + *

Related annotations

+ * * */ public class AS_BaseQualityRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java index 7a84795943..4166c6bca4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java @@ -65,7 +65,27 @@ /** - * Allele specific strand bias estimated using Fisher's Exact Test + * Allele-specific strand bias estimated using Fisher's Exact Test + * + * *

Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other.

+ * + *

The AS_FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele, and does so separately for each alternate allele.

+ *

The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.

+ * + *

Statistical notes

+ *

See the method document on statistical tests for a more detailed explanation of this application of Fisher's Exact Test.

+ * + *

Caveats

+ * + *

Related annotations

+ * * */ public class AS_FisherStrand extends AS_StrandBiasTest implements AS_StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java index b3255fdf12..cc492d1934 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java @@ -68,9 +68,26 @@ /** - * Allele specific Rank Sum Test for mapping qualities of REF versus each ALT reads + * Allele specific Rank Sum Test for mapping qualities of REF versus ALT reads * - * Currently this annotation duplicate the MappingQualityRankSumTest annotation + *

This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting each alternate allele. To be clear, it does so separately for each alternate allele.

+ * + *

The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.

+ *

Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. + * + *

Statistical notes

+ *

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

+ * + *

Caveats

+ * + * + *

Related annotations

+ * * */ public class AS_MappingQualityRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java index 1b44d66bc6..dc6582997a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java @@ -79,8 +79,13 @@ * * *

Caveat

- *

Uninformative reads are not used in these annotations.

+ *

Uninformative reads are not used in this annotation.

* + *

Related annotations

+ * */ public class AS_RMSMappingQuality extends AS_RMSAnnotation implements AS_StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java index 5cd4e060b5..d125c2fe52 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java @@ -63,9 +63,11 @@ import java.util.List; /** - * Allele-specific Rank Sum Test for relative positioning of REF versus each ALT allele within reads + * Allele-specific Rank Sum Test for relative positioning of REF versus ALT allele within reads * - *

This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and each alternate allele. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.

+ *

This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and each alternate allele. To be clear, it does so separately for each alternate allele.

+ * + *

Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.

* *

The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele.

* @@ -75,8 +77,15 @@ *

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

* *

Caveat

- *

Uninformative reads are not used in these annotations.

+ * * + *

Related annotations

+ * * */ public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java index 40cfe39b2d..9e8124b860 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java @@ -65,6 +65,46 @@ /** * Allele-specific strand bias estimated by the Symmetric Odds Ratio test * + *

Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other.

+ * + *

The AS_StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele. It does so separately for each allele. The reported value is ln-scaled.

+ * + *

Statistical notes

+ *

Odds Ratios in the 2x2 contingency table below are

+ * + * $$ R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]} $$ + * + *

and its inverse:

+ * + * + * + * + * + *
 + strand - strand
REF;X[0][0]X[0][1]
ALT;X[1][0]X[1][1]
+ * + *

The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where

+ * + * $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$ + * + *

and

+ * + * $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$ + * + *

ensures that the annotation value is large only.

+ * + *

See the method document on statistical tests for a more detailed explanation of this statistical test.

+ * + *

Caveat

+ *

+ * The name AS_StrandOddsRatio is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there.

+ * + *

Related annotations

+ * + * */ public class AS_StrandOddsRatio extends AS_StrandBiasTest implements AS_StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java index 9605c94eb4..c61403933f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java @@ -65,13 +65,23 @@ /** * Rank Sum Test of REF versus ALT base quality scores * - *

This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

+ *

This variant-level annotation compares the base qualities of the data supporting the reference allele with those supporting any alternate allele.

+ * + *

The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

* *

Statistical notes

*

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

* - *

Caveat

- *

The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ *

Caveats

+ * + * + *

Related annotations

+ * * */ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java index 076d5bc269..f50c4ae274 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java @@ -77,9 +77,24 @@ /** - * Phred-scaled p-value for exact test of excess heterozygosity. - * Using implementation from - * Wigginton JE, Cutler DJ, Abecasis GR. A Note on Exact Tests of Hardy-Weinberg Equilibrium. American Journal of Human Genetics. 2005;76(5):887-893. + * Phred-scaled p-value for exact test of excess heterozygosity + * + *

This annotation is a one-sided phred-scaled p-value using an exact test of the Hardy-Weinberg Equilibrium. The null hypothesis is that the number of heterozygotes follows the Hardy-Weinberg Equilibrium. The p-value is the probability of getting the same or more heterozygotes as was observed, given the null hypothesis. The implementation used is adapted from Wigginton JE, Cutler DJ, Abecasis GR. A Note on Exact Tests of Hardy-Weinberg Equilibrium. American Journal of Human Genetics. 2005;76(5):887-893.

+ * + *

Statistical notes

+ *

The p-value is calculated exactly by using the Levene-Haldane distribution. This implementation also uses a mid-p correction as described by Graffelman, J. & Moreno, V. (2013). The mid p-value in exact tests for Hardy-Weinberg equilibrium. Statistical Applications in Genetics and Molecular Biology, 12(4), pp. 433-448.

+ * + *

Caveats

+ * + * + *

Related annotations

+ * + * */ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static Logger logger = Logger.getLogger(ExcessHet.class); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java index 0f83e64446..1a6e4242ef 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java @@ -68,7 +68,10 @@ /** * Strand bias estimated using Fisher's Exact Test * - *

Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.”

+ *

Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other.

+ * + *

The FisherStrand annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It uses Fisher's Exact Test to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.

+ * *

The output is a Phred-scaled p-value. The higher the output value, the more likely there is to be bias. More bias is indicative of false positive calls.

* *

Statistical notes

@@ -81,6 +84,7 @@ * *

Related annotations

* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index 28c707e474..122767bfba 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -85,6 +85,11 @@ *
  • This annotation can take a valid pedigree file to specify founders.
  • * * + *

    Related annotations

    + * + * */ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation, ReducibleAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java index f939542588..256b90259f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java @@ -70,11 +70,14 @@ *

    Statistical notes

    *

    The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for mapping qualities (MAPQ of reads supporting REF vs. MAPQ of reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

    * - *

    Caveat

    - *

    The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

    + *

    Caveats

    + * * *

    Related annotations

    * * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java index 7d0332e7d9..fda303cabc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java @@ -80,8 +80,12 @@ *

    Statistical notes

    *

    The root mean square is equivalent to the mean of the mapping qualities plus the standard deviation of the mapping qualities.

    * + *

    Caveat

    + *

    Uninformative reads are not used in this annotation.

    + * *

    Related annotations

    * * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java index 3b33d80c65..09a38952e2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java @@ -65,7 +65,9 @@ /** * Rank Sum Test for relative positioning of REF versus ALT alleles within reads * - *

    This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.

    + *

    This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles.

    + * + *

    Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.

    * *

    The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele.

    * @@ -75,7 +77,15 @@ *

    The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

    * *

    Caveat

    - *

    The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

    + * + * + * *

    Related annotations

    + * * */ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java index 38822a0555..653848a0c1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java @@ -66,7 +66,9 @@ /** * Strand bias estimated by the Symmetric Odds Ratio test * - *

    Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.

    + *

    Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other.

    + * + *

    The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele. The reported value is ln-scaled.

    * *

    Statistical notes

    *

    Odds Ratios in the 2x2 contingency table below are

    @@ -93,8 +95,13 @@ * *

    See the method document on statistical tests for a more detailed explanation of this statistical test.

    * + *

    Caveat

    + *

    + * The name SOR is not entirely appropriate because the implementation was changed somewhere between the start of development and release of this annotation. Now SOR isn't really an odds ratio anymore. The goal was to separate certain cases of data without penalizing variants that occur at the ends of exons because they tend to only be covered by reads in one direction (depending on which end of the exon they're on), so if a variant has 10 ref reads in the + direction, 1 ref read in the - direction, 9 alt reads in the + direction and 2 alt reads in the - direction, it's actually not strand biased, but the FS score is pretty bad. The implementation that resulted derived in part from empirically testing some read count tables of various sizes with various ratios and deciding from there.

    + * *

    Related annotations

    * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java index 1419180a92..a5419139bf 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java @@ -79,7 +79,7 @@ * Create plots to visualize base recalibration results * *

    - * This tool generates plots for visualizing the quality of a recalibration run. + * This tool generates plots for visualizing the quality of a recalibration run (effected by BaseRecalibrator). *

    * *

    Input

    diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java index 5626ce7a1b..0d74a7c4d3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java @@ -86,22 +86,35 @@ import java.util.List; /** - * Generate base recalibration table to compensate for systematic errors + * Generate base recalibration table to compensate for systematic errors in basecalling confidences * *

    - * This tool is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating - * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative - * of poor base quality. This tool generates tables based on various user-specified covariates (such as read group, - * reported quality score, cycle, and context). Since there is a large amount of data, one can then calculate an empirical - * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. - * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). - *

    - *

    - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added regardless of whether - * or not they were specified. + * Variant calling algorithms rely heavily on the quality scores assigned to the individual base calls in each sequence + * read. These scores are per-base estimates of error emitted by the sequencing machines. Unfortunately the scores + * produced by the machines are subject to various sources of systematic technical error, leading to over- or + * under-estimated base quality scores in the data. Base quality score recalibration (BQSR) is a process in which we + * apply machine learning to model these errors empirically and adjust the quality scores accordingly. This allows us + * to get more accurate base qualities, which in turn improves the accuracy of our variant calls. + * + * The base recalibration process involves two key steps: first the program builds a model of covariation based on the + * data and a set of known variants (which you can bootstrap if there is none available for your organism), then it + * adjusts the base quality scores in the data based on the model. + * + * There is an optional but highly recommended step that involves building a second model and generating before/after + * plots to visualize the effects of the recalibration process. This is useful for quality control purposes. + * + * This tool performs the first step described above: it builds the model of covariation and produces the recalibration + * table. It operates only at sites that are not in dbSNP; we assume that all reference mismatches we see are therefore + * errors and indicative of poor base quality. This tool generates tables based on various user-specified covariates + * (such as read group, reported quality score, cycle, and context). Assuming we are working with a large amount of data, + * we can then calculate an empirical probability of error given the particular covariates seen at this site, + * where p(error) = num mismatches / num observations. + * + * The output file is a table (of the several covariate values, number of observations, number of mismatches, empirical + * quality score). *

    * - *

    Input

    + *

    Inputs

    *

    * A BAM file containing data that needs to be recalibrated. *

    @@ -131,6 +144,13 @@ * -knownSites latest_dbsnp.vcf \ * -o recal_data.table * + * + *

    Notes

    + * + * */ @DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class}) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 25ff347eeb..fe7d7c3ae9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -108,7 +108,7 @@ import java.util.*; /** - * Call SNPs and indels simultaneously via local re-assembly of haplotypes in an active region + * Call SNPs and indels simultaneously via local assembly of haplotypes in an active region * *

    The basic operation of the HaplotypeCaller proceeds as follows:

    * @@ -119,7 +119,7 @@ * evidence for variation.

    * *
    - *

    2. Determine haplotypes by re-assembly of the active region

    + *

    2. Determine haplotypes by assembly of the active region

    * *

    For each ActiveRegion, the program builds a De Bruijn-like graph to reassemble the ActiveRegion, and identifies * what are the possible haplotypes present in the data. The program then realigns each haplotype against the reference @@ -135,7 +135,7 @@ *
    *

    4. Assign sample genotypes

    * - *

    For each potentially variant site, the program applies Bayes’ rule, using the likelihoods of alleles given the + *

    For each potentially variant site, the program applies Bayes' rule, using the likelihoods of alleles given the * read data to calculate the likelihoods of each genotype per sample given the read data observed for that * sample. The most likely genotype is then assigned to the sample.

    * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java index d5276254f9..050d174ca6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java @@ -130,7 +130,7 @@ * -T IndelRealigner \ * -R reference.fasta \ * -I input.bam \ - * --known indels.vcf \ + * -known indels.vcf \ * -targetIntervals intervalListFromRTC.intervals \ * -o realignedBam.bam * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java index 9f80b66f4c..6938784ee6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java @@ -140,7 +140,7 @@ * * */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class PhaseByTransmission extends RodWalker, HashMap> { @ArgumentCollection diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java index d5250a480e..a5d4f38f51 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java @@ -138,7 +138,7 @@ // Filter out all reads with zero mapping quality @ReadFilters({MappingQualityZeroFilter.class}) -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class ReadBackedPhasing extends RodWalker { @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) protected boolean DEBUG = false; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index c641c8ea73..09a5dc9859 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -141,6 +141,7 @@ *

    Caveats

    * *
      + *
    • SNPs and indels must be recalibrated in separate runs (but it is not necessary to separate them into different files). Mixed records are treated as indels.
    • *
    • The values used in the example above are only meant to show how the command lines are composed. * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java index e2c8b226ed..4d11f3ef82 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java @@ -159,20 +159,31 @@ static Mode parseString(final String input) { @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="LOD score cutoff for selecting bad variants", required=false) public double BAD_LOD_CUTOFF = -5.0; + /** + * MQ is capped at a "max" value (60 for bwa-mem) when the alignment is considered perfect. Typically, a huge + * proportion of the reads in a dataset are perfectly mapped, which yields a distribution of MQ values with a + * blob below the max value and a huge peak at the max value. This does not conform to the expectations of the + * Gaussian mixture model of VQSR and has been observed to yield a ROC curve with a jump. + * + * This argument aims to mitigate this problem. Using MQCap = X has 2 effects: (1) MQs are transformed by a scaled + * logit on [0,X] (+ epsilon to avoid division by zero) to make the blob more Gaussian-like and (2) the transformed + * MQ=X are jittered to break the peak into a narrow Gaussian. + * + * Beware that IndelRealigner, if used, adds 10 to MQ for successfully realigned indels. We recommend to either use + * --read-filter ReassignOriginalMQAfterIndelRealignment with HaplotypeCaller or use a MQCap=max+10 to take that + * into account. + * + * If this option is not used, or if MQCap is set to 0, MQ will not be transformed. + */ @Advanced - @Argument(fullName="MQCapForLogitJitterTransform", shortName = "MQCap", doc="MQ is capped at a \"max\" value (60 for bwa-mem) when the alignment is considered perfect." + - "Since often a huge proportion of reads are perfectly mapped, this yields a distribution with a blob < max and a huge peak at max" + - "This is not good for the mixture of Gaussian VQSR model and has been observed to yield a ROC curve with a jump." + - "Using MQCap = X has 2 effects: (1) MQs are transformed by a scaled logit on [0,X] (+ epsilon to avoid division by zero)" + - "to make the blob more Gaussian-like and (2) The transformed MQ=X are jittered to break the peak into a narrow Gaussian." + - "Beware that IndelRealigner, if used, adds 10 to MQ for successfully realigned indels." + - "We recommend to use --read-filter ReassignOriginalMQAfterIndelRealignment with HaplotypeCaller, but if not, use a MQCap=max+10 to take that into account." + - "If this option is not used, or if MQCap is set to 0, MQ will not be transformed.", required=false) + @Argument(fullName="MQCapForLogitJitterTransform", shortName = "MQCap", doc="Apply logit transform and jitter to MQ values", required=false) public int MQ_CAP = 0; + /** * The following 2 arguments are hidden because they are only for testing different jitter amounts with and without logit transform. * Once this will have been tested, and the correct jitter amount chosen (perhaps as a function of the logit range [0,max]) they can be removed. */ + @Hidden @Advanced @Argument(fullName = "no_MQ_logit", shortName = "NoMQLogit", doc="MQ is by default transformed to log[(MQ_cap + epsilon - MQ)/(MQ + epsilon)] to make it more Gaussian-like. Use this flag to not do that.", required = false) diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java index 33ec49f6c6..0601034dfb 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -336,7 +336,8 @@ public void setDownsamplingMethod(DownsamplingMethod method) { * Any value greater than zero will be used to recalculate the quantization using that many levels. * Negative values mean that we should quantize using the recalibration report's quantization level. */ - + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) + public int quantizationLevels = 0; /** * Static quantized quals are entirely separate from the quantize_qual option which uses dynamic binning. @@ -356,9 +357,6 @@ public void setDownsamplingMethod(DownsamplingMethod method) { @Argument(fullName="round_down_quantized", shortName = "RDQ", doc = "Round quals down to nearest quantized qual", required=false, exclusiveOf="quantize_quals") public boolean roundDown = false; - @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) - public int quantizationLevels = 0; - /** * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 022ef2e37d..abd53f9f0b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -248,8 +248,8 @@ * -V input.vcf \ * -o output.vcf \ * -selectType INDEL - * -minIndelSize 2 - * -maxIndelSize 5 + * --minIndelSize 2 + * --maxIndelSize 5 * * *

      Exclude indels from a VCF:

      @@ -259,7 +259,7 @@ * -T SelectVariants \ * --variant input.vcf \ * -o output.vcf \ - * -selectTypeToExclude INDEL + * --selectTypeToExclude INDEL * * *

      Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):

      diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java index b215bb3a5b..10f463727b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java @@ -57,12 +57,35 @@ * *

      Inputs

      *

      - * A VCF file and a metadata file + * A VCF file and a metadata file. + *

      + * +*

      The metaData file can take two formats, the first of which is the first 6 lines of the standard pedigree file. This + * is what Plink describes as a .fam file. Note that the sex encoding convention is 1=male; 2=female; other=unknown. An example .fam file is as follows (note that there is no header):

      + *
      + * CEUTrio NA12878 NA12891 NA12892 2 -9
      + * CEUTrio NA12891 UNKN1 UNKN2 1 -9
      + * CEUTrio NA12892 UNKN3 UNKN4 2 -9
      + * 
      + *

      where the entries are: FamilyID IndividualID DadID MomID Sex Phenotype.

      + *

      An alternate format is a two-column key-value file:

      + *
      + * NA12878        fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
      + * NA12891        fid=CEUTrio;sex=1;phenotype=-9
      + * NA12892        fid=CEUTrio;sex=2;phenotype=-9
      + * 
      + *

      where unknown parents do not need to be specified. The columns are the individual ID and a list of key-value pairs.

      + * + *

      + * Regardless of which file is specified, the tool will output a .fam file alongside the pedigree file. If the + * command line has "-m [name].fam", the fam file will be subset and reordered to match the sample content and ordering + * of the VCF. However, if a metadata file of the alternate format is passed by "-m [name].txt", the tool will + * construct a formatted .fam file from the data. *

      * *

      Outputs

      *

      - * A binary pedigree in PLINK format, composed of three files (.bed/.bim/.fam) + * A binary pedigree in PLINK format, composed of three files (.bed/.bim/.fam). See the PLINK format specification for more details. *

      * *

      Example

      @@ -85,30 +108,7 @@ public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - - /** - *

      The metaData file can take two formats, the first of which is the first 6 lines of the standard pedigree file. This - * is what Plink describes as a .fam file. An example .fam file is as follows (note that there is no header):

      - *
      -     * CEUTrio NA12878 NA12891 NA12892 2 -9
      -     * CEUTrio NA12891 UNKN1 UNKN2 2 -9
      -     * CEUTrio NA12892 UNKN3 UNKN4 1 -9
      -     * 
      - *

      where the entries are: FamilyID IndividualID DadID MomID Phenotype Sex.

      - *

      An alternate format is a two-column key-value file:

      - *
      -     * NA12878        fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
      -     * NA12891        fid=CEUTrio;sex=2;phenotype=-9
      -     * NA12892        fid=CEUTrio;sex=1;phenotype=-9
      -     * 
      - *

      where unknown parents do not need to be specified. The columns are the individual ID and a list of key-value pairs.

      - *

      - * Regardless of which file is specified, the tool will output a .fam file alongside the pedigree file. If the - * command line has "-m [name].fam", the fam file will be subset and reordered to match the sample content and ordering - * of the VCF. However, if a metadata file of the alternate format is passed by "-m [name].txt", the tool will - * construct a formatted .fam file from the data. - *

      - */ + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file") File metaDataFile; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java index aac28832f1..ceb1d0f4fc 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java @@ -100,11 +100,11 @@ public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { - super(String.format("Badly formed genome loc: %s: %s", message, loc)); + super(String.format("Badly formed genome location: %s: %s", message, loc)); } public MalformedGenomeLoc(String message) { - super(String.format("Badly formed genome loc: %s", message)); + super(String.format("Badly formed genome location: %s", message)); } } @@ -129,66 +129,66 @@ public BadArgumentValue(String arg, String message) { public static class UnknownTribbleType extends CommandLineException { public UnknownTribbleType(String type, String message) { - super(String.format("Unknown tribble type %s: %s", type, message)); + super(String.format("Unknown variant input file type %s: %s", type, message)); } } public static class BadTmpDir extends UserException { public BadTmpDir(String message) { - super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); + super(String.format("An error occurred while working with the tmp directory %s. You can specify -Djava.io.tmpdir=X on the command line (before the -jar argument) where X is a directory path, to use a more appropriate temporary directory. The exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); } } public static class TooManyOpenFiles extends UserException { public TooManyOpenFiles() { - super(String.format("There was a failure because there are too many files open concurrently; your system's open file handle limit is too small. See the unix ulimit command to adjust this limit")); + super(String.format("An error occurred because there were too many files open concurrently; your system's open file handle limit is probably too small. See the unix ulimit command to adjust this limit or ask your system administrator for help.")); } } public static class LocalParallelizationProblem extends UserException { public LocalParallelizationProblem(final File file) { - super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); + super(String.format("An error occurred because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or your system experienced a temporary instability. Your system administrator can help you resolve these problems.", file.getAbsolutePath())); } } public static class NotEnoughMemory extends UserException { public NotEnoughMemory() { - super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); + super(String.format("An error occurred because you did not provide enough memory to run this program. You can use the -Xmx argument (before the -jar argument) to adjust the maximum heap size provided to Java. Note that this is a JVM argument, not a GATK argument.")); } } public static class ErrorWritingBamFile extends UserException { public ErrorWritingBamFile(String message) { - super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); + super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. Your system administrator can help you resolve these issues. If you know what temporary directory to use, you can specify it by adding -Djava.io.tmpdir=X to the command line (before the -jar argument), where X is the directory path. The exact error was %s", message)); } } public static class NoSpaceOnDevice extends UserException { public NoSpaceOnDevice() { - super("There is no space left on the device, so writing failed"); + super("Writing failed because there is no space left on the disk or hard drive. Please make some space or specify a different location for writing output files."); } } public static class CouldNotReadInputFile extends UserException { public CouldNotReadInputFile(String message, Exception e) { - super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); + super(String.format("Could not read file because %s caused by %s", message, getMessage(e))); } public CouldNotReadInputFile(File file) { - super(String.format("Couldn't read file %s", file.getAbsolutePath())); + super(String.format("Could not read file %s", file.getAbsolutePath())); } public CouldNotReadInputFile(File file, String message) { - super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); + super(String.format("Could not read file %s because %s", file.getAbsolutePath(), message)); } public CouldNotReadInputFile(String file, String message) { - super(String.format("Couldn't read file %s because %s", file, message)); + super(String.format("Could not read file %s because %s", file, message)); } public CouldNotReadInputFile(File file, String message, Exception e) { - super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + super(String.format("Could not read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); } public CouldNotReadInputFile(File file, Exception e) { @@ -203,19 +203,19 @@ public CouldNotReadInputFile(String message) { public static class CouldNotCreateOutputFile extends UserException { public CouldNotCreateOutputFile(File file, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); + super(String.format("Could not write file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); } public CouldNotCreateOutputFile(File file, String message) { - super(String.format("Couldn't write file %s because %s", file.getAbsolutePath(), message)); + super(String.format("Could not write file %s because %s", file.getAbsolutePath(), message)); } public CouldNotCreateOutputFile(String filename, String message, Exception e) { - super(String.format("Couldn't write file %s because %s with exception %s", filename, message, getMessage(e))); + super(String.format("Could not write file %s because %s with exception %s", filename, message, getMessage(e))); } public CouldNotCreateOutputFile(File file, Exception e) { - super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); + super(String.format("Could not write file %s because exception %s", file.getAbsolutePath(), getMessage(e))); } public CouldNotCreateOutputFile(String message, Exception e) { @@ -225,20 +225,20 @@ public CouldNotCreateOutputFile(String message, Exception e) { public static class MissortedBAM extends UserException { public MissortedBAM(SAMFileHeader.SortOrder order, File file, SAMFileHeader header) { - super(String.format("Missorted Input SAM/BAM/CRAM files: %s is must be sorted in %s order but order was: %s", file, order, header.getSortOrder())); + super(String.format("Missorted input SAM/BAM/CRAM files: %s must be sorted in %s order but order was: %s. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information.", file, order, header.getSortOrder())); } public MissortedBAM(SAMFileHeader.SortOrder order, String message) { - super(String.format("Missorted Input SAM/BAM/CRAM files: files are not sorted in %s order; %s", order, message)); + super(String.format("Missorted input SAM/BAM/CRAM files: files are not sorted in %s order. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", order, message)); } public MissortedBAM(SAMFileHeader.SortOrder order, SAMRecord read, String message) { - super(String.format("Missorted Input SAM/BAM/CRAM file %s: file sorted in %s order but %s is required; %s", + super(String.format("Missorted input SAM/BAM/CRAM file %s: file sorted in %s order but %s is required. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", read.getFileSource().getReader(), read.getHeader().getSortOrder(), order, message)); } public MissortedBAM(String message) { - super(String.format("Missorted Input SAM/BAM/CRAM files: %s", message)); + super(String.format("Missorted input SAM/BAM/CRAM files. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", message)); } } @@ -252,7 +252,7 @@ public MalformedBAM(File file, String message) { } public MalformedBAM(String source, String message) { - super(String.format("SAM/BAM/CRAM file %s is malformed: %s", source, message)); + super(String.format("SAM/BAM/CRAM file %s is malformed. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s", source, message)); } } @@ -262,7 +262,7 @@ public MisencodedBAM(SAMRecord read, String message) { } public MisencodedBAM(String source, String message) { - super(String.format("SAM/BAM/CRAM file %s appears to be using the wrong encoding for quality scores: %s; please see the GATK --help documentation for options related to this error", source, message)); + super(String.format("SAM/BAM/CRAM file %s appears to be using the wrong encoding for quality scores: %s. Please see https://www.broadinstitute.org/gatk/guide?id=6470 for more details and options related to this error.", source, message)); } } @@ -294,25 +294,25 @@ public MalformedVCFHeader(String message) { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(final SAMRecord read) { - super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please see " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); } } public static class ReadHasUndefinedReadGroup extends MalformedBAM { public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { - super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); + super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please see " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); } } public static class VariantContextMissingRequiredField extends UserException { public VariantContextMissingRequiredField(String field, VariantContext vc) { - super(String.format("Variant at %s:%d is is missing the required field %s", vc.getChr(), vc.getStart(), field)); + super(String.format("Variant at %s:%d is is missing the required field %s.", vc.getChr(), vc.getStart(), field)); } } public static class MissortedFile extends UserException { public MissortedFile(File file, String message, Exception e) { - super(String.format("Missorted Input file: %s is must be sorted in coordinate order. %s and got error %s", file, message, getMessage(e))); + super(String.format("Missorted input file: %s is must be sorted in coordinate order. Please see " + HelpConstants.forumPost("discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram") + "for more information. Error details: %s and got error %s", file, message, getMessage(e))); } } @@ -366,18 +366,14 @@ public DeprecatedArgument(String param, String doc) { public static class IncompatibleSequenceDictionaries extends UserException { public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { - super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s", + super(String.format("Input files %s and %s have incompatible contigs. Please see " + HelpConstants.forumPost("discussion/63/input-files-have-incompatible-contigs") + "for more information. Error details: %s.\n %s contigs = %s\n %s contigs = %s", name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2))); } } public static class LexicographicallySortedSequenceDictionary extends UserException { public LexicographicallySortedSequenceDictionary(String name, SAMSequenceDictionary dict) { - super(String.format("Lexicographically sorted human genome sequence detected in %s." - + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." - + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") - + "\n %s contigs = %s", + super(String.format("Lexicographically sorted human genome sequence detected in %s. Please see " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") + "for more information. Error details: %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } }