Merge pull request #692 from broadinstitute/gvda_docfixes_dsde-docs#1422

Add tools to be documented + minor tweaks to MD docs
broadinstitute · Dec 1, 2016 · 6e9f6cb · 6e9f6cb
2 parents f8a93c9 + 0d23d0b
commit 6e9f6cb
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 13 deletions.
diff --git a/build.gradle b/build.gradle
@@ -161,14 +161,16 @@ ext.commandClasses = ["picard.sam.AddCommentsToBam", "picard.sam.AddOrReplaceRea
                           "picard.analysis.CollectWgsMetricsWithNonZeroCoverage", "picard.analysis.CompareMetrics", "picard.sam.CompareSAMs",
                           "picard.analysis.artifacts.ConvertSequencingArtifactToOxoG", "picard.sam.CreateSequenceDictionary", "picard.sam.DownsampleSam",
                           "picard.illumina.ExtractIlluminaBarcodes", "picard.sam.markduplicates.EstimateLibraryComplexity", "picard.sam.FastqToSam", "picard.util.FifoBuffer",
+                          "picard.vcf.MendelianViolations.FindMendelianViolations",
                           "picard.sam.FilterSamReads", "picard.vcf.filter.FilterVcf", "picard.sam.FixMateInformation", "picard.sam.GatherBamFiles", "picard.vcf.GatherVcfs",
                           "picard.vcf.GenotypeConcordance", "picard.illumina.IlluminaBasecallsToFastq", "picard.illumina.IlluminaBasecallsToSam", "picard.illumina.CheckIlluminaDirectory",
                           "picard.sam.CheckTerminatorBlock", "picard.util.IntervalListTools", "picard.util.LiftOverIntervalList", "picard.vcf.LiftoverVcf", "picard.vcf.MakeSitesOnlyVcf",
                           "picard.sam.markduplicates.MarkDuplicates", "picard.sam.markduplicates.MarkDuplicatesWithMateCigar", "picard.analysis.MeanQualityByCycle",
                           "picard.sam.MergeBamAlignment", "picard.sam.MergeSamFiles", "picard.vcf.MergeVcfs", "picard.reference.NormalizeFasta", "picard.sam.PositionBasedDownsampleSam",
                           "picard.reference.ExtractSequences", "picard.analysis.QualityScoreDistribution", "picard.vcf.RenameSampleInVcf", "picard.sam.ReorderSam",
                           "picard.sam.ReplaceSamHeader", "picard.sam.RevertSam", "picard.sam.RevertOriginalBaseQualitiesAndAddMateCigar", "picard.sam.SamFormatConverter",
-                          "picard.sam.SamToFastq", "picard.util.ScatterIntervalsByNs", "picard.sam.SortSam", "picard.vcf.SortVcf", "picard.sam.SplitSamByLibrary",
+                          "picard.sam.SamToFastq", "picard.util.ScatterIntervalsByNs", "picard.sam.SetNmMdAndUqTags",
+                          "picard.sam.SortSam", "picard.vcf.SortVcf", "picard.sam.SplitSamByLibrary", "picard.sam.markduplicates.UmiAwareMarkDuplicatesWithMateCigar",
                           "picard.vcf.UpdateVcfSequenceDictionary", "picard.vcf.VcfFormatConverter", "picard.illumina.MarkIlluminaAdapters", "picard.vcf.SplitVcfs",
                           "picard.sam.ValidateSamFile", "picard.sam.ViewSam", "picard.vcf.VcfToIntervalList"]
 

diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesWithMateCigar.java
@@ -89,8 +89,16 @@ public class MarkDuplicatesWithMateCigar extends AbstractMarkDuplicatesCommandLi
 
     private final Log log = Log.getInstance(MarkDuplicatesWithMateCigar.class);
 
-    @Option(doc = "The minimum distance to buffer records to account for clipping on the 5' end of the records." +
-            "Set this number to -1 to use twice the first read's read length (or 100, whichever is smaller).", optional = true)
+    @Option(doc = "The minimum distance to buffer records to account for clipping on the 5' end of the records. " +
+            "For a given alignment, this parameter controls the width of the window to search for duplicates of that alignment. " +
+            "Due to 5' read clipping, duplicates do not necessarily have the same 5' alignment coordinates, so the algorithm " +
+            "needs to search around the neighborhood. For single end sequencing data, the neighborhood is only determined by " +
+            "the amount of clipping (assuming no split reads), thus setting MINIMUM_DISTANCE to twice the sequencing read length " +
+            "should be sufficient. For paired end sequencing, the neighborhood is also determined by the fragment insert size, " +
+            "so you may want to set MINIMUM_DISTANCE to something like twice the 99.5% percentile of the fragment insert size " +
+            "distribution (see CollectInsertSizeMetrics). Or you can set this number to -1 to use either a) twice the first read's " +
+            "read length, or b) 100, whichever is smaller. Note that the larger the window, the greater the RAM requirements, so " +
+            "you could run into performance limitations if you use a value that is unnecessarily large.", optional = true)
     public int MINIMUM_DISTANCE = -1;
 
     @Option(doc = "Skip record pairs with no mate cigar and include them in the output.")

diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java
@@ -51,15 +51,16 @@
         programGroup = Alpha.class
 )
 public class UmiAwareMarkDuplicatesWithMateCigar extends SimpleMarkDuplicatesWithMateCigar {
-    static final String USAGE_SUMMARY = "Identifies duplicate reads using information from read positions and UMIs." +
-            "All records are then written to the output file with the duplicate records flagged.";
-    static final String USAGE_DETAILS = "<p>UmiAwareMarkDuplicatesWithMateCigar locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are " +
-            "defined as originating from a single fragment of DNA. </p>" +
-            "<p>This tool identifies a duplicate set by assuming that all members of a duplicate set must have the same start and end position," +
-            "and must also have a sufficiently similar UMIs.  Sufficiently similar is parameterized by MAX_EDIT_DISTANCE_TO_JOIN which indicates" +
-            "the edit distance between UMIs that shall be considered to be part of the same original molecule.</p>" +
-            "<p>This tool is not intended to be used on data without UMIs, see MarkDuplicates for marking duplicates that" +
-            "do not have UMIs.</p>";
+    static final String USAGE_SUMMARY = "Identifies duplicate reads using information from read positions and UMIs. ";
+    static final String USAGE_DETAILS = "<p>This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are " +
+            "defined as originating from a single fragment of DNA. It is based on the MarkDuplicatesWithMateCigar tool, with added logic " +
+            "to leverage Unique Molecular Identifier (UMI) information.</p>" +
+            "<p>In addition to assuming that all members of a duplicate set must have the same start and end position, it imposes that" +
+            "they must also have sufficiently similar UMIs. In this context, 'sufficiently similar' is parameterized by the command line " +
+            "argument MAX_EDIT_DISTANCE_TO_JOIN, which sets the edit distance between UMIs that will be considered to be part of the same " +
+            "original molecule. This logic allows for sequencing errors in UMIs.</p>" +
+            "<p>This tool is NOT intended to be used on data without UMIs; for marking duplicates in non-UMI data, see MarkDuplicates or " +
+            "MarkDuplicatesWithMateCigar. Mixed data (where some reads have UMIs and others do not) is not supported.</p>";
 
     @Option(shortName = "MAX_EDIT_DISTANCE_TO_JOIN", doc = "Largest edit distance that UMIs must have in order to be considered as coming from distinct source molecules.", optional = true)
     public int MAX_EDIT_DISTANCE_TO_JOIN = 1;
@@ -73,7 +74,7 @@ public class UmiAwareMarkDuplicatesWithMateCigar extends SimpleMarkDuplicatesWit
     // Since we inherit from SimpleMarkDuplicatesWithMateCigar, it is useful for us to also inherit the tests
     // which do not contain UMIs.  By default, we don't allow for missing UMIs, but for the inherited tests
     // we allow for missing UMIs.
-    @Option(doc = "Allow for missing UMIs if data doesn't have UMIs.  This option is intended to be used only for testing the code.  Use SimpleMarkDuplicatesWithMateCigar if data has missing UMIs.", optional = true)
+    @Option(doc = "FOR TESTING ONLY: allow for missing UMIs if data doesn't have UMIs. This option is intended to be used ONLY for testing the code. Use MarkDuplicatesWithMateCigar if data has no UMIs. Mixed data (where some reads have UMIs and others do not) is not supported.", optional = true)
     public boolean ALLOW_MISSING_UMIS = false;
 
     private final Log log = Log.getInstance(UmiAwareMarkDuplicatesWithMateCigar.class);