Permalink
Browse files

Merge pull request #62 from broadinstitute/rp_increase_max_kmer_in_as…

…sembly

The maximum kmer length is derived from the reads.
  • Loading branch information...
2 parents ed5aff3 + 89e2943 commit 51d618de97737b31b80030b6bd3c46c66b00d90a depristo committed Feb 26, 2013
@@ -75,9 +75,8 @@
public class DeBruijnAssembler extends LocalAssemblyEngine {
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
- private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 12;
+ private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
private static final byte MIN_QUALITY = (byte) 16;
- private static final int MAX_POSSIBLE_KMER = 75;
private static final int GRAPH_KMER_STEP = 6;
// Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode
@@ -136,7 +135,9 @@ public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, fin
protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
graphs.clear();
- final int maxKmer = Math.min(MAX_POSSIBLE_KMER, refHaplotype.getBases().length - KMER_OVERLAP);
+ final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
+ if( maxKmer < MIN_KMER ) { throw new IllegalStateException("Reads are too small for use in assembly."); }
+
// create the graph for each possible kmer
for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) {
final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );
@@ -283,17 +283,17 @@ private void testHCFlatContamination(final String bam1, final String bam2, final
@Test
public void testHCFlatContaminationCase1() {
- testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "0b9d6aabd5ab448f0a2d32f24ff64840");
+ testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "9dbd17769e091ce759efda050cd4f8b2");
}
@Test
public void testHCFlatContaminationCase2() {
- testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "a4ef4a6ce557a6b9666e234fad5c7c80");
+ testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "b8cee98c9c693fd336fc5e574dd744ed");
}
@Test
public void testHCFlatContaminationCase3() {
- testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "bacc98eb2baa5bb1777da24cf0f84913");
+ testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "e7309bd594b8e4b54b712f9877518b8e");
}
}
@@ -68,12 +68,12 @@ private void HCTest(String bam, String args, String md5) {
@Test
public void testHaplotypeCallerMultiSample() {
- HCTest(CEUTRIO_BAM, "", "ecf563b63ca3f640d9cfcc548e8ad776");
+ HCTest(CEUTRIO_BAM, "", "a9748a39604c4ec8bbdb2cb809a971f1");
}
@Test
public void testHaplotypeCallerSingleSample() {
- HCTest(NA12878_BAM, "", "874389182141f41879abea7cb350c9d4");
+ HCTest(NA12878_BAM, "", "c55ebed976767e1f93d2e8ada9d52bf8");
}
@Test(enabled = false)
@@ -84,7 +84,7 @@ public void testHaplotypeCallerSingleSampleWithDbsnp() {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
- "4aa3d0d0a859c0fc0533f29529cc3d95");
+ "70a53e566e6a7090e2f29ed608e9d84f");
}
private void HCTestComplexGGA(String bam, String args, String md5) {
@@ -102,7 +102,7 @@ public void testHaplotypeCallerMultiSampleGGAComplex() {
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
- "cfd717dd79ace99a266e8bb58d6cc7a6");
+ "10fdbfeb3b4ea1af7f242a8aca83cb9b");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -113,7 +113,7 @@ private void HCTestComplexVariants(String bam, String args, String md5) {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
- HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "58b484324f0ea00aaac25fb7711ad657");
+ HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@@ -136,7 +136,7 @@ private void HCTestIndelQualityScores(String bam, String args, String md5) {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
- HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "0e8a3a31b8fe5f097d6975aee8b67cdc");
+ HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f1f867dbbe3747f16a0d9e5f11e6ed64");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@@ -146,14 +146,14 @@ public void testHaplotypeCallerSingleSampleIndelQualityScores() {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
- final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2acd853da3a0380650de6827b7c790ac"));
+ final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
- final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("061a95cab149723866ce7c797ba6bdd4"));
+ final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a17e95c1191e3aef7892586fe38ca050"));
executeTest("HCTestStructuralIndels: ", spec);
}
@@ -913,4 +913,19 @@ public static String getBasesReverseComplement(GATKSAMRecord read) {
return getBasesReverseComplement(read.getReadBases());
}
+ /**
+ * Calculate the maximum read length from the given list of reads.
+ * @param reads list of reads
+ * @return non-negative integer
+ */
+ @Ensures({"result >= 0"})
+ public static int getMaxReadLength( final List<GATKSAMRecord> reads ) {
+ if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); }
+
+ int maxReadLength = 0;
+ for( final GATKSAMRecord read : reads ) {
+ maxReadLength = Math.max(maxReadLength, read.getReadLength());
+ }
+ return maxReadLength;
+ }
}
@@ -32,9 +32,7 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Random;
+import java.util.*;
public class ReadUtilsUnitTest extends BaseTest {
@@ -165,4 +163,20 @@ public void testGetBasesReverseComplement() {
Assert.assertEquals(reconverted, original);
}
}
+
+ @Test (enabled = true)
+ public void testGetMaxReadLength() {
+ for( final int minLength : Arrays.asList( 5, 30, 50 ) ) {
+ for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) {
+ final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
+ for( int readLength = minLength; readLength <= maxLength; readLength++ ) {
+ reads.add( ReadUtils.createRandomRead( readLength ) );
+ }
+ Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match");
+ }
+ }
+
+ final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
+ Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero");
+ }
}

0 comments on commit 51d618d

Please sign in to comment.