diff --git a/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java index e3d6c52fd4f..163f3fc3da2 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/ReadWalker.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.samtools.SAMSequenceDictionary; import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; @@ -57,7 +58,10 @@ protected final void onStartup() { */ void setReadTraversalBounds() { if ( hasUserSuppliedIntervals() ) { - reads.setTraversalBounds(intervalArgumentCollection.getTraversalParameters(getHeaderForReads().getSequenceDictionary())); + final SAMSequenceDictionary dict = getHeaderForReads().getSequenceDictionary(); + final boolean traverseUnmapped = + intervalArgumentCollection.getTraversalParameters(dict).traverseUnmappedReads(); + reads.setTraversalBounds(new TraversalParameters(userIntervals, traverseUnmapped)); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java b/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java index 0ecdf127a0a..22576b54edd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java @@ -16,7 +16,7 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap; +import org.broadinstitute.hellbender.utils.collections.HopscotchMap; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Tail; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/LocalAssembler.java b/src/main/java/org/broadinstitute/hellbender/tools/LocalAssembler.java new file mode 100644 index 00000000000..9fa8287e82a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/LocalAssembler.java @@ -0,0 +1,2513 @@ +package org.broadinstitute.hellbender.tools; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.PairWalker; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.collections.HopscotchSet; +import org.broadinstitute.hellbender.utils.read.GATKRead; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.util.*; +import java.util.stream.Collectors; +import java.util.zip.GZIPOutputStream; + +@DocumentedFeature +@BetaFeature +@CommandLineProgramProperties( + summary = "Performs local assembly of small regions to discover structural variants.", + oneLineSummary = "Local assembler for SVs", + usageExample = "gatk LocalAssembler -L chr21:16187360-16187360 --ip 500 -R 38.fa.gz " + + "-I NA19240.cram -I NA19240.distantmate.bam " + + "--assembly-name chr21_16187360_16187360_INS --gfa-file test.gfa --fasta-file test.fa.gz", + programGroup = CoverageAnalysisProgramGroup.class +) +public class LocalAssembler extends PairWalker { + @Argument(fullName="assembly-name", doc="Name of assembly used as a prefix for traversal names.") + public String assemblyName; + + @Argument(fullName="gfa-file", doc="Path to assembly output in gfa format.", optional=true) + public GATKPath gfaFile; + + @Argument(fullName="fasta-file", doc="Path to scaffolds in fasta format.", optional=true) + public GATKPath fastaFile; + + public static final byte QMIN_DEFAULT = 25; + @Argument(fullName="q-min", doc="Minimum base quality when kmerizing reads.", optional=true) + private byte qMin = QMIN_DEFAULT; + + public static final int MIN_THIN_OBS_DEFAULT = 4; + @Argument(fullName="min-thin-observations", + doc="Minimum number of observations of some kmer within the contig required to " + + "retain the contig.", optional=true) + private int minThinObs = MIN_THIN_OBS_DEFAULT; + + public static final int MIN_GAPFILL_COUNT_DEFAULT = 3; + @Argument(fullName="min-gapfill-count", + doc="Minimum number of observations of a sequence that patches a gap.", optional=true) + private int minGapfillCount = MIN_GAPFILL_COUNT_DEFAULT; + + public static final int TOO_MANY_TRAVERSALS_DEFAULT = 100000; + @Argument(fullName="too-many-traversals", + doc="If the assembly graph produces this many traversals, just emit contigs instead.", + optional=true) + private int tooManyTraversals = TOO_MANY_TRAVERSALS_DEFAULT; + + public static final int TOO_MANY_SCAFFOLDS_DEFAULT = 50000; + @Argument(fullName="too-many-scaffolds", + doc="If the assembly graph produces this many scaffolds, just emit traversals instead.", + optional=true) + private int tooManyScaffolds = TOO_MANY_SCAFFOLDS_DEFAULT; + + public static final int MIN_SV_SIZE_DEFAULT = 50; + @Argument(fullName="min-sv-size", + doc="Smallest variation size to count as a structural variant.", optional=true) + public int minSVSize = MIN_SV_SIZE_DEFAULT; + + @Argument(fullName="no-scaffolding", doc="turn off scaffolding -- write traversals instead", optional=true) + private boolean noScaffolding = false; + + private final List reads = new ArrayList<>(); + + @Override public boolean requiresIntervals() { return true; } + + @Override public void apply( final GATKRead read, final GATKRead mate ) { + trimOverruns(read, mate); + reads.add(read); + reads.add(mate); + } + + @Override public void applyUnpaired( final GATKRead read ) { + reads.add(read); + } + + @Override public Object onTraversalSuccess() { + super.onTraversalSuccess(); // flush any incomplete pairs + + if ( gfaFile == null ) { + gfaFile = new GATKPath(assemblyName + ".gfa.gz"); + } + if ( fastaFile == null ) { + fastaFile = new GATKPath(assemblyName + ".fa.gz"); + } + + final int regionSize = getTraversalIntervals().stream().mapToInt(SimpleInterval::size).sum(); + final KmerSet kmerAdjacencySet = new KmerSet<>(10 * regionSize); + kmerizeReads(reads, qMin, kmerAdjacencySet); + + List contigs = createAssembly(kmerAdjacencySet, minThinObs); + if ( fillGaps(kmerAdjacencySet, minGapfillCount, reads) ) { + contigs = createAssembly(kmerAdjacencySet, minThinObs); + } + + markCycles(contigs); + + final List readPaths = pathReads(kmerAdjacencySet, reads); + final Map> contigTransitsMap = + collectTransitPairCounts(contigs, readPaths); + try { + final List allTraversals = new ArrayList<>( + traverseAllPaths(contigs, readPaths, tooManyTraversals, contigTransitsMap)); + contigs.sort(Comparator.comparingInt(ContigImpl::getId)); + writeGFA(gfaFile, contigs, allTraversals); + if ( noScaffolding ) { + writeTraversals(fastaFile, assemblyName, allTraversals); + return null; + } + try { + writeTraversals(fastaFile, assemblyName, + createScaffolds(allTraversals, tooManyScaffolds, minSVSize)); + } catch ( final AssemblyTooComplexException x ) { + logger.warn("Assembly too complex for scaffolding. Writing traversals to fasta-file"); + writeTraversals(fastaFile, assemblyName, allTraversals); + } + } catch ( final AssemblyTooComplexException x ) { + logger.warn("Assembly too complex to traverse. Writing contigs as traversals to fasta-file"); + final Collection contigTraversals = new ArrayList<>(contigs.size()); + for ( final Contig contig : contigs ) { + contigTraversals.add(new Traversal(Collections.singletonList(contig))); + } + writeTraversals(fastaFile, assemblyName, contigTraversals); + } + return null; + } + + private static List createAssembly( final KmerSet kmerAdjacencySet, + final int minThinObs ) { + final List contigs = buildContigs(kmerAdjacencySet); + connectContigs(contigs); + removeThinContigs(contigs, minThinObs, kmerAdjacencySet); + weldPipes(contigs); + return contigs; + } + + /** trim read pairs of base calls that have gone past the end of a short fragment */ + @VisibleForTesting + static void trimOverruns( final GATKRead read, final GATKRead mate ) { + // if both mapped and they're on different strands + if ( !read.isUnmapped() && !mate.isUnmapped() && + read.isReverseStrand() != mate.isReverseStrand() ) { + // and both start within 1 base on the ref + if ( Math.abs(read.getStart() - read.getMateStart()) <= 1 ) { + // and both end within 1 base + final int readRefLen = read.getCigar().getReferenceLength(); + final int mateRefLen = mate.getCigar().getReferenceLength(); + if ( Math.abs(readRefLen - mateRefLen) <= 1 ) { + if ( mate.isReverseStrand() ) { + trimClips(read, mate); + } else { + trimClips(mate, read); + } + } + } + } + } + + private static void trimClips( final GATKRead fwd, final GATKRead rev ) { + final List fwdElements = fwd.getCigarElements(); + final List revElements = rev.getCigarElements(); + final int lastFwdElementIdx = fwdElements.size() - 1; + final int lastRevElementIdx = revElements.size() - 1; + final CigarElement fwdLastElement = fwdElements.get(lastFwdElementIdx); + final CigarElement revLastElement = revElements.get(lastRevElementIdx); + final CigarElement fwdFirstElement = fwdElements.get(0); + final CigarElement revFirstElement = revElements.get(0); + if ( fwdFirstElement.getOperator() == CigarOperator.M && + fwdLastElement.getOperator() == CigarOperator.S && + revFirstElement.getOperator() == CigarOperator.S && + revLastElement.getOperator() == CigarOperator.M ) { + final byte[] fwdBases = fwd.getBasesNoCopy(); + final int lastElementLen = fwdLastElement.getLength(); + fwd.setBases(Arrays.copyOfRange(fwdBases, 0, fwdBases.length - lastElementLen)); + final byte[] fwdQuals = fwd.getBaseQualitiesNoCopy(); + if ( fwdQuals.length > 0 ) { + final int qualsLen = fwdQuals.length - lastElementLen; + fwd.setBaseQualities(Arrays.copyOfRange(fwdQuals, 0, qualsLen)); + } + final List newFwdElements = new ArrayList<>(fwdElements); + newFwdElements.set(lastFwdElementIdx, new CigarElement(lastElementLen, CigarOperator.H)); + fwd.setCigar(new Cigar(newFwdElements)); + + final byte[] revBases = rev.getBasesNoCopy(); + final int firstElementLen = revFirstElement.getLength(); + rev.setBases(Arrays.copyOfRange(revBases, firstElementLen, revBases.length)); + final byte[] revQuals = rev.getBaseQualitiesNoCopy(); + if ( revQuals.length > 0 ) { + rev.setBaseQualities(Arrays.copyOfRange(revQuals, firstElementLen, revQuals.length)); + } + final List newRevElements = new ArrayList<>(revElements); + newRevElements.set(0, new CigarElement(firstElementLen, CigarOperator.H)); + rev.setCigar(new Cigar(newRevElements)); + } + } + + @VisibleForTesting + static void kmerizeReads( final List reads, + final byte qMin, + final KmerSet kmerAdjacencySet ) { + for ( final GATKRead read : reads ) { + final byte[] calls = read.getBasesNoCopy(); + final byte[] quals = read.getBaseQualitiesNoCopy(); + KmerAdjacency.kmerize(calls, quals, qMin, kmerAdjacencySet); + } + } + + /** gather unbranched strings of kmers into contigs */ + @VisibleForTesting + static List buildContigs( final KmerSet kmerAdjacencySet ) { + // This method identifies each KmerAdjacency that is a contig start or end, and then builds + // a contig from that start or end. That's actually a lie: it builds contigs at the starts, + // and builds contigs from the reverse complement of the ends (because that's the start of a + // contig on the other strand) to economize on code. + // A KmerAdjacency is a contig start if: + // 1) it has more than 1 predecessor, or no predecessors + // 2) it has a single predecessor, but that predecessor has multiple successors + // 3) its predecessor is its own reverse complement (i.e., a palindromic hairpin) + // Similarly, a KmerAdjacency is a contig end if: + // 1) it has more than 1 successor, or no successors + // 2) it has a single successor, but that successor has multiple predecessors + // 3) its successor is its own reverse complement + final List contigs = new ArrayList<>(); + int nContigs = 0; + for ( final KmerAdjacency kmerAdjacency : kmerAdjacencySet ) { + if ( kmerAdjacency.getContig() == null ) { + ContigImpl contig = null; + final KmerAdjacency predecessor = kmerAdjacency.getSolePredecessor(); + // if we should start a contig with this kmer + if ( predecessor == null || // yes, no predecessors or more than 1 + predecessor.getSuccessorCount() > 1 || // yes, predecessor has multiple successors + predecessor == kmerAdjacency.rc() ) { // yes, predecessor folds back as a palindrome + contig = new ContigImpl(++nContigs, kmerAdjacency); + } else { + // if we should end a contig with this kmer (actually we'll start a contig with the RC) + final KmerAdjacency successor = kmerAdjacency.getSoleSuccessor(); + if ( successor == null || // yes, no successors or more than 1 + successor.getPredecessorCount() > 1 || // yes, successor has multiple predecessors + successor == kmerAdjacency.rc() ) { // yes, successor folds back as a palindrome + contig = new ContigImpl(++nContigs, kmerAdjacency.rc()); + } + } + if ( contig != null ) { + setKmerContig(contig); + contigs.add(contig); + } + } + } + + // if there are smooth circles like a plasmid, gather them together as a contig, too + for ( final KmerAdjacency kmerAdjacency : kmerAdjacencySet ) { + if ( kmerAdjacency.getContig() == null ) { + final ContigImpl contig = new ContigImpl(++nContigs, kmerAdjacency); + setKmerContig(contig); + contigs.add(contig); + } + } + + return contigs; + } + + /** connect contigs when the final kmer of one contig is adjacent to the inital contig of another */ + @VisibleForTesting + static void connectContigs( final List contigs ) { + final int nContigs = contigs.size(); + final KmerSet contigEnds = new KmerSet<>(2*nContigs); + for ( int contigId = 0; contigId != nContigs; ++contigId ) { + final ContigImpl contig = contigs.get(contigId); + final KmerAdjacency fwdKmer = contig.getFirstKmer(); + final KmerAdjacency revKmer = contig.getLastKmer().rc(); + if ( fwdKmer == revKmer ) { + contigEnds.add(new ContigEndKmer(fwdKmer.getKVal(), contig, ContigOrientation.BOTH)); + } else { + contigEnds.add(new ContigEndKmer(fwdKmer.getKVal(), contig, ContigOrientation.FWD)); + contigEnds.add(new ContigEndKmer(revKmer.getKVal(), contig, ContigOrientation.REV)); + } + } + + for ( int contigId = 0; contigId != nContigs; ++contigId ) { + final Contig contig = contigs.get(contigId); + + final KmerAdjacency start = contig.getFirstKmer(); + final int predecessorCount = start.getPredecessorCount(); + if ( predecessorCount > 0 ) { + final List predecessors = contig.getPredecessors(); + final int mask = start.getPredecessorMask(); + for ( int call = 0; call != 4; ++call ) { + if ( (mask & (1 << call)) != 0 ) { + final long kVal = + KmerAdjacency.reverseComplement(start.getPredecessorVal(call)); + final ContigEndKmer contigEndKmer = contigEnds.find(new Kmer(kVal)); + if ( contigEndKmer == null ) { + throw new GATKException("missing contig end kmer"); + } + switch ( contigEndKmer.getContigOrientation() ) { + case FWD: + predecessors.add(contigEndKmer.getContig().rc()); + break; + case REV: + predecessors.add(contigEndKmer.getContig()); + break; + case BOTH: + predecessors.add(contigEndKmer.getContig()); + predecessors.add(contigEndKmer.getContig().rc()); + break; + } + } + } + } + + final KmerAdjacency end = contig.getLastKmer(); + final int successorCount = end.getSuccessorCount(); + if ( successorCount > 0 ) { + final List successors = contig.getSuccessors(); + final int mask = end.getSuccessorMask(); + for ( int call = 0; call != 4; ++call ) { + if ( (mask & (1 << call)) != 0 ) { + final long kVal = end.getSuccessorVal(call); + final ContigEndKmer contigEndKmer = contigEnds.find(new Kmer(kVal)); + if ( contigEndKmer == null ) { + throw new GATKException("missing contig end kmer"); + } + switch ( contigEndKmer.getContigOrientation() ) { + case FWD: + successors.add(contigEndKmer.getContig()); + break; + case REV: + successors.add(contigEndKmer.getContig().rc()); + break; + case BOTH: + successors.add(contigEndKmer.getContig()); + successors.add(contigEndKmer.getContig().rc()); + break; + } + } + } + } + } + } + + /** remove contigs that have little evidence */ + @VisibleForTesting + static void removeThinContigs( final List contigs, + final int minThinObs, + final KmerSet kmerAdjacencySet ) { + contigs.sort(Comparator.comparingInt(ContigImpl::getMaxObservations)); + boolean contigRemoved; + final WalkDataFactory walkDataFactory = new WalkDataFactory(); + do { + // figure out which contigs are cut points + // i.e., those contigs which, if removed, would result in a graph with more connected components + final int nContigs = contigs.size(); + final Map cutDataMap = new HashMap<>(nContigs * 3); + + for ( final ContigImpl contig : contigs ) { + if ( cutDataMap.containsKey(contig) ) { + continue; + } + + cutDataMap.put(contig, walkDataFactory.create()); + int children = 0; + for ( final Contig nextContig : contig.getSuccessors() ) { + if ( !cutDataMap.containsKey(nextContig) ) { + findCuts(nextContig, contig, walkDataFactory, cutDataMap); + children += 1; + } + } + for ( final Contig nextContig : contig.getPredecessors() ) { + if ( !cutDataMap.containsKey(nextContig) ) { + findCuts(nextContig, contig, walkDataFactory, cutDataMap); + children += 1; + } + } + if ( children >= 2 ) { + contig.setMarked(true); + } + } + + // remove poorly attested (low max observations) contigs, unless they are cut points + contigRemoved = false; + final Iterator itr = contigs.iterator(); + while ( itr.hasNext() ) { + final Contig contig = itr.next(); + // TODO: Think about replacing the heuristic "minThinObs" with something that + // takes the observation depth of adjacent contigs into account. + if ( contig.getMaxObservations() < minThinObs && !contig.isMarked() ) { + unlinkContig(contig, kmerAdjacencySet); + itr.remove(); + contigRemoved = true; + break; + } + } + } while ( contigRemoved ); + contigs.sort(Comparator.comparingInt(ContigImpl::getId)); + } + + private static WalkData findCuts( final Contig contig, + final Contig parent, + final WalkDataFactory walkDataFactory, + final Map cutDataMap ) { + final WalkData walkData = walkDataFactory.create(); + cutDataMap.put(contig, walkData); + for ( final Contig nextContig : contig.getSuccessors() ) { + if ( nextContig == parent ) { + continue; + } + WalkData nextWalkData = cutDataMap.get(nextContig); + if ( nextWalkData != null ) { + walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.visitNum); + } else { + nextWalkData = findCuts(nextContig, contig, walkDataFactory, cutDataMap); + walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.minVisitNum); + if ( nextWalkData.minVisitNum >= walkData.visitNum ) { + contig.setMarked(true); + } + } + } + for ( final Contig nextContig : contig.getPredecessors() ) { + if ( nextContig == parent ) { + continue; + } + WalkData nextWalkData = cutDataMap.get(nextContig); + if ( nextWalkData != null ) { + walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.visitNum); + } else { + nextWalkData = findCuts(nextContig, contig, walkDataFactory, cutDataMap); + walkData.minVisitNum = Math.min(walkData.minVisitNum, nextWalkData.minVisitNum); + if ( nextWalkData.minVisitNum >= walkData.visitNum ) { + contig.setMarked(true); + } + } + } + return walkData; + } + + @VisibleForTesting + static void unlinkContig( final Contig contig, final KmerSet kmerAdjacencySet ) { + final KmerAdjacency firstKmer = contig.getFirstKmer(); + final int firstKmerFinalCall = firstKmer.getFinalCall(); + for ( final Contig predecessor : contig.getPredecessors() ) { + if ( predecessor != contig && predecessor != contig.rc() ) { + predecessor.getLastKmer().removeSuccessor(firstKmerFinalCall, kmerAdjacencySet); + if ( !predecessor.getSuccessors().remove(contig) ) { + throw new GATKException("failed to find predecessor link"); + } + } + } + + final KmerAdjacency lastKmer = contig.getLastKmer(); + final int lastKmerInitialCall = lastKmer.getInitialCall(); + for ( final Contig successor : contig.getSuccessors() ) { + if ( successor != contig && successor != contig.rc() ) { + successor.getFirstKmer().removePredecessor(lastKmerInitialCall, kmerAdjacencySet); + if ( !successor.getPredecessors().remove(contig) ) { + throw new GATKException("failed to find successor link"); + } + } + } + + KmerAdjacency nextKmer = firstKmer; + KmerAdjacency kmer; + do { + kmer = nextKmer; + nextKmer = kmer.getSoleSuccessor(); + kmerAdjacencySet.remove(kmer.canonical()); + } while ( kmer != lastKmer ); + } + + private static void clearKmerContig( final Contig contig ) { + int count = 0; + final KmerAdjacency firstKmer = contig.getFirstKmer(); + final KmerAdjacency lastKmer = contig.getLastKmer(); + for ( KmerAdjacency kmer = firstKmer; kmer != lastKmer; kmer = kmer.getSoleSuccessor() ) { + if ( kmer == null ) { + throw new GATKException("contig does not have a flat pipeline of kmers"); + } + if ( kmer.getContig() == null ) { + throw new GATKException("we've returned to a kmer we've already cleared"); + } + kmer.clearContig(); + count += 1; + } + lastKmer.clearContig(); + if ( count + Kmer.KSIZE != contig.size() ) { + throw new GATKException("kmer chain length does not equal contig size"); + } + } + + /** Sets a pointer back to its unique containing contig for each kmer comprised by the contig */ + private static void setKmerContig( final Contig contig ) { + int offset = 0; + final KmerAdjacency firstKmer = contig.getFirstKmer(); + final KmerAdjacency lastKmer = contig.getLastKmer(); + for ( KmerAdjacency kmer = firstKmer; kmer != lastKmer; kmer = kmer.getSoleSuccessor() ) { + if ( kmer == null ) { + throw new GATKException("contig does not have a flat pipeline of kmers"); + } + if ( kmer.getContig() != null ) { + throw new GATKException("we've returned to a kmer we've already updated"); + } + kmer.setContigOffset(contig, offset++); + } + lastKmer.setContigOffset(contig, offset); + if ( offset + Kmer.KSIZE != contig.size() ) { + throw new GATKException("kmer chain length does not equal contig size"); + } + } + + /** replace adjacent contigs without branches with a single, larger contig */ + @VisibleForTesting + static void weldPipes( final List contigs ) { + for ( int contigIdx = 0; contigIdx != contigs.size(); ++contigIdx ) { + final ContigImpl contig = contigs.get(contigIdx); + if ( contig.getSuccessors().size() == 1 ) { + final Contig successor = contig.getSuccessors().get(0); + if ( successor != contig && successor != contig.rc() && + successor.getPredecessors().size() == 1 ) { + contigs.set(contigIdx, join(contig.getId(), contig, successor)); + if ( !contigs.remove(successor.canonical()) ) { + throw new GATKException("successor linkage is messed up"); + } + contigIdx -= 1; // reconsider the new contig -- there might be more joining possible + continue; + } + } + if ( contig.getPredecessors().size() == 1 ) { + final Contig predecessor = contig.getPredecessors().get(0); + if ( predecessor != contig && predecessor != contig.rc() && + predecessor.getSuccessors().size() == 1 ) { + contigs.set(contigIdx, join(contig.getId(), predecessor, contig)); + if ( !contigs.remove(predecessor.canonical()) ) { + throw new GATKException("predecessor linkage is messed up"); + } + contigIdx -= 1; // reconsider + } + } + } + } + + private static ContigImpl join( final int id, + final Contig predecessor, + final Contig successor ) { + final ContigImpl joinedContig = new ContigImpl(id, predecessor, successor); + clearKmerContig(joinedContig); + setKmerContig(joinedContig); + return joinedContig; + } + + @VisibleForTesting + static void markCycles( final List contigs ) { + for ( final Contig contig : contigs ) { + contig.setIsCycleMember(false); + } + + final int nContigs = contigs.size(); + final Deque deque = new ArrayDeque<>(nContigs); + final Map cutDataMap = new HashMap<>(nContigs * 3); + final WalkDataFactory walkDataFactory = new WalkDataFactory(); + for ( final Contig contig : contigs ) { + if ( !cutDataMap.containsKey(contig) ) { + markCyclesRecursion(contig, deque, walkDataFactory, cutDataMap); + } + } + } + + private static WalkData markCyclesRecursion( final Contig contig, + final Deque deque, + final WalkDataFactory walkDataFactory, + final Map cutDataMap ) { + final WalkData walkData = walkDataFactory.create(); + cutDataMap.put(contig, walkData); + deque.addFirst(contig); + + for ( final Contig successor : contig.getSuccessors() ) { + final WalkData successorWalkData = cutDataMap.get(successor); + if ( successorWalkData == null ) { + final int recursionVisitNum = + markCyclesRecursion(successor, deque, walkDataFactory, cutDataMap).minVisitNum; + walkData.minVisitNum = Math.min(walkData.minVisitNum, recursionVisitNum); + } else { + walkData.minVisitNum = Math.min(walkData.minVisitNum, successorWalkData.visitNum); + } + } + + if ( walkData.visitNum == walkData.minVisitNum ) { + Contig tig = deque.removeFirst(); + if ( tig == contig ) { + cutDataMap.get(tig).visitNum = Integer.MAX_VALUE; + + // single-vertex component -- cyclic only if self-referential + if ( tig.getSuccessors().contains(tig) ) { + tig.setIsCycleMember(true); + } + } else { + while ( true ) { + // kill cross-links + cutDataMap.get(tig).visitNum = Integer.MAX_VALUE; + tig.setIsCycleMember(true); + if ( tig == contig ) break; + tig = deque.removeFirst(); + } + } + } + return walkData; + } + + @VisibleForTesting + static boolean fillGaps( final KmerSet kmerAdjacencySet, + final int minGapfillCount, + final List reads ) { + final Map gapFillCounts = new HashMap<>(); + final PathBuilder pathBuilder = new PathBuilder(kmerAdjacencySet); + for ( final GATKRead read : reads ) { + final Path path = new Path(read.getBasesNoCopy(), pathBuilder); + final List parts = path.getParts(); + final int lastIdx = parts.size() - 1; + for ( int idx = 1; idx < lastIdx; ++idx ) { + final PathPart pathPart = parts.get(idx); + if ( pathPart.isGap() ) { + final char prevCall = parts.get(idx - 1).getLastCall(); + final char nextCall = parts.get(idx + 1).getFirstCall(); + String gapFill = prevCall + pathPart.getSequence().toString() + nextCall; + final SequenceRC gapFillRC = new SequenceRC(gapFill); + if ( gapFillRC.compareTo(gapFill) < 0 ) { + gapFill = gapFillRC.toString(); + } + gapFillCounts.merge(gapFill, 1, Integer::sum); + } + } + } + + boolean newKmers = false; + for ( final Map.Entry entry : gapFillCounts.entrySet() ) { + final int nObservations = entry.getValue(); + if ( nObservations >= minGapfillCount ) { + KmerAdjacency.kmerize(entry.getKey(), nObservations, kmerAdjacencySet); + newKmers = true; + } + } + + if ( newKmers ) { + for ( final KmerAdjacency kmerAdjacency : kmerAdjacencySet ) { + kmerAdjacency.clearContig(); + } + } + return newKmers; + } + + @VisibleForTesting + static List pathReads( final KmerSet kmerAdjacencySet, + final List reads ) { + final List readPaths = new ArrayList<>(reads.size()); + final PathBuilder pathBuilder = new PathBuilder(kmerAdjacencySet); + for ( final GATKRead read : reads ) { + readPaths.add(new Path(read.getBasesNoCopy(), pathBuilder)); + } + return readPaths; + } + + @VisibleForTesting + static Map> collectTransitPairCounts( + final List contigs, + final List readPaths ) { + final Map> contigTransitsMap = + new HashMap<>(3 * contigs.size()); + for ( final Path path : readPaths ) { + final List parts = path.getParts(); + final int lastPart = parts.size() - 1; + for ( int partIdx = 1; partIdx < lastPart; ++partIdx ) { + final Contig prevContig = parts.get(partIdx - 1).getContig(); + if ( prevContig == null ) continue; + final Contig curContig = parts.get(partIdx).getContig(); + if ( curContig == null ) { + partIdx += 1; + continue; + } + final Contig nextContig = parts.get(partIdx + 1).getContig(); + if ( nextContig == null ) { + partIdx += 2; + continue; + } + final TransitPairCount tpc = new TransitPairCount(prevContig, nextContig); + final List tpcList = + contigTransitsMap.computeIfAbsent(curContig, tig -> new ArrayList<>(4)); + final int idx = tpcList.indexOf(tpc); + if ( idx != -1 ) { + tpcList.get(idx).observe(); + } else { + tpcList.add(tpc); + contigTransitsMap.computeIfAbsent(curContig.rc(), tig -> new ArrayList<>(4)) + .add(tpc.getRC()); + } + } + } + return contigTransitsMap; + } + + @VisibleForTesting + static Set traverseAllPaths( + final List contigs, + final List readPaths, + final int tooManyTraversals, + final Map> contigTransitsMap ) { + final TraversalSet traversalSet = new TraversalSet(tooManyTraversals); + final List contigsList = new ArrayList<>(); + // build traversals from untransited contigs + // untransited contigs are sources, sinks, or large contigs that can't be crossed by a read + for ( final Contig contig : contigs ) { + if ( !contigTransitsMap.containsKey(contig) ) { + if ( contig.getSuccessors().isEmpty() && contig.getPredecessors().isEmpty() ) { + traversalSet.add(new Traversal(Collections.singletonList(contig))); + } else { + for ( final Contig successor : contig.getSuccessors() ) { + traverse(successor, contig, contigsList, + readPaths, contigTransitsMap, traversalSet); + } + for ( final Contig predecessor : contig.getPredecessors() ) { + traverse(predecessor.rc(), contig.rc(), contigsList, + readPaths, contigTransitsMap, traversalSet); + } + } + } + } + + // build traversals across acyclic contigs with transits that haven't been used yet + for ( final Map.Entry> entry : + contigTransitsMap.entrySet() ) { + final Contig contig = entry.getKey(); + if ( contig.isCycleMember() ) { + continue; + } + for ( final TransitPairCount tpc : entry.getValue() ) { + if ( tpc.getCount() > 0 ) { + tpc.resetCount(); + final TraversalSet fwdTraversalSet = new TraversalSet(tooManyTraversals); + traverse(tpc.getNextContig(), contig, contigsList, + readPaths, contigTransitsMap, fwdTraversalSet); + final TraversalSet revTraversalSet = new TraversalSet(tooManyTraversals); + traverse(tpc.getPrevContig().rc(), contig.rc(), contigsList, + readPaths, contigTransitsMap, revTraversalSet); + for ( final Traversal revTraversal : revTraversalSet ) { + final Traversal revTraversalRC = revTraversal.rc(); + for ( final Traversal fwdTraversal : fwdTraversalSet ) { + traversalSet.add(Traversal.combine(revTraversalRC, fwdTraversal)); + } + } + } + } + } + + // build traversals from any remaining unexplored transits + for ( final Map.Entry> entry : + contigTransitsMap.entrySet() ) { + final Contig contig = entry.getKey(); + for ( final TransitPairCount tpc : entry.getValue() ) { + if ( tpc.getCount() > 0 ) { + tpc.resetCount(); + final int lastElement = contigsList.size(); + contigsList.add(tpc.getPrevContig()); + traverse(tpc.getNextContig(), contig, contigsList, + readPaths, contigTransitsMap, traversalSet); + contigsList.set(lastElement, tpc.getNextContig().rc()); + traverse(tpc.getPrevContig().rc(), contig.rc(), contigsList, + readPaths, contigTransitsMap, traversalSet); + contigsList.remove(lastElement); + } + } + } + + return traversalSet; + } + + private static void traverse( final Contig contig, + final Contig predecessor, + final List contigsList, + final List readPaths, + final Map> contigTransitsMap, + final TraversalSet traversalSet ) { + contigsList.add(predecessor); // extend the contigsList + if ( contig.isCycleMember() ) { + traverseCycle(contig, contigsList, readPaths, contigTransitsMap, traversalSet); + // about to return, remove element added 3 lines above + contigsList.remove(contigsList.size() - 1); + return; + } + final List transits = contigTransitsMap.get(contig); + boolean foundTransitInMap = false; + if ( transits != null ) { + for ( final TransitPairCount tpc : transits ) { + if ( tpc.getPrevContig() == predecessor ) { + final Contig successor = tpc.getNextContig(); + if ( predecessor == contig.rc() ) { // if palindromic + final int nContigs = contigsList.size(); + if ( nContigs > 1 ) { + if ( successor.rc() == contigsList.get(nContigs - 2) ) { + continue; + } + } + } + tpc.resetCount(); + traverse(successor, contig, contigsList, + readPaths, contigTransitsMap, traversalSet); + foundTransitInMap = true; + } + } + } + if ( !foundTransitInMap ) { + contigsList.add(contig); + traversalSet.add(new Traversal(contigsList)); + contigsList.remove(contigsList.size() - 1); + } + + // undo what we did on the 1st line of this method + contigsList.remove(contigsList.size() - 1); + } + + private static void traverseCycle( final Contig contig, + final List contigsList, + final List readPaths, + final Map> contigTransitsMap, + final TraversalSet traversalSet ) { + contigsList.add(contig); + final int nContigs = contigsList.size(); + + // We're here because the final element of the list is cyclic. + // The previous element, if it exists, is a good place from which to start figuring out how + // far the read paths lead us. + if ( !contig.isCycleMember() ) { + throw new GATKException("not called from a cyclic contig"); + } + final List contigsSubList = + nContigs <= 2 ? contigsList : contigsList.subList(nContigs - 2, nContigs); + final List> longestPaths = findLongestPaths(contigsSubList, readPaths); + // didn't get anywhere -- just complete the traversal + if ( longestPaths.isEmpty() ) { + traversalSet.add(new Traversal(contigsList)); + } else { + // for each unique extension into the cycle + for ( final List path : longestPaths ) { + // don't think this can happen, but still + if ( path.isEmpty() ) { + traversalSet.add(new Traversal(contigsList)); + continue; + } + final List extendedContigsList = + new ArrayList<>(contigsList.size() + path.size()); + extendedContigsList.addAll(contigsList); + // if we didn't get out of the cycle + if ( path.get(path.size() - 1).isCycleMember() ) { + extendedContigsList.addAll(path); + traversalSet.add(new Traversal(extendedContigsList)); + } else { + // we found a cycle-exiting path, so extend that normally + for ( final Contig curContig : path ) { + if ( curContig.isCycleMember() ) { + extendedContigsList.add(curContig); + } else { + final Contig prevContig = + extendedContigsList.remove(extendedContigsList.size() - 1); + traverse(curContig, prevContig, extendedContigsList, readPaths, + contigTransitsMap, traversalSet); + extendedContigsList.add(prevContig); + break; + } + } + } + clearTransitPairs(contigTransitsMap, extendedContigsList); + } + } + contigsList.remove(contigsList.size() - 1); + } + + private static void clearTransitPairs( + final Map> contigTransitsMap, + final List contigsList ) { + final int lastIdx = contigsList.size() - 1; + for ( int idx = 1; idx < lastIdx; ++idx ) { + final List pairCounts = contigTransitsMap.get(contigsList.get(idx)); + if ( pairCounts != null ) { + final Contig predecessor = contigsList.get(idx - 1); + final Contig successor = contigsList.get(idx + 1); + for ( final TransitPairCount tpc : pairCounts ) { + if ( tpc.getPrevContig() == predecessor && tpc.getNextContig() == successor ) { + tpc.resetCount(); + break; + } + } + } + } + } + + private static List> findLongestPaths( final List toMatch, + final List readPaths ) { + final List> longestPaths = new ArrayList<>(); + for ( final Path path : readPaths ) { + testPath(path, toMatch, longestPaths); + testPath(path.rc(), toMatch, longestPaths); + } + return longestPaths; + } + + private static void testPath( final Path path, + final List toMatch, + final List> longestPaths ) { + final List pathParts = path.getParts(); + final int nPathParts = pathParts.size(); + final List pathContigs = + pathParts.stream() + .map(PathPart::getContig) + .collect(Collectors.toCollection(() -> new ArrayList<>(nPathParts))); + final int matchIdx = Collections.indexOfSubList(pathContigs, toMatch); + if ( matchIdx != -1 ) { + final int suffixIdx = matchIdx + toMatch.size(); + if ( suffixIdx < nPathParts ) { + addSubPathToLongestPaths(extractSubPath(pathContigs, suffixIdx), longestPaths); + } + } + } + + private static List extractSubPath( final List pathContigs, + final int suffixIdx ) { + final int nPathContigs = pathContigs.size(); + Contig prev = pathContigs.get(suffixIdx - 1); + final List subPath = new ArrayList<>(nPathContigs - suffixIdx); + for ( int idx = suffixIdx; idx != nPathContigs; ++idx ) { + final Contig tig = pathContigs.get(idx); + if ( tig == null || !prev.getSuccessors().contains(tig) ) break; + subPath.add(tig); + prev = tig; + } + return subPath; + } + + private static void addSubPathToLongestPaths( final List subPath, + final List> longestPaths ) { + final int nResults = longestPaths.size(); + for ( int idx = 0; idx != nResults; ++idx ) { + final List test = longestPaths.get(idx); + if ( isPrefix(subPath, test) ) return; + if ( isPrefix(test, subPath) ) { + longestPaths.set(idx, subPath); + return; + } + } + longestPaths.add(subPath); + } + + private static boolean isPrefix( final List list1, final List list2 ) { + final int list1Size = list1.size(); + final int list2Size = list2.size(); + if ( list1Size > list2Size ) return false; + for ( int idx = 0; idx != list1Size; ++idx ) { + if ( list1.get(idx) != list2.get(idx) ) return false; + } + return true; + } + + @VisibleForTesting + static Collection createScaffolds( final List allTraversals, + final int tooManyScaffolds, + final int minSVSize ) { + removeTriviallyDifferentTraversals(allTraversals, minSVSize); + + final int nTraversals = allTraversals.size(); + final Map> traversalsByFirstContig = new HashMap<>(3 * nTraversals); + for ( int idx = 0; idx != nTraversals; ++idx ) { + final Traversal traversal = allTraversals.get(idx); + traversalsByFirstContig.compute(traversal.getFirstContig(), + ( k, v ) -> v == null ? new ArrayList<>(3) : v).add(idx); + final Traversal rcTraversal = traversal.rc(); + traversalsByFirstContig.compute(rcTraversal.getFirstContig(), + ( k, v ) -> v == null ? new ArrayList<>(3) : v).add(~idx); + } + + final List scaffolds = new ArrayList<>(nTraversals); + final boolean[] touched = new boolean[nTraversals]; + for ( int idx = 0; idx != nTraversals; ++idx ) { + if ( !touched[idx] ) { + expandTraversal(idx, touched, traversalsByFirstContig, allTraversals, + tooManyScaffolds, scaffolds); + } + } + return scaffolds; + } + + private static void expandTraversal( final int traversalIdx, + final boolean[] touched, + final Map> traversalsByFirstContig, + final List allTraversals, + final int tooManyScaffolds, + final List scaffolds ) { + final Traversal traversal = allTraversals.get(traversalIdx); + touched[traversalIdx] = true; + final List downExtensions = new ArrayList<>(); + final Set startingContigSet = new HashSet<>(); + walkTraversals(traversal, touched, startingContigSet, traversalsByFirstContig, + allTraversals, downExtensions); + final List upExtensions = new ArrayList<>(); + walkTraversals(traversal.rc(), touched, startingContigSet, traversalsByFirstContig, + allTraversals, upExtensions); + for ( final Traversal down : downExtensions ) { + for ( final Traversal up : upExtensions ) { + if ( scaffolds.size() >= tooManyScaffolds ) { + throw new AssemblyTooComplexException(); + } + scaffolds.add( + Traversal.combineOverlappers(up.rc(), down, traversal.getContigs().size())); + } + } + } + + private static void walkTraversals( final Traversal traversal, + final boolean[] touched, + final Set startingContigSet, + final Map> traversalsByFirstContig, + final List allTraversals, + final List extensions ) { + final Contig firstContig = traversal.getFirstContig(); + final List indexList; + if ( startingContigSet.contains(firstContig) || + traversal.isInextensible() || + (indexList = traversalsByFirstContig.get(traversal.getLastContig())) == null ) { + extensions.add(traversal); + return; + } + startingContigSet.add(firstContig); + for ( int idx : indexList ) { + final Traversal extension; + if ( idx >= 0 ) { + extension = allTraversals.get(idx); + touched[idx] = true; + } else { + final int rcIdx = ~idx; + extension = allTraversals.get(rcIdx).rc(); + touched[rcIdx] = true; + } + walkTraversals(Traversal.combine(traversal, extension), touched, startingContigSet, + traversalsByFirstContig, allTraversals, extensions ); + } + startingContigSet.remove(firstContig); + } + + private static void removeTriviallyDifferentTraversals( + final Collection allTraversals, + final int minSVSize ) { + if ( allTraversals.isEmpty() ) { + return; + } + final TreeSet sortedTraversals = new TreeSet<>(new TraversalEndpointComparator()); + for ( final Traversal traversal : allTraversals ) { + sortedTraversals.add(traversal); + sortedTraversals.add(traversal.rc()); + } + final Iterator traversalIterator = sortedTraversals.iterator(); + Traversal prevTraversal = traversalIterator.next(); + while ( traversalIterator.hasNext() ) { + final Traversal curTraversal = traversalIterator.next(); + if ( isTriviallyDifferent(prevTraversal, curTraversal, minSVSize) ) { + traversalIterator.remove(); + } else { + prevTraversal = curTraversal; + } + } + // remove duplicates where we have both strands surviving + final Iterator traversalIterator2 = sortedTraversals.iterator(); + while ( traversalIterator2.hasNext() ) { + final Traversal traversal = traversalIterator2.next(); + if ( sortedTraversals.contains(traversal.rc()) ) { + traversalIterator2.remove(); + } + } + allTraversals.clear(); + allTraversals.addAll(sortedTraversals); + } + + private static boolean isTriviallyDifferent( final Traversal traversal1, + final Traversal traversal2, + final int minSVSize ) { + final Contig firstContig1 = traversal1.getFirstContig(); + final Contig lastContig1 = traversal1.getLastContig(); + final Contig firstContig2 = traversal2.getFirstContig(); + final Contig lastContig2 = traversal2.getLastContig(); + if ( firstContig1 != firstContig2 || lastContig1 != lastContig2 ) { + return false; + } + final int interiorSize1 = + traversal1.getSequenceLength() - firstContig1.size() - lastContig1.size(); + final int interiorSize2 = + traversal2.getSequenceLength() - firstContig2.size() - lastContig2.size(); + + // if the path lengths are so different that one could harbor an SV, they're not trivially different + if ( Math.abs(interiorSize1 - interiorSize2) >= minSVSize ) { + return false; + } + + // if the paths are small enough that there can't be an SV's worth of differences, they're trivially different + final int maxInteriorSize = Math.max(interiorSize1, interiorSize2); + if ( maxInteriorSize < minSVSize ) { + return true; + } + + // dang, maybe there's enough material in common that there can't be an SV's worth of differences + // run a longest common subsequence algorithm to figure out the length of the common material + // DP matrix holds length of common material + final List contigs1 = traversal1.getContigs(); + final int rowLen = contigs1.size() - 1; + final int[][] rowPair = new int[2][]; + rowPair[0] = new int[rowLen]; + rowPair[1] = new int[rowLen]; + int pairIdx = 0; + final List contigs2 = traversal2.getContigs(); + final int nRows = contigs2.size() - 1; + for ( int idx2 = 1; idx2 != nRows; ++idx2 ) { + final int[] curRow = rowPair[pairIdx]; + final int[] prevRow = rowPair[pairIdx ^ 1]; + pairIdx ^= 1; + + final int id2 = contigs2.get(idx2).getId(); + for ( int idx1 = 1; idx1 != rowLen; ++idx1 ) { + final Contig tig1 = contigs1.get(idx1); + if ( tig1.getId() == id2 ) { + // if the previous cells also contain a match we've already removed the K-1 bases upstream + final boolean extendMatch = + contigs1.get(idx1 -1).getId() == contigs2.get(idx2 - 1).getId(); + curRow[idx1] = prevRow[idx1 - 1] + (extendMatch ? tig1.getNKmers() : tig1.size()); + } else { + curRow[idx1] = Math.max(curRow[idx1 - 1], prevRow[idx1]); + } + } + } + final int commonLen = rowPair[pairIdx ^ 1][rowLen - 1]; + return (maxInteriorSize - commonLen) < minSVSize; + } + + public static class TraversalEndpointComparator implements Comparator { + @Override + public int compare( final Traversal traversal1, final Traversal traversal2 ) { + final List contigs1 = traversal1.getContigs(); + final List contigs2 = traversal2.getContigs(); + int cmp = Integer.compare(contigs1.get(0).getId(), contigs2.get(0).getId()); + if ( cmp != 0 ) { + return cmp; + } + final int last1 = contigs1.size() - 1; + final int last2 = contigs2.size() - 1; + cmp = Integer.compare(contigs1.get(last1).getId(), contigs2.get(last2).getId()); + if ( cmp != 0 ) { + return cmp; + } + // among those starting and ending at the same place, sort least observed last + cmp = -Integer.compare(traversal1.getMinMaxObservations(), + traversal2.getMinMaxObservations()); + if ( cmp != 0 ) { + return cmp; + } + final int end = Math.min(last1, last2); + for ( int idx = 1; idx < end; ++idx ) { + cmp = Integer.compare(contigs1.get(idx).getId(), contigs2.get(idx).getId()); + if ( cmp != 0 ) { + return cmp; + } + } + return Integer.compare(last1, last2); + } + } + + private static void writeGFA( final GATKPath gfaFile, + final Collection contigs, + final Collection traversals ) { + for ( final ContigImpl contig : contigs ) { + contig.setMarked(false); + } + + try ( final BufferedWriter writer = createBufferedWriter(gfaFile) ) { + writer.write("H\tVN:Z:2.0"); + writer.newLine(); + for ( final Contig contig : contigs ) { + if ( !contig.isMarked() ) { + writeContig(contig, writer); + } + } + for ( final Traversal traversal : traversals ) { + writer.write(traversal.getContigs().stream() + .map(Contig::toRef) + .collect(Collectors.joining(" ", "O\t*\t", ""))); + writer.newLine(); + } + } catch ( final IOException ioe ) { + throw new UserException("Failed to write gfa-file " + gfaFile, ioe); + } + } + + private static void writeContig( final Contig contig, + final BufferedWriter writer ) throws IOException { + final Contig canonicalContig = contig.canonical(); + canonicalContig.setMarked(true); + final CharSequence seq = canonicalContig.getSequence(); + writer.write("S\t" + canonicalContig + "\t" + seq.length() + "\t" + seq + + "\tMO:i:" + canonicalContig.getMaxObservations() + + "\tFO:i:" + canonicalContig.getFirstKmer().getNObservations() + + "\tLO:i:" + canonicalContig.getLastKmer().getNObservations()); + writer.newLine(); + + for ( final Contig successor : contig.getSuccessors() ) { + if ( !successor.isMarked() ) { + writeContig(successor, writer); + } + writeEdge(contig, successor, writer); + } + for ( final Contig predecessor : contig.getPredecessors() ) { + if ( !predecessor.isMarked() ) { + writeContig(predecessor, writer); + } + } + } + + private static void writeEdge( final Contig contig, + final Contig successor, + final BufferedWriter writer ) throws IOException { + final int contigLength = contig.getSequence().length(); + writer.write("E\t*\t" + contig.toRef() + "\t" + successor.toRef() + "\t" + + (contigLength - Kmer.KSIZE + 1) + "\t" + contigLength + "$\t0\t" + + (Kmer.KSIZE - 1) + "\t" + (Kmer.KSIZE - 1) + "M"); + writer.newLine(); + } + + private static void writeTraversals( final GATKPath fastaFile, + final String assemblyName, + final Collection traversals ) { + try ( final BufferedWriter writer = createBufferedWriter(fastaFile) ) { + int traversalNo = 0; + for ( final Traversal traversal : traversals ) { + writer.write(">"); + if ( assemblyName != null ) { + writer.write(assemblyName); + writer.write("_"); + } + writer.write("t"); + writer.write(Integer.toString(++traversalNo)); + writer.write(" "); + writer.write(traversal.toString()); + writer.newLine(); + writer.write(traversal.getSequence()); + writer.newLine(); + } + } catch ( final IOException ioe ) { + throw new UserException("Failed to write fasta-file " + fastaFile, ioe); + } + } + + private static BufferedWriter createBufferedWriter( final GATKPath path ) throws IOException { + final OutputStream os = path.getOutputStream(); + final String pathString = path.getRawInputString(); + if ( !pathString.endsWith(".gz") && !pathString.endsWith(".GZ") ) { + return new BufferedWriter(new OutputStreamWriter(os)); + } + return new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(os))); + } + + /** fixed-size, immutable kmer. usual 2-bit encoding: ACGT->0123. low order bits are final call. */ + public static class Kmer { + public static final int KSIZE = 31; // must be odd number less than 32 + public static final long KMASK = (1L << 2*KSIZE) - 1L; + private final long kVal; + + public Kmer( final long kVal ) { this.kVal = kVal; } + + public long getKVal() { return kVal; } + public boolean isCanonical() { return isCanonical(kVal); } + public int getInitialCall() { return (int)(kVal >> (KSIZE*2 - 2)) & 3; } + public int getFinalCall() { return (int)kVal & 3; } + + public long getPredecessorVal( final int call ) { + return (kVal >> 2) | ((long)call << (2 * (KSIZE - 1))); + } + public long getSuccessorVal( final int call ) { return ((kVal << 2) & KMASK) | call; } + + public static boolean isCanonical( final long val ) { + return (val & (1L << KSIZE)) == 0L; + } + + @Override public boolean equals( final Object obj ) { + return obj instanceof Kmer && kVal == ((Kmer)obj).kVal; + } + + @Override public int hashCode() { + return (int)(kVal ^ (kVal >>> 32)); + } + } + + /** Set of Kmers. Uses HopscotchSet, customized to find correct starting bin for Kmers and derivatives. */ + public static final class KmerSet extends HopscotchSet { + public KmerSet( final int capacity ) { super(capacity); } + + @Override + protected int hashToIndex( final Object kmer ) { + final long positiveKval = + ((HopscotchSet.SPREADER * ((Kmer)kmer).getKVal()) & Long.MAX_VALUE); + return (int)(positiveKval % capacity()); + } + } + + /** + * A Kmer that remembers its predecessors and successors, and the number of times it's been observed + * in the assembly's input set of reads. + * The masks are bit-wise (1=A, 2=C, 4=G, 8=T) to show which predecessors or successors have been observed. + * The Kmer's position on a Contig is also tracked (in later phases of the assembly process). + */ + public static abstract class KmerAdjacency extends Kmer { + public KmerAdjacency( final long kVal ) { super(kVal); } + + public abstract KmerAdjacency getSolePredecessor(); // returns null if there's 0 or >1 predecessors + public abstract int getPredecessorMask(); + public abstract int getPredecessorCount(); + public abstract void removePredecessor( final int callToRemove, + final KmerSet kmerAdjacencySet ); + + public abstract KmerAdjacency getSoleSuccessor(); // returns null if there's 0 or > 1 successors + public abstract int getSuccessorMask(); + public abstract int getSuccessorCount(); + public abstract void removeSuccessor( final int callToRemove, + final KmerSet kmerAdjacencySet ); + + public abstract Contig getContig(); + public abstract int getContigOffset(); + // offset is 0-based measure on the contig sequence of the beginning of the kmer + public abstract void setContigOffset( final Contig contig, final int contigOffset ); + public abstract void clearContig(); + + public abstract int getNObservations(); + public abstract KmerAdjacency rc(); + public abstract KmerAdjacencyImpl canonical(); + + public void observe( final KmerAdjacency predecessor, final KmerAdjacency successor ) { + observe(predecessor, successor, 1); + } + + public abstract void observe( final KmerAdjacency predecessor, + final KmerAdjacency successor, + final int count ); + + @Override public String toString() { + final StringBuilder sb = new StringBuilder(KSIZE); + long currentVal = getKVal(); + for ( int idx = 0; idx != KSIZE; ++idx ) { + sb.append("ACGT".charAt((int)currentVal & 3)); + currentVal >>= 2; + } + sb.reverse(); // low order bits were loaded into sb first: fix that now by reversing the sb. + return sb.toString(); + } + + /** + * Transform a read's calls into KmerAdjacencies, and add them to a KmerSet. + * Skip kmers that include a call with a quality < qMin. + * Skip kmers with non-ACGT calls. + */ + public static void kmerize( final byte[] calls, + final byte[] quals, + final byte qMin, + final KmerSet kmerAdjacencySet ) { + int currentCount = 0; // number of calls loaded into currentKVal + long currentKVal = 0; + KmerAdjacency prevAdjacency = null; + KmerAdjacency currentAdjacency = null; + for ( int idx = 0; idx < calls.length; ++idx ) { + if ( quals[idx] < qMin ) { // if we encounter a low-quality call + // take care of the most recent valid KmerAdjacency, if any + if ( currentAdjacency != null ) { + currentAdjacency.observe(prevAdjacency, null); + } + // ready ourselves to accumulate calls afresh + currentCount = 0; + currentAdjacency = prevAdjacency = null; + continue; + } + currentKVal <<= 2; + switch ( calls[idx] ) { + case 'A': case 'a': break; + case 'C': case 'c': currentKVal += 1; break; + case 'G': case 'g': currentKVal += 2; break; + case 'T': case 't': currentKVal += 3; break; + default: + if ( currentAdjacency != null ) { + currentAdjacency.observe(prevAdjacency, null); + } + currentCount = 0; + currentAdjacency = prevAdjacency = null; + continue; + } + if ( ++currentCount >= KSIZE ) { // if we've loaded enough calls to make a complete kmer + final KmerAdjacency nextAdjacency = findOrAdd(currentKVal, kmerAdjacencySet); + if ( currentAdjacency != null ) { + currentAdjacency.observe(prevAdjacency, nextAdjacency); + } + prevAdjacency = currentAdjacency; + currentAdjacency = nextAdjacency; + } + } + if ( currentAdjacency != null ) { + currentAdjacency.observe(prevAdjacency, null); + } + } + + /** + * Kmerize a String. This version is for gap fills. + * The number of observations applies to all kmers except the 1st and last. + */ + public static void kmerize( final String sequence, + final int nObservations, + final KmerSet kmerAdjacencySet ) { + int currentCount = 0; + long currentKVal = 0; + int nObs = 0; + KmerAdjacency prevAdjacency = null; + KmerAdjacency currentAdjacency = null; + final int nCalls = sequence.length(); + for ( int idx = 0; idx != nCalls; ++idx ) { + currentKVal <<= 2; + switch ( sequence.charAt(idx) ) { + case 'A': case 'a': break; + case 'C': case 'c': currentKVal += 1; break; + case 'G': case 'g': currentKVal += 2; break; + case 'T': case 't': currentKVal += 3; break; + default: throw new GATKException("unexpected base call in string to kmerize."); + } + if ( ++currentCount >= KSIZE ) { + final KmerAdjacency nextAdjacency = findOrAdd(currentKVal, kmerAdjacencySet); + if ( currentAdjacency != null ) { + currentAdjacency.observe(prevAdjacency, nextAdjacency, nObs); + nObs = nObservations; + } + prevAdjacency = currentAdjacency; + currentAdjacency = nextAdjacency; + } + } + if ( currentAdjacency != null ) { + currentAdjacency.observe(prevAdjacency, null, 0); + } + } + + // Lookup table for reverse-complementing each possible byte value. + // Each pair of bits represents a base, so you have to reverse bits pairwise and then invert all bits. + // This is most quickly and easily done with a lookup table. + private static final long[] BYTEWISE_REVERSE_COMPLEMENT; + static { + BYTEWISE_REVERSE_COMPLEMENT = new long[256]; + for ( int bIn = 0; bIn != 256; ++bIn ) { + BYTEWISE_REVERSE_COMPLEMENT[bIn] = + ~(((bIn & 3) << 6) | (((bIn >> 2) & 3) << 4) | + (((bIn >> 4) & 3) << 2) | ((bIn >> 6) & 3)) & 0xffL; + } + } + + public static long reverseComplement( long val ) { + // process val one byte at a time + long result = BYTEWISE_REVERSE_COMPLEMENT[(int)val & 0xFF]; // handle the low-order byte + int nBytes = 8; + while ( --nBytes != 0 ) { // pre-decrementing: we'll go through the loop 7 times + // rotate down by a byte + val >>= 8; + // rotate up by a byte and OR in the reverse complement of the next byte + result = (result << 8) | BYTEWISE_REVERSE_COMPLEMENT[(int)val & 0xFF]; + } + return result >>> (Long.SIZE - 2*KSIZE); + } + + // Kmer lookup in KmerSet. + // KmerSets holding KmerAdjacencies have only canonical Kmers, so RC non-canonical kmers before lookup. + public static KmerAdjacency find( final long kVal, + final KmerSet kmerAdjacencySet ) { + if ( isCanonical(kVal) ) return kmerAdjacencySet.find(new Kmer(kVal & KMASK)); + final KmerAdjacency result = kmerAdjacencySet.find(new Kmer(reverseComplement(kVal))); + return result == null ? null : result.rc(); + } + + // Kmer lookup in KmerSet. + // KmerSets holding KmerAdjacencies have only canonical Kmers, so RC non-canonical kmers before lookup. + // Add missing Kmers. + public static KmerAdjacency findOrAdd( final long kVal, + final KmerSet kmerAdjacencySet ) { + if ( isCanonical(kVal) ) { + return kmerAdjacencySet.findOrAdd(new Kmer(kVal & KMASK), kmer -> + new KmerAdjacencyImpl(((Kmer)kmer).getKVal())); + } + return kmerAdjacencySet.findOrAdd(new Kmer(reverseComplement(kVal)), kmer -> + new KmerAdjacencyImpl(((Kmer)kmer).getKVal())).rc(); + } + } + + /** + * Class to implement KmerAdjacency for canonical Kmers. + * In particular, a KmerSet created on KmerAdjacency contains only canonical Kmers. + */ + public static final class KmerAdjacencyImpl extends KmerAdjacency { + private final KmerAdjacencyRC rc; // the reverse-complement of this kmer + + // these fields won't change much after all reads have been kmerized, but they do change + // during that process. and maybe just a little during gap filling + private KmerAdjacency solePredecessor; // set to null if there are no predecessors, or multiple predecessors + private KmerAdjacency soleSuccessor; // set to null if there are no successors, or multiple successors + private int predecessorMask; // bit mask of observed kmers preceding this one + private int successorMask; // bit mask observed kmers following this one + private int nObservations; // the reads in which this kmer was observed + + // these fields refer back to the graph: they'll change as the graph structure is refined + private Contig contig; // the contig that contains this Kmer + private int contigOffset; // the offset within the contig where this kmer is found + + private static final int[] COUNT_FOR_MASK = + //side sum for binary values from 0 -> 15 + //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 + { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + + public KmerAdjacencyImpl( final long kVal ) { + super(kVal); + this.rc = new KmerAdjacencyRC(this); + } + + @Override public KmerAdjacency getSolePredecessor() { return solePredecessor; } // may return null + @Override public int getPredecessorMask() { return predecessorMask; } + @Override public int getPredecessorCount() { return COUNT_FOR_MASK[predecessorMask]; } + @Override + public void removePredecessor( final int callToRemove, + final KmerSet kmerAdjacencySet ) { + predecessorMask &= ~(1 << callToRemove); + solePredecessor = null; + if ( getPredecessorCount() == 1 ) { + for ( int call = 0; call != 4; ++call ) { + if ( ((1 << call) & predecessorMask) != 0 ) { + solePredecessor = find(getPredecessorVal(call), kmerAdjacencySet); + break; + } + } + } + } + + @Override public KmerAdjacency getSoleSuccessor() { return soleSuccessor; } // may return null + @Override public int getSuccessorMask() { return successorMask; } + @Override public int getSuccessorCount() { return COUNT_FOR_MASK[successorMask]; } + @Override + public void removeSuccessor( final int callToRemove, + final KmerSet kmerAdjacencySet ) { + successorMask &= ~(1 << callToRemove); + soleSuccessor = null; + if ( getSuccessorCount() == 1 ) { + for ( int call = 0; call != 4; ++call ) { + if ( ((1 << call) & successorMask) != 0 ) { + soleSuccessor = find(getSuccessorVal(call), kmerAdjacencySet); + break; + } + } + } + } + + @Override public Contig getContig() { return contig; } + @Override public int getContigOffset() { return contigOffset; } + @Override public void setContigOffset( final Contig contig, final int contigOffset ) { + if ( this.contig != null ) { + throw new GATKException("Internal error: overwriting kmer contig and offset."); + } + this.contig = contig; + this.contigOffset = contigOffset; + } + @Override public void clearContig() { contig = null; contigOffset = 0; } + + @Override public int getNObservations() { return nObservations; } + @Override public KmerAdjacency rc() { return rc; } + @Override public KmerAdjacencyImpl canonical() { return this; } + + @Override public void observe( final KmerAdjacency predecessor, + final KmerAdjacency successor, + final int count ) { + if ( predecessor != null ) { + if ( predecessor.getSuccessorVal(getFinalCall()) != getKVal() ) { + throw new GATKException("illegal predecessor"); + } + final int initialCall = predecessor.getInitialCall(); + final int newPredecessorMask = 1 << initialCall; + if ( (newPredecessorMask & predecessorMask) == 0 ) { + if ( predecessorMask == 0 ) { + solePredecessor = predecessor; + predecessorMask = newPredecessorMask; + } else { + solePredecessor = null; + predecessorMask |= newPredecessorMask; + } + } + } + if ( successor != null ) { + if ( successor.getPredecessorVal(getInitialCall()) != getKVal() ) { + throw new GATKException("illegal successor"); + } + final int finalCall = successor.getFinalCall(); + final int newSuccessorMask = 1 << finalCall; + if ( (newSuccessorMask & successorMask) == 0 ) { + if ( successorMask == 0 ) { + soleSuccessor = successor; + successorMask = newSuccessorMask; + } else { + soleSuccessor = null; + successorMask |= newSuccessorMask; + } + } + } + nObservations += count; + } + } + + /** + * Class to implement KmerAdjacency for Kmers that are the reverse-complement of a canonical Kmer. + * In particular, a KmerSet created on KmerAdjacency contains only canonical Kmers. + * A KmerAdjacencyRC represents the RC of each Kmer in the KmerSet. + */ + public static final class KmerAdjacencyRC extends KmerAdjacency { + private final KmerAdjacencyImpl rc; + + // lookup table to bit-reverse nibbles + private static final int[] NIBREV = + // 0000, 0001, 0010, 0011, 0100, 0101, 0110, 0111, 1000, 1001, 1010, 1011, 1100, 1101, 1110, 1111 + {0b0000,0b1000,0b0100,0b1100,0b0010,0b1010,0b0110,0b1110,0b0001,0b1001,0b0101,0b1101,0b0011,0b1011,0b0111,0b1111}; + + public KmerAdjacencyRC( final KmerAdjacencyImpl rc ) { + super(reverseComplement(rc.getKVal())); + this.rc = rc; + } + + @Override public KmerAdjacency getSolePredecessor() { + final KmerAdjacency successor = rc.getSoleSuccessor(); + return successor == null ? null : successor.rc(); + } + @Override public int getPredecessorMask() { return NIBREV[rc.getSuccessorMask()]; } + @Override public int getPredecessorCount() { return rc.getSuccessorCount(); } + @Override + public void removePredecessor( final int callToRemove, + final KmerSet kmerAdjacencySet ) { + rc.removeSuccessor(3 - callToRemove, kmerAdjacencySet); + } + + @Override public KmerAdjacency getSoleSuccessor() { + final KmerAdjacency predecessor = rc.getSolePredecessor(); + return predecessor == null ? null : predecessor.rc(); + } + @Override public int getSuccessorMask() { return NIBREV[rc.getPredecessorMask()]; } + @Override public int getSuccessorCount() { return rc.getPredecessorCount(); } + @Override + public void removeSuccessor( final int callToRemove, + final KmerSet kmerAdjacencySet ) { + rc.removePredecessor(3 - callToRemove, kmerAdjacencySet); + } + + @Override public Contig getContig() { + final Contig contig = rc.getContig(); + return contig == null ? null : contig.rc(); + } + @Override public int getContigOffset() { + final Contig contig = rc.getContig(); + return contig == null ? 0 : contig.size() - rc.getContigOffset() - KSIZE; + } + @Override public void setContigOffset( final Contig contig, final int contigOffset ) { + rc.setContigOffset(contig.rc(), contig.size() - contigOffset - KSIZE); + } + @Override public void clearContig() { rc.clearContig(); } + + @Override public int getNObservations() { return rc.getNObservations(); } + @Override public KmerAdjacency rc() { return rc; } + @Override public KmerAdjacencyImpl canonical() { return rc; } + + @Override public void observe( final KmerAdjacency predecessor, + final KmerAdjacency successor, + final int count ) { + rc.observe(successor == null ? null : successor.rc(), + predecessor == null ? null : predecessor.rc(), + count); + } + } + + public enum ContigOrientation { + FWD, // k-mer appears at the 5' end of the contig + REV, // k-mer appears at the 5' end of the reverse-complemented contig + BOTH // k-mer occurs on 5' end of the contig and its RC (can happen when the contig is a palindrome) + } + + /** Initial or final Kmer in a Contig. */ + public static final class ContigEndKmer extends Kmer { + private final Contig contig; + private final ContigOrientation contigOrientation; + + public ContigEndKmer( final long kVal, + final Contig contig, + final ContigOrientation contigEnd ) { + super(kVal); + this.contig = contig; + this.contigOrientation = contigEnd; + } + + public Contig getContig() { return contig; } + public ContigOrientation getContigOrientation() { return contigOrientation; } + } + + /** + * An unbranched sequence of Kmers. + * Each Kmer (except the last one) has a single successor, which allows enumerating the sequence + * of Kmers in the Contig. The sequence of base calls in the Contig is just the sequence of kmers + * with the K-1 overlapping calls removed from adjacent kmers. + */ + public interface Contig { + int getId(); + String toRef(); // a GFA-format reference + CharSequence getSequence(); + int getMaxObservations(); + KmerAdjacency getFirstKmer(); + KmerAdjacency getLastKmer(); + List getPredecessors(); + List getSuccessors(); + int size(); + default int getNKmers() { return size() - Kmer.KSIZE + 1; } + Contig rc(); + boolean isCycleMember(); + void setIsCycleMember( final boolean isCycleMember ); + boolean isMarked(); + void setMarked( final boolean marked ); + boolean isCanonical(); + ContigImpl canonical(); + + default boolean isPredecessor( final Contig contig ) { + return findContig(getPredecessors(), contig); + } + default boolean isSuccessor( final Contig contig ) { + return findContig(getSuccessors(), contig); + } + + static boolean findContig( final List contigs, final Contig contig ) { + for ( final Contig tig : contigs ) { + if ( contig == tig ) { + return true; + } + } + return false; + } + } + + /** Simple implementation of Contig interface. */ + public static final class ContigImpl implements Contig { + private final int id; + private final CharSequence sequence; + private final int maxObservations; + private final KmerAdjacency firstKmer; + private final KmerAdjacency lastKmer; + private final List predecessors; + private final List successors; + private boolean cyclic; + private boolean marked; + private final Contig rc; + + public ContigImpl( final int id, final KmerAdjacency firstKmerAdjacency ) { + this.id = id; + final StringBuilder sb = new StringBuilder(firstKmerAdjacency.toString()); + int maxObservations = firstKmerAdjacency.getNObservations(); + KmerAdjacency lastKmerAdjacency = firstKmerAdjacency; + for ( KmerAdjacency kmerAdjacency = firstKmerAdjacency.getSoleSuccessor(); + kmerAdjacency != null; + kmerAdjacency = kmerAdjacency.getSoleSuccessor() ) { + // if we've gone around a circle, or if we're branching backwards, or if we hit a palindrome u-turn + if ( firstKmerAdjacency == kmerAdjacency || + kmerAdjacency.getPredecessorCount() != 1 || + kmerAdjacency == lastKmerAdjacency.rc() ) { + break; + } + sb.append("ACGT".charAt(kmerAdjacency.getFinalCall())); + maxObservations = Math.max(maxObservations, kmerAdjacency.getNObservations()); + lastKmerAdjacency = kmerAdjacency; + } + this.sequence = sb.toString(); + this.maxObservations = maxObservations; + this.firstKmer = firstKmerAdjacency; + this.lastKmer = lastKmerAdjacency; + this.predecessors = new ArrayList<>(firstKmer.getPredecessorCount()); + this.successors = new ArrayList<>(lastKmer.getSuccessorCount()); + this.rc = new ContigRCImpl(this); + } + + // create a new contig by joining two contigs + public ContigImpl( final int id, final Contig predecessor, final Contig successor ) { + if ( predecessor == successor || predecessor == successor.rc() ) { + throw new GATKException("can't self-join"); + } + if ( !checkOverlap(predecessor.getSequence(), successor.getSequence()) ) { + throw new GATKException("sequences can't be joined"); + } + this.id = id; + final StringBuilder sb = new StringBuilder(predecessor.getSequence()); + final CharSequence successorSequence = successor.getSequence(); + sb.append(successorSequence.subSequence(Kmer.KSIZE - 1, successorSequence.length())); + this.sequence = sb.toString(); + this.maxObservations = + Math.max(predecessor.getMaxObservations(), successor.getMaxObservations()); + this.firstKmer = predecessor.getFirstKmer(); + this.lastKmer = successor.getLastKmer(); + this.predecessors = new ArrayList<>(predecessor.getPredecessors().size()); + this.successors = new ArrayList<>(successor.getSuccessors().size()); + this.rc = new ContigRCImpl(this); + + // fix predecessor linkages to point to new contig + for ( final Contig predPredecessor : predecessor.getPredecessors() ) { + if ( predPredecessor == successor ) { + predecessors.add(this); + } else if ( predPredecessor == predecessor.rc() ) { + predecessors.add(rc); + } else { + predecessors.add(predPredecessor); + final List successors = predPredecessor.getSuccessors(); + successors.set(successors.indexOf(predecessor), this); + } + } + + // fix successor linkages to point to new contig + for ( final Contig succSuccessor : successor.getSuccessors() ) { + if ( succSuccessor == predecessor ) { + successors.add(this); + } else if ( succSuccessor == successor.rc() ) { + successors.add(rc); + } else { + successors.add(succSuccessor); + final List predecessors = succSuccessor.getPredecessors(); + predecessors.set(predecessors.indexOf(successor), this); + } + } + } + + @Override + public int getId() { + return id; + } + + @Override + public String toRef() { return toString() + "+"; } + + @Override + public CharSequence getSequence() { + return sequence; + } + + @Override + public int getMaxObservations() { + return maxObservations; + } + + @Override + public KmerAdjacency getFirstKmer() { + return firstKmer; + } + + @Override + public KmerAdjacency getLastKmer() { + return lastKmer; + } + + @Override + public List getPredecessors() { + return predecessors; + } + + @Override + public List getSuccessors() { + return successors; + } + + @Override + public int size() { + return sequence.length(); + } + + @Override + public Contig rc() { + return rc; + } + + @Override + public boolean isCycleMember() { + return cyclic; + } + + @Override + public void setIsCycleMember( final boolean isCycleMember ) { + this.cyclic = isCycleMember; + } + + @Override + public boolean isMarked() { + return marked; + } + + @Override + public void setMarked( final boolean marked ) { + this.marked = marked; + } + + @Override + public boolean isCanonical() { + return true; + } + + @Override + public ContigImpl canonical() { + return this; + } + + @Override + public String toString() { + return "c" + id; + } + + private static boolean checkOverlap( final CharSequence seq1, final CharSequence seq2 ) { + final int seq1Len = seq1.length(); + final CharSequence seq1SubSeq = seq1.subSequence(seq1Len - Kmer.KSIZE + 1, seq1Len); + final CharSequence seq2SubSeq = seq2.subSequence(0, Kmer.KSIZE - 1); + return seq1SubSeq.equals(seq2SubSeq); + } + } + + /** + * Implementation of Contig for the reverse-complement of some other Contig. + * Which one is the "real" Contig, and which is the "RC" is completely arbitrary, since there + * is no notion of canonical for Contigs. + */ + public static final class ContigRCImpl implements Contig { + private final CharSequence sequence; + private final List predecessors; + private final List successors; + private final ContigImpl rc; + + public ContigRCImpl( final ContigImpl contig ) { + this.sequence = new SequenceRC(contig.getSequence()); + this.predecessors = new ContigListRC(contig.getSuccessors()); + this.successors = new ContigListRC(contig.getPredecessors()); + this.rc = contig; + } + + @Override public int getId() { return ~rc.getId(); } + @Override public String toRef() { return rc.toString() + "-"; } + @Override public CharSequence getSequence() { return sequence; } + @Override public int getMaxObservations() { return rc.getMaxObservations(); } + @Override public KmerAdjacency getFirstKmer() { return rc.getLastKmer().rc(); } + @Override public KmerAdjacency getLastKmer() { return rc.getFirstKmer().rc(); } + @Override public List getPredecessors() { return predecessors; } + @Override public List getSuccessors() { return successors; } + @Override public int size() { return sequence.length(); } + @Override public Contig rc() { return rc; } + @Override public boolean isCycleMember() { return rc.isCycleMember(); } + @Override public void setIsCycleMember( final boolean isCycleMember ) { rc.setIsCycleMember(isCycleMember); } + @Override public boolean isMarked() { return rc.isMarked(); } + @Override public void setMarked( final boolean marked ) { rc.setMarked(marked); } + @Override public boolean isCanonical() { return false; } + @Override public ContigImpl canonical() { return rc; } + @Override public String toString() { return rc.toString() + "RC"; } + } + + /** A CharSequence that is a view of the reverse-complement of another sequence. */ + public static final class SequenceRC implements CharSequence, Comparable { + private final int lenLess1; + private final CharSequence sequence; + + public SequenceRC( final CharSequence sequence ) { + this.lenLess1 = sequence.length() - 1; + this.sequence = sequence; + } + + @Override public int length() { return sequence.length(); } + @Override public char charAt( final int index ) { + final char result; + switch ( Character.toUpperCase(sequence.charAt(lenLess1 - index)) ) { + case 'A': result = 'T'; break; + case 'C': result = 'G'; break; + case 'G': result = 'C'; break; + case 'T': result = 'A'; break; + default: result = 'N'; break; + } + return result; + } + @Override public CharSequence subSequence( final int start, final int end ) { + return new StringBuilder(end - start).append(this, start, end).toString(); + } + @Override public String toString() { return new StringBuilder(this).toString(); } + + @Override public int compareTo( final CharSequence charSequence ) { + final int len1 = length(); + final int len2 = charSequence.length(); + final int cmpLen = Math.min(len1, len2); + for ( int idx = 0; idx != cmpLen; ++idx ) { + final char char1 = charAt(idx); + final char char2 = Character.toUpperCase(charSequence.charAt(idx)); + if ( char1 > char2 ) return 1; + if ( char1 < char2 ) return -1; + } + return Integer.compare(len1, len2); + } + } + + /** A list of Contigs that presents a reverse-complemented view of a List of Contigs. */ + public static final class ContigListRC extends AbstractList { + private final List contigList; + + public ContigListRC( final List contigList ) { + this.contigList = contigList; + } + + @Override public Contig get( final int index ) { + return contigList.get(reflectIndex(index)).rc(); + } + @Override public int size() { return contigList.size(); } + @Override public Contig set( final int index, final Contig contig ) { + return contigList.set(reflectIndex(index), contig.rc()).rc(); + } + @Override public void add( final int index, final Contig contig ) { + contigList.add(reflectIndex(index), contig.rc()); + } + @Override public Contig remove( final int index ) { + return contigList.remove(reflectIndex(index)).rc(); + } + + public List rc() { return contigList; } + + private int reflectIndex( final int index ) { return size() - 1 - index; } + } + + /** A single-Contig portion of a path across the assembly graph. */ + public interface PathPart { + Contig getContig(); // will be null for PathParts that depart from the graph (PathPartGap) + CharSequence getSequence(); // will be null for PathParts on the graph (PathPartContig) + void extend( final char call ); + int getStart(); + int getStop(); + boolean isGap(); + int getLength(); + PathPart rc(); + char getFirstCall(); + char getLastCall(); + default boolean startsAtBeginning() { return getStart() == 0; } + default boolean stopsAtEnd() { return getStop() + Kmer.KSIZE - 1 == getContig().size(); } + } + + /** A part of a path that isn't present in the graph. */ + public static final class PathPartGap implements PathPart { + private final StringBuilder sequence = new StringBuilder(); + + public PathPartGap( final KmerAdjacency kmer ) { sequence.append(kmer.toString()); } + private PathPartGap( final CharSequence sequence ) { this.sequence.append(sequence); } + + @Override public Contig getContig() { return null; } + @Override public CharSequence getSequence() { return sequence.toString(); } + @Override public void extend( final char call ) { sequence.append(call); } + @Override public int getStart() { return 0; } + @Override public int getStop() { return sequence.length(); } + @Override public boolean isGap() { return true; } + @Override public int getLength() { return sequence.length() - Kmer.KSIZE + 1; } + @Override public PathPart rc() { return new PathPartGap(new SequenceRC(sequence)); } + @Override public char getFirstCall() { return sequence.charAt(Kmer.KSIZE - 1); } + @Override public char getLastCall() { + return sequence.charAt(sequence.length() - Kmer.KSIZE + 1); + } + } + + /** A part of a path that is present as a sub-sequence of some Contig. */ + public static final class PathPartContig implements PathPart { + private final Contig contig; + private final int start; + private int stop; + + public PathPartContig( final Contig contig, final int start ) { + this(contig, start, start+1); + } + public PathPartContig( final Contig contig, final int start, final int stop ) { + this.contig = contig; + this.start = start; + this.stop = stop; + } + + @Override public Contig getContig() { return contig; } + @Override public String getSequence() { return null; } + @Override public void extend( final char call ) { stop += 1; } + @Override public int getStart() { return start; } + @Override public int getStop() { return stop; } + @Override public boolean isGap() { return false; } + @Override public int getLength() { return stop - start; } + @Override public PathPart rc() { + final int revBase = contig.size() - Kmer.KSIZE + 1; + return new PathPartContig(contig.rc(), revBase - stop, revBase - start); + } + @Override public char getFirstCall() { + return getContig().getSequence().charAt(start + Kmer.KSIZE - 1); + } + @Override public char getLastCall() { return getContig().getSequence().charAt(stop - 1); } + } + + /** A helper class for Path building. + * It transforms an array of base calls into a list of contigs (and gaps). + */ + public static final class PathBuilder { + private final KmerSet kmerAdjacencySet; + private final List parts; + private long kVal = 0; + private int count = 0; + private PathPart currentPathPart = null; + + public PathBuilder( final KmerSet kmerAdjacencySet ) { + this.kmerAdjacencySet = kmerAdjacencySet; + this.parts = new ArrayList<>(); + } + + public List processCalls( final byte[] calls ) { + parts.clear(); + kVal = 0; + count = 0; + currentPathPart = null; + + for ( int idx = 0; idx != calls.length; ++idx ) { + processCall( (char)calls[idx] ); + } + return new ArrayList<>(parts); + } + + private void processCall( final char call ) { + kVal <<= 2; + switch ( call ) { + case 'C': case 'c': kVal += 1; break; + case 'G': case 'g': kVal += 2; break; + case 'T': case 't': kVal += 3; break; + } + if ( ++count >= Kmer.KSIZE ) { // if we've seen enough calls to make a whole kmer + processKval(call); + } + } + + private void processKval( final char call ) { + final KmerAdjacency kmer = KmerAdjacencyImpl.find(kVal, kmerAdjacencySet); + if ( kmer == null ) { // if the kmer isn't in the KmerSet + processGap(call); // we'll make a "gap" PathPart + } else { + processKmer(call, kmer); // it's a part of some contig + } + } + + private void processGap( final char call ) { + // if there's no current path part, or if the current part isn't a gap, create a PathPartGap + if ( currentPathPart == null || !currentPathPart.isGap() ) { + currentPathPart = new PathPartGap(new KmerAdjacencyImpl(kVal)); + parts.add(currentPathPart); + } else { + // the current path part is a PathPartGap, so just extend it + currentPathPart.extend(call); + } + } + + private void processKmer( final char call, final KmerAdjacency kmer ) { + final Contig contig = kmer.getContig(); + final int contigOffset = kmer.getContigOffset(); + if ( currentPathPart == null ) { + // we've found a kmer, but don't have a current path part -- create one + currentPathPart = new PathPartContig(contig, contigOffset); + parts.add(currentPathPart); + } else if ( contig == currentPathPart.getContig() ) { + // our lookup is on the current path part's contig -- extend it + extendCurrentPathPart(call, contig, contigOffset); + } else { + // we're moving from one contig to a new one, or from a gap onto a contig + processNewContig(contig, contigOffset); + } + } + + private void extendCurrentPathPart( final char call, + final Contig contig, + final int contigOffset ) { + final int curStop = currentPathPart.getStop(); + if ( contigOffset == curStop ) { // if this is just the next kmer on our current contig + currentPathPart.extend(call); + } else if ( contigOffset == 0 && contig.getNKmers() == curStop ) { + // starting over on the same contig (i.e., a tight cycle) + currentPathPart = new PathPartContig(contig, 0); + parts.add(currentPathPart); + } else { + // kmer is non-contiguous on the current contig. + // start a new path part after a zero-length gap. + parts.add(zeroLengthGap(currentPathPart)); + currentPathPart = new PathPartContig(contig, contigOffset); + parts.add(currentPathPart); + } + } + + private void processNewContig( final Contig contig, final int contigOffset ) { + if ( currentPathPart.isGap() ) { + // try to squash captured gaps caused by isolated, single-base sequencing errors + if ( currentPathPart.getLength() == Kmer.KSIZE ) { + final int prevPartIdx = parts.size() - 2; + if ( prevPartIdx >= 0 ) { + if ( gapIsSquashed(prevPartIdx, contig, contigOffset) ) { + return; + } + } + } + } else if ( !currentPathPart.stopsAtEnd() || contigOffset != 0 || + !currentPathPart.getContig().isSuccessor(contig) ) { + // not an end-to-end join across connected contigs -- record a zero-length gap + parts.add(zeroLengthGap(currentPathPart)); + } + + // we're jumping to a new contig. start a new path part + currentPathPart = new PathPartContig(contig, contigOffset); + parts.add(currentPathPart); + } + + private boolean gapIsSquashed( final int prevPartIdx, + final Contig contig, + final int contigOffset ) { + final PathPart prevPart = parts.get(prevPartIdx); + final Contig prevPartContig = prevPart.getContig(); + final int prevPartStart = prevPart.getStart(); + final int prevPartStop = prevPart.getStop(); + final int prevPartMaxStop = + prevPartContig.size() - Kmer.KSIZE + 1; + final int newStop = contigOffset + 1; + + if ( prevPartContig == contig ) { // if the gap is internal to a single contig + if ( contigOffset - prevPartStop == Kmer.KSIZE ) { + // smooth over a kmer-sized gap in a single contig by backfilling the gap + currentPathPart = new PathPartContig(prevPartContig, prevPartStart, newStop); + parts.set(prevPartIdx, currentPathPart); + parts.remove(prevPartIdx + 1); + return true; + } + } else if ( prevPartMaxStop - prevPartStop + contigOffset == Kmer.KSIZE ) { + // kmer-size gap crosses from one contig to another: backfill the gap + parts.set(prevPartIdx, + new PathPartContig(prevPartContig, prevPartStart, prevPartMaxStop)); + currentPathPart = new PathPartContig(contig, 0, newStop); + parts.set(prevPartIdx + 1, currentPathPart); + return true; + } + return false; + } + + private static PathPart zeroLengthGap( final PathPart currentPathPart ) { + final int currentStop = currentPathPart.getStop(); + final CharSequence currentSequence = currentPathPart.getContig().getSequence(); + final CharSequence almostAKmer = + currentSequence.subSequence(currentStop, currentStop + Kmer.KSIZE - 1); + return new PathPartGap(almostAKmer); + } + } + + /** A path through the assembly graph for something (probably a read). */ + public static final class Path { + private final List parts; + + public Path( final byte[] calls, final PathBuilder pathBuilder ) { + parts = pathBuilder.processCalls(calls); + } + + // RCing constructor + private Path( final Path that ) { + final List thoseParts = that.parts; + final int nParts = thoseParts.size(); + parts = new ArrayList<>(nParts); + for ( int idx = nParts - 1; idx >= 0; --idx ) { + parts.add(thoseParts.get(idx).rc()); + } + } + + public List getParts() { return parts; } + public Path rc() { return new Path(this); } + + @Override public String toString() { + if ( parts.size() == 0 ) return ""; + final StringBuilder sb = new StringBuilder(); + String prefix = ""; + final PathPart firstPart = parts.get(0); + final PathPart lastPart = parts.get(parts.size() - 1); + for ( final PathPart pp : parts ) { + sb.append(prefix); + prefix = ", "; + if ( pp.isGap() ) { + sb.append("NoKmer(").append(pp.getLength()).append(")"); + } else { + final Contig contig = pp.getContig(); + sb.append(contig); + final int maxStop = contig.size() - Kmer.KSIZE + 1; + if ( (pp != firstPart && pp.getStart() != 0) || + (pp != lastPart && pp.getStop() != maxStop) ) { + sb.append('(').append(pp.getStart()).append('-') + .append(pp.getStop()).append('/').append(maxStop).append(')'); + } + } + } + return sb.toString(); + } + } + + public static final class WalkDataFactory { + private int nextNum; + + public WalkData create() { return new WalkData(++nextNum); } + } + + /** Per-Contig storage for depth-first graph walking. */ + public static final class WalkData { + public int visitNum; + public int minVisitNum; + + private WalkData( final int visitNum ) { + this.visitNum = visitNum; + this.minVisitNum = visitNum; + } + } + + /** + * A count of the number of read Paths that cross through some Contig from some previous Contig + * to some subsequent Contig. This helps us with phasing, so that we limit ourselves to + * graph traversals truly represented by the data. + */ + public static final class TransitPairCount { + private final Contig prevContig; + private final Contig nextContig; + private final TransitPairCount rc; + private int count; + + public TransitPairCount( final Contig prevContig, final Contig nextContig ) { + this.prevContig = prevContig; + this.nextContig = nextContig; + this.rc = new TransitPairCount(this); + this.count = 1; + } + + private TransitPairCount( final TransitPairCount rc ) { + this.prevContig = rc.nextContig.rc(); + this.nextContig = rc.prevContig.rc(); + this.rc = rc; + this.count = 1; + } + + public Contig getPrevContig() { return prevContig; } + public Contig getNextContig() { return nextContig; } + public TransitPairCount getRC() { return rc; } + public void observe() { count += 1; rc.count += 1; } + public void resetCount() { count = 0; rc.count = 0; } + public int getCount() { return count; } + + @Override public boolean equals( final Object obj ) { + if ( !(obj instanceof TransitPairCount) ) return false; + final TransitPairCount that = (TransitPairCount)obj; + return this.prevContig == that.prevContig && this.nextContig == that.nextContig; + } + + @Override public int hashCode() { + return 47 * (47 * prevContig.hashCode() + nextContig.hashCode()); + } + + @Override public String toString() { + return prevContig + "<-->" + nextContig + " " + count + "x"; + } + } + + /** + * A list of Contigs through the assembly graph. + * Differs from a Path in that there are no gaps, and all the Contigs are properly adjacent + * in the graph. + * In an earlier phase of assembly, all valid phased Traversals are produced. + * Later, the same Traversal class is used to hook up Traversals across which we cannot phase. + * (These are somewhat improperly called Scaffolds.) + */ + public static final class Traversal { + private final List contigs; + private final int minMaxObservations; + private int hashCode; // Traversal is immutable, so we can stash the hashCode + + public Traversal( final Collection contigs ) { + if ( contigs == null || contigs.isEmpty() ) { + throw new GATKException("null or empty list of contigs in traversal"); + } + this.contigs = new ArrayList<>(contigs); + int minMaxObservations = Integer.MAX_VALUE; + for ( Contig contig : contigs ) { + minMaxObservations = Math.min(minMaxObservations, contig.getMaxObservations()); + } + this.minMaxObservations = minMaxObservations; + this.hashCode = 0; + } + + // RC constructor + private Traversal( final Traversal traversal ) { + final List thoseContigs = traversal.contigs; + this.contigs = thoseContigs instanceof ContigListRC ? + ((ContigListRC)thoseContigs).rc() : new ContigListRC(thoseContigs); + this.minMaxObservations = traversal.minMaxObservations; + this.hashCode = 0; + } + + public List getContigs() { return Collections.unmodifiableList(contigs); } + public Contig getFirstContig() { return contigs.get(0); } + public Contig getLastContig() { return contigs.get(contigs.size() - 1); } + public Traversal rc() { return new Traversal(this); } + public boolean isRC() { return contigs instanceof ContigListRC; } + public int getMinMaxObservations() { return minMaxObservations; } + public boolean isInextensible() { return getLastContig().isCycleMember(); } + + @Override + public String toString() { + return contigs.stream() + .map(Contig::toString) + .collect(Collectors.joining("+")); + } + + public int getSequenceLength() { + int len = 0; + for ( final Contig contig : contigs ) { + len += contig.getNKmers(); + } + return len + Kmer.KSIZE - 1; + } + + public String getSequence() { + if ( contigs.size() == 0 ) return ""; + final StringBuilder sb = + new StringBuilder(contigs.get(0).getSequence().subSequence(0, Kmer.KSIZE - 1)); + for ( final Contig contig : contigs ) { + final CharSequence seq = contig.getSequence(); + sb.append(seq.subSequence(Kmer.KSIZE - 1, seq.length())); + } + return sb.toString(); + } + + @Override public int hashCode() { + if ( hashCode == 0 ) { + hashCode = contigs.hashCode(); + } + return hashCode; + } + + @Override public boolean equals( final Object obj ) { + if ( this == obj ) return true; + if ( !(obj instanceof Traversal) ) return false; + final Traversal that = (Traversal)obj; + if ( hashCode != that.hashCode || contigs.size() != that.contigs.size() ) return false; + return contigs.equals(that.contigs); + } + + public static Traversal combine( final Traversal trav1, final Traversal trav2 ) { + return combineOverlappers(trav1, trav2, 1); + } + + public static Traversal combineOverlappers( final Traversal trav1, + final Traversal trav2, + final int overlapLen ) { + final int len1 = trav1.contigs.size(); + if ( !trav1.contigs.subList(len1 - overlapLen, len1) + .equals(trav2.contigs.subList(0, overlapLen)) ) { + throw new GATKException("combining non-overlapping traversals"); + } + final int len2 = trav2.contigs.size(); + final List combinedList = + new ArrayList<>(trav1.contigs.size() + len2 - overlapLen); + combinedList.addAll(trav1.contigs); + combinedList.addAll(trav2.contigs.subList(overlapLen, len2)); + return new Traversal(combinedList); + } + } + + /** Set of traversals. + * Rejects adding RC's of existing traversals. + * Explodes when it gets too big. */ + public static class TraversalSet extends HashSet { + private static final long serialVersionUID = 1L; + final int tooManyTraversals; + + public TraversalSet( final int tooManyTraversals ) { + this.tooManyTraversals = tooManyTraversals; + } + + @Override public boolean add( final Traversal traversal ) { + if ( contains(traversal.rc()) ) { + return false; + } + if ( size() >= tooManyTraversals ) { + throw new AssemblyTooComplexException(); + } + return super.add(traversal); + } + } + + /** Something to throw when we have too many Contigs or Traversals to proceed with assembly. */ + public final static class AssemblyTooComplexException extends RuntimeException { + static final long serialVersionUID = -1L; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/PrintDistantMates.java b/src/main/java/org/broadinstitute/hellbender/tools/PrintDistantMates.java new file mode 100644 index 00000000000..38015955c88 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/PrintDistantMates.java @@ -0,0 +1,115 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMTag; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.engine.ReadWalker; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.util.*; + +@CommandLineProgramProperties( + summary = "Unmaps reads that have distant mates. "+ + "This ensures that a PairWalker will always see both mates, "+ + "even when one of them is mapped far away, when given the output "+ + "of this tool along with the original inputs.", + oneLineSummary = "Unmaps reads with distant mates.", + programGroup = ReadDataManipulationProgramGroup.class +) +@DocumentedFeature +public class PrintDistantMates extends ReadWalker { + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc="Write output to this file") + public GATKPath output; + public static String DISTANT_MATE_TAG = "DM"; + + private SAMFileGATKReadWriter outputWriter; + + @Override public List getDefaultReadFilters() { + final List readFilters = new ArrayList<>(super.getDefaultReadFilters()); + readFilters.add(ReadFilterLibrary.PAIRED); + readFilters.add(ReadFilterLibrary.PRIMARY_LINE); + readFilters.add(ReadFilterLibrary.NOT_DUPLICATE); + readFilters.add(ReadFilterLibrary.MATE_DISTANT); + return readFilters; + } + + @Override + public boolean requiresReads() { return true; } + + @Override + public void onTraversalStart() { + outputWriter = createSAMWriter(output, false); + } + + @Override + public void apply( final GATKRead read, + final ReferenceContext referenceContext, + final FeatureContext featureContext ) { + final GATKRead copy = doDistantMateAlterations(read); + outputWriter.addRead(copy); + } + + @Override + public void closeTool() { + if ( outputWriter != null ) { + outputWriter.close(); + } + } + + public static boolean isDistantMate( final GATKRead read ) { + return read.hasAttribute(DISTANT_MATE_TAG); + } + + public static GATKRead doDistantMateAlterations( final GATKRead read ) { + final GATKRead copy = read.copy(); + final Integer nmValue = read.getAttributeAsInteger(SAMTag.NM.name()); + final String oaValue = read.getContig() + "," + read.getStart() + "," + + (read.isReverseStrand() ? "-," : "+,") + read.getCigar() + "," + + read.getMappingQuality() + "," + (nmValue == null ? "" : nmValue) + ";"; + copy.clearAttribute(SAMTag.NM.name()); + copy.setAttribute(SAMTag.OA.name(), oaValue); + copy.setAttribute(DISTANT_MATE_TAG, ""); + copy.setPosition(read.getMateContig(), read.getMateStart()); + copy.setCigar(SAMRecord.NO_ALIGNMENT_CIGAR); + copy.setMappingQuality(0); + copy.setIsUnmapped(); + return copy; + } + + public static GATKRead undoDistantMateAlterations( final GATKRead read ) { + final String oaValue = read.getAttributeAsString(SAMTag.OA.name()); + if ( oaValue == null ) { + return read; + } + final GATKRead copy = read.copy(); + copy.clearAttribute(DISTANT_MATE_TAG); + copy.clearAttribute(SAMTag.OA.name()); + try { + final String[] tokens = oaValue.split(","); + copy.setPosition(tokens[0], Integer.parseInt(tokens[1])); + copy.setIsReverseStrand("-".equals(tokens[2])); + copy.setCigar(tokens[3]); + copy.setMappingQuality(Integer.parseInt(tokens[4])); + if ( tokens[5].length() > 1 ) { + final String nmValue = tokens[5].substring(0, tokens[5].length() - 1); + copy.setAttribute(SAMTag.NM.name(), Integer.parseInt(nmValue)); + } + } catch ( final Exception exc ) { + throw new UserException("can't recover alignment from OA tag: " + oaValue, exc); + } + return copy; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FermiLiteAssemblyHandler.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FermiLiteAssemblyHandler.java index 058a00b99a6..15c5bb4e2b6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FermiLiteAssemblyHandler.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FermiLiteAssemblyHandler.java @@ -4,7 +4,7 @@ import htsjdk.samtools.util.SequenceUtil; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.spark.sv.utils.*; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMultiMapSpark; import org.broadinstitute.hellbender.utils.BaseUtils; import org.broadinstitute.hellbender.utils.bwa.BwaMemAligner; import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment; @@ -128,7 +128,7 @@ static FermiLiteAssembly removeShadowedContigs( final FermiLiteAssembly assembly // make a map of assembled kmers final int capacity = assembly.getContigs().stream().mapToInt(tig -> tig.getSequence().length - kmerSize + 1).sum(); - final HopscotchMultiMap kmerMap = new HopscotchMultiMap<>(capacity); + final HopscotchMultiMapSpark kmerMap = new HopscotchMultiMapSpark<>(capacity); assembly.getContigs().forEach(tig -> { int contigOffset = 0; final Iterator contigKmerItr = new SVKmerizer(tig.getSequence(), kmerSize, new SVKmerShort()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBadGenomicKmersSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBadGenomicKmersSpark.java index 6c4055b89f7..44baee23097 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBadGenomicKmersSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBadGenomicKmersSpark.java @@ -17,7 +17,7 @@ import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection; import org.broadinstitute.hellbender.tools.spark.sv.utils.*; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMapSpark; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; import scala.Tuple2; @@ -147,7 +147,7 @@ static List collectUbiquitousKmersInReference(final int kSize, final int hashSize = 2*REF_RECORDS_PER_PARTITION; return refRDD .mapPartitions(seqItr -> { - final HopscotchMap kmerCounts = new HopscotchMap<>(hashSize); + final HopscotchMapSpark kmerCounts = new HopscotchMapSpark<>(hashSize); while ( seqItr.hasNext() ) { final byte[] seq = seqItr.next(); SVDUSTFilteredKmerizer.canonicalStream(seq, kSize, maxDUSTScore, new SVKmerLong()) @@ -162,7 +162,7 @@ static List collectUbiquitousKmersInReference(final int kSize, .mapToPair(entry -> new Tuple2<>(entry.getKey(), entry.getValue())) .partitionBy(new HashPartitioner(nPartitions)) .mapPartitions(pairItr -> { - final HopscotchMap kmerCounts = new HopscotchMap<>(hashSize); + final HopscotchMapSpark kmerCounts = new HopscotchMapSpark<>(hashSize); while ( pairItr.hasNext() ) { final Tuple2 pair = pairItr.next(); final SVKmer kmer = pair._1(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSpark.java index a6a67a0e495..ff1dc17d3a6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSpark.java @@ -27,7 +27,7 @@ import org.broadinstitute.hellbender.tools.spark.sv.evidence.BreakpointEvidence.ReadEvidence; import org.broadinstitute.hellbender.tools.spark.sv.utils.*; import org.broadinstitute.hellbender.tools.spark.utils.FlatMapGluer; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.bwa.BwaMemIndexCache; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; @@ -166,7 +166,7 @@ public static AssembledEvidenceResults gatherEvidenceAndWriteContigSamFile( new ArrayList<>(), evidenceScanResults.evidenceTargetLinks); - final HopscotchUniqueMultiMap qNamesMultiMap = evidenceScanResults.qNamesForAssemblyMultiMap; + final HopscotchUniqueMultiMapSpark qNamesMultiMap = evidenceScanResults.qNamesForAssemblyMultiMap; // supplement the template names with other reads that share kmers final List alignedAssemblyOrExcuseList; @@ -327,7 +327,7 @@ private static EvidenceScanResults getMappedQNamesSet( final int nIntervalsAfterDepthCleaning = intervals.size(); log("Removed " + (nIntervalsAfterGapRemoval - nIntervalsAfterDepthCleaning) + " intervals that were entirely high-depth.", logger); - final HopscotchUniqueMultiMap qNamesMultiMap = + final HopscotchUniqueMultiMapSpark qNamesMultiMap = getQNames(params, ctx, broadcastMetadata, intervals, unfilteredReads, filter, broadcastHighCoverageSubIntervals); SparkUtils.destroyBroadcast(broadcastHighCoverageSubIntervals, "high-coverage subintervals"); @@ -372,12 +372,12 @@ static final class EvidenceScanResults { final ReadMetadata readMetadata; final List intervals; final List evidenceTargetLinks; - final HopscotchUniqueMultiMap qNamesForAssemblyMultiMap; + final HopscotchUniqueMultiMapSpark qNamesForAssemblyMultiMap; public EvidenceScanResults(final ReadMetadata readMetadata, final List intervals, final List evidenceTargetLinks, - final HopscotchUniqueMultiMap qNamesForAssemblyMultiMap) { + final HopscotchUniqueMultiMapSpark qNamesForAssemblyMultiMap) { this.readMetadata = readMetadata; this.intervals = intervals; this.evidenceTargetLinks = evidenceTargetLinks; @@ -493,17 +493,17 @@ private static List addAssemblyQNames( final FindBreakpointEvidenceSparkArgumentCollection params, final ReadMetadata readMetadata, final JavaSparkContext ctx, - final HopscotchUniqueMultiMap qNamesMultiMap, + final HopscotchUniqueMultiMapSpark qNamesMultiMap, final int nIntervals, final JavaRDD unfilteredReads, final SVReadFilter filter, final Logger logger) { - final Tuple2, HopscotchUniqueMultiMap> kmerIntervalsAndDispositions = + final Tuple2, HopscotchUniqueMultiMapSpark> kmerIntervalsAndDispositions = getKmerAndIntervalsSet(params, readMetadata, ctx, qNamesMultiMap, nIntervals, unfilteredReads, filter, logger); - final HopscotchUniqueMultiMap kmersAndIntervals = + final HopscotchUniqueMultiMapSpark kmersAndIntervals = removeUbiquitousKmers(params, readMetadata, ctx, kmerIntervalsAndDispositions._2(), unfilteredReads, filter, logger); qNamesMultiMap.addAll(getAssemblyQNames(params, ctx, kmersAndIntervals, unfilteredReads, filter)); @@ -524,11 +524,11 @@ private static List addAssemblyQNames( * _1 describes the intervals that have been killed for having too few kmers (as a map from intervalId onto an explanatory string), * and _2 describes the good kmers that we want to use in local assemblies (as a multimap from kmer onto intervalId). */ - private static Tuple2, HopscotchUniqueMultiMap> getKmerAndIntervalsSet( + private static Tuple2, HopscotchUniqueMultiMapSpark> getKmerAndIntervalsSet( final FindBreakpointEvidenceSparkArgumentCollection params, final ReadMetadata readMetadata, final JavaSparkContext ctx, - final HopscotchUniqueMultiMap qNamesMultiMap, + final HopscotchUniqueMultiMapSpark qNamesMultiMap, final int nIntervals, final JavaRDD unfilteredReads, final SVReadFilter filter, @@ -545,8 +545,8 @@ private static Tuple2, HopscotchUniqueMultiMap, List> kmerIntervalsAndDispositions = getKmerIntervals(params, readMetadata, ctx, qNamesMultiMap, nIntervals, kmerKillSet, unfilteredReads, filter, logger); - final HopscotchUniqueMultiMap kmerMultiMap = - new HopscotchUniqueMultiMap<>(kmerIntervalsAndDispositions._2()); + final HopscotchUniqueMultiMapSpark kmerMultiMap = + new HopscotchUniqueMultiMapSpark<>(kmerIntervalsAndDispositions._2()); log("Discovered " + kmerMultiMap.size() + " kmers.", logger); return new Tuple2<>(kmerIntervalsAndDispositions._1(), kmerMultiMap); @@ -566,7 +566,7 @@ private static Tuple2, HopscotchUniqueMultiMap handleAssemblies( final JavaSparkContext ctx, - final HopscotchUniqueMultiMap qNamesMultiMap, + final HopscotchUniqueMultiMapSpark qNamesMultiMap, final JavaRDD unfilteredReads, final SVReadFilter filter, final int nIntervals, @@ -579,7 +579,7 @@ private static Tuple2, HopscotchUniqueMultiMap> broadcastQNamesMultiMap = + final Broadcast> broadcastQNamesMultiMap = ctx.broadcast(qNamesMultiMap); final List intervalDispositions = unfilteredReads @@ -620,15 +620,15 @@ public static IntPair reduce( final IntPair pair1, final IntPair pair2 ) { * For a set of interesting kmers, count occurrences of each over all reads, and remove those * that appear too frequently from the set. */ - private static HopscotchUniqueMultiMap removeUbiquitousKmers( + private static HopscotchUniqueMultiMapSpark removeUbiquitousKmers( final FindBreakpointEvidenceSparkArgumentCollection params, final ReadMetadata readMetadata, final JavaSparkContext ctx, - final HopscotchUniqueMultiMap kmersAndIntervals, + final HopscotchUniqueMultiMapSpark kmersAndIntervals, final JavaRDD unfilteredReads, final SVReadFilter filter, final Logger logger ) { - final Broadcast> broadcastKmersAndIntervals = + final Broadcast> broadcastKmersAndIntervals = ctx.broadcast(kmersAndIntervals); final int kmersPerPartition = kmersAndIntervals.size(); @@ -680,10 +680,10 @@ private static HopscotchUniqueMultiMap removeU @VisibleForTesting static List getAssemblyQNames( final FindBreakpointEvidenceSparkArgumentCollection params, final JavaSparkContext ctx, - final HopscotchUniqueMultiMap kmerMultiMap, + final HopscotchUniqueMultiMapSpark kmerMultiMap, final JavaRDD unfilteredReads, final SVReadFilter filter ) { - final Broadcast> broadcastKmersAndIntervals = + final Broadcast> broadcastKmersAndIntervals = ctx.broadcast(kmerMultiMap); final int kSize = params.kSize; @@ -705,7 +705,7 @@ private static HopscotchUniqueMultiMap removeU final FindBreakpointEvidenceSparkArgumentCollection params, final ReadMetadata readMetadata, final JavaSparkContext ctx, - final HopscotchUniqueMultiMap qNamesMultiMap, + final HopscotchUniqueMultiMapSpark qNamesMultiMap, final int nIntervals, final Set kmerKillSet, final JavaRDD unfilteredReads, @@ -713,7 +713,7 @@ private static HopscotchUniqueMultiMap removeU final Logger logger ) { final Broadcast> broadcastKmerKillSet = ctx.broadcast(kmerKillSet); - final Broadcast> broadcastQNameAndIntervalsMultiMap = + final Broadcast> broadcastQNameAndIntervalsMultiMap = ctx.broadcast(qNamesMultiMap); // given a set of template names with interval IDs and a kill set of ubiquitous kmers, @@ -944,7 +944,7 @@ static Iterator findHighCoverageSubintervals(final Br } /** find template names for reads mapping to each interval */ - @VisibleForTesting static HopscotchUniqueMultiMap getQNames( + @VisibleForTesting static HopscotchUniqueMultiMapSpark getQNames( final FindBreakpointEvidenceSparkArgumentCollection params, final JavaSparkContext ctx, final Broadcast broadcastMetadata, @@ -964,8 +964,8 @@ static Iterator findHighCoverageSubintervals(final Br SparkUtils.destroyBroadcast(broadcastIntervals, "intervals"); - final HopscotchUniqueMultiMap qNamesMultiMap = - new HopscotchUniqueMultiMap<>(params.assemblyToMappedSizeRatioGuess*qNameAndIntervalList.size()); + final HopscotchUniqueMultiMapSpark qNamesMultiMap = + new HopscotchUniqueMultiMapSpark<>(params.assemblyToMappedSizeRatioGuess*qNameAndIntervalList.size()); qNamesMultiMap.addAll(qNameAndIntervalList); return qNamesMultiMap; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCleaner.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCleaner.java index c875787f0d3..19e978f25a3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCleaner.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCleaner.java @@ -2,8 +2,8 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSetSpark; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import scala.Tuple2; import java.util.Iterator; @@ -13,14 +13,14 @@ */ public final class KmerCleaner implements Iterable { - private final HopscotchUniqueMultiMap kmerMultiMap; + private final HopscotchUniqueMultiMapSpark kmerMultiMap; public KmerCleaner( final Iterator> kmerCountItr, final int kmersPerPartitionGuess, final int minKmerCount, final int maxKmerCount, final int maxIntervalsPerKmer ) { - kmerMultiMap = new HopscotchUniqueMultiMap<>(kmersPerPartitionGuess); + kmerMultiMap = new HopscotchUniqueMultiMapSpark<>(kmersPerPartitionGuess); // remove kmers with extreme counts that won't help in building a local assembly while ( kmerCountItr.hasNext() ) { @@ -29,7 +29,7 @@ public KmerCleaner( final Iterator> kmerCountIt if ( count >= minKmerCount && count <= maxKmerCount ) kmerMultiMap.add(kmerCount._1); } - final HopscotchSet uniqueKmers = new HopscotchSet<>(kmerMultiMap.size()); + final HopscotchSetSpark uniqueKmers = new HopscotchSetSpark<>(kmerMultiMap.size()); kmerMultiMap.stream().map(KmerAndInterval::getKey).forEach(uniqueKmers::add); uniqueKmers.stream() .filter(kmer -> SVUtils.iteratorSize(kmerMultiMap.findEach(kmer)) > maxIntervalsPerKmer) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCounter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCounter.java index 705dfdbf946..6f636c8b834 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCounter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/KmerCounter.java @@ -4,8 +4,8 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerLong; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerizer; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMapSpark; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.utils.read.GATKRead; import java.util.Iterator; @@ -17,17 +17,17 @@ public final class KmerCounter { private final int kSize; private final int kmersPerPartitionGuess; - private final HopscotchUniqueMultiMap kmerMap; + private final HopscotchUniqueMultiMapSpark kmerMap; public KmerCounter( final int kSize, final int kmersPerPartitionGuess, - final HopscotchUniqueMultiMap kmerMap ) { + final HopscotchUniqueMultiMapSpark kmerMap ) { this.kSize = kSize; this.kmerMap = kmerMap; this.kmersPerPartitionGuess = kmersPerPartitionGuess; } public Iterator apply( final Iterator readItr ) { - final HopscotchMap counts = new HopscotchMap<>(kmersPerPartitionGuess); + final HopscotchMapSpark counts = new HopscotchMapSpark<>(kmersPerPartitionGuess); while ( readItr.hasNext() ) { final GATKRead read = readItr.next(); SVKmerizer.canonicalStream(read.getBases(), kSize, new SVKmerLong()) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameIntervalFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameIntervalFinder.java index 8f162078502..f7cf32de0fe 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameIntervalFinder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameIntervalFinder.java @@ -3,7 +3,7 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerLong; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerizer; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.utils.read.GATKRead; import java.util.ArrayList; @@ -18,9 +18,9 @@ */ public final class QNameIntervalFinder implements Function> { private final int kSize; - private final HopscotchUniqueMultiMap kmerMap; + private final HopscotchUniqueMultiMapSpark kmerMap; - public QNameIntervalFinder( final int kSize, final HopscotchUniqueMultiMap kmerMap ) { + public QNameIntervalFinder( final int kSize, final HopscotchUniqueMultiMapSpark kmerMap ) { this.kSize = kSize; this.kmerMap = kmerMap; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameKmerizer.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameKmerizer.java index 1ff1a102c03..d9a512e7a3d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameKmerizer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNameKmerizer.java @@ -3,7 +3,7 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.SVDUSTFilteredKmerizer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerLong; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.utils.read.GATKRead; import scala.Tuple2; @@ -18,14 +18,14 @@ * The template names of reads to kmerize, along with a set of kmers to ignore are passed in (by broadcast). */ public final class QNameKmerizer implements Function>> { - private final HopscotchUniqueMultiMap qNameAndIntervalMultiMap; + private final HopscotchUniqueMultiMapSpark qNameAndIntervalMultiMap; private final Set kmersToIgnore; private final int kSize; private final int maxDUSTScore; private final SVReadFilter filter; private final ArrayList> tupleList = new ArrayList<>(); - public QNameKmerizer( final HopscotchUniqueMultiMap qNameAndIntervalMultiMap, + public QNameKmerizer( final HopscotchUniqueMultiMapSpark qNameAndIntervalMultiMap, final Set kmersToIgnore, final int kSize, final int maxDUSTScore, final SVReadFilter filter ) { this.qNameAndIntervalMultiMap = qNameAndIntervalMultiMap; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNamesForKmersFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNamesForKmersFinder.java index 2827e74b638..289244d0e9e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNamesForKmersFinder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/QNamesForKmersFinder.java @@ -3,7 +3,7 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerizer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmer; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVKmerLong; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.utils.read.GATKRead; import scala.Tuple2; @@ -20,10 +20,10 @@ public final class QNamesForKmersFinder implements Function>> { private final int kSize; private final SVReadFilter filter; - private final HopscotchUniqueMultiMap kmerMultiMap; + private final HopscotchUniqueMultiMapSpark kmerMultiMap; public QNamesForKmersFinder( final int kSize, - final HopscotchUniqueMultiMap kmerMultiMap, + final HopscotchUniqueMultiMapSpark kmerMultiMap, final SVReadFilter filter ) { this.kSize = kSize; this.kmerMultiMap = kmerMultiMap; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/ReadsForQNamesFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/ReadsForQNamesFinder.java index a742df658a9..dff16f24e17 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/ReadsForQNamesFinder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/ReadsForQNamesFinder.java @@ -1,7 +1,7 @@ package org.broadinstitute.hellbender.tools.spark.sv.evidence; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFastqUtils; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.utils.read.GATKRead; import scala.Tuple2; @@ -15,7 +15,7 @@ public final class ReadsForQNamesFinder implements Iterable>> { private final List>> fastQRecords; - public ReadsForQNamesFinder( final HopscotchUniqueMultiMap qNamesMultiMap, + public ReadsForQNamesFinder( final HopscotchUniqueMultiMapSpark qNamesMultiMap, final int nIntervals, final boolean includeMappingLocation, final Iterator unfilteredReadItr, final SVReadFilter filter ) { final int nReadsPerInterval = 2 * qNamesMultiMap.size() / nIntervals; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFileUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFileUtils.java index 60a6ff080e6..4d38c94b68c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFileUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFileUtils.java @@ -3,7 +3,7 @@ import htsjdk.samtools.*; import htsjdk.samtools.util.FileExtensions; import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSetSpark; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; import org.broadinstitute.hellbender.utils.io.IOUtils; @@ -49,7 +49,7 @@ public static Set readKmersFile(final String kmersFilePath, final int kS try ( final BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(kmersFilePath))) ) { final long fileLength = BucketUtils.fileSize(kmersFilePath); - kmers = new HopscotchSet<>((int)(fileLength/(kSize+1))); + kmers = new HopscotchSetSpark<>((int)(fileLength/(kSize+1))); String line; while ( (line = rdr.readLine()) != null ) { if ( line.length() != kSize ) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVUtils.java index e7625487e3c..1fc4a9e9971 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVUtils.java @@ -4,7 +4,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSetSpark; import org.broadinstitute.hellbender.tools.spark.utils.LongIterator; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; @@ -138,7 +138,7 @@ public static Collection uniquify(final Collection coll1, final Utils.nonNull(coll1, "first collection of kmers is null"); Utils.nonNull(coll2, "second collection of kmers is null"); - final HopscotchSet kmers = new HopscotchSet<>(coll1.size() + coll2.size()); + final HopscotchSetSpark kmers = new HopscotchSetSpark<>(coll1.size() + coll2.size()); kmers.addAll(coll1); kmers.addAll(coll2); return kmers; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollectionSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollectionSpark.java new file mode 100644 index 00000000000..9e95870ac01 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollectionSpark.java @@ -0,0 +1,65 @@ +package org.broadinstitute.hellbender.tools.spark.utils; + +import com.esotericsoftware.kryo.DefaultSerializer; +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; +import org.broadinstitute.hellbender.utils.collections.HopscotchCollection; + +import java.util.Collection; + +@DefaultSerializer(HopscotchCollectionSpark.Serializer.class) +public class HopscotchCollectionSpark extends HopscotchCollection { + /** make a small HopscotchCollectionSpark */ + public HopscotchCollectionSpark() {} + + /** make a HopscotchCollectionSpark for a specified capacity (or good guess) */ + public HopscotchCollectionSpark( final int capacity ) { super(capacity); } + + /** make a HopscotchCollectionSpark from a collection */ + public HopscotchCollectionSpark( final Collection collection ) { super(collection); } + + @SuppressWarnings("unchecked") + protected HopscotchCollectionSpark( final Kryo kryo, final Input input ) { + super((int)(input.readInt() * HopscotchCollection.LOAD_FACTOR)); + + final boolean oldReferences = kryo.getReferences(); + kryo.setReferences(false); + + int nElements = input.readInt(); + while ( nElements-- > 0 ) { + add((T)kryo.readClassAndObject(input)); + } + + kryo.setReferences(oldReferences); + } + + @SuppressWarnings("unchecked") + protected void serialize( final Kryo kryo, final Output output ) { + final boolean oldReferences = kryo.getReferences(); + kryo.setReferences(false); + + final int capacity = capacity(); + output.writeInt(capacity); + output.writeInt(size()); + + // write the chain heads, and then the squatters + chainHeads().forEach(t -> kryo.writeClassAndObject(output, t)); + squatters().forEach(t -> kryo.writeClassAndObject(output, t)); + + kryo.setReferences(oldReferences); + } + + public static final class Serializer extends com.esotericsoftware.kryo.Serializer> { + @Override + public void write( final Kryo kryo, final Output output, final HopscotchCollectionSpark hopscotchCollection ) { + hopscotchCollection.serialize(kryo, output); + } + + @Override + public HopscotchCollectionSpark read( final Kryo kryo, final Input input, + final Class> klass ) { + return new HopscotchCollectionSpark<>(kryo, input); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMap.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMapSpark.java similarity index 59% rename from src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMap.java rename to src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMapSpark.java index 1eb92588bbf..d724f1c4ba9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMap.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMapSpark.java @@ -15,12 +15,13 @@ * A uniquely keyed map with O(1) operations. * Sadly, it's not a java.util.Map, but it does behave like a java.util.Map's entrySet. */ -@DefaultSerializer(HopscotchMap.Serializer.class) -public final class HopscotchMap> extends HopscotchSet { - public HopscotchMap() {} - public HopscotchMap( final int capacity ) { super(capacity); } - public HopscotchMap( final Collection collection ) { super(collection); } - protected HopscotchMap( final Kryo kryo, final Input input ) { super(kryo, input); } +@DefaultSerializer(HopscotchMapSpark.Serializer.class) +public final class HopscotchMapSpark> extends HopscotchCollectionSpark { + public HopscotchMapSpark() {} + public HopscotchMapSpark( final int capacity ) { super(capacity); } + public HopscotchMapSpark( final Collection collection ) { super(collection); } + private HopscotchMapSpark( final Kryo kryo, final Input input ) { super(kryo, input); } + /** in a map, uniqueness is on the key value, so we compare keys to see if we already have an entry for that key */ @Override @@ -31,15 +32,15 @@ public HopscotchMap() {} protected Function toKey() { return T::getKey; } @SuppressWarnings("rawtypes") - public static final class Serializer extends com.esotericsoftware.kryo.Serializer { + public static final class Serializer extends com.esotericsoftware.kryo.Serializer { @Override - public void write( final Kryo kryo, final Output output, final HopscotchMap hopscotchMap ) { - hopscotchMap.serialize(kryo, output); + public void write( final Kryo kryo, final Output output, final HopscotchMapSpark hopscotchMapSpark ) { + hopscotchMapSpark.serialize(kryo, output); } @Override - public HopscotchMap read( final Kryo kryo, final Input input, final Class klass ) { - return new HopscotchMap(kryo, input); + public HopscotchMapSpark read( final Kryo kryo, final Input input, final Class klass ) { + return new HopscotchMapSpark(kryo, input); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMap.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMap.java deleted file mode 100644 index 8b368f2b212..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMap.java +++ /dev/null @@ -1,38 +0,0 @@ -package org.broadinstitute.hellbender.tools.spark.utils; - -import com.esotericsoftware.kryo.DefaultSerializer; -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; - -import java.util.Collection; -import java.util.Map; -import java.util.function.Function; - -/** - * A map that can contain multiple values for a given key. - */ -@DefaultSerializer(HopscotchMultiMap.Serializer.class) -public class HopscotchMultiMap> extends HopscotchCollection { - public HopscotchMultiMap() {} - public HopscotchMultiMap( final int capacity ) { super(capacity); } - public HopscotchMultiMap( final Collection collection ) { super(collection); } - protected HopscotchMultiMap( final Kryo kryo, final Input input ) { super(kryo, input); } - - /** getKey returns the key part of a Map.Entry */ - @Override - protected Function toKey() { return T::getKey; } - - @SuppressWarnings("rawtypes") - public static final class Serializer extends com.esotericsoftware.kryo.Serializer { - @Override - public void write(final Kryo kryo, final Output output, final HopscotchMultiMap hopscotchMultiMap ) { - hopscotchMultiMap.serialize(kryo, output); - } - - @Override - public HopscotchMultiMap read(final Kryo kryo, final Input input, final Class klass ) { - return new HopscotchMultiMap(kryo, input); - } - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMapSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMapSpark.java new file mode 100644 index 00000000000..95d9c3ac2e4 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMapSpark.java @@ -0,0 +1,38 @@ +package org.broadinstitute.hellbender.tools.spark.utils; + +import com.esotericsoftware.kryo.DefaultSerializer; +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.io.Input; +import com.esotericsoftware.kryo.io.Output; + +import java.util.Collection; +import java.util.Map; +import java.util.function.Function; + +/** + * A map that can contain multiple values for a given key. + */ +@DefaultSerializer(HopscotchMultiMapSpark.Serializer.class) +public class HopscotchMultiMapSpark> extends HopscotchCollectionSpark { + public HopscotchMultiMapSpark() {} + public HopscotchMultiMapSpark( final int capacity ) { super(capacity); } + public HopscotchMultiMapSpark( final Collection collection ) { super(collection); } + protected HopscotchMultiMapSpark( final Kryo kryo, final Input input ) { super(kryo, input); } + + /** getKey returns the key part of a Map.Entry */ + @Override + protected Function toKey() { return T::getKey; } + + @SuppressWarnings("rawtypes") + public static final class Serializer extends com.esotericsoftware.kryo.Serializer { + @Override + public void write(final Kryo kryo, final Output output, final HopscotchMultiMapSpark hopscotchMultiMapSpark ) { + hopscotchMultiMapSpark.serialize(kryo, output); + } + + @Override + public HopscotchMultiMapSpark read( final Kryo kryo, final Input input, final Class klass ) { + return new HopscotchMultiMapSpark(kryo, input); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSet.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetSpark.java similarity index 65% rename from src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSet.java rename to src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetSpark.java index f778bf7903f..4c0f06a9532 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetSpark.java @@ -14,12 +14,12 @@ * Implements Set by imposing a unique-element constraint on HopscotchCollection. * Also implements equals and hashCode to be consistent with the documented requirements of the java Set interface. */ -@DefaultSerializer(HopscotchSet.Serializer.class) -public class HopscotchSet extends HopscotchCollection implements Set { - public HopscotchSet() {} - public HopscotchSet( final int capacity ) { super(capacity); } - public HopscotchSet( final Collection collection ) { super(collection); } - protected HopscotchSet( final Kryo kryo, final Input input ) { super(kryo, input); } +@DefaultSerializer(HopscotchSetSpark.Serializer.class) +public class HopscotchSetSpark extends HopscotchCollectionSpark implements Set { + public HopscotchSetSpark() {} + public HopscotchSetSpark( final int capacity ) { super(capacity); } + public HopscotchSetSpark( final Collection collection ) { super(collection); } + protected HopscotchSetSpark( final Kryo kryo, final Input input ) { super(kryo, input); } @Override public final boolean equals( final Object obj ) { @@ -38,15 +38,15 @@ public final boolean equals( final Object obj ) { protected BiPredicate entryCollides() { return Object::equals; } @SuppressWarnings("rawtypes") - public static final class Serializer extends com.esotericsoftware.kryo.Serializer { + public static final class Serializer extends com.esotericsoftware.kryo.Serializer { @Override - public void write( final Kryo kryo, final Output output, final HopscotchSet hopscotchSet ) { - hopscotchSet.serialize(kryo, output); + public void write( final Kryo kryo, final Output output, final HopscotchSetSpark hopscotchSetSpark ) { + hopscotchSetSpark.serialize(kryo, output); } @Override - public HopscotchSet read( final Kryo kryo, final Input input, final Class klass ) { - return new HopscotchSet(kryo, input); + public HopscotchSetSpark read( final Kryo kryo, final Input input, final Class klass ) { + return new HopscotchSetSpark(kryo, input); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMap.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMapSpark.java similarity index 51% rename from src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMap.java rename to src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMapSpark.java index 30ca80f217e..eb8353ff61d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMap.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMapSpark.java @@ -8,32 +8,31 @@ import java.util.Collection; import java.util.Map; import java.util.function.BiPredicate; -import java.util.function.Function; /** * A map that can contain multiple values for a given key, but distinct entries. */ -@DefaultSerializer(HopscotchUniqueMultiMap.Serializer.class) -public final class HopscotchUniqueMultiMap> extends HopscotchMultiMap { - public HopscotchUniqueMultiMap() {} - public HopscotchUniqueMultiMap( final int capacity ) { super(capacity); } - public HopscotchUniqueMultiMap( final Collection collection ) { super(collection); } - protected HopscotchUniqueMultiMap( final Kryo kryo, final Input input ) { super(kryo, input); } +@DefaultSerializer(HopscotchUniqueMultiMapSpark.Serializer.class) +public final class HopscotchUniqueMultiMapSpark> extends HopscotchMultiMapSpark { + public HopscotchUniqueMultiMapSpark() {} + public HopscotchUniqueMultiMapSpark( final int capacity ) { super(capacity); } + public HopscotchUniqueMultiMapSpark( final Collection collection ) { super(collection); } + private HopscotchUniqueMultiMapSpark( final Kryo kryo, final Input input ) { super(kryo, input); } /** in a unique multimap, uniqueness is on the entry, so we just compare entries to see if we already have one */ @Override protected BiPredicate entryCollides() { return Object::equals; } @SuppressWarnings("rawtypes") - public static final class Serializer extends com.esotericsoftware.kryo.Serializer { + public static final class Serializer extends com.esotericsoftware.kryo.Serializer { @Override - public void write( final Kryo kryo, final Output output, final HopscotchUniqueMultiMap hopscotchMultiMap ) { + public void write( final Kryo kryo, final Output output, final HopscotchUniqueMultiMapSpark hopscotchMultiMap ) { hopscotchMultiMap.serialize(kryo, output); } @Override - public HopscotchUniqueMultiMap read( final Kryo kryo, final Input input, final Class klass ) { - return new HopscotchUniqueMultiMap(kryo, input); + public HopscotchUniqueMultiMapSpark read( final Kryo kryo, final Input input, final Class klass ) { + return new HopscotchUniqueMultiMapSpark(kryo, input); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSet.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSet.java index 3318f053486..5893837fc40 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSet.java @@ -7,6 +7,7 @@ import com.google.common.annotations.VisibleForTesting; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.collections.SetSizeUtils; import java.io.Serializable; import java.util.*; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongBloomFilter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongBloomFilter.java index e05746889e7..b1cee5bca0c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongBloomFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongBloomFilter.java @@ -10,6 +10,7 @@ import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.collections.SetSizeUtils; import java.util.Arrays; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSet.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSet.java index 5a5e973aabd..cc02c7d9201 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSet.java @@ -7,6 +7,7 @@ import com.google.common.annotations.VisibleForTesting; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.collections.SetSizeUtils; import java.io.Serializable; import java.util.NoSuchElementException; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/PairWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/PairWalker.java new file mode 100644 index 00000000000..bcad085284f --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/PairWalker.java @@ -0,0 +1,228 @@ +package org.broadinstitute.hellbender.tools.walkers; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadWalker; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; +import org.broadinstitute.hellbender.tools.PrintDistantMates; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.collections.HopscotchSet; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +@CommandLineProgramProperties( + summary = "A walker that presents read pairs as pairs. Run PrintDistantMates, and use "+ + "its output as input to the PairWalker along with the original inputs to ensure "+ + "seeing all pairs as pairs. This is unnecessary if you only care about pairs "+ + "where both reads lie within your intervals.", + oneLineSummary = "A walker that presents read pairs as pairs.", + programGroup = ReadDataManipulationProgramGroup.class +) +@DocumentedFeature +public abstract class PairWalker extends ReadWalker { + + @Argument(fullName = "pair-padding", shortName = "pp", + doc = "Amount of extra padding (in bp) to add to pick up mates near your intervals.") + protected int intervalPadding = 1000; + + private RegionChecker regionChecker = null; + private final HopscotchSet pairBufferSet = new HopscotchSet<>(1000000); + private final HopscotchSet distantPairsProcessed = new HopscotchSet<>(1000000); + + @Override public List getDefaultReadFilters() { + final List readFilters = new ArrayList<>(super.getDefaultReadFilters()); + readFilters.add(ReadFilterLibrary.PRIMARY_LINE); + readFilters.add(ReadFilterLibrary.NOT_DUPLICATE); + return readFilters; + } + + @Override + public boolean requiresReads() { return true; } + + @Override + protected List transformTraversalIntervals( final List intervals, + final SAMSequenceDictionary dictionary ) { + regionChecker = new RegionChecker(intervals, dictionary); + + final List paddedIntervals = new ArrayList<>(intervals.size()); + SimpleInterval prevInterval = null; + for ( final SimpleInterval interval : intervals ) { + final SimpleInterval curInterval = interval.expandWithinContig(intervalPadding, dictionary); + if ( prevInterval == null ) { + prevInterval = curInterval; + } else if ( prevInterval.getContig().equals(curInterval.getContig()) && + prevInterval.getStart() <= curInterval.getEnd() + 1 && + prevInterval.getStart() <= curInterval.getEnd() + 1 ) { + prevInterval = prevInterval.mergeWithContiguous(curInterval); + } else { + paddedIntervals.add(prevInterval); + prevInterval = curInterval; + } + } + if ( prevInterval != null ) { + paddedIntervals.add(prevInterval); + } + return Collections.unmodifiableList(paddedIntervals); + } + + @Override + public void apply( final GATKRead read, + final ReferenceContext referenceContext, + final FeatureContext featureContext ) { + if ( !read.isPaired() || read.isSecondaryAlignment() || read.isSupplementaryAlignment() ) { + applyUnpaired(read); + return; + } + final boolean inInterval = regionChecker == null || regionChecker.isInInterval(read); + final PairBufferEntry pb = new PairBufferEntry(read, inInterval); + final PairBufferEntry pbMate = pairBufferSet.find(pb); + if ( pbMate == null ) { + pairBufferSet.add(pb); + } else { + if ( pb.isInInterval() || pbMate.isInInterval() ) { + if ( !pb.isDistantMate() && !pbMate.isDistantMate() ) { + apply(pbMate.getRead(), pb.getRead()); + } else { + final String readName = pb.getRead().getName(); + if ( distantPairsProcessed.contains(readName) ) { + distantPairsProcessed.remove(readName); + } else { + distantPairsProcessed.add(readName); + apply(pbMate.getRead(), pb.getRead()); + } + } + } + pairBufferSet.remove(pbMate); + } + } + + @Override + public Object onTraversalSuccess() { + int nUnpaired = 0; + for ( final PairBufferEntry pb : pairBufferSet ) { + if ( pb.isInInterval() ) { + applyUnpaired(pb.getRead()); + nUnpaired += 1; + } + } + if ( nUnpaired > 0 ) { + logger.info("There were " + nUnpaired + " incomplete pairs."); + } + return null; + } + + /** + * This method is called once for each pair of reads. + * The pairs will NOT be in strict coordinate order. + * The reads supplied will be the primary lines for each pair. + **/ + public abstract void apply( final GATKRead read, final GATKRead mate ); + + /** + * Unpaired reads, secondary and supplemental alignments, and other detritus comes through here. + * Also, if you haven't used PrintDistantMates, you'll get all of pairs for which we didn't + * find the mate at the end of the traversal via this method. + */ + public abstract void applyUnpaired( final GATKRead read ); + + // Maintains an iterator over a set of coordinate-sorted, disjoint intervals. + // It allows you to ask whether a read overlaps any of the intervals by calling isInInterval. + // With each call, it advances the iterator as appropriate, and checks for overlap. + // The sequence of reads passed to isInInterval must also be coordinate-sorted. + @VisibleForTesting + static final class RegionChecker { + private final List intervals; + private final SAMSequenceDictionary dictionary; + private Iterator intervalIterator; + private SimpleInterval currentInterval; + + public RegionChecker( final List intervals, final SAMSequenceDictionary dictionary ) { + this.intervals = intervals; + this.dictionary = dictionary; + reset(); + } + + public boolean isInInterval( final GATKRead read ) { + if ( currentInterval == null || read.isUnmapped() ) return false; + final String readContig = read.getContig(); + if ( !currentInterval.getContig().equals(readContig) ) { + final int readContigId = dictionary.getSequenceIndex(readContig); + int currentContigId; + while ( (currentContigId = + dictionary.getSequenceIndex(currentInterval.getContig())) < readContigId ) { + if ( !intervalIterator.hasNext() ) { + currentInterval = null; + return false; + } + currentInterval = intervalIterator.next(); + } + if ( currentContigId > readContigId ) { + return false; + } + } + + // we've advanced the iterator so that the current contig is the same as the read's contig + final int readStart = read.getStart(); + while ( currentInterval.getEnd() < readStart ) { + if ( !intervalIterator.hasNext() ) { + currentInterval = null; + return false; + } + currentInterval = intervalIterator.next(); + if ( !currentInterval.getContig().equals(readContig) ) { + return false; + } + } + + // we've advanced the iterator so that the current end is no smaller than the read start + return currentInterval.overlaps(read); + } + + public void reset() { + intervalIterator = intervals.iterator(); + currentInterval = intervalIterator.hasNext() ? intervalIterator.next() : null; + } + } + + // Tracks the first read of a pair we observe, while we're waiting for the mate to show up. + // Note that it's "keyed" on read name: Two PairBufferEntries are equal if they point to reads + // with the same name. + private static final class PairBufferEntry { + private final GATKRead read; + private final boolean inInterval; + private final boolean distantMate; + + public PairBufferEntry( final GATKRead read, final boolean inInterval ) { + this.distantMate = PrintDistantMates.isDistantMate(read); + this.read = this.distantMate ? PrintDistantMates.undoDistantMateAlterations(read) : read; + this.inInterval = inInterval; + } + + public GATKRead getRead() { return read; } + public boolean isInInterval() { return inInterval; } + public boolean isDistantMate() { return distantMate; } + + @Override + public boolean equals( final Object obj ) { + return this == obj || + (obj instanceof PairBufferEntry && + ((PairBufferEntry)obj).read.getName().equals(read.getName())); + } + + @Override + public int hashCode() { + return read.getName().hashCode(); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollection.java b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchCollection.java similarity index 88% rename from src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollection.java rename to src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchCollection.java index f7d9801d2de..188830e3f4d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchCollection.java @@ -1,13 +1,11 @@ -package org.broadinstitute.hellbender.tools.spark.utils; - -import com.esotericsoftware.kryo.DefaultSerializer; -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; +package org.broadinstitute.hellbender.utils.collections; import java.util.*; +import java.util.function.BiFunction; import java.util.function.BiPredicate; import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; /** * Multiset implementation that provides low memory overhead with a high load factor by using the hopscotch algorithm. @@ -26,7 +24,6 @@ * characteristics. The ones you see recommended everywhere that add in the final piece of state, don't have good * avalanche. We try to take care of this common defect with the SPREADER, but that's not a perfect solution. */ -@DefaultSerializer(HopscotchCollection.Serializer.class) public class HopscotchCollection extends AbstractCollection { // For power-of-2 table sizes add this line //private static final int maxCapacity = Integer.MAX_VALUE/2 + 1; @@ -45,9 +42,9 @@ public class HopscotchCollection extends AbstractCollection { // "end of chain" instead. we use Byte.MAX_VALUE (i.e., 0x7f) to pick off these bits. private byte[] status; - private static final double LOAD_FACTOR = .85; + protected static final double LOAD_FACTOR = .85; private static final int NO_ELEMENT_INDEX = -1; - private static final int SPREADER = 241; + protected static final int SPREADER = 241; /** make a small HopscotchCollection */ public HopscotchCollection() { this(12000); } @@ -68,63 +65,17 @@ public HopscotchCollection( final Collection collection ) { addAll(collection); } - @SuppressWarnings("unchecked") - protected HopscotchCollection( final Kryo kryo, final Input input ) { - final boolean oldReferences = kryo.getReferences(); - kryo.setReferences(false); - - capacity = input.readInt(); - size = 0; - buckets = (T[])new Object[capacity]; - status = new byte[capacity]; - int nElements = input.readInt(); - while ( nElements-- > 0 ) { - add((T)kryo.readClassAndObject(input)); - } - - kryo.setReferences(oldReferences); - } - - protected void serialize( final Kryo kryo, final Output output ) { - final boolean oldReferences = kryo.getReferences(); - kryo.setReferences(false); - - output.writeInt(capacity); - output.writeInt(size); - - // write the chain heads, and then the squatters - int count = 0; - for ( int idx = 0; idx != capacity; ++idx ) { - if ( isChainHead(idx) ) { - kryo.writeClassAndObject(output, buckets[idx]); - count += 1; - } - } - for ( int idx = 0; idx != capacity; ++idx ) { - final T val = buckets[idx]; - if ( val != null && !isChainHead(idx) ) { - kryo.writeClassAndObject(output, val); - count += 1; - } - } - - kryo.setReferences(oldReferences); - - if ( count != size ) { - throw new IllegalStateException("Failed to serialize the expected number of objects: expected="+size+" actual="+count+"."); - } - } - @Override public final boolean add( final T entry ) { if ( entry == null ) throw new UnsupportedOperationException("This collection cannot contain null."); if ( size == capacity ) resize(); final BiPredicate collision = entryCollides(); + final int bucketIndex = hashToIndex(toKey().apply(entry)); try { - return insert(entry, collision); + return insert(entry, bucketIndex, collision); } catch ( final IllegalStateException ise ) { resize(); - return insert(entry, collision); + return insert(entry, bucketIndex, collision); } } @@ -158,6 +109,47 @@ public final T find( final Object key ) { return null; } + /** find an entry equivalent to the key, or use producer to add a new value */ + public final T findOrAdd( final Object key, final Function producer ) { + try { + return findOrAddInternal(key, producer); + } catch ( final IllegalStateException ise ) { + resize(); + return findOrAddInternal(key, producer); + } + } + + private final T findOrAddInternal( final Object key, Function producer ) { + final int bucketIndex = hashToIndex(key); + if ( !isChainHead(bucketIndex) ) { + final T entry = producer.apply(key); + if ( entry != null ) { + insert(entry, bucketIndex, entryCollides()); + } + return entry; + } + + T entry = buckets[bucketIndex]; + if ( equivalent(entry, key) ) return entry; + + int endOfChainIndex = bucketIndex; + while ( true ) { + entry = buckets[endOfChainIndex]; + if ( equivalent(entry, key) ) return entry; + final int offset = getOffset(endOfChainIndex); + if ( offset == 0 ) break; + endOfChainIndex = getIndex(endOfChainIndex, offset); + } + + entry = producer.apply(key); + if ( entry != null ) { + final int emptyBucketIndex = insertIntoChain(bucketIndex, endOfChainIndex); + buckets[emptyBucketIndex] = entry; + size += 1; + } + return entry; + } + /** get an iterator over each of the elements equivalent to the key */ public final Iterator findEach( final Object key ) { return new ElementIterator(key); @@ -209,6 +201,15 @@ public final boolean removeAll( final Collection collection ) { @Override public final int size() { return size; } + // next two methods are to allow efficient serialization + public Stream chainHeads() { + return IntStream.range(0, capacity).filter(this::isChainHead).mapToObj(idx -> buckets[idx]); + } + + public Stream squatters() { + return IntStream.range(0, capacity) + .filter(idx -> !isChainHead(idx) && buckets[idx] != null).mapToObj(idx -> buckets[idx]); + } /** in a general multiset there is no uniqueness criterion, so there is never a collision */ protected BiPredicate entryCollides() { return (t1, t2) -> false; } @@ -222,7 +223,7 @@ private boolean equivalent( final T entry, final Object key ) { return Objects.equals(toKey().apply(entry), key); } - private int hashToIndex( final Object entry ) { + protected int hashToIndex( final Object entry ) { // For power-of-2 table sizes substitute this line // return (SPREADER*hashVal)&(capacity-1); int result = entry == null ? 0 : ((SPREADER * entry.hashCode()) % capacity); @@ -230,9 +231,7 @@ private int hashToIndex( final Object entry ) { return result; } - private boolean insert( final T entry, final BiPredicate collision ) { - final int bucketIndex = hashToIndex(toKey().apply(entry)); - + private boolean insert( final T entry, final int bucketIndex, final BiPredicate collision ) { // if there's a squatter where the new entry should go, move it elsewhere and put the entry there if ( buckets[bucketIndex] != null && !isChainHead(bucketIndex) ) evict(bucketIndex); @@ -405,7 +404,8 @@ private int hopscotch( final int fromIndex, final int emptyBucketIndex ) { throw new IllegalStateException("Hopscotching failed at load factor "+(1.*size/capacity)); } - private void move( int predecessorBucketIndex, final int bucketToMoveIndex, final int emptyBucketIndex ) { + private void move( final int predecessorBucketIndexArg, final int bucketToMoveIndex, final int emptyBucketIndex ) { + int predecessorBucketIndex = predecessorBucketIndexArg; int toEmptyDistance = getIndexDiff(bucketToMoveIndex, emptyBucketIndex); int nextOffset = getOffset(bucketToMoveIndex); if ( nextOffset == 0 || nextOffset > toEmptyDistance ) { @@ -443,11 +443,12 @@ private void resize() { buckets = (T[])new Object[capacity]; status = new byte[capacity]; + final Function keyFunc = toKey(); try { int idx = 0; do { final T entry = oldBuckets[idx]; - if ( entry != null ) insert(entry, (t1, t2) -> false ); + if ( entry != null ) insert(entry, hashToIndex(keyFunc.apply(entry)), (t1, t2) -> false ); } while ( (idx = (idx+127)%oldCapacity) != 0 ); } catch ( final IllegalStateException ise ) { @@ -518,7 +519,7 @@ private final class ElementIterator extends BaseIterator { ElementIterator( final Object key ) { this.key = key; - int bucketIndex = hashToIndex(key); + final int bucketIndex = hashToIndex(key); if ( !isChainHead(bucketIndex) ) return; currentIndex = bucketIndex; ensureEquivalence(); @@ -614,17 +615,4 @@ private void nextBucketHead() { bucketHeadIndex = NO_ELEMENT_INDEX; } } - - public static final class Serializer extends com.esotericsoftware.kryo.Serializer> { - @Override - public void write( final Kryo kryo, final Output output, final HopscotchCollection hopscotchCollection ) { - hopscotchCollection.serialize(kryo, output); - } - - @Override - public HopscotchCollection read( final Kryo kryo, final Input input, - final Class> klass ) { - return new HopscotchCollection<>(kryo, input); - } - } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchMap.java b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchMap.java new file mode 100644 index 00000000000..4b576be534b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchMap.java @@ -0,0 +1,25 @@ +package org.broadinstitute.hellbender.utils.collections; + +import java.util.Collection; +import java.util.Map; +import java.util.Objects; +import java.util.function.BiPredicate; +import java.util.function.Function; + +/** + * A uniquely keyed map with O(1) operations. + * Sadly, it's not a java.util.Map, but it does behave like a java.util.Map's entrySet. + */ +public final class HopscotchMap> extends HopscotchSet { + public HopscotchMap() {} + public HopscotchMap( final int capacity ) { super(capacity); } + public HopscotchMap( final Collection collection ) { super(collection); } + + /** in a map, uniqueness is on the key value, so we compare keys to see if we already have an entry for that key */ + @Override + protected BiPredicate entryCollides() { return (t1, t2) -> Objects.equals(t1.getKey(), t2.getKey()); } + + /** getKey returns the key part of a Map.Entry */ + @Override + protected Function toKey() { return T::getKey; } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchMultiMap.java b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchMultiMap.java new file mode 100644 index 00000000000..a2e60008d5f --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchMultiMap.java @@ -0,0 +1,20 @@ +package org.broadinstitute.hellbender.utils.collections; + +import java.util.Collection; +import java.util.Map; +import java.util.function.Function; + +/** + * A map that can contain multiple values for a given key. + */ +public class HopscotchMultiMap> extends HopscotchCollection { + public HopscotchMultiMap() {} + public HopscotchMultiMap( final int capacity ) { super(capacity); } + public HopscotchMultiMap( final Collection collection ) { super(collection); } + + /** + * getKey returns the key part of a Map.Entry + */ + @Override + protected Function toKey() { return T::getKey; } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchSet.java b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchSet.java new file mode 100644 index 00000000000..2678529e7bd --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchSet.java @@ -0,0 +1,33 @@ +package org.broadinstitute.hellbender.utils.collections; + +import java.util.Collection; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiPredicate; + +/** + * Implements Set by imposing a unique-element constraint on HopscotchCollection. + * Also implements equals and hashCode to be consistent with the documented requirements of the java Set interface. + */ +public class HopscotchSet extends HopscotchCollection implements Set { + public HopscotchSet() {} + public HopscotchSet( final int capacity ) { super(capacity); } + public HopscotchSet( final Collection collection ) { super(collection); } + + @Override + public final boolean equals( final Object obj ) { + if ( this == obj ) return true; + if ( !(obj instanceof Set) ) return false; + @SuppressWarnings("rawtypes") final Set that = (Set)obj; + return this.size() == that.size() && this.containsAll(that); + } + + @Override + public final int hashCode() { return stream().mapToInt(Objects::hashCode).sum(); } + + /** + * in a set, uniqueness is on the entry, so we just compare entries to see if we already have an identical one + */ + @Override + protected BiPredicate entryCollides() { return Object::equals; } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchUniqueMultiMap.java b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchUniqueMultiMap.java new file mode 100644 index 00000000000..50913f50c98 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/collections/HopscotchUniqueMultiMap.java @@ -0,0 +1,18 @@ +package org.broadinstitute.hellbender.utils.collections; + +import java.util.Collection; +import java.util.Map; +import java.util.function.BiPredicate; + +/** + * A map that can contain multiple values for a given key, but distinct entries. + */ +public final class HopscotchUniqueMultiMap> extends HopscotchMultiMap { + public HopscotchUniqueMultiMap() {} + public HopscotchUniqueMultiMap( final int capacity ) { super(capacity); } + public HopscotchUniqueMultiMap( final Collection collection ) { super(collection); } + + /** in a unique multimap, uniqueness is on the entry, so we just compare entries to see if we already have one */ + @Override + protected BiPredicate entryCollides() { return Object::equals; } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/SetSizeUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/collections/SetSizeUtils.java similarity index 97% rename from src/main/java/org/broadinstitute/hellbender/tools/spark/utils/SetSizeUtils.java rename to src/main/java/org/broadinstitute/hellbender/utils/collections/SetSizeUtils.java index b91dade6cb1..f3aee7153e2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/utils/SetSizeUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/collections/SetSizeUtils.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; /** * Set size utility diff --git a/src/test/java/org/broadinstitute/hellbender/tools/LocalAssemblerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/LocalAssemblerUnitTest.java new file mode 100644 index 00000000000..31201aafb8d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/LocalAssemblerUnitTest.java @@ -0,0 +1,571 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMLineParser; +import htsjdk.samtools.util.SequenceUtil; +import org.broadinstitute.hellbender.tools.LocalAssembler.*; +import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; +import java.util.stream.Collectors; + +public class LocalAssemblerUnitTest { + private static final byte QMIN = LocalAssembler.QMIN_DEFAULT; + private static final int MIN_THIN_OBS = LocalAssembler.MIN_THIN_OBS_DEFAULT; + private static final int MIN_GAPFILL_COUNT = LocalAssembler.MIN_GAPFILL_COUNT_DEFAULT; + private static final int TOO_MANY_TRAVERSALS = LocalAssembler.TOO_MANY_TRAVERSALS_DEFAULT; + private static final int TOO_MANY_SCAFFOLDS = LocalAssembler.TOO_MANY_SCAFFOLDS_DEFAULT; + private static final int MIN_SV_SIZE = LocalAssembler.MIN_SV_SIZE_DEFAULT; + private static final int KMER_SET_CAPACITY = 200; + + private static final String[] SEQS_FOR_DOGBONE_GRAPH = new String[] { + "ACGCGCCGGCGCAGGCGCAGAGACACATGCTACCGCGTCCAGGGGTGGAGGCATGGCGCAGGCGCAGAGA", + "CCGCGCCGGCGCAGGCGCAGAGACACATGCTACCGCGTCCAGGGGTGGAGGCATGGCGCAGGCGCAGAGT" + }; + + // produces a graph that looks like this: + // ------------- --------------- + // ---------------------------------------------------- + // ------------- --------------- + // (i.e., 5 contigs, with a branch at each end of a longer contig) + public static List makeDogbone( final KmerSet kmers ) { + for ( final String seq : SEQS_FOR_DOGBONE_GRAPH ) { + KmerAdjacency.kmerize(seq, MIN_THIN_OBS, kmers); + } + final List contigs = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs); + return contigs; + } + + private static final String SEQ_FOR_LARIAT = + "ACGCGCCGGCGCAGGCGCAGAGACACATGCTACCGCGTCCAGGGGTGGAGGCATGGCGCAGGCGCAGAGTCGCGCCGGCGCAGGCGCAGAGACACATGCTA"; + + // produces a graph that cycles back on itself + public static List makeLariat( final KmerSet kmers ) { + KmerAdjacency.kmerize(SEQ_FOR_LARIAT, MIN_THIN_OBS, kmers); + final List contigs = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs); + return contigs; + } + + + @Test + public void testKmers() { + Assert.assertTrue(Kmer.KSIZE < 32); + Assert.assertTrue((Kmer.KSIZE & 1) != 0); + final Kmer kmer = new Kmer(0x0BADF00DDEADBEEFL); + for ( int call = 0; call != 4; ++call ) { + final Kmer pred = new Kmer(kmer.getPredecessorVal(call)); + Assert.assertEquals(pred.getInitialCall(), call); + Assert.assertEquals(new Kmer(pred.getSuccessorVal(kmer.getFinalCall())), kmer); + final Kmer succ = new Kmer(kmer.getSuccessorVal(call)); + Assert.assertEquals(succ.getFinalCall(), call); + Assert.assertEquals(new Kmer(succ.getPredecessorVal(kmer.getInitialCall())), kmer); + } + } + + @Test + public void testKmerAdjacencies() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + // canonicality call is here -----v + final String seq = "ACGTACGTACGTACACACGTACGTACGTACG"; + KmerAdjacency.kmerize(seq, 0, kmers); + Assert.assertEquals(kmers.size(), 1); + final KmerAdjacency kmer = kmers.iterator().next(); + Assert.assertTrue(kmer.isCanonical()); + Assert.assertEquals(kmer.getInitialCall(), 0); + Assert.assertEquals(kmer.getFinalCall(), 2); + Assert.assertEquals(kmer.getPredecessorCount(), 0); + Assert.assertEquals(kmer.getPredecessorMask(), 0); + Assert.assertNull(kmer.getSolePredecessor()); + Assert.assertEquals(kmer.getSuccessorCount(), 0); + Assert.assertEquals(kmer.getSuccessorMask(), 0); + Assert.assertNull(kmer.getSoleSuccessor()); + Assert.assertNull(kmer.getContig()); + Assert.assertEquals(kmer.getNObservations(), 0); + Assert.assertSame(kmer.rc().rc(), kmer); + Assert.assertSame(kmer.canonical(), kmer); + Assert.assertEquals(kmer.toString(), seq); + + final byte[] calls = ("A" + seq + "T").getBytes(); + final byte[] quals = new byte[calls.length]; + Arrays.fill(quals, QMIN); + KmerAdjacency.kmerize(calls, quals, QMIN, kmers); + Assert.assertEquals(kmer.getNObservations(), 1); + Assert.assertSame(KmerAdjacency.find(kmer.getKVal(), kmers), kmer); + Assert.assertSame(KmerAdjacency.find(kmer.rc().getKVal(), kmers), kmer.rc()); + Assert.assertEquals(kmer.getPredecessorCount(), 1); + Assert.assertEquals(kmer.getPredecessorMask(), 1); // predecessor is "A" + final KmerAdjacency prev = kmer.getSolePredecessor(); + Assert.assertNotNull(prev); + Assert.assertSame(KmerAdjacency.find(prev.getKVal(), kmers), prev); + Assert.assertEquals(prev.getNObservations(), 1); + Assert.assertSame(prev.getSoleSuccessor(), kmer); + Assert.assertSame(prev.rc().getSolePredecessor(), kmer.rc()); + Assert.assertSame(KmerAdjacency.find(prev.rc().getKVal(), kmers), prev.rc()); + Assert.assertEquals(kmer.getSuccessorCount(), 1); + Assert.assertEquals(kmer.getSuccessorMask(), 8); // successor is "T" + final KmerAdjacency next = kmer.getSoleSuccessor(); + Assert.assertNotNull(next); + Assert.assertSame(KmerAdjacency.find(next.getKVal(), kmers), next); + Assert.assertEquals(next.getNObservations(), 1); + Assert.assertSame(next.getSolePredecessor(), kmer); + Assert.assertSame(next.rc().getSoleSuccessor(), kmer.rc()); + Assert.assertSame(KmerAdjacency.find(next.rc().getKVal(), kmers), next.rc()); + + final byte[] calls2 = ("C" + seq + "C").getBytes(); + KmerAdjacency.kmerize(calls2, quals, QMIN, kmers); + Assert.assertEquals(kmer.getNObservations(), 2); + Assert.assertEquals(kmer.getPredecessorCount(), 2); + Assert.assertNull(kmer.getSolePredecessor()); + Assert.assertEquals(kmer.getSuccessorCount(), 2); + Assert.assertNull(kmer.getSoleSuccessor()); + } + + @Test + public void testSingleContigBuild() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + final String seq = "AACGTACGTACGTACACACGTACGTACGTACGT"; + final byte[] calls = seq.getBytes(); + final byte[] quals = new byte[calls.length]; + Arrays.fill(quals, QMIN); + KmerAdjacency.kmerize(calls, quals, QMIN, kmers); + final List contigs = LocalAssembler.buildContigs(kmers); + Assert.assertEquals(contigs.size(), 1); + + final Contig contigAsBuilt = contigs.get(0); + Assert.assertTrue(contigAsBuilt.isCanonical()); // by definition, the one that got built is canonical + Assert.assertFalse(contigAsBuilt.rc().isCanonical()); + Assert.assertSame(contigAsBuilt.canonical(), contigAsBuilt); + Assert.assertSame(contigAsBuilt.rc().canonical(), contigAsBuilt); + + // which strand the contig is built for depends on minute details of the implementation + // just grab the right one + final String contigSeq = contigAsBuilt.getSequence().toString(); + final Contig contig = seq.equals(contigSeq) ? contigAsBuilt : contigAsBuilt.rc(); + Assert.assertEquals(contig.getSequence().toString(), seq); + Assert.assertEquals(contig.getMaxObservations(), 1); + Assert.assertEquals(contig.getFirstKmer().toString(), seq.substring(0, seq.length()-2)); + Assert.assertEquals(contig.getLastKmer().toString(), seq.substring(2)); + Assert.assertEquals(contig.size(), seq.length()); + + Assert.assertSame(contig.rc().rc(), contig); + + Assert.assertEquals(SequenceUtil.reverseComplement(contig.getSequence().toString()), + contig.rc().getSequence().toString()); + } + + @Test + public void testContigConnections() { + final List contigs = makeDogbone(new KmerSet<>(KMER_SET_CAPACITY)); + Assert.assertEquals(contigs.size(), 5); + for ( final Contig contig : contigs ) { + final List predecessors = contig.getPredecessors(); + final int nPreds = predecessors.size(); + for ( int idx = 0; idx != nPreds; ++idx ) { + final Contig pred = predecessors.get(idx); + Assert.assertTrue(pred.getSuccessors().contains(contig)); + Assert.assertTrue(pred.rc().getPredecessors().contains(contig.rc())); + Assert.assertSame(contig.rc().getSuccessors().get(nPreds - idx - 1).rc(), pred); + } + final List successors = contig.getSuccessors(); + final int nSuccs = successors.size(); + for ( int idx = 0; idx != nSuccs; ++idx ) { + final Contig succ = successors.get(idx); + Assert.assertTrue(succ.getPredecessors().contains(contig)); + Assert.assertTrue(succ.rc().getSuccessors().contains(contig.rc())); + Assert.assertSame(contig.rc().getPredecessors().get(nSuccs - idx - 1).rc(), succ); + } + } + } + + @Test + public void testPaths() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + makeDogbone(kmers); + final PathBuilder pathBuilder = new PathBuilder(kmers); + final byte[] path1Calls = SEQS_FOR_DOGBONE_GRAPH[0].getBytes(); + final Path path1 = new Path(path1Calls, pathBuilder); + Assert.assertEquals(path1.getParts().size(), 3); + Assert.assertTrue(pathsEquivalent(path1, path1.rc().rc())); + + // exercise our rudimentary error correction by changing one call in the middle of a contig + path1Calls[path1Calls.length / 2] = (byte)(path1Calls[path1Calls.length / 2] == 'A' ? 'C' : 'A'); + Assert.assertTrue(pathsEquivalent(path1, new Path(path1Calls, pathBuilder))); + + // path of RC sequence ought to be equivalent to RC of path + SequenceUtil.reverseComplement(path1Calls); + Assert.assertTrue(pathsEquivalent(path1.rc(), new Path(path1Calls, pathBuilder))); + + final Path path2 = new Path(SEQS_FOR_DOGBONE_GRAPH[1].getBytes(), pathBuilder); + Assert.assertEquals(path2.getParts().size(), 3); + Assert.assertFalse(pathsEquivalent(path1, path2)); + } + + // this looks like it should just be an equals method on Path and their parts, + // but Paths are never tested for equality by the LocalAssembler, just by the unit tests. + // so it's pulled out and implemented here. + private static boolean pathsEquivalent( final Path path1, final Path path2 ) { + final List parts1 = path1.getParts(); + final List parts2 = path2.getParts(); + if ( parts1.size() != parts2.size() ) { + return false; + } + final int nParts = parts1.size(); + for ( int idx = 0; idx != nParts; ++idx ) { + if ( !pathPartsEquivalent(parts1.get(idx), parts2.get(idx)) ) { + return false; + } + } + return true; + } + + private static boolean pathPartsEquivalent( final PathPart part1, final PathPart part2 ) { + final String part1Seq = part1.getSequence() == null ? null : part1.getSequence().toString(); + final String part2Seq = part2.getSequence() == null ? null : part2.getSequence().toString(); + return part1.getContig() == part2.getContig() && + Objects.equals(part1Seq, part2Seq) && + part1.getStart() == part2.getStart() && + part1.getStop() == part2.getStop(); + } + + // test removal of poorly attested contigs + @Test + public void testGraphTrimming() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + final List contigs = makeDogbone(kmers); + Assert.assertEquals(contigs.size(), 5); + + // tear the ears off the dog bone -- the gap-fill kmerization left them at 0 max-observations + LocalAssembler.removeThinContigs(contigs, MIN_THIN_OBS, kmers); + Assert.assertEquals(contigs.size(), 1); + } + + // test suppression of removing cut-point contigs + @Test + public void testCutPoints() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + makeDogbone(kmers); + + // adjust the number of observations so that the central contig has no observations + // and the ears have MIN_THIN_OBS observations + for ( final KmerAdjacency kmer : kmers ) { + if ( kmer.getNObservations() > 0 ) { + kmer.observe(null, null, -kmer.getNObservations()); + } else { + kmer.observe(null, null, MIN_THIN_OBS); + } + kmer.clearContig(); + } + + // re-create the graph, and make sure central contig isn't removed despite having no + // observations (because it's a cut point) + List contigs2 = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs2); + LocalAssembler.removeThinContigs(contigs2, MIN_THIN_OBS, kmers); + Assert.assertEquals(contigs2.size(), 5); + } + + // test joining adjacent contigs that have no branching + @Test + public void testWeldPipes() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + final List contigs = makeDogbone(kmers); + for ( final Contig contig : contigs ) { + // find the central contig + if ( contig.getPredecessors().size() > 1 && contig.getSuccessors().size() > 1 ) { + final KmerAdjacency prevKmer = contig.getPredecessors().get(0).getLastKmer(); + prevKmer.observe(null, null, MIN_THIN_OBS); + final KmerAdjacency nextKmer = contig.getSuccessors().get(0).getFirstKmer(); + nextKmer.observe(null, null, MIN_THIN_OBS); + break; + } + } + for ( final KmerAdjacency kmer : kmers ) { + kmer.clearContig(); + } + final List contigs2 = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs2); + LocalAssembler.removeThinContigs(contigs2, MIN_THIN_OBS, kmers); + Assert.assertEquals(contigs2.size(), 3); + LocalAssembler.weldPipes(contigs2); + Assert.assertEquals(contigs2.size(), 1); + Assert.assertEquals(contigs2.get(0).getSequence().length(), SEQS_FOR_DOGBONE_GRAPH[0].length()); + } + + @Test + public void testCycleDetection() { + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + final List contigs = makeLariat(kmers); + Assert.assertEquals(contigs.size(), 2); + final Contig c1 = contigs.get(0); + final Contig c2 = contigs.get(1); + final Contig longer = c1.getSequence().length() > c2.getSequence().length() ? c1 : c2; + + // longer contig is a self-cycle in the lariat sequence + Assert.assertTrue(contigInList(longer, longer.getSuccessors())); + Assert.assertTrue(contigInList(longer, longer.getPredecessors())); + + LocalAssembler.markCycles(contigs); + Assert.assertTrue(longer.isCycleMember()); + final Contig shorter = longer == c1 ? c2 : c1; + Assert.assertFalse(shorter.isCycleMember()); + } + + private boolean contigInList( final Contig target, final List contigs ) { + for ( final Contig contig : contigs ) { + if ( contig == target ) return true; + } + return false; + } + + @Test + public void testGapFilling() { + final byte[] calls = SEQ_FOR_LARIAT.getBytes(); + final byte[] quals = new byte[calls.length]; + Arrays.fill(quals, QMIN); + quals[quals.length / 2] = 0; // one bad quality score in the middle + final GATKRead read = + ArtificialReadUtils.createArtificialRead(calls, quals, calls.length + "M"); + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + final List reads = new ArrayList<>(MIN_GAPFILL_COUNT); + for ( int iii = 0; iii != MIN_GAPFILL_COUNT; ++iii ) { + reads.add(read); + KmerAdjacency.kmerize(calls, quals, QMIN, kmers); + } + final List contigs = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs); + LocalAssembler.markCycles(contigs); + + // broken lariat + Assert.assertEquals(contigs.size(), 3); + Assert.assertFalse(contigs.get(0).isCycleMember() || contigs.get(1).isCycleMember()); + + // can we find and fill a gap? + Assert.assertTrue(LocalAssembler.fillGaps(kmers, MIN_GAPFILL_COUNT, reads)); + + final List contigs2 = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs2); + LocalAssembler.markCycles(contigs2); + + // lariat healed by gap fill + Assert.assertEquals(contigs2.size(), 2); + Assert.assertTrue(contigs2.get(0).isCycleMember() || contigs2.get(1).isCycleMember()); + } + + @Test + public void testTraversalPhasing() { + final SAMFileHeader header = + ArtificialReadUtils.createArtificialSamHeader(1, 1, 100000000); + final byte[] quals = new byte[SEQS_FOR_DOGBONE_GRAPH[0].length()]; + Arrays.fill(quals, QMIN); + final GATKRead read1 = ArtificialReadUtils.createArtificialRead(header, + "read1", 0, 1, SEQS_FOR_DOGBONE_GRAPH[0].getBytes(), quals); + final GATKRead read2 = ArtificialReadUtils.createArtificialRead(header, + "read2", 0, 1, SEQS_FOR_DOGBONE_GRAPH[1].getBytes(), quals); + final List reads = new ArrayList<>(2); + reads.add(read1); + reads.add(read2); + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + LocalAssembler.kmerizeReads(reads, QMIN, kmers); + final List contigs = LocalAssembler.buildContigs(kmers); + LocalAssembler.connectContigs(contigs); + final List readPaths = LocalAssembler.pathReads(kmers, reads); + Assert.assertEquals(readPaths.size(), 2); + Assert.assertEquals(readPaths.get(0).getParts().size(), 3); + Assert.assertEquals(readPaths.get(1).getParts().size(), 3); + Assert.assertTrue(pathPartsEquivalent(readPaths.get(0).getParts().get(1), + readPaths.get(1).getParts().get(1))); + final Map> contigTransitsMap = + LocalAssembler.collectTransitPairCounts(contigs, readPaths); + Assert.assertEquals(contigTransitsMap.size(), 2); + + // we put transits for both strands into the map, so the two entries ought to be RC of each other + final Iterator>> mapItr = + contigTransitsMap.entrySet().iterator(); + final Map.Entry> entry1 = mapItr.next(); + final Map.Entry> entry2 = mapItr.next(); + Assert.assertSame(entry1.getKey().rc(), entry2.getKey()); + Assert.assertEquals(entry1.getValue().size(), 2); + Assert.assertEquals(entry2.getValue().size(), 2); + final List list1 = entry1.getValue(); + final List list2 = entry2.getValue(); + Assert.assertEquals(list1.get(0).getRC(), list2.get(0)); + Assert.assertEquals(list1.get(1).getRC(), list2.get(1)); + + final Set allTraversals = + LocalAssembler.traverseAllPaths(contigs, readPaths, TOO_MANY_TRAVERSALS, contigTransitsMap); + Assert.assertEquals(allTraversals.size(), 2); + final Iterator travItr = allTraversals.iterator(); + final String trav1Seq = travItr.next().getSequence(); + final String trav2Seq = travItr.next().getSequence(); + if ( trav1Seq.equals(SEQS_FOR_DOGBONE_GRAPH[0]) || + SequenceUtil.reverseComplement(trav1Seq).equals(SEQS_FOR_DOGBONE_GRAPH[0]) ) { + Assert.assertTrue(trav2Seq.equals(SEQS_FOR_DOGBONE_GRAPH[1]) || + SequenceUtil.reverseComplement(trav2Seq).equals(SEQS_FOR_DOGBONE_GRAPH[1])); + } else { + Assert.assertTrue(trav1Seq.equals(SEQS_FOR_DOGBONE_GRAPH[1]) || + SequenceUtil.reverseComplement(trav1Seq).equals(SEQS_FOR_DOGBONE_GRAPH[1])); + Assert.assertTrue(trav2Seq.equals(SEQS_FOR_DOGBONE_GRAPH[0]) || + SequenceUtil.reverseComplement(trav2Seq).equals(SEQS_FOR_DOGBONE_GRAPH[0])); + } + } + + @Test + public void testScaffolds() { + final String seq1 = SEQS_FOR_DOGBONE_GRAPH[0]; + final byte[] quals = new byte[seq1.length() - 1]; + Arrays.fill(quals, QMIN); + + // this time no read transits the central contig + final SAMFileHeader header = + ArtificialReadUtils.createArtificialSamHeader(1, 1, 100000000); + final GATKRead read1 = ArtificialReadUtils.createArtificialRead(header, + "read1", 0, 1, seq1.substring(1).getBytes(), quals); + final GATKRead read2 = ArtificialReadUtils.createArtificialRead(header, + "read2", 0, 1, seq1.substring(0, seq1.length() - 1).getBytes(), quals); + final String seq2 = SEQS_FOR_DOGBONE_GRAPH[1]; + final GATKRead read3 = ArtificialReadUtils.createArtificialRead(header, + "read3", 0, 1, seq2.substring(1).getBytes(), quals); + final GATKRead read4 = ArtificialReadUtils.createArtificialRead(header, + "read4", 0, 1, seq2.substring(0, seq2.length() - 1).getBytes(), quals); + final List reads = new ArrayList<>(4); + reads.add(read1); + reads.add(read2); + reads.add(read3); + reads.add(read4); + + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + LocalAssembler.kmerizeReads(reads, QMIN, kmers); + final List contigs = LocalAssembler.buildContigs(kmers); + Assert.assertEquals(contigs.size(), 5); // same dogbone as before + LocalAssembler.connectContigs(contigs); + final List readPaths = LocalAssembler.pathReads(kmers, reads); + final Map> contigTransitsMap = + LocalAssembler.collectTransitPairCounts(contigs, readPaths); + Assert.assertEquals(contigTransitsMap.size(), 0); + final List allTraversals = new ArrayList<>( + LocalAssembler.traverseAllPaths(contigs, readPaths, TOO_MANY_TRAVERSALS, contigTransitsMap)); + + // this should reconstruct the original dogbone sequences, but also another pair + // with the phasing of the initial and final SNP swapped (since we have no transits + // to establish phasing) + final Collection scaffolds = + LocalAssembler.createScaffolds(allTraversals, TOO_MANY_SCAFFOLDS, MIN_SV_SIZE); + Assert.assertEquals(scaffolds.size(), 4); + List scaffoldSeqs = + scaffolds.stream().map(Traversal::getSequence).collect(Collectors.toList()); + Assert.assertTrue(containsSeqOrRC(SEQS_FOR_DOGBONE_GRAPH[0], scaffoldSeqs)); + Assert.assertTrue(containsSeqOrRC(SEQS_FOR_DOGBONE_GRAPH[1], scaffoldSeqs)); + final String phaseRev1 = + SEQS_FOR_DOGBONE_GRAPH[1].charAt(0) + SEQS_FOR_DOGBONE_GRAPH[0].substring(1); + Assert.assertTrue(containsSeqOrRC(phaseRev1, scaffoldSeqs)); + final String phaseRev2 = + SEQS_FOR_DOGBONE_GRAPH[0].charAt(0) + SEQS_FOR_DOGBONE_GRAPH[1].substring(1); + Assert.assertTrue(containsSeqOrRC(phaseRev2, scaffoldSeqs)); + } + + public boolean containsSeqOrRC( final String seq, final List scaffoldSeqs ) { + return scaffoldSeqs.contains(seq) || + scaffoldSeqs.contains(SequenceUtil.reverseComplement(seq)); + } + + @Test + public void testCycles() { + final String[] seqs = { + "ACGCGCCGGCGCAGGCGCAGAGACACATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACTACCGCGTCCAGGGGTGGAGGCATGGCGCAGGCGCAGAGA", + "TCGCGCCGGCGCAGGCGCAGAGACACATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + }; + final SAMFileHeader header = + ArtificialReadUtils.createArtificialSamHeader(1, 1, 100000000); + final byte[] quals1 = new byte[seqs[0].length()]; + Arrays.fill(quals1, QMIN); + final GATKRead read1 = ArtificialReadUtils.createArtificialRead(header, + "read1", 0, 1, seqs[0].getBytes(), quals1); + final byte[] quals2 = new byte[seqs[1].length()]; + Arrays.fill(quals2, QMIN); + final GATKRead read2 = ArtificialReadUtils.createArtificialRead(header, + "read2", 0, 1, seqs[1].getBytes(), quals2); + final List reads = new ArrayList<>(2); + reads.add(read1); + reads.add(read2); + + final KmerSet kmers = new KmerSet<>(KMER_SET_CAPACITY); + LocalAssembler.kmerizeReads(reads, QMIN, kmers); + final List contigs = LocalAssembler.buildContigs(kmers); + Assert.assertEquals(contigs.size(), 5); // same dogbone as before + LocalAssembler.connectContigs(contigs); + LocalAssembler.markCycles(contigs); + final List readPaths = LocalAssembler.pathReads(kmers, reads); + final Map> contigTransitsMap = + LocalAssembler.collectTransitPairCounts(contigs, readPaths); + // the polyA contig, the one just upstream, and their RCs make 4 + Assert.assertEquals(contigTransitsMap.size(), 4); + final List traversals = new ArrayList<>( + LocalAssembler.traverseAllPaths(contigs, readPaths, TOO_MANY_TRAVERSALS, contigTransitsMap)); + Assert.assertEquals(traversals.size(), 2); + + // this should reconstruct the original dogbone sequences, but also another pair + // with the phasing of the initial and final SNP swapped (since we have no transits + // to establish phasing) + final Collection scaffolds = + LocalAssembler.createScaffolds(traversals, TOO_MANY_SCAFFOLDS, MIN_SV_SIZE); + Assert.assertEquals(scaffolds.size(), 2); + } + + @Test + public void testTrimOverruns() { + final SAMFileHeader header = + ArtificialReadUtils.createArtificialSamHeader(1, 1, 100000000); + final SAMLineParser samLineParser = new SAMLineParser(header); + + // two reads that start and end at the same place with a removable soft clip appropriately placed on each + final GATKRead read1 = new SAMRecordToGATKReadAdapter( + samLineParser.parseLine("read1\t163\t1\t5113820\t49\t111M40S\t=\t5113820\t111\t"+ + "ACAGAGACAGGAAGAAGTACGCGTGGGGGGCCCAGTCTGGATATGCTGAGTGGGCGGTGCCCCACTCCAAGTGTAGTGCACAGAGAAGGGCTGGAGTTACAGGCCTCCTTGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTGTATTGC\t"+ + "???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????")); + final GATKRead mate1 = new SAMRecordToGATKReadAdapter( + samLineParser.parseLine("mate1\t83\t1\t5113820\t49\t40S111M\t=\t5113820\t-111\t"+ + "GTTGGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTACAGAGACAGGAAGAAGTACGCGTGGGGGGCCCAGTCTGGATATGCTGAGTGGGCGGTGCCCCACTCCAAGTGTAGTGCACAGAGAAGGGCTGGAGTTACAGGCCTCCTTG\t"+ + "???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????")); + LocalAssembler.trimOverruns(read1, mate1); + + // expected results trim soft clip from each read + Assert.assertEquals(read1.getCigar().toString(), "111M40H"); // changed 111M40S to 111M40H + Assert.assertEquals(read1.getBasesNoCopy().length, 111); // hard-clipped the calls and quals + Assert.assertEquals(read1.getBaseQualitiesNoCopy().length, 111); + Assert.assertEquals(mate1.getCigar().toString(), "40H111M"); // changed 40S111M to 40H111M + Assert.assertEquals(mate1.getBasesNoCopy().length, 111); // hard-clipped the calls and quals + Assert.assertEquals(mate1.getBaseQualitiesNoCopy().length, 111); + } + + + @Test + public void testNoTrimOverruns() { + final SAMFileHeader header = + ArtificialReadUtils.createArtificialSamHeader(1, 1, 100000000); + final SAMLineParser samLineParser = new SAMLineParser(header); + // can't trim this read pair: cigar is 4S56M91S and the 4S suggests this might be chimeric + final GATKRead read1 = new SAMRecordToGATKReadAdapter( + samLineParser.parseLine("read1\t99\t1\t5114132\t0\t4S56M91S\t=\t5114132\t56\t"+ + "GAGCTGGGGGTTGAGTGTGGAGGAGCTGGGAGTGGTGGGGGAGCTGGGGGTTGAGTGTGGAGGAGCTGGGAGTGGTGGGGGGGCTGGGGGTTGAGTGTGGAGGTGCTGGGAGCGGTGGGGGGGCTGGGGGTTGAGTGTGGAGGTCGGGGGA\t"+ + "??????????????????????????????????????????????????????????????+5?????????????????+?5???????5???+??????++5&55???5$+??+5?5+555???5??55?+5555??+??########")); + final GATKRead mate1 = new SAMRecordToGATKReadAdapter( + samLineParser.parseLine("mate1\t147\t1\t5114132\t0\t95S56M\t=\t5114132\t-56\t"+ + "GTAGGGTGTGGGGGGTGGGGTGGGGGTGGGGGGGCGGGGGGGGTCGGGGGGGGGGTGGGGGTTGGGTGGGGGGGCGACGGGGTTGGGGGGGGGGCTGGGGGTTGAGGGTGGAGGAGCTGGGAGTGGGGGGGGAGCTGGGGGTTGAGTGTGG\t"+ + "###############################################################################?55++5'5?'555'5++???55++555'&+?'5??55++??555+?5'????????????????????????")); + LocalAssembler.trimOverruns(read1, mate1); + + // expected results are all unchanged from original + Assert.assertEquals(read1.getCigar().toString(), "4S56M91S"); + Assert.assertEquals(read1.getBasesNoCopy().length, 151); + Assert.assertEquals(read1.getBaseQualitiesNoCopy().length, 151); + Assert.assertEquals(mate1.getCigar().toString(), "95S56M"); + Assert.assertEquals(mate1.getBasesNoCopy().length, 151); + Assert.assertEquals(mate1.getBaseQualitiesNoCopy().length, 151); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/PrintDistantMatesUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/PrintDistantMatesUnitTest.java new file mode 100644 index 00000000000..d81e865f714 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/PrintDistantMatesUnitTest.java @@ -0,0 +1,27 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.List; + +public class PrintDistantMatesUnitTest { + @Test + public void testPrintDistantMatesBasics() { + final SAMFileHeader hdr = + ArtificialReadUtils.createArtificialSamHeader(1, 1, 5000); + final List readPair = + ArtificialReadUtils.createPair(hdr, "r1", 100, 0, + 1501, 3501, true, false); + final GATKRead leftRead = readPair.get(0); + final GATKRead rightRead = readPair.get(1); + final GATKRead leftDistantMate = PrintDistantMates.doDistantMateAlterations(leftRead); + Assert.assertEquals(leftDistantMate.getAssignedContig(), rightRead.getContig()); + Assert.assertEquals(leftDistantMate.getAssignedStart(), rightRead.getStart()); + Assert.assertTrue(PrintDistantMates.isDistantMate(leftDistantMate)); + Assert.assertEquals(leftRead, PrintDistantMates.undoDistantMateAlterations(leftDistantMate)); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSparkUnitTest.java index c89cdda3744..232cefd3b1e 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSparkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/evidence/FindBreakpointEvidenceSparkUnitTest.java @@ -9,8 +9,8 @@ import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.spark.sv.utils.*; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMap; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchSetSpark; +import org.broadinstitute.hellbender.tools.spark.utils.HopscotchUniqueMultiMapSpark; import org.broadinstitute.hellbender.tools.spark.utils.IntHistogram; import org.broadinstitute.hellbender.utils.IntHistogramTest; import org.broadinstitute.hellbender.utils.read.GATKRead; @@ -95,8 +95,8 @@ public void getKmerIntervalsTest() { killSet.add(SVKmerizer.toKmer(seq2,kmer)); Assert.assertEquals( killSet.size(), 2); - final HopscotchUniqueMultiMap qNameMultiMap = - new HopscotchUniqueMultiMap<>(expectedAssemblyQNames.size()); + final HopscotchUniqueMultiMapSpark qNameMultiMap = + new HopscotchUniqueMultiMapSpark<>(expectedAssemblyQNames.size()); // an empty qname map should produce a "too few kmers" disposition for the interval final List alignedAssemblyOrExcuseList = @@ -109,10 +109,10 @@ public void getKmerIntervalsTest() { expectedQNames.stream() .map(qName -> new QNameAndInterval(qName, 0)) .forEach(qNameMultiMap::add); - final HopscotchUniqueMultiMap actualKmerAndIntervalSet = - new HopscotchUniqueMultiMap<>( + final HopscotchUniqueMultiMapSpark actualKmerAndIntervalSet = + new HopscotchUniqueMultiMapSpark<>( FindBreakpointEvidenceSpark.getKmerIntervals(params, readMetadataExpected, ctx, qNameMultiMap, - 1, new HopscotchSet<>(0), + 1, new HopscotchSetSpark<>(0), reads, filter, logger)._2()); final Set actualKmers = new HashSet<>(SVUtils.hashMapCapacity(actualKmerAndIntervalSet.size())); for ( final KmerAndInterval kmerAndInterval : actualKmerAndIntervalSet ) { @@ -125,8 +125,8 @@ public void getKmerIntervalsTest() { @Test(groups = "sv") public void getAssemblyQNamesTest() { final Set expectedKmers = SVFileUtils.readKmersFile(kmersFile, params.kSize); - final HopscotchUniqueMultiMap kmerAndIntervalSet = - new HopscotchUniqueMultiMap<>(expectedKmers.size()); + final HopscotchUniqueMultiMapSpark kmerAndIntervalSet = + new HopscotchUniqueMultiMapSpark<>(expectedKmers.size()); expectedKmers.stream(). map(kmer -> new KmerAndInterval(kmer, 0)) .forEach(kmerAndIntervalSet::add); @@ -140,8 +140,8 @@ public void getAssemblyQNamesTest() { @Test(groups = "sv") public void generateFastqsTest() { - final HopscotchUniqueMultiMap qNameMultiMap = - new HopscotchUniqueMultiMap<>(expectedAssemblyQNames.size()); + final HopscotchUniqueMultiMapSpark qNameMultiMap = + new HopscotchUniqueMultiMapSpark<>(expectedAssemblyQNames.size()); expectedAssemblyQNames.stream() .map(qName -> new QNameAndInterval(qName, 0)) .forEach(qNameMultiMap::add); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSetTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSetTest.java index 7a4512aea64..e1f89869d4b 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSetTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LongHopscotchSetTest.java @@ -5,6 +5,7 @@ import com.esotericsoftware.kryo.io.Output; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.utils.collections.SetSizeUtils; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/PairWalkerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/PairWalkerUnitTest.java new file mode 100644 index 00000000000..5c99852a3d5 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/PairWalkerUnitTest.java @@ -0,0 +1,136 @@ +package org.broadinstitute.hellbender.tools.walkers; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.PrintDistantMates; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +public class PairWalkerUnitTest { + final static SAMFileHeader hdr = + ArtificialReadUtils.createArtificialSamHeader(3, 1, 5000); + final static SAMSequenceDictionary dict = hdr.getSequenceDictionary(); + final static List intervalList = Arrays.asList( + new SimpleInterval(dict.getSequence(0).getSequenceName(), 1001, 2000), + new SimpleInterval(dict.getSequence(1).getSequenceName(), 1001, 2000), + new SimpleInterval(dict.getSequence(1).getSequenceName(), 3001, 4000), + new SimpleInterval(dict.getSequence(2).getSequenceName(), 1001, 2000) + ); + + private static GATKRead createRead( final int refIndex, final int alignStart ) { + return ArtificialReadUtils.createArtificialRead(hdr, "r1", refIndex, alignStart, 100); + } + + @Test + public void testRegionCheckerBasics() { + final PairWalker.RegionChecker regionChecker = new PairWalker.RegionChecker(intervalList, dict); + Assert.assertFalse(regionChecker.isInInterval(createRead(0, 501))); + Assert.assertTrue(regionChecker.isInInterval(createRead(0, 1501))); + Assert.assertFalse(regionChecker.isInInterval(createRead(0, 2501))); + Assert.assertFalse(regionChecker.isInInterval(createRead(1, 501))); + Assert.assertTrue(regionChecker.isInInterval(createRead(1, 1501))); + Assert.assertTrue(regionChecker.isInInterval(createRead(1, 1701))); + Assert.assertTrue(regionChecker.isInInterval(createRead(1, 1901))); + Assert.assertFalse(regionChecker.isInInterval(createRead(1, 2501))); + Assert.assertFalse(regionChecker.isInInterval(createRead(1, 2901))); + Assert.assertTrue(regionChecker.isInInterval(createRead(1, 3501))); + Assert.assertFalse(regionChecker.isInInterval(createRead(1, 4501))); + Assert.assertFalse(regionChecker.isInInterval(createRead(2, 501))); + Assert.assertTrue(regionChecker.isInInterval(createRead(2, 1501))); + Assert.assertFalse(regionChecker.isInInterval(createRead(2, 2501))); + } + + @Test + public void testRegionCheckerEdges() { + final PairWalker.RegionChecker regionChecker = new PairWalker.RegionChecker(intervalList, dict); + Assert.assertFalse(regionChecker.isInInterval(createRead(0, 901))); + Assert.assertTrue(regionChecker.isInInterval(createRead(0, 902))); + Assert.assertTrue(regionChecker.isInInterval(createRead(0, 2000))); + Assert.assertFalse(regionChecker.isInInterval(createRead(0, 2001))); + } + + @Test + public void testRegionCheckerIntervalSkipping() { + final PairWalker.RegionChecker regionChecker = new PairWalker.RegionChecker(intervalList, dict); + Assert.assertTrue(regionChecker.isInInterval(createRead(1, 1501))); + Assert.assertTrue(regionChecker.isInInterval(createRead(2, 1501))); + } + + private final static class PairWalkerTester extends PairWalker { + int nPairs = 0; + public void apply( final GATKRead read, final GATKRead mate ) { nPairs += 1; } + public void applyUnpaired( final GATKRead read ) { + throw new GATKException("not expecting unpaired reads during testing"); + } + } + + private static int countValidPairs( final List reads ) { + final PairWalkerTester tester = new PairWalkerTester(); + tester.transformTraversalIntervals(intervalList, dict); + for ( final GATKRead read : reads ) { + tester.apply(read, null, null); + } + return tester.nPairs; + } + + private static List createReadPair( final int refIndex, + final int leftStart, + final int rightStart ) { + return ArtificialReadUtils.createPair(hdr, "r1", 100, refIndex, + leftStart, rightStart, true, false); + } + + @Test + public void testPairWalkerBasics() { + Assert.assertEquals(countValidPairs(createReadPair(0, 501, 1501)), 1); + Assert.assertEquals(countValidPairs(createReadPair(0, 1501, 1701)), 1); + Assert.assertEquals(countValidPairs(createReadPair(0, 1701, 2501)), 1); + Assert.assertEquals(countValidPairs(createReadPair(0, 2501, 3501)), 0); + Assert.assertEquals(countValidPairs(createReadPair(0, 501, 3501)), 0); + } + + @Test + public void testUnmapped() { + final List pair = createReadPair(0, 1501, 3501); + pair.get(1).setIsUnplaced(); + Assert.assertEquals(countValidPairs(pair), 1); + } + + @Test + public void testPairWalkerInterleavedPair() { + final List reads = new ArrayList<>(4); + reads.addAll(createReadPair(0, 501, 1701)); + reads.addAll(createReadPair(0, 1501, 2501)); + reads.sort(Comparator.comparingInt(GATKRead::getStart)); + Assert.assertEquals(countValidPairs(reads), 2); + } + + private static List createDistantReadPair( final int refIndex, + final int leftStart, + final int rightStart ) { + final List distantPair = createReadPair(refIndex, leftStart, rightStart); + final GATKRead leftRead = distantPair.get(0); + final GATKRead rightRead = distantPair.get(1); + final GATKRead leftDistantMate = + PrintDistantMates.doDistantMateAlterations(leftRead); + final GATKRead rightDistantMate = + PrintDistantMates.doDistantMateAlterations(rightRead); + return Arrays.asList(leftRead, rightDistantMate, rightRead, leftDistantMate); + } + + @Test + public void testPairWalkerDistantPairs() { + Assert.assertEquals(countValidPairs(createDistantReadPair(1, 1501, 3501)), 1); + Assert.assertEquals(countValidPairs(createDistantReadPair(1, 501, 3501)), 1); + Assert.assertEquals(countValidPairs(createDistantReadPair(1, 1501, 4501)), 1); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollectionTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchCollectionTest.java similarity index 93% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollectionTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchCollectionTest.java index 8a72457b81f..90570eda792 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchCollectionTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchCollectionTest.java @@ -1,14 +1,11 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.GATKBaseTest; import org.testng.Assert; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; +import java.util.*; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -51,7 +48,7 @@ void createFromCollectionTest() { @Test void addTest() { final HopscotchCollection hopscotchCollection = new HopscotchCollection<>(testVals.size()); - testVals.stream().forEach(hopscotchCollection::add); + hopscotchCollection.addAll(testVals); Assert.assertEquals(hopscotchCollection.size(), testVals.size()); Assert.assertTrue(hopscotchCollection.containsAll(testVals)); } @@ -145,7 +142,7 @@ void removeTest() { void removeEachTest() { final HopscotchCollection hopscotchCollection = new HopscotchCollection<>(testVals); Assert.assertFalse(hopscotchCollection.removeEach(notInTestVals)); - for ( final Integer value : new HashSet(testVals) ) { + for ( final Integer value : new HashSet<>(testVals) ) { Assert.assertTrue(hopscotchCollection.removeEach(value)); } Assert.assertTrue(hopscotchCollection.isEmpty()); @@ -154,9 +151,9 @@ void removeEachTest() { @Test void removeAllTest() { final HopscotchCollection hopscotchCollection = new HopscotchCollection<>(testVals); - Assert.assertFalse(hopscotchCollection.removeAll(Arrays.asList(notInTestVals))); - Assert.assertTrue(hopscotchCollection.removeAll(Arrays.asList(testVals.get(0)))); - Assert.assertTrue(hopscotchCollection.removeAll(new HashSet(testVals))); + Assert.assertFalse(hopscotchCollection.removeAll(Collections.singletonList(notInTestVals))); + Assert.assertTrue(hopscotchCollection.removeAll(Collections.singletonList(testVals.get(0)))); + Assert.assertTrue(hopscotchCollection.removeAll(new HashSet<>(testVals))); Assert.assertTrue(hopscotchCollection.isEmpty()); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMapTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchMapTest.java similarity index 98% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMapTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchMapTest.java index 794bbea937d..baffd646453 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMapTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchMapTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.GATKBaseTest; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMapTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchMultiMapTest.java similarity index 97% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMapTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchMultiMapTest.java index 655e9b514bb..d32bf9730ab 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchMultiMapTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchMultiMapTest.java @@ -1,7 +1,7 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMapTest.IntPair; +import org.broadinstitute.hellbender.utils.collections.HopscotchMapTest.IntPair; import org.broadinstitute.hellbender.GATKBaseTest; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchSetTest.java similarity index 98% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchSetTest.java index 06fa5f419e2..47122d1e1f4 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchSetTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetTimingTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchSetTimingTest.java similarity index 98% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetTimingTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchSetTimingTest.java index ae2ae525896..7eb9097ebc1 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchSetTimingTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchSetTimingTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMapTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchUniqueMultiMapTest.java similarity index 86% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMapTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchUniqueMultiMapTest.java index 868e39bfd07..3369f716aeb 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/HopscotchUniqueMultiMapTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/HopscotchUniqueMultiMapTest.java @@ -1,6 +1,6 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; -import org.broadinstitute.hellbender.tools.spark.utils.HopscotchMapTest.IntPair; +import org.broadinstitute.hellbender.utils.collections.HopscotchMapTest.IntPair; import org.broadinstitute.hellbender.GATKBaseTest; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSetTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/LargeLongHopscotchSetTest.java similarity index 97% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSetTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/LargeLongHopscotchSetTest.java index c36a982bcbc..fd106a0bf6f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/LargeLongHopscotchSetTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/LargeLongHopscotchSetTest.java @@ -1,10 +1,12 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.tools.spark.utils.LargeLongHopscotchSet; +import org.broadinstitute.hellbender.tools.spark.utils.LongIterator; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/SetSizeUtilsTest.java b/src/test/java/org/broadinstitute/hellbender/utils/collections/SetSizeUtilsTest.java similarity index 93% rename from src/test/java/org/broadinstitute/hellbender/tools/spark/utils/SetSizeUtilsTest.java rename to src/test/java/org/broadinstitute/hellbender/utils/collections/SetSizeUtilsTest.java index 3980a8e8596..a07417cf019 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/utils/SetSizeUtilsTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/collections/SetSizeUtilsTest.java @@ -1,6 +1,7 @@ -package org.broadinstitute.hellbender.tools.spark.utils; +package org.broadinstitute.hellbender.utils.collections; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.utils.collections.SetSizeUtils; import org.testng.Assert; import org.testng.annotations.Test;