Skip to content

Commit

Permalink
full code rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
akiezun committed Apr 19, 2015
1 parent 0d41980 commit 0ea45ee
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 77 deletions.
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
package org.broadinstitute.hellbender.engine.filters;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;

import java.util.Iterator;
import java.util.regex.Pattern;

/**
* Filter out reads with wonky cigar strings.
Expand All @@ -16,84 +14,50 @@
* - No reads that are fully hard or soft clipped
* - No reads that have consecutive indels in the cigar (II, DD, ID or DI)
*
* ps: apparently an empty cigar is okay...
* From SAM spec:
* - H can only be present as the first and/or last operation
* - S may only have H operations between them and the ends of the CIGAR string
*
* ps: apparently an empty cigar is valid.
*/
public final class GoodCigarFilter implements ReadFilter {

private static final String DIGITS = "\\d*"; //Any operator can be preceeded by a series of digits.
private static final String M =of(DIGITS + CigarOperator.M.name());
private static final String H =of(DIGITS + CigarOperator.H.name());
private static final String S =of(DIGITS + CigarOperator.S.name());
private static final String I =of(DIGITS + CigarOperator.I.name());
private static final String D =of(DIGITS + CigarOperator.D.name());
private static final String N =of(DIGITS + CigarOperator.N.name());
private static final String P =of(DIGITS + CigarOperator.P.name());
private static final String EQ=of(DIGITS + CigarOperator.EQ.name());
private static final String X =of(DIGITS + CigarOperator.X.name());

private static final String NO_H = of(M + "|" + S + "|" + I + "|" + D + "|" + N + "|" + P + "|" + EQ + "|" + X);
private static final String ANY = of(NO_H + "|" + H);
private static final String ANYSTAR = of(ANY + "*");
private static final String ANYPLUS = of(ANY + "+");
private static final String CLIP = of(H + "|" + S);
private static final String CLIPSTAR = of(CLIP + "*");
private static final String CLIPPLUS = of(CLIP + "+");

//We pre-create one regular expression pattern and reuse it.
//Clauses in the pattern correspond to the conditions we're testing for.
private final static Pattern pat = Pattern.compile(

This comment has been minimized.

Copy link
@vruano

vruano Apr 21, 2015

Contributor

I think that pat should be renamed to something more informative and using upper cases and underscores ... E.g. BAD_CIGARS?

of(ANYPLUS + H + ANYPLUS) + "|" + // hard clips in the middle
of(ANYSTAR + NO_H + S + NO_H + ANYSTAR) + "|" + //S in the middle without flanking H
of(ANYSTAR + of(I + I + "|" + D + D + "|" + I + D + "|" + D + I) + ANYSTAR) + "|" + //consecutive indels
of(CLIPSTAR + D + ANYSTAR) + "|" + //starts with deletion
of(ANYSTAR + D + CLIPSTAR) + "|" + //ends with deletion
of(CLIPPLUS)); //only clips

private static String of(String s){
return "(" + s + ")";
}

@Override
public boolean test(final SAMRecord rec) {
final Cigar c = rec.getCigar();

// if there is no Cigar then it can't be bad
if( c.isEmpty() ) {
return true;
}

Iterator<CigarElement> elementIterator = c.getCigarElements().iterator();

CigarOperator firstOp = CigarOperator.H;
while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) {
CigarOperator op = elementIterator.next().getOperator();

// No reads with Hard/Soft clips in the middle of the cigar
if (op == CigarOperator.H && firstOp != CigarOperator.H) {
return false;
}
firstOp = op;
}

// No reads starting with deletions (with or without preceding clips)
if (firstOp == CigarOperator.D) {
return false;
}

boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S);
boolean previousElementWasIndel = firstOp == CigarOperator.I;
CigarOperator lastOp = firstOp;
CigarOperator previousOp = firstOp;

while (elementIterator.hasNext()) {
CigarOperator op = elementIterator.next().getOperator();

if (op != CigarOperator.S && op != CigarOperator.H) {

// No reads with Hard/Soft clips in the middle of the cigar
if (previousOp == CigarOperator.S || previousOp == CigarOperator.H)
return false;

lastOp = op;

if (!hasMeaningfulElements && op.consumesReadBases()) {
//Note: this line is likely unreachable.
// if we're in this loop then the top loop must have been
// stopped by (firstOp != CigarOperator.H && firstOp != CigarOperator.S)
// which implies hasMeaningfulElements == true
// which means we'd not get to the current line
hasMeaningfulElements = true;
}

if (op == CigarOperator.I || op == CigarOperator.D) {

// No reads that have consecutive indels in the cigar (II, DD, ID or DI)
if (previousElementWasIndel) {
return false;
}
previousElementWasIndel = true;
}
else {
previousElementWasIndel = false;
}
}
// No reads with Hard/Soft clips in the middle of the cigar
else if (op == CigarOperator.S && previousOp == CigarOperator.H) {
return false;
}

previousOp = op;
}
return rec.getCigar().isEmpty() || !pat.matcher(rec.getCigarString()).matches();

// No reads ending in deletions (with or without follow-up clips)
// No reads that are fully hard or soft clipped
return lastOp != CigarOperator.D && hasMeaningfulElements;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ public Object[][] badCigars() {
{"1M1H1S1M"}, // S in the middle, after H
{"10M2H10M"}, // H in the middle
{"10M2S10M"}, // S in the middle
{"1H1S10M2S10M1S1H"}, // deceiving S in the middle
{"1S1H"}, // only clipping
{"1S1S"}, // only clipping
{"1H1S"}, // only clipping
{"1H1H"}, // only clipping
{"1H1S10M2S10M1S1H"}, // deceiving S in the middle: HSMSMSH
{"1H1S10M2H10M1S1H"} // deceiving H in the middle
};
}
Expand All @@ -63,6 +67,20 @@ public void testEmptyCigar(){
Assert.assertTrue(filter.test(read), read.getCigarString());
}

@DataProvider(name = "goodCigars")
public Object[][] goodCigars() {
return new Object[][]{
{"1I9H"},
{"1I1S8H"},
{"1S1I1S7H"}
};
}

@Test(dataProvider = "goodCigars")
public void testGoodCigars (String cigarString) {
SAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString);
Assert.assertTrue(filter.test(read), read.getCigarString());
}
@Test
public void testGoodCigarsUpToSize() {
//Note: not using data providers here because it's super slow to print (many minutes vs few seconds).
Expand Down

6 comments on commit 0ea45ee

@akiezun
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vruano now i completely rewrote this old gatk3 code and encoded the whole thing as a regexp. please review

@vruano
Copy link
Contributor

@vruano vruano commented on 0ea45ee Apr 20, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I used a regex in my previous review I didn't mean to encourage to use one for the actual solution although I have to admit that it occurred to me. This could be a very elegant solution although it might not work well in practice if it turns out to be inefficient respect to just work with the list of CigarOperations.... I would like some input about this from @lbergelson or @droazen. Is true that we should not fall in premature optimizations but is also true that we should not produce grossly inefficient code (I'm kind of quoting @droazen here). Hopefully the pattern pre-compilation step does a good job in preventing that but we have to make sure...

@vruano
Copy link
Contributor

@vruano vruano commented on 0ea45ee Apr 21, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before go ahead with merging this pull-request I would like to make sure that we are actually capturing what we are supposed to be capturing here.

For example,

  1. Do we need to filter out operators with 0 length (is this a valid Cigar at all?)
  2. Do we care if the cigar has consecutive operators of the same kind? what we should fail for two consecutive insertions (xIyI were x and y are numbers) but not for consecutive match (xMyM).
  3. Why we should not allow for a Insertion to be followed by a Deletion or viceversa. I think it is a genuine possibility otherwise you are kind of forcing an artifactual base map. Perhaps I am missing something here though.
    ...

@vruano
Copy link
Contributor

@vruano vruano commented on 0ea45ee Apr 21, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the intention is to port what there is in GATK3 as it is for now, the reg-expression provided is not doing it fully.

For example GATK3's filter would not complain for consecutive H operations at the beginning whereas the new code does.

Also it does accept non empty CIGAR with out any operator that consume read bases such as N or P. That would not happen with the GATK3 code.

I don't know if this can be captured appropriately with a regular expression.

@vruano
Copy link
Contributor

@vruano vruano commented on 0ea45ee Apr 21, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we stick to using regex ...

I don't see the need of the of() operation at all with an or(String ... comps) that would join using the sep "I" and surround each component with brackets makes the code more readable IMO. Also you would be able to get rid of the clutter from those repetitive string concats and pipe ("I") string literals.

You could have static method to add starts or crosses star(), plus().

@vruano
Copy link
Contributor

@vruano vruano commented on 0ea45ee Apr 21, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finish with my review back to @akiezun

Please sign in to comment.