Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge remote-tracking branch 'unstable/master'

  • Loading branch information...
commit c973239c2a37751fe2514f39f368fd2f49080051 2 parents 616e696 + 29d994e
@vdauwera vdauwera authored
Showing with 463 additions and 205 deletions.
  1. +1 −1  pom.xml
  2. +1 −1  public/VectorPairHMM/pom.xml
  3. +1 −1  public/external-example/pom.xml
  4. +1 −1  public/gatk-engine/pom.xml
  5. +3 −2 public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java
  6. +74 −0 public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java
  7. +1 −1  public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java
  8. +77 −0 public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java
  9. +1 −1  public/gatk-queue-extensions-generator/pom.xml
  10. +1 −1  public/gatk-queue-extensions-public/pom.xml
  11. +1 −1  public/gatk-queue/pom.xml
  12. +1 −1  public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraphSettings.scala
  13. +3 −3 public/gatk-root/pom.xml
  14. +1 −1  public/gatk-tools-public/pom.xml
  15. +16 −3 public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java
  16. +18 −5 public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java
  17. +149 −26 public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java
  18. +16 −0 public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java
  19. +1 −1  public/gatk-utils/pom.xml
  20. +6 −1 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java
  21. +0 −134 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java
  22. +26 −4 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java
  23. +32 −8 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java
  24. +7 −5 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
  25. BIN  public/gatk-utils/src/main/resources/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so
  26. +22 −0 public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
  27. +1 −1  public/gsalib/pom.xml
  28. +1 −1  public/package-tests/pom.xml
  29. +1 −1  public/pom.xml
View
2  pom.xml
@@ -13,7 +13,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>public/gatk-root</relativePath>
</parent>
View
2  public/VectorPairHMM/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../../public/gatk-root</relativePath>
</parent>
View
2  public/external-example/pom.xml
@@ -9,7 +9,7 @@
<name>External Example</name>
<properties>
- <gatk.version>3.4</gatk.version>
+ <gatk.version>3.5-SNAPSHOT</gatk.version>
<!--
gatk.basedir property must point to your checkout of GATK/GATK until we can get all the
dependencies out of the committed gatk repo and into central.
View
2  public/gatk-engine/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
5 public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java
@@ -69,8 +69,9 @@ private GATKVCFUtils() { }
public final static GATKVCFIndexType DEFAULT_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR;
public final static Integer DEFAULT_GVCF_INDEX_PARAMETER = 128000;
- // GVCF file extension
+ // GVCF file extensions
public final static String GVCF_EXT = "g.vcf";
+ public final static String GVCF_GZ_EXT = "g.vcf.gz";
// Message for using the deprecated --variant_index_type or --variant_index_parameter arguments.
public final static String DEPRECATED_INDEX_ARGS_MSG = "Naming your output file using the .g.vcf extension will automatically set the appropriate values " +
@@ -389,7 +390,7 @@ public static IndexCreator makeIndexCreator(final GATKVCFIndexType variantIndexT
indexType = variantIndexType;
indexParameter = variantIndexParameter;
logger.warn(DEPRECATED_INDEX_ARGS_MSG);
- } else if (outputFile.getName().endsWith("." + GVCF_EXT)) {
+ } else if (outputFile.getName().endsWith("." + GVCF_EXT) || outputFile.getName().endsWith("." + GVCF_GZ_EXT)) {
indexType = DEFAULT_GVCF_INDEX_TYPE;
indexParameter = DEFAULT_GVCF_INDEX_PARAMETER;
}
View
74 ...tk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java
@@ -0,0 +1,74 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.gatk.engine.filters;
+
+import htsjdk.samtools.*;
+import org.broadinstitute.gatk.utils.commandline.Argument;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.engine.ReadProperties;
+import org.broadinstitute.gatk.utils.ValidationExclusion;
+import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource;
+import org.broadinstitute.gatk.utils.exceptions.UserException;
+
+/**
+ * Filter out reads that are over-soft-clipped
+ *
+ * <p>
+ * This filter is intended to filter out reads that are potentially from foreign organisms.
+ * From experience with sequencing of human DNA we have found cases of contamination by bacterial
+ * organisms; the symptoms of such contamination are a class of reads with only a small number
+ * of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended
+ * to remove such reads.
+ * </p>
+ *
+ */
+public class OverclippedReadFilter extends ReadFilter {
+
+ @Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false)
+ int tooShort = 30;
+
+
+ public boolean filterOut(final SAMRecord read) {
+ boolean sawLeadingSoftclip = false;
+ boolean sawAlignedBase = false;
+ int alignedLength = 0;
+
+ for ( final CigarElement element : read.getCigar().getCigarElements() ) {
+ if ( element.getOperator() == CigarOperator.S ) {
+ if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip
+ return (alignedLength < tooShort);
+ sawLeadingSoftclip = true;
+ } else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above)
+ if ( !sawLeadingSoftclip )
+ return false;
+ sawAlignedBase = true;
+ alignedLength += element.getLength();
+ }
+ }
+
+ return false;
+ }
+}
View
2  ...tk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java
@@ -38,7 +38,7 @@
@DataProvider(name="cramData")
public Object[][] getCRAMData() {
return new Object[][] {
- {"PrintReads", "exampleBAM.bam", "", "cram", "026ebc00c2a8f9832e37f1a6a0f53521"},
+ {"PrintReads", "exampleBAM.bam", "", "cram", "fc6e3919a8a34266c89ef66e97ceaba9"},
//{"PrintReads", "exampleCRAM.cram", "", "cram", "026ebc00c2a8f9832e37f1a6a0f53521"}, https://github.com/samtools/htsjdk/issues/148
{"PrintReads", "exampleCRAM.cram", "", "bam", "99e5f740b43594a5b8e5bc1a007719e0"},
{"PrintReads", "exampleCRAM-noindex.cram", "", "bam", "99e5f740b43594a5b8e5bc1a007719e0"},
View
77 ...e/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java
@@ -0,0 +1,77 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.gatk.engine.filters;
+
+
+import htsjdk.samtools.Cigar;
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.TextCigarCodec;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.*;
+
+
+/**
+ * Tests for the OverclippedReadFilter
+ */
+public class OverclippedReadFilterUnitTest extends ReadFilterTest {
+
+ @Test(enabled = true, dataProvider= "OverclippedDataProvider")
+ public void testOverclippedFilter(final String cigarString, final boolean expectedResult) {
+
+ final OverclippedReadFilter filter = new OverclippedReadFilter();
+ final SAMRecord read = buildSAMRecord(cigarString);
+ Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
+ }
+
+ private SAMRecord buildSAMRecord(final String cigarString) {
+ final Cigar cigar = TextCigarCodec.decode(cigarString);
+ return this.createRead(cigar, 1, 0, 10);
+ }
+
+ @DataProvider(name= "OverclippedDataProvider")
+ public Iterator<Object[]> overclippedDataProvider() {
+ final List<Object[]> result = new LinkedList<Object[]>();
+
+ result.add(new Object[] { "1S10M1S", true });
+ result.add(new Object[] { "1S10X1S", true });
+ result.add(new Object[] { "1H1S10M1S1H", true });
+ result.add(new Object[] { "1S40M1S", false });
+ result.add(new Object[] { "1S40X1S", false });
+ result.add(new Object[] { "1H10M1S", false });
+ result.add(new Object[] { "1S10M1H", false });
+ result.add(new Object[] { "10M1S", false });
+ result.add(new Object[] { "1S10M", false });
+ result.add(new Object[] { "1S10M10D10M1S", true });
+ result.add(new Object[] { "1S1M40I1S", false });
+ result.add(new Object[] { "1S10I1S", true });
+ result.add(new Object[] { "1S40I1S", false });
+
+ return result.iterator();
+ }
+}
View
2  public/gatk-queue-extensions-generator/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
2  public/gatk-queue-extensions-public/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
2  public/gatk-queue/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
2  public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraphSettings.scala
@@ -77,7 +77,7 @@ class QGraphSettings {
var jobReportFile: String = _
@Advanced
- @Argument(fullName="disableJobReport", shortName="disabpleJobReport", doc="If provided, we will not create a job report", required=false)
+ @Argument(fullName="disableJobReport", shortName="disableJobReport", doc="If provided, we will not create a job report", required=false)
var disableJobReport: Boolean = false
@ArgumentCollection
View
6 public/gatk-root/pom.xml
@@ -12,7 +12,7 @@
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<packaging>pom</packaging>
<name>GATK Root</name>
@@ -44,8 +44,8 @@
<test.listeners>org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.gatk.utils.TestNGTestTransformer,org.broadinstitute.gatk.utils.GATKTextReporter,org.uncommons.reportng.HTMLReporter</test.listeners>
<!-- Version numbers for picard and htsjdk -->
- <htsjdk.version>1.132</htsjdk.version>
- <picard.version>1.131</picard.version>
+ <htsjdk.version>1.134</htsjdk.version>
+ <picard.version>1.133</picard.version>
</properties>
<!-- Dependency configuration (versions, etc.) -->
View
2  public/gatk-tools-public/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
19 ...ls-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java
@@ -137,6 +137,9 @@ public static void addCounts(int[] updateMe, int[] leaveMeAlone ) {
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) {
Map<SAMReadGroupRecord, int[]> countsByRG = new HashMap<SAMReadGroupRecord,int[]>();
+ Map<String, int[]> countsByRGName = new HashMap<String, int[]>();
+ Map<String, SAMReadGroupRecord> RGByName = new HashMap<String, SAMReadGroupRecord>();
+
List<PileupElement> countPileup = new LinkedList<PileupElement>();
FragmentCollection<PileupElement> fpile;
@@ -202,10 +205,20 @@ else if (e.getBase() != firstElem.getBase()) {
for (PileupElement e : countPileup) {
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
- if (!countsByRG.keySet().contains(readGroup))
- countsByRG.put(readGroup, new int[6]);
- updateCounts(countsByRG.get(readGroup), e);
+ String readGroupId = readGroup.getSample() + "_" + readGroup.getReadGroupId();
+ int[] counts = countsByRGName.get(readGroupId);
+ if (counts == null) {
+ counts = new int[6];
+ countsByRGName.put(readGroupId, counts);
+ RGByName.put(readGroupId, readGroup);
+ }
+
+ updateCounts(counts, e);
+ }
+
+ for (String readGroupId : RGByName.keySet()) {
+ countsByRG.put(RGByName.get(readGroupId), countsByRGName.get(readGroupId));
}
return countsByRG;
View
23 ...public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java
@@ -179,15 +179,21 @@
/**
* Invert the selection criteria for --filterExpression
*/
- @Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false)
+ @Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false)
protected boolean invertFilterExpression = false;
/**
* Invert the selection criteria for --genotypeFilterExpression
*/
- @Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
+ @Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
protected boolean invertGenotypeFilterExpression = false;
+ /**
+ * If this argument is provided, set filtered genotypes to no-call (./.).
+ */
+ @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
+ private boolean setFilteredGenotypesToNocall = false;
+
// JEXL expressions for the filters
List<VariantContextUtils.JexlVCMatchExp> filterExps;
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
@@ -201,8 +207,10 @@
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<FiltrationContext>();
+ private final List<Allele> diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
+
/**
- * Prepend inverse phrase to description if --invert_filter_expression
+ * Prepend inverse phrase to description if --invertFilterExpression
*
* @param description the description
* @return the description with inverse prepended if --invert_filter_expression
@@ -379,7 +387,7 @@ private void filter() {
final VariantContextBuilder builder = new VariantContextBuilder(vc);
// make new Genotypes based on filters
- if ( !genotypeFilterExps.isEmpty() ) {
+ if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) {
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
// for each genotype, check filters then create a new object
@@ -388,12 +396,17 @@ private void filter() {
final List<String> filters = new ArrayList<String>();
if ( g.isFiltered() ) filters.add(g.getFilters());
+ // Add if expression filters the variant context
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
filters.add(exp.name);
}
- genotypes.add(new GenotypeBuilder(g).filters(filters).make());
+ // if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
+ if ( !filters.isEmpty() && setFilteredGenotypesToNocall )
+ genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make());
+ else
+ genotypes.add(new GenotypeBuilder(g).filters(filters).make());
} else {
genotypes.add(g);
}
View
175 ...blic/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java
@@ -146,7 +146,7 @@
* -o output.vcf \
* -se 'SAMPLE.+PARC' \
* -select "QD > 10.0"
- * --invert_selection
+ * -invertSelect
* </pre>
*
* <h4>Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):</h4>
@@ -207,7 +207,7 @@
* -sn mySample
* </pre>
*
- * <h4>Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more</h4>
+ * <h4>Generating a VCF of all the variants that are mendelian violations. The optional argument '-mvq' restricts the selection to sites that have a QUAL score of 50 or more</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@@ -218,14 +218,14 @@
* -o violations.vcf
* </pre>
*
- * <h4>Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less</h4>
+ * <h4>Generating a VCF of all the variants that are not mendelian violations. The optional argument '-mvq' together with '-invMv' restricts the selection to sites that have a QUAL score of 50 or less</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
* -R reference.fasta \
* -V input.vcf \
* -ped family.ped \
- * -mv -mvq 50 -inv_mv \
+ * -mv -mvq 50 -invMv \
* -o violations.vcf
* </pre>
*
@@ -275,7 +275,7 @@
*
* <h4>Select IDs in fileKeep and exclude IDs in fileExclude:</h4>
* <pre>
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
@@ -284,9 +284,35 @@
* -excludeIDs fileExclude
* </pre>
*
+ * <h4>Select sites where there are between 2 and 5 samples and between 10 and 50 percent of the sample genotypes are filtered:</h4>
+ * <pre>
+ * java -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T SelectVariants \
+ * --variant input.vcf \
+ * --maxFilteredGenotypes 5
+ * --minFilteredGenotypes 2
+ * --maxFractionFilteredGenotypes 0.60
+ * --minFractionFilteredGenotypes 0.10
+ * </pre>
+ *
+ * <h4>Set filtered genotypes to no-call (./.):</h4>
+ * <pre>
+ * java -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T SelectVariants \
+ * --variant input.vcf \
+ * --setFilteredGenotypesToNocall
+ * </pre>
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
+ static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE;
+ static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0;
+ static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0;
+ static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0;
+
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
@@ -359,10 +385,10 @@
public ArrayList<String> selectExpressions = new ArrayList<>();
/**
- * Invert the selection criteria for --select.
+ * Invert the selection criteria for -select.
*/
- @Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false)
- protected boolean invertSelection = false;
+ @Argument(shortName="invertSelect", doc="Invert the selection criteria for -select", required=false)
+ protected boolean invertSelect = false;
/*
* If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none
@@ -436,8 +462,8 @@
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
* `-ped` argument.
*/
- @Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false)
- private Boolean invert_mendelianViolations = false;
+ @Argument(fullName="invertMendelianViolation", shortName="invMv", doc="Output non-mendelian violation sites only", required=false)
+ private Boolean invertMendelianViolations = false;
/**
* This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order
@@ -514,6 +540,36 @@
@Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include")
private int minIndelSize = 0;
+ /**
+ * If this argument is provided, select sites where at most a maximum number of samples are filtered at the genotype level.
+ */
+ @Argument(fullName="maxFilteredGenotypes", required=false, doc="Maximum number of samples filtered at the genotype level")
+ private int maxFilteredGenotypes = MAX_FILTERED_GENOTYPES_DEFAULT_VALUE;
+
+ /**
+ * If this argument is provided, select sites where at least a minimum number of samples are filtered at the genotype level.
+ */
+ @Argument(fullName="minFilteredGenotypes", required=false, doc="Minimum number of samples filtered at the genotype level")
+ private int minFilteredGenotypes = MIN_FILTERED_GENOTYPES_DEFAULT_VALUE;
+
+ /**
+ * If this argument is provided, select sites where a fraction or less of the samples are filtered at the genotype level.
+ */
+ @Argument(fullName="maxFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level")
+ private double maxFractionFilteredGenotypes = MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
+
+ /**
+ * If this argument is provided, select sites where a fraction or more of the samples are filtered at the genotype level.
+ */
+ @Argument(fullName="minFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level")
+ private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
+
+ /**
+ * If this argument is provided, set filtered genotypes to no-call (./.).
+ */
+ @Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
+ private boolean setFilteredGenotypesToNocall = false;
+
@Hidden
@Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.")
private boolean allowNonOverlappingCommandLineSamples = false;
@@ -547,6 +603,8 @@
private Set<String> IDsToRemove = null;
private Map<String, VCFHeader> vcfRods;
+ private final List<Allele> diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
+
/**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
*/
@@ -619,7 +677,7 @@ public void initialize() {
selectedTypes.add(t);
}
- // remove specifed types
+ // remove specified types
for (VariantContext.Type t : typesToExclude)
selectedTypes.remove(t);
@@ -713,16 +771,16 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
for (VariantContext vc : vcs) {
// an option for performance testing only
- if ( fullyDecode )
- vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() );
+ if (fullyDecode)
+ vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing());
- if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) )
+ if (IDsToKeep != null && !IDsToKeep.contains(vc.getID()))
continue;
- if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) )
+ if (IDsToRemove != null && IDsToRemove.contains(vc.getID()))
continue;
- if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) )
+ if (mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invertMendelianViolations))
break;
if (discordanceOnly) {
@@ -745,20 +803,30 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
if (!selectedTypes.contains(vc.getType()))
continue;
- if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) )
+ if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize))
continue;
+ if ( needNumFilteredGenotypes()) {
+ int numFilteredSamples = numFilteredGenotypes(vc);
+ double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size();
+ if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes ||
+ fractionFilteredGenotypes > maxFractionFilteredGenotypes || fractionFilteredGenotypes < minFractionFilteredGenotypes)
+ continue;
+ }
+
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
+ VariantContext filteredGenotypeToNocall = setFilteredGenotypeToNocall(sub, setFilteredGenotypesToNocall);
+
// Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered
- if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) {
+ if ( (!XLnonVariants || filteredGenotypeToNocall.isPolymorphicInSamples()) && (!XLfiltered || !filteredGenotypeToNocall.isFiltered()) ) {
// Write the subsetted variant if it matches all of the expressions
boolean failedJexlMatch = false;
try {
for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
- if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){
+ if ( Utils.invertLogic(!VariantContextUtils.match(filteredGenotypeToNocall, jexl), invertSelect) ){
failedJexlMatch = true;
break;
}
@@ -771,7 +839,7 @@ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentCo
if ( !failedJexlMatch &&
!justRead &&
( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) {
- vcfWriter.add(sub);
+ vcfWriter.add(filteredGenotypeToNocall);
}
}
}
@@ -801,13 +869,33 @@ protected static boolean containsIndelLargerOrSmallerThan(final VariantContext v
}
/**
+ * Find the number of filtered samples
+ *
+ * @param vc the variant rod VariantContext
+ * @return number of filtered samples
+ */
+ private int numFilteredGenotypes(final VariantContext vc){
+ if (vc == null)
+ return 0;
+
+ int numFiltered = 0;
+ // check if we find it in the variant rod
+ GenotypesContext genotypes = vc.getGenotypes(samples);
+ for (final Genotype g : genotypes)
+ if ( g.isFiltered() && !g.getFilters().isEmpty())
+ numFiltered++;
+
+ return numFiltered;
+ }
+
+ /**
* Checks if vc has a variant call for (at least one of) the samples.
*
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
* @param compVCs the comparison VariantContext (discordance)
* @return true VariantContexts are discordant, false otherwise
*/
- private boolean isDiscordant (VariantContext vc, Collection<VariantContext> compVCs) {
+ private boolean isDiscordant (final VariantContext vc, final Collection<VariantContext> compVCs) {
if (vc == null)
return false;
@@ -845,7 +933,7 @@ private boolean isDiscordant (VariantContext vc, Collection<VariantContext> comp
* @param compVCs the comparison VariantContext
* @return true if VariantContexts are concordant, false otherwise
*/
- private boolean isConcordant (VariantContext vc, Collection<VariantContext> compVCs) {
+ private boolean isConcordant (final VariantContext vc, final Collection<VariantContext> compVCs) {
if (vc == null || compVCs == null || compVCs.isEmpty())
return false;
@@ -876,7 +964,7 @@ private boolean isConcordant (VariantContext vc, Collection<VariantContext> comp
return true;
}
- private boolean sampleHasVariant(Genotype g) {
+ private boolean sampleHasVariant(final Genotype g) {
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered)));
}
@@ -945,8 +1033,7 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean prese
for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
- final List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
- genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
+ genotypes.add(new GenotypeBuilder(genotype).alleles(diploidNoCallAlleles).noGQ().make());
}
else{
genotypes.add(genotype);
@@ -966,6 +1053,30 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean prese
return trimmed;
}
+ /**
+ * If --setFilteredGtToNocall, set filtered genotypes to no-call
+ *
+ * @param vc the VariantContext record to set filtered genotypes to no-call
+ * @param filteredGenotypesToNocall set filtered genotypes to non-call?
+ * @return the VariantContext with no-call genotypes if the sample was filtered
+ */
+ private VariantContext setFilteredGenotypeToNocall(final VariantContext vc, final boolean filteredGenotypesToNocall) {
+
+ if ( !filteredGenotypesToNocall )
+ return vc;
+
+ final VariantContextBuilder builder = new VariantContextBuilder(vc);
+ final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
+
+ for ( final Genotype g : vc.getGenotypes() ) {
+ if ( g.isCalled() && g.isFiltered() )
+ genotypes.add(new GenotypeBuilder(g).alleles(diploidNoCallAlleles).make());
+ else
+ genotypes.add(g);
+ }
+
+ return builder.genotypes(genotypes).make();
+ }
/*
* Add annotations to the new VC
*
@@ -1061,4 +1172,16 @@ else if ( List.class.isAssignableFrom(attribute.getClass()) )
}
return result;
}
-}
+
+ /**
+ * Need the number of filtered genotypes samples?
+ *
+ * @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise
+ */
+ private boolean needNumFilteredGenotypes(){
+ return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE ||
+ minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE ||
+ maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE ||
+ minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
+ }
+}
View
16 ...-tools-public/src/test/java/org/broadinstitute/gatk/tools/CatVariantsIntegrationTest.java
@@ -246,4 +246,20 @@ public void testCatVariantsGVCFIndexCreation() throws IOException{
ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine));
pc.execAndCheck(ps);
}
+
+ @Test()
+ public void testCatVariantsGVCFGzIndexCreation() throws IOException{
+
+ String cmdLine = String.format("java -cp \"%s\" %s -R %s -V %s -V %s -out %s",
+ StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar),
+ CatVariants.class.getCanonicalName(),
+ BaseTest.b37KGReference,
+ CatVariantsVcf1,
+ CatVariantsVcf2,
+ BaseTest.createTempFile("CatVariantsGVCFIndexCreationTest", "." + GATKVCFUtils.GVCF_GZ_EXT));
+
+ ProcessController pc = ProcessController.getThreadLocal();
+ ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine));
+ pc.execAndCheck(ps);
+ }
}
View
2  public/gatk-utils/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
7 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java
@@ -165,7 +165,12 @@ public static void validateDictionaries( final Logger logger,
}
case OUT_OF_ORDER: {
- UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2);
+ UserException ex = new UserException.IncompatibleSequenceDictionaries(
+ "The contig order in " + name1 + " and " + name2 + "is not "
+ + "the same; to fix this please see: "
+ + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), "
+ + " which describes reordering contigs in BAM and VCF files.",
+ name1, dict1, name2, dict2);
if ( allowNonFatalIncompabilities(validationExclusion) )
logger.warn(ex.getMessage());
else
View
134 ...atk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java
@@ -28,7 +28,6 @@
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.tribble.Feature;
import htsjdk.tribble.annotation.Strand;
-import htsjdk.tribble.dbsnp.OldDbSNPFeature;
import htsjdk.tribble.gelitext.GeliTextFeature;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.GenomeLoc;
@@ -111,139 +110,6 @@ public VariantContext convert(String name, Object input, ReferenceContext ref) {
}
}
- // --------------------------------------------------------------------------------------------------------------
- //
- // dbSNP to VariantContext
- //
- // --------------------------------------------------------------------------------------------------------------
-
- private static class DBSnpAdaptor implements VCAdaptor {
- private static boolean isSNP(OldDbSNPFeature feature) {
- return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
- }
-
- private static boolean isMNP(OldDbSNPFeature feature) {
- return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
- }
-
- private static boolean isInsertion(OldDbSNPFeature feature) {
- return feature.getVariantType().contains("insertion");
- }
-
- private static boolean isDeletion(OldDbSNPFeature feature) {
- return feature.getVariantType().contains("deletion");
- }
-
- private static boolean isIndel(OldDbSNPFeature feature) {
- return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature);
- }
-
- public static boolean isComplexIndel(OldDbSNPFeature feature) {
- return feature.getVariantType().contains("in-del");
- }
-
- /**
- * gets the alternate alleles. This method should return all the alleles present at the location,
- * NOT including the reference base. This is returned as a string list with no guarantee ordering
- * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
- * frequency).
- *
- * @return an alternate allele list
- */
- public static List<String> getAlternateAlleleList(OldDbSNPFeature feature) {
- List<String> ret = new ArrayList<String>();
- for (String allele : getAlleleList(feature))
- if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
- return ret;
- }
-
- /**
- * gets the alleles. This method should return all the alleles present at the location,
- * including the reference base. The first allele should always be the reference allele, followed
- * by an unordered list of alternate alleles.
- *
- * @return an alternate allele list
- */
- public static List<String> getAlleleList(OldDbSNPFeature feature) {
- List<String> alleleList = new ArrayList<String>();
- // add ref first
- if ( feature.getStrand() == Strand.POSITIVE )
- alleleList = Arrays.asList(feature.getObserved());
- else
- for (String str : feature.getObserved())
- alleleList.add(SequenceUtil.reverseComplement(str));
- if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase())
- && !alleleList.get(0).equals(feature.getNCBIRefBase()) )
- Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0);
-
- return alleleList;
- }
-
- /**
- * Converts non-VCF formatted dbSNP records to VariantContext.
- * @return OldDbSNPFeature.
- */
- @Override
- public Class<? extends Feature> getAdaptableFeatureType() { return OldDbSNPFeature.class; }
-
- @Override
- public VariantContext convert(String name, Object input, ReferenceContext ref) {
- OldDbSNPFeature dbsnp = (OldDbSNPFeature)input;
-
- int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
- if ( index < 0 )
- return null; // we weren't given enough reference context to create the VariantContext
-
- final byte refBaseForIndel = ref.getBases()[index];
- final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-");
-
- boolean addPaddingBase;
- if ( isSNP(dbsnp) || isMNP(dbsnp) )
- addPaddingBase = false;
- else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") )
- addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp)));
- else
- return null; // can't handle anything else
-
- Allele refAllele;
- if ( refBaseIsDash )
- refAllele = Allele.create(refBaseForIndel, true);
- else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) )
- return null;
- else
- refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true);
-
- final List<Allele> alleles = new ArrayList<Allele>();
- alleles.add(refAllele);
-
- // add all of the alt alleles
- for ( String alt : getAlternateAlleleList(dbsnp) ) {
- if ( Allele.wouldBeNullAllele(alt.getBytes()))
- alt = "";
- else if ( ! Allele.acceptableAlleleBases(alt) )
- return null;
-
- alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false));
- }
-
- final VariantContextBuilder builder = new VariantContextBuilder();
- builder.source(name).id(dbsnp.getRsID());
- builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0));
- builder.alleles(alleles);
- return builder.make();
- }
-
- private static List<String> stripNullDashes(final List<String> alleles) {
- final List<String> newAlleles = new ArrayList<String>(alleles.size());
- for ( final String allele : alleles ) {
- if ( allele.equals("-") )
- newAlleles.add("");
- else
- newAlleles.add(allele);
- }
- return newAlleles;
- }
- }
// --------------------------------------------------------------------------------------------------------------
//
View
30 public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java
@@ -51,10 +51,14 @@
public static final String SPANNING_DELETIONS_KEY = "Dels";
public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants
public static final String DOWNSAMPLED_KEY = "DS";
+ public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNT"; //M2
+ public static final String EVENT_DISTANCE_MAX_KEY = "MAX_ED"; //M2
+ public static final String EVENT_DISTANCE_MIN_KEY = "MIN_ED"; //M2
public static final String FISHER_STRAND_KEY = "FS";
public static final String GC_CONTENT_KEY = "GC";
public static final String GQ_MEAN_KEY = "GQ_MEAN";
public static final String GQ_STDEV_KEY = "GQ_STDDEV";
+ public static final String HAPLOTYPE_COUNT_KEY = "HCNT"; //M2
public static final String HAPLOTYPE_SCORE_KEY = "HaplotypeScore";
public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo";
public static final String HOMOPOLYMER_RUN_KEY = "HRun";
@@ -80,8 +84,10 @@
public static final String ORIGINAL_CONTIG_KEY = "OriginalChr"; //LiftoverVariants
public static final String ORIGINAL_START_KEY = "OriginalStart"; //LiftoverVariants
public static final String N_BASE_COUNT_KEY = "PercentNBase";
+ public static final String NORMAL_LOD_KEY = "NLOD"; //M2
public static final String RBP_INCONSISTENT_KEY = "PhasingInconsistent"; //ReadBackedPhasing
public static final String GENOTYPE_PRIOR_KEY = "PG";
+ public static final String PANEL_OF_NORMALS_COUNT_KEY = "PON"; //M2
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE";
public static final String QUAL_BY_DEPTH_KEY = "QD";
public static final String BEAGLE_R2_KEY = "R2"; //BeagleOutputToVCF
@@ -93,11 +99,18 @@
public static final String STRAND_ODDS_RATIO_KEY = "SOR";
public static final String STR_PRESENT_KEY = "STR";
public static final String TRANSMISSION_DISEQUILIBRIUM_KEY = "TDT";
+ public static final String TUMOR_LOD_KEY = "TLOD"; //M2
public static final String VARIANT_TYPE_KEY = "VariantType";
public static final String VQS_LOD_KEY = "VQSLOD";
+ public static final String OXOG_ALT_F1R2_KEY = "ALT_F1R2";
+ public static final String OXOG_ALT_F2R1_KEY = "ALT_F2R1";
+ public static final String OXOG_REF_F1R2_KEY = "REF_F1R2";
+ public static final String OXOG_REF_F2R1_KEY = "REF_F2R1";
+ public static final String OXOG_FRACTION_KEY = "FOXOG";
//FORMAT keys
public static final String ALLELE_BALANCE_KEY = "AB";
+ public static final String ALLELE_FRACTION_KEY = "AF"; //M2
public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
public static final String RBP_HAPLOTYPE_KEY = "HP"; //ReadBackedPhasing
public static final String AVG_INTERVAL_DP_BY_SAMPLE_KEY = "IDP"; //DiagnoseTargets
@@ -110,6 +123,7 @@
public static final String HAPLOTYPE_CALLER_PHASING_GT_KEY = "PGT";
public static final String HAPLOTYPE_CALLER_PHASING_ID_KEY = "PID";
public static final String PHRED_SCALED_POSTERIORS_KEY = "PP"; //FamilyLikelihoodsUtils / PosteriorLikelihoodsUtils
+ public static final String QUALITY_SCORE_SUM_KEY = "QSS"; //M2
public static final String REFERENCE_GENOTYPE_QUALITY = "RGQ";
public static final String STRAND_COUNT_BY_SAMPLE_KEY = "SAC";
public static final String STRAND_BIAS_BY_SAMPLE_KEY = "SB";
@@ -120,13 +134,21 @@
/* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic,
their names (or descriptions) depend on some threshold. Those filters are not included here
*/
- public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
- public static final String LOW_QUAL_FILTER_NAME = "LowQual";
+ public static final String ALT_ALLELE_IN_NORMAL_FILTER_NAME = "alt_allele_in_normal"; //M2
+ public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
+ public static final String CLUSTERED_EVENTS_FILTER_NAME = "clustered_events"; //M2
+ public static final String GERMLINE_RISK_FILTER_NAME = "germline_risk"; //M2
+ public static final String HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME = "homologous_mapping_event"; //M2
+ public static final String LOW_QUAL_FILTER_NAME = "LowQual";
+ public static final String MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME = "multi_event_alt_allele_in_normal"; //M2
+ public static final String PON_FILTER_NAME = "panel_of_normals"; //M2
+ public static final String STR_CONTRACTION_FILTER_NAME = "str_contraction"; //M2
+ public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2
// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
- public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL";
- public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site
+ public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED = "*:DEL";
+ public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED+">", false); // represents any possible spanning deletion allele at this site
}
View
40 ...ic/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java
@@ -25,16 +25,11 @@
package org.broadinstitute.gatk.utils.variant;
-import htsjdk.variant.vcf.VCFFilterHeaderLine;
-import htsjdk.variant.vcf.VCFFormatHeaderLine;
-import htsjdk.variant.vcf.VCFHeaderLineCount;
-import htsjdk.variant.vcf.VCFHeaderLineType;
-import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import htsjdk.variant.vcf.*;
import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.*;
/**
* This class contains the VCFHeaderLine definitions for the annotation keys in GATKVCFConstants.
@@ -66,6 +61,16 @@ private static void addFilterLine(final VCFFilterHeaderLine line) {
addFilterLine(new VCFFilterHeaderLine(LOW_QUAL_FILTER_NAME, "Low quality"));
addFilterLine(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_NAME, "This site was set to monomorphic by Beagle"));
+ // M2-related filters
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Evidence seen in the normal sample"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME, "Clustered events observed in the tumor"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME, "Evidence indicates this site is germline, not somatic"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME, "More than three events were observed in the tumor"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Multiple events observed in tumor and normal"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.PON_FILTER_NAME, "Seen in at least 2 samples in the panel of normals"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold"));
+ addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short tandem repeat region"));
+
addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype"));
addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample"));
addFormatLine(new VCFFormatHeaderLine(MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample"));
@@ -89,6 +94,15 @@ private static void addFilterLine(final VCFFilterHeaderLine line) {
addFormatLine(new VCFFormatHeaderLine(JOINT_POSTERIOR_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint posterior probability of the genotype combination (after applying family priors)"));
addFormatLine(new VCFFormatHeaderLine(ORIGINAL_GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Original Genotype input to Beagle"));
+ // M2-related info lines
+ addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.ALLELE_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Allele fraction of the event in the tumor"));
+ addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_ALT_F1R2_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting the alternate allele"));
+ addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_ALT_F2R1_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting the alternate allele"));
+ addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_REF_F1R2_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting the reference allele"));
+ addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_REF_F2R1_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting the reference allele"));
+ addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Fraction of alt reads indicating OxoG error"));
+
+
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
@@ -132,7 +146,7 @@ private static void addFilterLine(final VCFFilterHeaderLine line) {
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_DP_KEY, 1, VCFHeaderLineType.Integer, "Original DP"));
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record"));
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record"));
- addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
+ addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds of being a true variant versus being false under the trained gaussian mixture model"));
addInfoLine(new VCFInfoHeaderLine(CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
addInfoLine(new VCFInfoHeaderLine(POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
addInfoLine(new VCFInfoHeaderLine(NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
@@ -147,5 +161,15 @@ private static void addFilterLine(final VCFFilterHeaderLine line) {
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AC_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site"));
+
+ // M2-related info lines
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, 1, VCFHeaderLineType.String, "Number of haplotypes that support this variant"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.NORMAL_LOD_KEY, 1, VCFHeaderLineType.String, "Normal LOD score"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY, 1, VCFHeaderLineType.String, "Count from Panel of Normals"));
+ addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_LOD_KEY, 1, VCFHeaderLineType.String, "Tumor LOD score"));
+
}
}
View
12 ...tk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
@@ -393,7 +393,7 @@ public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBa
if (repetitionCount[0] == 0 || repetitionCount[1] == 0)
return null;
- if (lengths.size() == 0) {
+ if (lengths.isEmpty()) {
lengths.add(repetitionCount[0]); // add ref allele length only once
}
lengths.add(repetitionCount[1]); // add this alt allele's length
@@ -947,7 +947,7 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
final String setKey,
final boolean filteredAreUncalled,
final boolean mergeInfoWithMaxAC ) {
- if ( unsortedVCs == null || unsortedVCs.size() == 0 )
+ if ( unsortedVCs == null || unsortedVCs.isEmpty() )
return null;
if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size())
@@ -965,7 +965,7 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
VCs.add(vc);
}
- if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled
+ if ( VCs.isEmpty() ) // everything is filtered out and we're filteredAreUncalled
return null;
// establish the baseline info from the first VC
@@ -1289,7 +1289,7 @@ static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, f
* @param currentAlleles the list of alleles already created
* @return a non-null mapping of original alleles to new (extended) ones
*/
- private static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
+ protected static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
final VariantContext oneVC,
final Collection<Allele> currentAlleles) {
final Allele myRef = oneVC.getReference();
@@ -1305,6 +1305,8 @@ static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, f
if ( extended.equals(b) )
extended = b;
map.put(a, extended);
+ } else if ( a.isSymbolic() ) {
+ map.put(a, a);
}
}
@@ -1696,7 +1698,7 @@ public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) {
// otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list
vcList.remove(k);
// avoid having empty lists
- if (vcList.size() == 0)
+ if (vcList.isEmpty())
mappedVCs.remove(type);
if ( !mappedVCs.containsKey(vcType) )
mappedVCs.put(vcType, new ArrayList<VariantContext>());
View
BIN  ...utils/src/main/resources/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so
Binary file not shown
View
22 .../src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
@@ -46,6 +46,7 @@
Allele ATref;
Allele Anoref;
Allele GT;
+ Allele Symbolic;
private GenomeLocParser genomeLocParser;
@@ -63,6 +64,7 @@ public void setup() throws IOException {
ATref = Allele.create("AT",true);
Anoref = Allele.create("A",false);
GT = Allele.create("GT",false);
+ Symbolic = Allele.create("<Symbolic>", false);
genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference)));
}
@@ -1607,5 +1609,25 @@ public void testTotalPloidy(final int[] ploidies, final int defaultPloidy, final
BaseUtils.fillWithRandomBases(bases, 1, bases.length);
return bases;
}
+
+ @Test
+ public void testCreateAlleleMapping(){
+ final List<Allele> alleles = Arrays.asList(Aref,Symbolic,T);
+ final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make();
+ Map<Allele, Allele> map = GATKVariantContextUtils.createAlleleMapping(ATref, vc, alleles);
+
+ final List<Allele> expectedAlleles = Arrays.asList(Allele.create("<Symbolic>", false), Allele.create("TT", false));
+ for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ){
+ Assert.assertEquals(map.get(vc.getAlternateAlleles().get(i)), expectedAlleles.get(i));
+ }
+ }
+
+ @Test(expectedExceptions = IllegalStateException.class)
+ public void testCreateAlleleMappingException(){
+ final List<Allele> alleles = Arrays.asList(Aref, Symbolic, T);
+ final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make();
+ // Throws an exception if the ref allele length <= ref allele length to extend
+ Map<Allele, Allele> map = GATKVariantContextUtils.createAlleleMapping(Aref, vc, alleles);
+ }
}
View
2  public/gsalib/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>
View
2  public/package-tests/pom.xml
@@ -9,7 +9,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>../gatk-root</relativePath>
</parent>
View
2  public/pom.xml
@@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
- <version>3.4</version>
+ <version>3.5-SNAPSHOT</version>
<relativePath>gatk-root</relativePath>
</parent>
Please sign in to comment.
Something went wrong with that request. Please try again.