Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| /* | |
| * The MIT License | |
| * | |
| * Copyright (c) 2016 The Broad Institute | |
| * | |
| * Permission is hereby granted, free of charge, to any person obtaining a copy | |
| * of this software and associated documentation files (the "Software"), to deal | |
| * in the Software without restriction, including without limitation the rights | |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| * copies of the Software, and to permit persons to whom the Software is | |
| * furnished to do so, subject to the following conditions: | |
| * | |
| * The above copyright notice and this permission notice shall be included in | |
| * all copies or substantial portions of the Software. | |
| * | |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| * THE SOFTWARE. | |
| */ | |
| package picard.fingerprint; | |
| import htsjdk.samtools.BamFileIoUtils; | |
| import htsjdk.samtools.SAMReadGroupRecord; | |
| import htsjdk.samtools.SamReader; | |
| import htsjdk.samtools.SamReaderFactory; | |
| import htsjdk.samtools.metrics.MetricsFile; | |
| import htsjdk.samtools.util.CloserUtil; | |
| import htsjdk.samtools.util.IOUtil; | |
| import htsjdk.samtools.util.Log; | |
| import htsjdk.samtools.util.SequenceUtil; | |
| import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; | |
| import htsjdk.variant.vcf.VCFFileReader; | |
| import htsjdk.variant.vcf.VCFHeader; | |
| import picard.PicardException; | |
| import picard.analysis.FingerprintingDetailMetrics; | |
| import picard.analysis.FingerprintingSummaryMetrics; | |
| import picard.cmdline.CommandLineProgram; | |
| import picard.cmdline.CommandLineProgramProperties; | |
| import picard.cmdline.Option; | |
| import picard.cmdline.StandardOptionDefinitions; | |
| import picard.cmdline.programgroups.Fingerprinting; | |
| import java.io.File; | |
| import java.util.Collections; | |
| import java.util.List; | |
| /** | |
| * Attempts to check the sample identity of the sequence/genotype data in the provided file (SAM/BAM or VCF) | |
| * against a set of known genotypes in the supplied genotype file (in either GELI or VCF format). | |
| * | |
| * @author Tim Fennell | |
| */ | |
| @CommandLineProgramProperties( | |
| usage = CheckFingerprint.USAGE_DETAILS, | |
| usageShort = "Computes a fingerprint from the supplied input (SAM/BAM or VCF) file and compares it to the provided genotypes", | |
| programGroup = Fingerprinting.class | |
| ) | |
| public class CheckFingerprint extends CommandLineProgram { | |
| static final String USAGE_DETAILS = "Computes a fingerprint from the supplied input file (SAM/BAM or VCF) file and " + | |
| "compares it to the expected fingerprint genotypes provided. The key output is a LOD score " + | |
| "which represents the relative likelihood of the sequence data originating from the same " + | |
| "sample as the genotypes vs. from a random sample. Two outputs are produced: (1) a summary " + | |
| "metrics file that gives metrics at the single sample level (if the input was a VCF) or at the read " + | |
| "level (lane or index within a lane) (if the input was a SAM/BAM) " + | |
| "versus a set of known genotypes for the expected sample, and (2) a detail metrics file that " + | |
| "contains an individual SNP/Haplotype comparison within a fingerprint comparison. The two " + | |
| "files may be specified individually using the SUMMARY_OUTPUT and DETAIL_OUTPUT options. " + | |
| "Alternatively the OUTPUT option may be used instead to give the base of the two output " + | |
| "files, with the summary metrics having a file extension '" + CheckFingerprint.FINGERPRINT_SUMMARY_FILE_SUFFIX + "' " + | |
| "and the detail metrics having a file extension '" + CheckFingerprint.FINGERPRINT_DETAIL_FILE_SUFFIX + "'."; | |
| @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file SAM/BAM or VCF. If a VCF is used, " + | |
| "it must have at least one sample. If there are more than one samples in the VCF, the parameter OBSERVED_SAMPLE_ALIAS must " + | |
| "be provided in order to indicate which sample's data to use. If there are no samples in the VCF, an exception will be thrown.") | |
| public File INPUT; | |
| @Option(optional = true, doc = "If the input is a VCF, this parameters used to select which sample's data in the VCF to use.") | |
| public String OBSERVED_SAMPLE_ALIAS; | |
| @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The base prefix of output files to write. The summary metrics " + | |
| "will have the file extension '" + CheckFingerprint.FINGERPRINT_SUMMARY_FILE_SUFFIX + "' and the detail metrics will have " + | |
| "the extension '" + CheckFingerprint.FINGERPRINT_DETAIL_FILE_SUFFIX + "'.", mutex = {"SUMMARY_OUTPUT", "DETAIL_OUTPUT"}) | |
| public String OUTPUT; | |
| @Option(shortName = "S", doc = "The text file to which to write summary metrics.", mutex = {"OUTPUT"}) | |
| public File SUMMARY_OUTPUT; | |
| @Option(shortName = "D", doc = "The text file to which to write detail metrics.", mutex = {"OUTPUT"}) | |
| public File DETAIL_OUTPUT; | |
| @Option(shortName="G", doc = "File of genotypes (VCF or GELI) to be used in comparison. May contain " + | |
| "any number of genotypes; CheckFingerprint will use only those that are usable for fingerprinting.") | |
| public File GENOTYPES; | |
| @Option(shortName = "SAMPLE_ALIAS", optional=true, doc = "This parameter can be used to specify which sample's genotypes to use from the " + | |
| "expected VCF file (the GENOTYPES file). If it is not supplied, the sample name from the input " + | |
| "(VCF or BAM read group header) will be used.") | |
| public String EXPECTED_SAMPLE_ALIAS; | |
| @Option(shortName="H", doc = "The file lists a set of SNPs, optionally arranged in high-LD blocks, to be used for fingerprinting. See " + | |
| "https://software.broadinstitute.org/gatk/documentation/article?id=9526 for details.") | |
| public File HAPLOTYPE_MAP; | |
| @Option(shortName="LOD", doc = "When counting haplotypes checked and matching, count only haplotypes " + | |
| "where the most likely haplotype achieves at least this LOD.") | |
| public double GENOTYPE_LOD_THRESHOLD = 5; | |
| @Option(optional=true, shortName="IGNORE_RG", doc = "If the input is a SAM/BAM, and this parameter is true, treat the " + | |
| "entire input BAM as one single read group in the calculation, " + | |
| "ignoring RG annotations, and producing a single fingerprint metric for the entire BAM.") | |
| public boolean IGNORE_READ_GROUPS = false; | |
| private final Log log = Log.getInstance(CheckFingerprint.class); | |
| public static final String FINGERPRINT_SUMMARY_FILE_SUFFIX = "fingerprinting_summary_metrics"; | |
| public static final String FINGERPRINT_DETAIL_FILE_SUFFIX = "fingerprinting_detail_metrics"; | |
| // Stock main method | |
| public static void main(final String[] args) { | |
| new CheckFingerprint().instanceMainWithExit(args); | |
| } | |
| @Override | |
| protected int doWork() { | |
| final File outputDetailMetricsFile, outputSummaryMetricsFile; | |
| if (OUTPUT == null) { | |
| outputDetailMetricsFile = DETAIL_OUTPUT; | |
| outputSummaryMetricsFile = SUMMARY_OUTPUT; | |
| } | |
| else { | |
| if (!OUTPUT.endsWith(".")) OUTPUT = OUTPUT + "."; | |
| outputDetailMetricsFile = new File(OUTPUT + FINGERPRINT_DETAIL_FILE_SUFFIX); | |
| outputSummaryMetricsFile = new File(OUTPUT + FINGERPRINT_SUMMARY_FILE_SUFFIX); | |
| } | |
| IOUtil.assertFileIsReadable(INPUT); | |
| IOUtil.assertFileIsReadable(HAPLOTYPE_MAP); | |
| IOUtil.assertFileIsReadable(GENOTYPES); | |
| IOUtil.assertFileIsWritable(outputDetailMetricsFile); | |
| IOUtil.assertFileIsWritable(outputSummaryMetricsFile); | |
| final FingerprintChecker checker = new FingerprintChecker(HAPLOTYPE_MAP); | |
| List<FingerprintResults> results; | |
| String observedSampleAlias = null; | |
| final boolean isBamOrSamFile = isBamOrSamFile(INPUT); | |
| if (isBamOrSamFile) { | |
| SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), SAMSequenceDictionaryExtractor.extractDictionary(GENOTYPES), true); | |
| SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT), checker.getHeader().getSequenceDictionary(), true); | |
| // Verify that there's only one sample in the SAM/BAM. | |
| final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); | |
| for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) { | |
| if (observedSampleAlias == null) { | |
| observedSampleAlias = rec.getSample(); | |
| } | |
| else if (!observedSampleAlias.equals(rec.getSample())) { | |
| throw new PicardException("INPUT SAM/BAM file must not contain data from multiple samples."); | |
| } | |
| } | |
| CloserUtil.close(in); | |
| // If expected sample alias isn't supplied, assume it's the one from the INPUT file's RGs | |
| if (EXPECTED_SAMPLE_ALIAS == null) { | |
| EXPECTED_SAMPLE_ALIAS = observedSampleAlias; | |
| } | |
| results = checker.checkFingerprints( | |
| Collections.singletonList(INPUT), | |
| Collections.singletonList(GENOTYPES), | |
| EXPECTED_SAMPLE_ALIAS, | |
| IGNORE_READ_GROUPS); | |
| } else { // Input is a VCF | |
| // Note that FingerprintChecker.loadFingerprints() verifies that the VCF's Sequence Dictionaries agree with that of the Haplotye Map File | |
| // Verify that there is only one sample in the VCF | |
| final VCFFileReader fileReader = new VCFFileReader(INPUT, false); | |
| final VCFHeader fileHeader = fileReader.getFileHeader(); | |
| if (fileHeader.getNGenotypeSamples() < 1) { | |
| throw new PicardException("INPUT VCF file must contain at least one sample."); | |
| } | |
| if ((fileHeader.getNGenotypeSamples() > 1) && (OBSERVED_SAMPLE_ALIAS == null)) { | |
| throw new PicardException("INPUT VCF file contains multiple samples and yet the OBSERVED_SAMPLE_ALIAS parameter is not set."); | |
| } | |
| // set observedSampleAlias to the parameter, if set. Otherwise, if here, this must be a single sample VCF, get it's sample | |
| observedSampleAlias = (OBSERVED_SAMPLE_ALIAS != null) ? OBSERVED_SAMPLE_ALIAS : fileHeader.getGenotypeSamples().get(0); | |
| // Now verify that observedSampleAlias is, in fact, in the VCF | |
| if (!fileHeader.getGenotypeSamples().contains(observedSampleAlias)) { | |
| throw new PicardException("INPUT VCF file does not contain OBSERVED_SAMPLE_ALIAS: " + observedSampleAlias); | |
| } | |
| if (OBSERVED_SAMPLE_ALIAS == null) { | |
| observedSampleAlias = fileHeader.getGenotypeSamples().get(0); | |
| } | |
| fileReader.close(); | |
| // If expected sample alias isn't supplied, assume it's the one from the INPUT file | |
| if (EXPECTED_SAMPLE_ALIAS == null) { | |
| EXPECTED_SAMPLE_ALIAS = observedSampleAlias; | |
| } | |
| results = checker.checkFingerprints( | |
| Collections.singletonList(INPUT), | |
| Collections.singletonList(GENOTYPES), | |
| observedSampleAlias, | |
| EXPECTED_SAMPLE_ALIAS); | |
| } | |
| final MetricsFile<FingerprintingSummaryMetrics,?> summaryFile = getMetricsFile(); | |
| final MetricsFile<FingerprintingDetailMetrics,?> detailsFile = getMetricsFile(); | |
| for (final FingerprintResults fpr : results) { | |
| final MatchResults mr = fpr.getMatchResults().first(); | |
| final FingerprintingSummaryMetrics metrics = new FingerprintingSummaryMetrics(); | |
| metrics.READ_GROUP = fpr.getReadGroup(); | |
| metrics.SAMPLE = EXPECTED_SAMPLE_ALIAS; | |
| metrics.LL_EXPECTED_SAMPLE = mr.getSampleLikelihood(); | |
| metrics.LL_RANDOM_SAMPLE = mr.getPopulationLikelihood(); | |
| metrics.LOD_EXPECTED_SAMPLE = mr.getLOD(); | |
| for (final LocusResult lr : mr.getLocusResults()) { | |
| final DiploidGenotype expectedGenotype = lr.getExpectedGenotype(); | |
| final DiploidGenotype observedGenotype = lr.getMostLikelyGenotype(); | |
| // Update the summary metrics | |
| metrics.HAPLOTYPES_WITH_GENOTYPES++; | |
| if (lr.getLodGenotype() >= GENOTYPE_LOD_THRESHOLD) { | |
| metrics.HAPLOTYPES_CONFIDENTLY_CHECKED++; | |
| if (lr.getExpectedGenotype() == lr.getMostLikelyGenotype()) { | |
| metrics.HAPLOTYPES_CONFIDENTLY_MATCHING++; | |
| } | |
| if (expectedGenotype.isHeterozygous() && observedGenotype.isHomomozygous()) { | |
| metrics.HET_AS_HOM++; | |
| } | |
| if (expectedGenotype.isHomomozygous() && observedGenotype.isHeterozygous()) { | |
| metrics.HOM_AS_HET++; | |
| } | |
| if (expectedGenotype.isHomomozygous() && observedGenotype.isHomomozygous() | |
| && expectedGenotype.compareTo(observedGenotype) != 0) { | |
| metrics.HOM_AS_OTHER_HOM++; | |
| } | |
| } | |
| // Build the detail metrics | |
| final FingerprintingDetailMetrics details = new FingerprintingDetailMetrics(); | |
| details.READ_GROUP = fpr.getReadGroup(); | |
| details.SAMPLE = EXPECTED_SAMPLE_ALIAS; | |
| details.SNP = lr.getSnp().getName(); | |
| details.SNP_ALLELES = lr.getSnp().getAlleleString(); | |
| details.CHROM = lr.getSnp().getChrom(); | |
| details.POSITION = lr.getSnp().getPos(); | |
| details.EXPECTED_GENOTYPE = expectedGenotype.toString(); | |
| details.OBSERVED_GENOTYPE = observedGenotype.toString(); | |
| details.LOD = lr.getLodGenotype(); | |
| details.OBS_A = lr.getAllele1Count(); | |
| details.OBS_B = lr.getAllele2Count(); | |
| detailsFile.addMetric(details); | |
| } | |
| summaryFile.addMetric(metrics); | |
| log.info("Read Group: " + metrics.READ_GROUP + " / " + observedSampleAlias + " vs. " + metrics.SAMPLE + ": LOD = " + metrics.LOD_EXPECTED_SAMPLE); | |
| } | |
| summaryFile.write(outputSummaryMetricsFile); | |
| detailsFile.write(outputDetailMetricsFile); | |
| return 0; | |
| } | |
| protected String[] customCommandLineValidation() { | |
| IOUtil.assertFileIsReadable(INPUT); | |
| boolean isBamOrSamFile = isBamOrSamFile(INPUT); | |
| if (!isBamOrSamFile && IGNORE_READ_GROUPS) { | |
| return new String[]{"The parameter IGNORE_READ_GROUPS can only be used with BAM/SAM inputs."}; | |
| } | |
| if (isBamOrSamFile && OBSERVED_SAMPLE_ALIAS != null) { | |
| return new String[]{"The parameter OBSERVED_SAMPLE_ALIAS can only be used with a VCF input."}; | |
| } | |
| return super.customCommandLineValidation(); | |
| } | |
| static boolean isBamOrSamFile(final File f) { | |
| return (BamFileIoUtils.isBamFile(f) || f.getName().endsWith(IOUtil.SAM_FILE_EXTENSION)); | |
| } | |
| } |