Add basic VCF tools.

broadinstitute · Feb 27, 2015 · cf2954f · cf2954f
1 parent 66aa5e0
commit cf2954f
Show file tree

Hide file tree

Showing 9 changed files with 1,191 additions and 0 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/picard/vcf/GatherVcfs.java b/src/main/java/org/broadinstitute/hellbender/tools/picard/vcf/GatherVcfs.java
@@ -0,0 +1,270 @@
+package org.broadinstitute.hellbender.tools.picard.vcf;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.BlockCompressedInputStream;
+import htsjdk.samtools.util.BlockCompressedOutputStream;
+import htsjdk.samtools.util.BlockCompressedStreamConstants;
+import htsjdk.samtools.util.CloseableIterator;
+import htsjdk.samtools.util.CloserUtil;
+import htsjdk.samtools.util.CollectionUtil;
+import htsjdk.samtools.util.IOUtil;
+import htsjdk.samtools.util.Log;
+import htsjdk.samtools.util.PeekableIterator;
+import htsjdk.samtools.util.ProgressLogger;
+import htsjdk.samtools.util.RuntimeIOException;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextComparator;
+import htsjdk.variant.variantcontext.writer.Options;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory;
+import htsjdk.variant.vcf.VCFFileReader;
+import htsjdk.variant.vcf.VCFHeader;
+import org.broadinstitute.hellbender.cmdline.Argument;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgramProperties;
+import org.broadinstitute.hellbender.cmdline.PicardCommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup;
+import org.broadinstitute.hellbender.exceptions.UserException;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Simple little class that combines multiple VCFs that have exactly the same set of samples
+ * and totally discrete sets of loci.
+ *
+ * @author Tim Fennell
+ */
+@CommandLineProgramProperties(
+        usage = "Gathers multiple VCF files from a scatter operation into a single VCF file. Input files " +
+                "must be supplied in genomic order and must not have events at overlapping positions.",
+        usageShort = "Gathers multiple VCF files from a scatter operation into a single VCF file",
+        programGroup = VariantProgramGroup.class
+)
+public class GatherVcfs extends PicardCommandLineProgram {
+
+    @Argument(shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,  doc = "Input VCF file(s).")
+	public List<File> INPUT;
+
+    @Argument(shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output VCF file.")
+	public File OUTPUT;
+
+    private static final Log log = Log.getInstance(GatherVcfs.class);
+
+    public GatherVcfs() {
+        CREATE_INDEX = true;
+    }
+
+    @Override
+    protected Object doWork() {
+        log.info("Checking inputs.");
+        INPUT = IOUtil.unrollFiles(INPUT, IOUtil.VCF_EXTENSIONS);
+        for (final File f: INPUT) IOUtil.assertFileIsReadable(f);
+        IOUtil.assertFileIsWritable(OUTPUT);
+
+		final SAMSequenceDictionary sequenceDictionary = VCFFileReader.getSequenceDictionary(INPUT.get(0));
+
+        if (CREATE_INDEX && sequenceDictionary == null) {
+            throw new UserException("In order to index the resulting VCF input VCFs must contain ##contig lines.");
+        }
+
+        log.info("Checking file headers and first records to ensure compatibility.");
+        assertSameSamplesAndValidOrdering(INPUT);
+
+        if (areAllBlockCompressed(INPUT) && areAllBlockCompressed(CollectionUtil.makeList(OUTPUT))) {
+            log.info("Gathering by copying gzip blocks. Will not be able to validate position non-overlap of files.");
+            if (CREATE_INDEX) log.warn("Index creation not currently supported when gathering block compressed VCFs.");
+            gatherWithBlockCopying(INPUT, OUTPUT);
+        }
+        else {
+            log.info("Gathering by conventional means.");
+            gatherConventionally(sequenceDictionary, CREATE_INDEX, INPUT, OUTPUT);
+        }
+
+        return null;
+    }
+
+    /** Checks (via filename checking) that all files appear to be block compressed files. */
+    private boolean areAllBlockCompressed(final List<File> input) {
+        for (final File f : input) {
+            if (VariantContextWriterFactory.isBCFOutput(f) || !VariantContextWriterFactory.isCompressedVcf(f)) return false;
+        }
+
+        return true;
+    }
+
+    /** Validates that all headers contain the same set of genotyped samples and that files are in order by position of first record. */
+    private static void assertSameSamplesAndValidOrdering(final List<File> inputFiles) {
+        final VCFHeader header = new VCFFileReader(inputFiles.get(0), false).getFileHeader();
+        final SAMSequenceDictionary dict = header.getSequenceDictionary();
+        final VariantContextComparator comparator = new VariantContextComparator(header.getSequenceDictionary());
+        final List<String> samples = header.getGenotypeSamples();
+
+        File lastFile = null;
+        VariantContext lastContext = null;
+
+        for (final File f : inputFiles) {
+            final VCFFileReader in = new VCFFileReader(f, false);
+            dict.assertSameDictionary(in.getFileHeader().getSequenceDictionary());
+            final List<String> theseSamples = in.getFileHeader().getGenotypeSamples();
+
+            if (!samples.equals(theseSamples)) {
+                final SortedSet<String> s1 = new TreeSet<String>(samples);
+                final SortedSet<String> s2 = new TreeSet<String>(theseSamples);
+                s1.removeAll(theseSamples);
+                s2.removeAll(samples);
+
+                throw new IllegalArgumentException("VCFs do not have identical sample lists." +
+                        " Samples unique to first file: " + s1 + ". Samples unique to " + f.getAbsolutePath() + ": " + s2 + ".");
+            }
+
+            final CloseableIterator<VariantContext> variantIterator = in.iterator();
+            if (variantIterator.hasNext()) {
+                final VariantContext currentContext = variantIterator.next();
+                if (lastContext != null) {
+                    if (comparator.compare(lastContext, currentContext) >= 0) {
+                        throw new IllegalArgumentException("First record in file " + f.getAbsolutePath() + " is not after first record in " +
+                                "previous file " + lastFile.getAbsolutePath());
+                    }
+                }
+
+                lastContext = currentContext;
+                lastFile    = f;
+            }
+
+            CloserUtil.close(in);
+        }
+    }
+
+    /** Code for gathering multiple VCFs that works regardless of input format and output format, but can be slow. */
+    private static void gatherConventionally(final SAMSequenceDictionary sequenceDictionary,
+                                      final boolean createIndex,
+                                      final List<File> inputFiles,
+                                      final File outputFile) {
+        final EnumSet<Options> options = EnumSet.copyOf(VariantContextWriterFactory.DEFAULT_OPTIONS);
+        if (createIndex) options.add(Options.INDEX_ON_THE_FLY); else options.remove(Options.INDEX_ON_THE_FLY);
+        final VariantContextWriter out = VariantContextWriterFactory.create(outputFile, sequenceDictionary, options);
+
+        final ProgressLogger progress = new ProgressLogger(log, 10000);
+        VariantContext lastContext = null;
+        File lastFile = null;
+        VCFHeader firstHeader = null;
+        VariantContextComparator comparator = null;
+
+        for (final File f : inputFiles) {
+            log.debug("Gathering from file: ", f.getAbsolutePath());
+            final VCFFileReader variantReader = new VCFFileReader(f, false);
+            final PeekableIterator<VariantContext> variantIterator = new PeekableIterator<VariantContext>(variantReader.iterator());
+            final VCFHeader header = variantReader.getFileHeader();
+
+            if (firstHeader == null) {
+                firstHeader = header;
+                out.writeHeader(firstHeader);
+                comparator = new VariantContextComparator(firstHeader.getContigLines());
+            }
+
+            if (lastContext != null && variantIterator.hasNext()) {
+                final VariantContext vc = variantIterator.peek();
+                if (comparator.compare(vc, lastContext) <= 0) {
+                    throw new IllegalStateException("First variant in file " + f.getAbsolutePath() + " is at " + vc.getSource() +
+                            " but last variant in earlier file " + lastFile.getAbsolutePath() + " is at " + lastContext.getSource());
+                }
+            }
+
+            while (variantIterator.hasNext()) {
+                lastContext = variantIterator.next();
+                out.add(lastContext);
+                progress.record(lastContext.getChr(), lastContext.getStart());
+            }
+
+            lastFile = f;
+
+            CloserUtil.close(variantIterator);
+            CloserUtil.close(variantReader);
+        }
+
+        out.close();
+    }
+
+    /**
+     * Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing
+     * most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file
+     * (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
+     * blocks (excluding a terminator block if present) are copied directly from input to output.
+     */
+    private static void gatherWithBlockCopying(final List<File> vcfs, final File output) {
+        try {
+            final FileOutputStream out = new FileOutputStream(output);
+            boolean isFirstFile = true;
+
+            for (final File f : vcfs) {
+                log.info("Gathering " + f.getAbsolutePath());
+                final FileInputStream in = new FileInputStream(f);
+
+                // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it
+                final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(f);
+                if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE) throw new UserException.MalformedFile(f.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
+
+                if (!isFirstFile) {
+                    final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in, false);
+                    boolean lastByteNewline = true;
+
+                    while (in.available() > 0) {
+                        // Read a block - blockIn.available() is guaranteed to return the bytes remaining in the block that has been
+                        // read, and since we haven't consumed any yet, that is the block size.
+                        final int blockLength = blockIn.available();
+                        final byte[] blockContents = new byte[blockLength];
+                        final int read = blockIn.read(blockContents);
+                        if (blockLength == 0 || read != blockLength) throw new IllegalStateException("Could not read available bytes from BlockCompressedInputStream.");
+
+                        // Scan forward within the block to see if we can find the end of the header within this block
+                        int firstNonHeaderByteIndex = -1;
+                        for (int i=0; i<read; ++i) {
+                            final byte b = blockContents[i];
+                            final boolean thisByteNewline = (b == '\n' || b == '\r');
+
+                            if (lastByteNewline && !thisByteNewline && b != '#') {
+                                // Aha!  Found first byte of non-header data in file!
+                                firstNonHeaderByteIndex = i;
+                                break;
+                            }
+
+                            lastByteNewline = thisByteNewline;
+                        }
+
+                        // If we found the end of the header then write the remainder of this block out as a
+                        // new gzip block and then break out of the while loop
+                        if (firstNonHeaderByteIndex >= 0) {
+                            final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(out, null);
+                            blockOut.write(blockContents, firstNonHeaderByteIndex, blockContents.length - firstNonHeaderByteIndex);
+                            blockOut.flush();
+                            // Don't close blockOut because closing underlying stream would break everything
+                            break;
+                        }
+                    }
+                }
+
+                // Copy remainder of input stream into output stream
+                final long currentPos = in.getChannel().position();
+                final long length     = f.length();
+                final long skipLast   = (term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) ?
+                        BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
+                final long bytesToWrite = length - skipLast - currentPos;
+
+                IOUtil.transferByStream(in, out, bytesToWrite);
+                in.close();
+                isFirstFile = false;
+            }
+
+            // And lastly add the Terminator block and close up
+            out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
+            out.close();
+        }
+        catch (final IOException ioe) {
+            throw new RuntimeIOException(ioe);
+        }
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/picard/vcf/MakeSitesOnlyVcf.java b/src/main/java/org/broadinstitute/hellbender/tools/picard/vcf/MakeSitesOnlyVcf.java
@@ -0,0 +1,105 @@
+package org.broadinstitute.hellbender.tools.picard.vcf;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.CloseableIterator;
+import htsjdk.samtools.util.CloserUtil;
+import htsjdk.samtools.util.IOUtil;
+import htsjdk.samtools.util.Log;
+import htsjdk.samtools.util.ProgressLogger;
+import htsjdk.variant.variantcontext.GenotypesContext;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.Options;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
+import htsjdk.variant.vcf.VCFFileReader;
+import htsjdk.variant.vcf.VCFHeader;
+import org.broadinstitute.hellbender.cmdline.Argument;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgramProperties;
+import org.broadinstitute.hellbender.cmdline.PicardCommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup;
+import org.broadinstitute.hellbender.exceptions.UserException;
+
+import java.io.File;
+import java.util.*;
+
+/**
+ * Writes out a VCF that contains all the site-level information for all records in the input VCF and no per-sample information.
+ *
+ * @author Tim Fennell
+ */
+@CommandLineProgramProperties(
+        usage = "Reads a VCF/VCF.gz/BCF and removes all genotype information from it while retaining " +
+                "all site level information, including annotations based on genotypes (e.g. AN, AF). Output an be " +
+                "any support variant format including .vcf, .vcf.gz or .bcf.",
+        usageShort = "Creates a VCF bereft of genotype information from an input VCF or BCF",
+        programGroup = VariantProgramGroup.class
+)
+public class MakeSitesOnlyVcf extends PicardCommandLineProgram {
+
+    @Argument(shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "Input VCF or BCF")
+    public File INPUT;
+
+    @Argument(shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output VCF or BCF to emit without per-sample info.")
+    public File OUTPUT;
+
+    @Argument(shortName = "S", doc = "Optionally one or more samples to retain when building the 'sites-only' VCF.", optional=true)
+    public Set<String> SAMPLE = new TreeSet<String>();
+
+	public MakeSitesOnlyVcf() {
+		CREATE_INDEX = true;
+	}
+
+    @Override
+    protected Object doWork() {
+        IOUtil.assertFileIsReadable(INPUT);
+        IOUtil.assertFileIsWritable(OUTPUT);
+
+	    final VCFFileReader reader = new VCFFileReader(INPUT, false);
+	    final VCFHeader inputVcfHeader = new VCFHeader(reader.getFileHeader().getMetaDataInInputOrder());
+	    final SAMSequenceDictionary sequenceDictionary = inputVcfHeader.getSequenceDictionary();
+
+	    if (CREATE_INDEX && sequenceDictionary == null) {
+		    throw new UserException("A sequence dictionary must be available (either through the input file or by setting it explicitly) when creating indexed output.");
+	    }
+
+        final ProgressLogger progress = new ProgressLogger(Log.getInstance(MakeSitesOnlyVcf.class), 10000);
+
+        // Setup the site-only file writer
+        final VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
+                .setOutputFile(OUTPUT)
+                .setReferenceDictionary(sequenceDictionary);
+        if (CREATE_INDEX)
+            builder.setOption(Options.INDEX_ON_THE_FLY);
+        else
+            builder.unsetOption(Options.INDEX_ON_THE_FLY);
+        final VariantContextWriter writer = builder.build();
+
+        final VCFHeader header = new VCFHeader(inputVcfHeader.getMetaDataInInputOrder(), SAMPLE);
+        writer.writeHeader(header);
+
+        // Go through the input, strip the records and write them to the output
+        final CloseableIterator<VariantContext> iterator = reader.iterator();
+	    while (iterator.hasNext()) {
+		    final VariantContext full = iterator.next();
+            final VariantContext site = subsetToSamplesWithOriginalAnnotations(full, SAMPLE);
+            writer.add(site);
+            progress.record(site.getChr(), site.getStart());
+        }
+
+	    CloserUtil.close(iterator);
+	    CloserUtil.close(reader);
+	    writer.close();
+
+        return null;
+    }
+
+    /** Makes a new VariantContext with only the desired samples. */
+    private static VariantContext subsetToSamplesWithOriginalAnnotations(final VariantContext ctx, final Set<String> samples) {
+        final VariantContextBuilder builder = new VariantContextBuilder(ctx);
+        final GenotypesContext newGenotypes = ctx.getGenotypes().subsetToSamples(samples);
+        builder.alleles(ctx.getAlleles());
+        return builder.genotypes(newGenotypes).make();
+    }
+}