Permalink
Browse files

Delete unused FastaSequenceIndexBuilder class and accompanying test

This class, being unused, was no longer getting packaged into the
GATK release jar by bcel, and so attempting to run its unit test
on the release jar was producing an error.
  • Loading branch information...
1 parent 3390fc7 commit f57256b6c2aed18f2599fc9b467a86184ca54599 @droazen droazen committed May 1, 2013
@@ -1,305 +0,0 @@
-/*
-* Copyright (c) 2012 The Broad Institute
-*
-* Permission is hereby granted, free of charge, to any person
-* obtaining a copy of this software and associated documentation
-* files (the "Software"), to deal in the Software without
-* restriction, including without limitation the rights to use,
-* copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following
-* conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-package net.sf.picard.reference;
-
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*;
-
-import java.io.*;
-
-import org.broadinstitute.sting.utils.exceptions.UserException;
-
-/**
- * Builds FastaSequenceIndex from fasta file.
- * Produces fai file with same output as samtools faidx
- */
-public class FastaSequenceIndexBuilder {
- final public File fastaFile;
- final boolean printProgress;
-
- // keep track of location in file
- long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file;
-
- // vars that store information about the contig that is currently being read
- String contig;
- long location, size, bytesPerLine, basesPerLine, basesThisLine;
- int thisSequenceIndex = 0;
-
- // vars that keep loop state
- byte lastByte = 0, currentByte = 0, nextByte = 0;
- public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT }
- Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum?
-
- public FastaSequenceIndexBuilder(File fastaFile, boolean printProgress) {
- this.fastaFile = fastaFile;
- fileLength = fastaFile.length();
- this.printProgress = printProgress;
- }
-
- /**
- * Creates fasta sequence index from fasta file
- * @return FastaSequenceIndex that is read from file
- */
- public FastaSequenceIndex createIndex() { // should this be static?
- bytesRead = -1;
- endOfLastLine = -1;
- contig = "";
- location = 0;
- size = 0;
- bytesPerLine = 0;
- basesPerLine = 0;
- basesThisLine = 0;
- lastTimestamp = System.currentTimeMillis();
- FastaSequenceIndex sequenceIndex = new FastaSequenceIndex();
-
- // initialize input stream
- DataInputStream in;
- try {
- in = new DataInputStream(new BufferedInputStream(new FileInputStream(fastaFile)));
- }
- catch (Exception e) {
- throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read fasta file", e);
- }
-
- /*
- * iterate through each character in file one at a time, but must account for variance in line terminators
- * strategy is to check if current character is a line terminator (\n, \r), then check next character
- * only treat as end of line if next character is NOT a line terminator
- */
- try {
- // intialize iterators
- nextByte = in.readByte();
- currentByte = '\n';
- while(currentByte != -1) {
-
- bytesRead ++; // update position in file
- lastByte = currentByte;
- currentByte = nextByte;
- try {
- nextByte = in.readByte();
- }
- catch (EOFException e) {
- nextByte = -1;
- }
-
- switch(status) {
-
- // if not currently reading anything
- // only thing that can trigger action is '>' (start of contig) or ';' (comment)
- case NONE:
- if (currentByte == '>')
- status = CONTIG;
- else if (currentByte == ';')
- status = COMMENT;
- break;
-
- // if reading a comment, just ignore everything until end of line
- case COMMENT:
- if (isEol(currentByte)) {
- if (!isEol(nextByte))
- status = Status.NONE;
- }
- break;
-
- // if reading a contig, add char to contig string
- // contig string can be anything, including spaces
- case CONTIG:
- if (isEol(currentByte)) {
- if (!isEol(nextByte)) {
- status = Status.FIRST_SEQ_LINE;
- location = bytesRead + 1;
- }
- }
- else
- contig += (char) currentByte;
- break;
-
- // record bases and bytes of first sequence line, to validate against later lines
- case FIRST_SEQ_LINE:
-
- if (isEol(currentByte)) {
-
- // record bases per line if last character was a base
- if (!isEol(lastByte)) {
- basesPerLine = bytesRead - location;
- basesThisLine = basesPerLine;
- size += basesPerLine;
- }
-
- // next character is start of next line, now know bytes per line
- if (!isEol(nextByte)) { // figure out what to do if there is only one data line
- bytesPerLine = bytesRead - location + 1;
- status = Status.SEQ_LINE;
- endOfLastLine = bytesRead;
-
- // if next char is ';' or '>', then there is only one contig =>
- if (nextByte == ';' || nextByte == '>')
- finishReadingContig(sequenceIndex);
- }
- }
-
- // validate base character
- else {
- if (!isValidBase(currentByte))
- throw new UserException.MalformedFile(fastaFile, String.format("An invalid base was found in the contig: %s", contig));
- }
- break;
-
-
- case SEQ_LINE:
-
- if (isEol(currentByte)) {
-
- // record bases per line if last character was a base
- if (!isEol(lastByte)) {
- basesThisLine = bytesRead - endOfLastLine - 1;
- size += basesThisLine;
- }
-
- // reached end of line - check if end of contig
- if (!isEol(nextByte)) {
-
- // if comment or new contig, definitely end of sequence
- if (nextByte == ';' || nextByte == '>')
- finishReadingContig(sequenceIndex);
-
- // if this line has different # of bases OR same # of bases and different # of bytes:
- // error if next char is a valid base, end of contig otherwise
- else if (basesThisLine != basesPerLine || bytesPerLine != bytesRead - endOfLastLine) {
- if (isValidBase(nextByte) && nextByte != -1) {
- throw new UserException.MalformedFile(fastaFile, String.format("An invalid line was found in the contig: %s", contig));
- }
- else
- finishReadingContig(sequenceIndex);
- }
- endOfLastLine = bytesRead;
- }
- }
-
- // validate base character
- else {
- if (!isValidBase(currentByte))
- throw new UserException.MalformedFile(fastaFile, String.format("An invalid base was found in the contig: %s", contig));
- }
- break;
- }
- }
- in.close();
- return sequenceIndex;
- }
- catch (IOException e) {
- throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read fasta file", e);
- }
- catch (Exception e) {
- throw new ReviewedStingException(e.getMessage(), e);
- }
- }
-
- /**
- * Checks if character is an end of line character, \n or \r
- * @param currentByte Character to check, as a byte
- * @return True if current character is \n or \r' false otherwise
- */
- private boolean isEol(byte currentByte) {
- return (currentByte == '\n' || currentByte == '\r');
- }
-
-
- /**
- * checks that character is valid base
- * only checks that the base isn't whitespace, like picard does
- * could add check that character is A/C/G/T/U if wanted
- * @param currentByte Character to check, as a byte
- * @return True if character is not whitespace, false otherwise
- */
- private boolean isValidBase(byte currentByte) {
- return (!Character.isWhitespace(currentByte) && currentByte != ';' && currentByte != '>');
- }
-
- /*
- * When reader reaches the end of a contig
- * Reset iterators and add contig to sequence index
- */
- private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
- sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
- status = Status.NONE;
- contig = "";
- size = 0;
-
- if (System.currentTimeMillis() - lastTimestamp > 10000) {
- int percentProgress = (int) (100*bytesRead/fileLength);
- if (printProgress)
- System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percentProgress));
- lastTimestamp = System.currentTimeMillis();
- }
- }
-
- /*
- * Trims the contig name to the expected value by removing any characters after the first whitespace
- */
- private static String trimContigName(final String contigName) {
- int whitespaceIndex = contigName.indexOf(' ');
- return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
- }
-
- /**
- * Stores FastaSequenceIndex as a .fasta.fai file on local machine
- * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
- * @param sequenceIndex sequenceIndex to be saved
- * @param faiFile file where we should store index
- */
- public static void saveAsFaiFile(FastaSequenceIndex sequenceIndex, File faiFile) {
-
- BufferedWriter out;
- try {
- out = new BufferedWriter(new FileWriter(faiFile));
- }
- catch (Exception e) {
- throw new UserException.CouldNotCreateOutputFile(faiFile, e);
- }
-
- try {
- for(FastaSequenceIndexEntry entry: sequenceIndex) {
- out.write(toIndexFileLine(entry));
- out.newLine();
- }
- out.close();
- }
- catch (Exception e) {
- throw new UserException.CouldNotCreateOutputFile(faiFile, e);
- }
- }
-
- /**
- * Print string in format of fai file line
- * @return Contig as one line in a fai file
- */
- private static String toIndexFileLine(FastaSequenceIndexEntry entry) {
- return String.format("%s\t%d\t%d\t%d\t%d", entry.getContig(), entry.getSize(), entry.getLocation(), entry.getBasesPerLine(), entry.getBytesPerLine());
- }
-
-}
Oops, something went wrong.

0 comments on commit f57256b

Please sign in to comment.