From bceab293eb75df4dbbb8157f16c08891de1ae827 Mon Sep 17 00:00:00 2001 From: Andreas Prlic Date: Fri, 29 Jul 2016 10:37:32 -0700 Subject: [PATCH 1/5] Adding test for genome mapping --- .../nbio/genome/TestGenomeMapping.java | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java diff --git a/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java b/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java new file mode 100644 index 0000000000..b4615b9db2 --- /dev/null +++ b/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java @@ -0,0 +1,151 @@ +package org.biojava.nbio.genome; + +import com.google.common.collect.Lists; +import com.google.common.collect.Range; +import junit.framework.TestCase; +import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition; +import org.biojava.nbio.genome.parsers.genename.GeneChromosomePositionParser; +import org.biojava.nbio.genome.util.ChromosomeMappingTools; +import org.junit.Test; + +import java.io.InputStream; +import java.net.URL; +import java.util.List; +import java.util.zip.GZIPInputStream; + +/** + * Created by andreas on 7/19/16. + */ +public class TestGenomeMapping extends TestCase{ + + private static final String geneChromosomeFile = "http://cdn.rcsb.org/gene/hg38/geneChromosome38.tsf.gz"; + + private List gcps = null; + + @Override + protected void setUp() throws Exception { + super.setUp(); + InputStream input = new GZIPInputStream(new URL(geneChromosomeFile).openStream()); + gcps = GeneChromosomePositionParser.getChromosomeMappings(input); + + + } + + + @Test + public void testAK1() { + String geneName = "AK1"; + + assertNotNull(gcps); + assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100); + + int uniProtLength = 194; + + try { + + for (GeneChromosomePosition pos : gcps) { + + //System.out.println(pos.getGeneName()); + if (!pos.getGeneName().equals(geneName)) + continue; + + /// there are three alternative transcripts for AK1. + // we are just testing one here: + + if ( ! pos.getGenebankId().equals("NM_000476")) + continue; + + assertTrue(pos.getGeneName().equals(geneName)); + assertTrue(pos.getOrientation().equals('-')); + assertTrue(pos.getChromosome().equals("chr9")); + + List> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos); + + validateExon(0,0,7, cdsranges ); + validateExon(1,7,43, cdsranges ); + validateExon(2,43,207, cdsranges ); + validateExon(3,207,324, cdsranges ); + validateExon(4,324,516, cdsranges ); + validateExon(5,516,585, cdsranges ); + + + int cdslength = ChromosomeMappingTools.getCDSLength(pos); + + assertTrue("CDS length should be 582, but is " + cdslength, cdslength == (uniProtLength *3)); + + List> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos); + + // we are reverse strand. reverse the order + chromranges = Lists.reverse(chromranges); + + assertTrue(chromranges.size() == 6); + + // compare with https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS6881 + validateExon(0,127868008,127868076, chromranges ); + validateExon(1,127868320,127868512, chromranges ); + validateExon(2,127871822,127871939, chromranges ); + validateExon(3,127872689,127872853, chromranges ); + validateExon(4,127873025,127873061, chromranges ); + validateExon(5,127874610,127874617, chromranges ); + + } + } catch (Exception e) { + fail(e.getMessage()); + } + } + + @Test + public void testHBA(){ + + String geneName = "HBA1"; + assertNotNull(gcps); + + assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100); + + try { + + for ( GeneChromosomePosition pos : gcps){ + + //System.out.println(pos.getGeneName()); + if ( ! pos.getGeneName().equals(geneName)) + continue; + + assertTrue(pos.getGeneName().equals("HBA1")); + assertTrue(pos.getGenebankId().equals("NM_000558")); + assertTrue(pos.getChromosome().equals("chr16")); + assertTrue(pos.getTranscriptionStart().equals(176650)); + assertTrue(pos.getTranscriptionEnd().equals(177522)); + assertTrue(pos.getOrientation().equals('+')); + + List> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos); + + assertTrue(cdsranges.size() == 3); + + validateExon(0,0,95,cdsranges); + validateExon(1,95,300,cdsranges); + validateExon(2,300,429,cdsranges); + + + List> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos); + + validateExon(0,176716,176811, chromranges ); + validateExon(1,176928,177133, chromranges ); + validateExon(2,177282,177411, chromranges ); + + + } + } catch (Exception e){ + fail(e.getMessage()); + } + + + } + + private void validateExon(int exonNr, int start, int stop, List> cdsranges) { + + Range exon = cdsranges.get(exonNr); + assertTrue("Exon " + exonNr + " boundary "+ exon.lowerEndpoint() + " does not match " +start , exon.lowerEndpoint().equals(start)); + assertTrue("Exon " + exonNr + " boundary " + exon.upperEndpoint() + " does not match " + stop, exon.upperEndpoint().equals(stop)); + + } +} From f2f4143c8adc9057f893c086337d5444e633c3f2 Mon Sep 17 00:00:00 2001 From: Andreas Prlic Date: Fri, 29 Jul 2016 10:38:44 -0700 Subject: [PATCH 2/5] new utility class for chromosome mappings. --- .../biojava/nbio/genome/util/ChromosomeMappingTools.java | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java diff --git a/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java b/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java new file mode 100644 index 0000000000..41eea76808 --- /dev/null +++ b/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java @@ -0,0 +1,7 @@ +package org.biojava.nbio.genome.util; + +/** + * Created by andreas on 7/29/16. + */ +public class ChromosomeMappingTools { +} From 04b8f4a4ac8ef4cf5f2fd0d29f2764d27d66b1b3 Mon Sep 17 00:00:00 2001 From: Andreas Prlic Date: Fri, 29 Jul 2016 10:39:24 -0700 Subject: [PATCH 3/5] new utility class for chromosome mappings. --- .../genome/util/ChromosomeMappingTools.java | 1230 ++++++++++++++++- 1 file changed, 1229 insertions(+), 1 deletion(-) diff --git a/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java b/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java index 41eea76808..07ace5278b 100644 --- a/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java +++ b/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java @@ -1,7 +1,1235 @@ package org.biojava.nbio.genome.util; +import com.google.common.collect.Range; + +import org.biojava.nbio.genome.parsers.genename.ChromPos; +import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; + /** - * Created by andreas on 7/29/16. + * A class that take care of the painful mapping */ public class ChromosomeMappingTools { + + private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class); + + + private static final String newline = System.getProperty("line.separator"); + + + public static final String CHROMOSOME = "CHROMOSOME"; + public static final String CDS = "CDS"; + + + /** Pretty print the details of a GeneChromosomePosition to a String + * + * @param chromosomePosition + * @return + */ + public static String formatExonStructure(GeneChromosomePosition chromosomePosition ){ + if ( chromosomePosition.getOrientation() == '+') + return formatExonStructureForward(chromosomePosition); + + return formatExonStructureReverse(chromosomePosition); + + } + + + private static String formatExonStructureForward(GeneChromosomePosition chromPos) { + + StringWriter s = new StringWriter(); + + List exonStarts = chromPos.getExonStarts(); + List exonEnds = chromPos.getExonEnds(); + + int cdsStart = chromPos.getCdsStart(); + int cdsEnd = chromPos.getCdsEnd(); + + boolean inCoding = false; + int codingLength = 0; + + for (int i = 0; i < exonStarts.size(); i++) { + + int start = exonStarts.get(i); + int end = exonEnds.get(i); + + if (start <= cdsStart +1 && end >= cdsStart+1) { + + inCoding = true; + codingLength += (end - cdsStart); + s.append(" UTR : ").append(format(start)).append(" - ").append(format(cdsStart)); + s.append(newline); + s.append(" -> Exon : ").append(format(cdsStart + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - cdsStart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); + s.append(newline); + + } else if (start <= cdsEnd && end >= cdsEnd) { + //logger.debug(" <-- CDS end at: " + cdsEnd ); + inCoding = false; + codingLength += (cdsEnd - start); + + s.append(" <- Exon : ").append(format(start + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); + s.append(newline); + s.append(" UTR : " + (cdsEnd +1) + " - " + format(end)); + s.append(newline); + + + } else if (inCoding) { + // full exon is coding + codingLength += (end - start); + + s.append(" Exon : ").append(format(start + 1)).append(" - ").append(format(end)).append(" | ").append(Integer.toString(end - start)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); + s.append(newline); + } + + } + s.append("Coding Length: "); + s.append((codingLength-3)+""); + s.append(newline); + return s.toString(); + } + + + + private static String showGenePosLink(GeneChromosomePosition chromPos, Integer pos ) { + + String spos = format(pos); + + StringBuffer buf = new StringBuffer(); + buf.append(""); + buf.append(spos); + buf.append(""); + + return buf.toString(); + } + + + private static String formatExonStructureReverse(GeneChromosomePosition chromPos) { + StringWriter s = new StringWriter(); + + List exonStarts = chromPos.getExonStarts(); + List exonEnds = chromPos.getExonEnds(); + + + int cdsStart = chromPos.getCdsStart(); + int cdsEnd = chromPos.getCdsEnd(); + + // logger.debug("CDS START:" +format(cdsStart) + " - " + format(cdsEnd)); + + boolean inCoding = false; + int codingLength = 0; + + if (cdsEnd < cdsStart) { + int tmp = cdsEnd; + cdsEnd = cdsStart; + cdsStart = tmp; + } + + // map reverse + for (int i = exonStarts.size() - 1; i >= 0; i--) { + + int end = exonStarts.get(i); + int start = exonEnds.get(i); + + if (end < start) { + int tmp = end; + end = start; + start = tmp; + } + + if (start <= cdsEnd && end >= cdsEnd) { + inCoding = true; + + int tmpstart = start; + if (start < cdsStart) { + tmpstart = cdsStart; + } + codingLength += (cdsEnd - tmpstart); + + s.append(" UTR :" + format(cdsEnd + 1) + " | " + format(end)); + s.append(newline); + if (tmpstart == start) + s.append(" -> "); + else + s.append(" <-> "); + s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(Integer.toString(cdsEnd - tmpstart)).append(" | ").append(Integer.toString(codingLength)).append(" | ").append(Integer.toString(codingLength % 3)); + s.append(newline); + // single exon with UTR on both ends + if (tmpstart != start) + s.append(" UTR :" + format(cdsStart ) + " - " + format(start + 1)); + s.append(newline); + + } else if (start <= cdsStart && end >= cdsStart) { + inCoding = false; + codingLength += (end - cdsStart); + + s.append(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); + s.append(newline); + s.append(" UTR : " + format(start+1) + " - " + format(cdsStart )); + s.append(newline); + + + } else if (inCoding) { + // full exon is coding + codingLength += (end - start); + + s.append(" Exon : " + format(start+1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); + s.append(newline); + } else { + // e.g. see UBQLN3 + s.append(" no translation! UTR: ").append(format(start)).append(" - ").append(format(end)); + s.append(newline); + } + } + + s.append("CDS length: ").append(Integer.toString(codingLength - 3)); + s.append(newline); + + return s.toString(); + } + + /** + * Get the length of the CDS in nucleotides. + * + * @param chromPos + * @return length of the CDS in nucleotides. + */ + public static int getCDSLength(GeneChromosomePosition chromPos) { + + logger.debug(chromPos.toString()); + + logger.debug("chromosomal information: "); + + logger.debug("Gene:" + chromPos.getGeneName()); + logger.debug(" Transcription (including UTRs): " + chromPos.getTranscriptionStart() + " - " + chromPos.getTranscriptionEnd() + " (length:" + (chromPos.getTranscriptionEnd() - chromPos.getTranscriptionStart()) + ")"); + logger.debug(" Orientation: " + chromPos.getOrientation()); + logger.debug(" CDS: " + (chromPos.getCdsStart()) + " - " + chromPos.getCdsEnd() + " (length: " + (chromPos.getCdsEnd() - chromPos.getCdsStart()) + ")"); + + + List exonStarts = chromPos.getExonStarts(); + List exonEnds = chromPos.getExonEnds(); + + logger.debug("Exons:" + exonStarts.size()); + + int cdsStart = chromPos.getCdsStart(); + int cdsEnd = chromPos.getCdsEnd(); + + + int codingLength; + if (chromPos.getOrientation().equals('+')) + codingLength = ChromosomeMappingTools.getCDSLengthForward(exonStarts, exonEnds, cdsStart, cdsEnd); + else + codingLength = ChromosomeMappingTools.getCDSLengthReverse(exonStarts, exonEnds, cdsStart, cdsEnd); + return codingLength; + } + + + /** + * maps the position of a CDS nucleotide back to the genome + * + * @param cdsNucleotidePosition + * @return a ChromPos object + */ + public static ChromPos getChromosomePosForCDScoordinate(int cdsNucleotidePosition, GeneChromosomePosition chromPos) { + + logger.debug(" ? Checking chromosome position for CDS position " + cdsNucleotidePosition); + + List exonStarts = chromPos.getExonStarts(); + List exonEnds = chromPos.getExonEnds(); + + logger.debug(" Exons:" + exonStarts.size()); + + int cdsStart = chromPos.getCdsStart(); + int cdsEnd = chromPos.getCdsEnd(); + + + ChromPos chromosomePos = null; + + if (chromPos.getOrientation().equals('+')) + + chromosomePos = ChromosomeMappingTools.getChromPosForward(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd); + else + chromosomePos = ChromosomeMappingTools.getChromPosReverse(cdsNucleotidePosition, exonStarts, exonEnds, cdsStart, cdsEnd); + + logger.debug("=> CDS pos " + cdsNucleotidePosition + " for " + chromPos.getGeneName() + " is on chromosome at " + chromosomePos); + return chromosomePos; + + } + + /** returns a nicely formatted representation of the position + * + * @param chromosomePosition + * @return + */ + private static String format(int chromosomePosition){ + + + return String.format("%,d", chromosomePosition); + } + + /** + * Get the CDS position mapped on the chromosome position + * + * @param exonStarts + * @param exonEnds + * @param cdsStart + * @param cdsEnd + * @return + */ + public static ChromPos getChromPosReverse(int cdsPos, List exonStarts, + List exonEnds, int cdsStart, int cdsEnd) { + + boolean inCoding = false; + int codingLength = 0; + + if (cdsEnd < cdsStart) { + int tmp = cdsEnd; + cdsEnd = cdsStart; + cdsStart = tmp; + } + + int lengthExons = 0; + + // map reverse + for (int i = exonStarts.size() - 1; i >= 0; i--) { + + logger.debug("Exon #" + (i+1) + "/" + exonStarts.size()); + int end = exonStarts.get(i); + int start = exonEnds.get(i); + + if (end < start) { + int tmp = end; + end = start; + start = tmp; + } + lengthExons += end - start; + + logger.debug(" is " + cdsPos + " part of Reverse exon? " + format(start+1) + " - " + format(end) + " | " + (end - start+1)); + logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength); + + if (start+1 <= cdsEnd && end >= cdsEnd) { + + + // FIRST EXON + inCoding = true; + + + int tmpstart = start; + if (start < cdsStart) { + tmpstart = cdsStart; + } + + // here one of the few places where we don't say start+1 + int check = codingLength + cdsEnd - tmpstart ; + + logger.debug("First Exon | " + (check) + " | " + format(start+1) + " " + format(end) + " | " + (cdsEnd - tmpstart) + " | " + cdsPos ); + + + if ( ( check > cdsPos) ) + { + + int tmp = cdsPos - codingLength ; + + + logger.debug(" -> found position in UTR exon: " + format(cdsPos) + " " + format(tmpstart+1) + " tmp:" + format(tmp) + " cs:" + format(cdsStart+1) + " ce:" + format(cdsEnd) + " cl:" + codingLength); + + return new ChromPos((cdsEnd - tmp), -1) ; + } + + + // don't add 1 here + codingLength += (cdsEnd - tmpstart ); + + boolean debug = logger.isDebugEnabled(); + + + if ( debug ) { + + StringBuffer b = new StringBuffer(); + + b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline); + if (tmpstart == start) + b.append(" -> "); + else + b.append(" <-> "); + b.append("Exon :" + format(tmpstart + 1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart + 1) + " - " + codingLength + " | " + (codingLength % 3) + newline); + + // single exon with UTR on both ends + if (tmpstart != start) + b.append(" UTR :" + format(cdsStart) + " - " + format(start + 1) + newline); + + logger.debug(b.toString()); + } + } else if (start <= cdsStart && end >= cdsStart) { + + // LAST EXON + inCoding = false; + + if (codingLength + end - cdsStart >= cdsPos) { + + // how many remaining coding nucleotides? + int tmp = codingLength + end - cdsStart - cdsPos ; + + logger.debug("cdl: " +codingLength + " tmp:" + tmp + " cdsStart: " + format(cdsStart)); + + logger.debug(" -> XXX found position noncoding exon: cdsPos:" + cdsPos + " s:" + format(start + 1) + " tmp:" + format(tmp) + " cdsStart:" + (cdsStart + 1) + " codingLength:" + codingLength + " cdsEnd:" + format(cdsEnd)); + + return new ChromPos((cdsStart + tmp),-1); + } + + codingLength += (end - cdsStart); + + logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); + + } else if (inCoding) { + + + if (codingLength + end - start -1 >= cdsPos) { + + int tmp = cdsPos - codingLength ; + + if ( tmp > (end - start ) ) { + + tmp = (end - start ); + + logger.debug("changing tmp to " + tmp); + + } + + logger.debug(" " + cdsPos + " " + codingLength + " | " + (cdsPos - codingLength) + " | " + (end -start) + " | " + tmp); + logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" -> RRR found position coding exon: " + cdsPos + " " + format(start+1) + " " + format(end) + " " + tmp + " " + format(cdsStart+1) + " " + codingLength); + + return new ChromPos((end - tmp),cdsPos %3); + } + + // full exon is coding + codingLength += (end - start) ; + + logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start+1) + " | " + codingLength + " | " + (codingLength % 3)); + } else { + // e.g. see UBQLN3 + + logger.debug(" no translation!"); + } + + + logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + cdsPos); + + + } + + logger.debug("length exons: " + lengthExons); + // could not map, or map over the full length?? + return new ChromPos(-1,-1); + + + } + + /** + * Get the CDS position mapped onto the chromosome position + * + * @param exonStarts + * @param exonEnds + * @param cdsStart + * @param cdsEnd + * @return + */ + public static ChromPos getChromPosForward(int cdsPos, List exonStarts, List exonEnds, + int cdsStart, int cdsEnd) { + boolean inCoding = false; + int codingLength = 0; + + int lengthExons = 0; + // map forward + for (int i = 0; i < exonStarts.size(); i++) { + + + // start can include UTR + int start = exonStarts.get(i); + int end = exonEnds.get(i); + + lengthExons += end - start; + + + if (start <= cdsStart +1 && end >= cdsStart+1) { + // first exon with UTR + if (codingLength + (end - cdsStart-1) >= cdsPos) { + // we are reaching our target position + int tmp = cdsPos - codingLength; + + + logger.debug(cdsStart + " | " + codingLength + " | " + tmp); + logger.debug(" -> found position in UTR exon: #"+(i+1)+ " cdsPos:" + cdsPos + + " return:"+(cdsStart +1 + tmp) +" start:" + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); + + // we start 1 after cdsStart... + return new ChromPos((cdsStart +1 + tmp),-1); + } + inCoding = true; + codingLength += (end - cdsStart); + + logger.debug(" UTR : " + format(start+1) + " - " + (cdsStart )); + logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); + + } else if (start+1 <= cdsEnd && end >= cdsEnd) { + // LAST EXON with UTR + //logger.debug(" <-- CDS end at: " + cdsEnd ); + inCoding = false; + if (codingLength + (cdsEnd - start-1) >= cdsPos) { + int tmp = cdsPos - codingLength; + + + logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); + logger.debug( codingLength + " | " + tmp + " | " + format(start+1)); + logger.debug(" -> chromPosForward found position in non coding exon: " + cdsPos + " " + format(start+1) + " " + format(tmp) + " " + format(cdsStart) + " " + codingLength); + + return new ChromPos((start +1 + tmp),cdsPos%3); + } + codingLength += (cdsEnd - start-1); + + logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); + + + } else if (inCoding) { + // A standard coding Exon + // tests for the maximum length of this coding exon + if (codingLength + (end - start -1) >= cdsPos) { + + // we are within the range of this exon + int tmp = cdsPos - codingLength ; + + + logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + tmp + " | " + codingLength); + logger.debug(" -> found chr position in coding exon #" + (i+1) + ": cdsPos:" + format(cdsPos) + " s:" + format(start) + "-" + format(end) + " tmp:" + format(tmp) + " cdsStart:" + format(cdsStart) + " codingLength:" + codingLength); + + return new ChromPos((start +1 + tmp),cdsPos%3); + } + // full exon is coding + codingLength += (end - start ); + + logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); + } + // if ( inCoding ) + // logger.debug("exon phase at end:" + (codingLength % 3)); + // + // logger.debug(" coding length: " + codingLength); + + + } + + //logger.debug("length exons: " + lengthExons); + //return codingLength - 3; + + // could not map! + + return new ChromPos(-1,-1); + } + + + /** + * Get the length of the coding sequence + * + * @param exonStarts + * @param exonEnds + * @param cdsStart + * @param cdsEnd + * @return + */ + public static int getCDSLengthReverse(List exonStarts, + List exonEnds, int cdsStart, int cdsEnd) { + + boolean inCoding = false; + int codingLength = 0; + + if (cdsEnd < cdsStart) { + int tmp = cdsEnd; + cdsEnd = cdsStart; + cdsStart = tmp; + } + + int lengthExons = 0; + + // map reverse + for (int i = exonStarts.size() - 1; i >= 0; i--) { + + int end = exonStarts.get(i); + int start = exonEnds.get(i); + + if (end < start) { + int tmp = end; + end = start; + start = tmp; + } + lengthExons += end - start; + + if (start <= cdsEnd && end >= cdsEnd) { + inCoding = true; + + + int tmpstart = start; + if (start < cdsStart) { + tmpstart = cdsStart; + } + codingLength += (cdsEnd - tmpstart); + + boolean debug = logger.isDebugEnabled(); + + if ( debug) { + + StringBuffer b = new StringBuffer(); + + b.append(" UTR :" + (cdsEnd + 1) + " - " + (end) + newline); + if (tmpstart == start) + b.append(" -> "); + else + b.append(" <-> "); + b.append("Exon :" + tmpstart + " - " + cdsEnd + " | " + (cdsEnd - tmpstart) + " | " + codingLength + " | " + (codingLength % 3) + newline); + // single exon with UTR on both ends + if (tmpstart != start) + b.append(" UTR :" + (cdsStart - 1) + " - " + start + newline); + logger.debug(b.toString()); + + } + } else if (start <= cdsStart && end >= cdsStart) { + inCoding = false; + codingLength += (end - cdsStart); + + + logger.debug(" <- Exon : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart) + " | " + (codingLength-3) + " | " + (codingLength % 3)); + logger.debug(" UTR : " + start + " - " + (cdsStart )); + + + + } else if (inCoding) { + // full exon is coding + codingLength += (end - start); + + logger.debug(" Exon : " + start + " - " + end + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); + } else { + // e.g. see UBQLN3 + + logger.debug(" no translation!"); + } + + } + + logger.debug("length exons: " + lengthExons + " codin length: " + (codingLength - 3)); + return codingLength - 3; + } + + /** + * Get the length of the coding sequence + * + * @param exonStarts + * @param exonEnds + * @param cdsStart + * @param cdsEnd + * @return + */ + public static int getCDSLengthForward(List exonStarts, List exonEnds, + int cdsStart, int cdsEnd) { + boolean inCoding = false; + int codingLength = 0; + + int lengthExons = 0; + // map forward + for (int i = 0; i < exonStarts.size(); i++) { + + int start = exonStarts.get(i); + int end = exonEnds.get(i); + lengthExons += end - start; + + + logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start)); + + if (start+1 <= cdsStart +1 && end >= cdsStart+1) { + + inCoding = true; + codingLength += (end - cdsStart); + + logger.debug(" UTR : " + start + " - " + (cdsStart )); + logger.debug(" -> Exon : " + (cdsStart+1) + " - " + end + " | " + (end - cdsStart+1) + " | " + codingLength + " | " + (codingLength % 3)); + + } else if (start+1 <= cdsEnd && end >= cdsEnd) { + + inCoding = false; + codingLength += (cdsEnd - start); + + logger.debug(" <- Exon : " + (start +1)+ " - " + cdsEnd + " | " + (cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" UTR : " + cdsEnd + 1 + " - " + end); + + + } else if (inCoding) { + // full exon is coding + codingLength += (end - start); + + logger.debug(" Exon :" + (start+1) + " - " + end + " | " + (end - start+1) + " | " + codingLength + " | " + (codingLength % 3)); + } + + + } + + logger.debug("length exons: " + Integer.toString(lengthExons)); + logger.debug("CDS length:" + Integer.toString((codingLength-3))); + + return codingLength-3 ; + } + + + + /** Extracts the exon boundaries in CDS coordinates. (needs to be divided by 3 to get AA positions) + * + * @param chromPos + * @return + */ + public static List> getCDSExonRanges(GeneChromosomePosition chromPos){ + if ( chromPos.getOrientation() == '+') + + return getCDSExonRangesForward(chromPos,CDS); + + return getCDSExonRangesReverse(chromPos,CDS); + } + + + /** Extracts the boundaries of the coding regions in chromosomal coordinates + * + * @param chromPos + * @return + */ + public static List> getChromosomalRangesForCDS(GeneChromosomePosition chromPos){ + if ( chromPos.getOrientation() == '+') + return getCDSExonRangesForward(chromPos,CHROMOSOME); + + return getCDSExonRangesReverse(chromPos,CHROMOSOME); + } + + private static List> getCDSExonRangesReverse(GeneChromosomePosition chromPos, + String responseType) { + + List exonStarts = chromPos.getExonStarts(); + List exonEnds = chromPos.getExonEnds(); + + List> data = new ArrayList<>(); + int cdsStart = chromPos.getCdsStart(); + int cdsEnd = chromPos.getCdsEnd(); + + boolean inCoding = false; + int codingLength = 0; + + if (cdsEnd < cdsStart) { + int tmp = cdsEnd; + cdsEnd = cdsStart; + cdsStart = tmp; + } + + java.lang.StringBuffer s =null; + + boolean debug = logger.isDebugEnabled(); + + + if ( debug) + s = new StringBuffer(); + + int lengthExons = 0; + + // map reverse + for (int i = exonStarts.size() - 1; i >= 0; i--) { + + int end = exonStarts.get(i); + int start = exonEnds.get(i); + + if (end < start) { + int tmp = end; + end = start; + start = tmp; + } + lengthExons += end - start; + //s.append("Reverse exon: " + end + " - " + start + " | " + (end - start)); + //s.append(newline); + + if (start <= cdsEnd && end >= cdsEnd) { + inCoding = true; + + + int tmpstart = start; + if (start < cdsStart) { + tmpstart = cdsStart; + } + codingLength += (cdsEnd - tmpstart); + if ( debug ) { + s.append(" UTR :").append(format(cdsEnd + 1)).append(" | ").append(format(end)); + s.append(newline); + if (tmpstart == start) + s.append(" -> "); + else + s.append(" <-> "); + s.append("Exon :").append(format(tmpstart + 1)).append(" - ").append(format(cdsEnd)).append(" | ").append(cdsEnd - tmpstart).append(" | ").append(codingLength).append(" | ").append(codingLength % 3); + s.append(newline); + // single exon with UTR on both ends + if (tmpstart != start) + s.append(" UTR :").append(format(cdsStart)).append(" - ").append(format(start + 1)); + s.append(newline); + } + + + Range r ; + if ( responseType.equals(CDS)) + r = Range.closed(0,codingLength); + else + r = Range.closed(tmpstart,cdsEnd); + + data.add(r); + + } else if (start <= cdsStart && end >= cdsStart) { + inCoding = false; + + Range r; + if ( responseType.equals(CDS)) + r = Range.closed(codingLength,codingLength+(end-cdsStart)); + else + r = Range.closed(cdsStart+1,end); + + data.add(r); + + codingLength += (end - cdsStart); + if (debug) { + s.append(" <- Exon : " + format(cdsStart + 1) + " - " + format(end) + " | " + (end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); + s.append(newline); + s.append(" UTR : ").append(format(start + 1)).append(" - ").append(format(cdsStart)); + s.append(newline); + } + + + } else if (inCoding) { + // full exon is coding + + Range r; + if ( responseType.equals(CDS)) + r = Range.closed(codingLength,codingLength+(end-start)); + else + r = Range.closed(start,end); + data.add(r); + + codingLength += (end - start); + if (debug) { + s.append(" Exon : " + format(start + 1) + " - " + format(end) + " | " + (end - start) + " | " + codingLength + " | " + (codingLength % 3)); + s.append(newline); + } + } else { + // e.g. see UBQLN3 + if ( debug ) { + s.append(" no translation! UTR: " + format(start) + " - " + format(end)); + s.append(newline); + } + } + } + if ( debug ) { + s.append("CDS length: ").append(Integer.toString(codingLength - 3)); + s.append(newline); + logger.debug(s.toString()); + } + + return data; + } + + + + private static List> getCDSExonRangesForward(GeneChromosomePosition chromPos, + String responseType) { + + List> data = new ArrayList<>(); + List exonStarts = chromPos.getExonStarts(); + List exonEnds = chromPos.getExonEnds(); + + + int cdsStart = chromPos.getCdsStart(); + int cdsEnd = chromPos.getCdsEnd(); + + boolean inCoding = false; + int codingLength = 0; + + for (int i = 0; i < exonStarts.size(); i++) { + + int start = exonStarts.get(i); + int end = exonEnds.get(i); + + if (start <= cdsStart && end >= cdsStart) { + + inCoding = true; + codingLength += (end - cdsStart); +// + + + Range r; + if ( responseType.equals(CDS)) + r = Range.closed(0,codingLength); + else + r = Range.closed(cdsStart,end); + data.add(r); + + } else if (start <= cdsEnd && end >= cdsEnd) { + //logger.debug(" <-- CDS end at: " + cdsEnd ); + inCoding = false; + + + Range r; + if ( responseType.equals(CDS)) + r = Range.closed(codingLength,codingLength+(cdsEnd-start)); + else + r = Range.closed(start,cdsEnd); + data.add(r); + codingLength += (cdsEnd - start); + + } else if (inCoding) { + // full exon is coding + + Range r; + if ( responseType.equals(CDS)) + r = Range.closed(codingLength,codingLength+(end-start)); + else + r = Range.closed(start,end); + data.add(r); + codingLength += (end - start); + + } + + } + + return data; + } +// + + /** + * I have a genomic coordinate, where is it in the Gene? + * + * @param coordinate + * @param chromosomePosition + * @return + */ + public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) { + + if ( chromosomePosition.getOrientation() == '+') + return getCDSPosForward(coordinate, + chromosomePosition.getExonStarts(), + chromosomePosition.getExonEnds(), + chromosomePosition.getCdsStart(), + chromosomePosition.getCdsEnd()); + + return getCDSPosReverse(coordinate, + chromosomePosition.getExonStarts(), + chromosomePosition.getExonEnds(), + chromosomePosition.getCdsStart(), + chromosomePosition.getCdsEnd()); + + } + + + public static int getCDSPosReverse(int chromPos, List exonStarts, List exonEnds, + int cdsStart, int cdsEnd) { + boolean inCoding = false; + int codingLength = 0; + + if (cdsEnd < cdsStart) { + int tmp = cdsEnd; + cdsEnd = cdsStart; + cdsStart = tmp; + } + + logger.debug("looking for CDS position for " +format(chromPos)); + + + if ( chromPos < cdsStart+1 ) { + // this is not in a coding region! + + logger.debug(chromPos + " < " + cdsStart+1 ); + return -1; + } + + if ( chromPos > cdsEnd+1 ) { + // this is not in a coding region! + + logger.debug(chromPos + " > " + cdsEnd+1 ); + return -1; + } + + int lengthExons = 0; + + // map reverse + for (int i = exonStarts.size() - 1; i >= 0; i--) { + + logger.debug("Reverse Exon #" + (i+1) + "/" + exonStarts.size()); + int end = exonStarts.get(i); + int start = exonEnds.get(i); + + if (end < start) { + int tmp = end; + end = start; + start = tmp; + } + lengthExons += end - start; + + + logger.debug(" is " + format(chromPos) + " part of Reverse exon? s:" + format(start+1) + " - e:" + format(end) + " | " + (end - start+1)); + logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength); + + + + if (start+1 <= cdsEnd && end >= cdsEnd ) { + + // first exon with UTR + + inCoding = true; + + int tmpstart = start; + if (start < cdsStart) { + tmpstart = cdsStart; + } + + + logger.debug(" --- codingLength " + codingLength + + " s:" + + format(tmpstart+1) + + " e:" + + format(cdsEnd) + + " p:" + + format(chromPos) + " tmp: " + (chromPos - cdsStart)); + + logger.debug("check: " + (codingLength + cdsEnd - tmpstart+1) + " ==?? " + format(chromPos)); + + int tmp = cdsEnd - chromPos ; + // if (codingLength + cdsEnd - tmpstart >= chromPos) { + //if (end >= chromPos && start + (end-start) >= chromPos) { + // if (codingLength + cdsEnd - tmpstart >= chromPos) { + if ( chromPos >= start +1 && chromPos <= end){ + + + logger.debug(" -> found position in UTR exon: P: " + format(chromPos) + " s:" + format(tmpstart+1) + " l:" + format(tmp) + " cdsS:" + format(cdsStart+1) + " cdsE:" + format(cdsEnd) + " codingL:" + codingLength); + return codingLength + tmp; + } + + + logger.debug(" codinglength " + codingLength + " + " + (cdsEnd - tmpstart ) ); + + // do not add 1 here + codingLength += (cdsEnd - tmpstart ); + + boolean debug = logger.isDebugEnabled(); + + if (debug) { + StringBuffer b = new StringBuffer(); + b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline); + if (tmpstart == start) + b.append(" -> "); + else + b.append(" <-> "); + b.append("Reverse Exon :" + format(tmpstart+1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart) + " - " + codingLength + " | " + (codingLength % 3) + newline); + + logger.debug(b.toString()); + + // single exon with UTR on both ends + if (tmpstart != start) + logger.debug(" UTR :" + format(cdsStart - 1) + " - " + format(start)); + } + } else if (start <= cdsStart && end >= cdsStart) { + + // terminal exon with UTR + inCoding = false; + + + logger.debug(format(start + codingLength + end - cdsStart) + " ?? " + format(chromPos)); + // (start + codingLength + end - cdsStart >= chromPos && + if (( start+1 <= chromPos) && ( end >= chromPos)) { + + //int tmp = end - cdsStart ; +// int tmp = chromPos - cdsStart ; +// int l = end - cdsStart; + int tmp = end-chromPos ; + if ( tmp > end -cdsStart) { + tmp = end-cdsStart ; + + logger.debug("Adjust tmp to " + tmp); + } + + + + logger.debug( codingLength + " | " + (end -chromPos) + " | " + (end - cdsStart) ); + logger.debug(" <- Exon : " + format(cdsStart) + " - " + format(end) + " | " + format(end - cdsStart +1) + " | "); + logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); + logger.debug(" <- YYY found position noncoding exon: #" + (i+1) + " " + format(chromPos) + " s:" + format(start) + " tmp: " + format(tmp) + " cdsStart" + format(cdsStart) + " cdl:" + codingLength + " " + format(cdsEnd)); + + return codingLength + tmp; + } + + + logger.debug(" codinglength " + codingLength + " + " + (end - cdsStart) ); + codingLength += (end - cdsStart+1); + + logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); + + } else if (inCoding) { + // standard coding exon + // if (codingLength + end - start >= chromPos) { + if ( chromPos >= start+1 && chromPos <= end) { + + int tmp = end -chromPos ; + if ( tmp > (end-start+1)) { + + tmp = (end - start+1); + + logger.debug("Adjusting tmp to " + tmp); + } + + + logger.debug(" -> found position in reverse coding exon: #" + (i+1) + " chromPos:" + format(chromPos) + " start:" + format(start+1) + " end:" + format(end) + " tmp:" + tmp + " cdsStart:" + cdsStart + " codingLength:" + codingLength); + + return codingLength+tmp; + } + + // full exon is coding + + logger.debug(" codinglength " + codingLength + " + " + (end - start) ); + // don't add 1 + codingLength += (end - start); + + logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); + } else { + // e.g. see UBQLN3 + + logger.debug(" no translation! cdl:" + codingLength); + + } + + //if ( inCoding ) + // logger.debug(" exon phase at end:" + ((codingLength) % 3)); + + logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + chromPos); + + + } + + logger.debug("length exons: " + lengthExons); + // could not map, or map over the full length?? + + + return -1; + + } + + /** + * Get the chromosome position mapped onto the mrna CDS transcript position (needs to be divided by 3 to get protein coordinate) + * + * @param exonStarts + * @param exonEnds + * @param cdsStart + * @param cdsEnd + * @return + */ + public static int getCDSPosForward(int chromPos, List exonStarts, List exonEnds, + int cdsStart, int cdsEnd) { + boolean inCoding = false; + int codingLength = 0; + + + logger.debug("looking for CDS position for " +chromPos); + + int lengthExons = 0; + // map forward + for (int i = 0; i < exonStarts.size(); i++) { + + + // start can include UTR + int start = exonStarts.get(i); + int end = exonEnds.get(i); + + lengthExons += end - start; + + + logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start) + " ? overlaps with " + format(chromPos)); + + if (start +1 <= cdsStart +1 && end >= cdsStart+1) { + + if (end >= chromPos) { + // we are reaching our target position + // -1 is important here... + int tmp = chromPos - cdsStart -1; + + + logger.debug("cdl:" + codingLength + " | " + tmp); + logger.debug(" -> found position in UTR exon: " + chromPos + " " + format(start) + " " + format(tmp) + " " + cdsStart + " " + codingLength); + + return codingLength + tmp; + } + inCoding = true; + codingLength += (end - cdsStart); + + logger.debug(" UTR : " + format(start) + " - " + (cdsStart )); + logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); + + } else if (start <= cdsEnd && end >= cdsEnd) { + //logger.debug(" <-- CDS end at: " + cdsEnd ); + inCoding = false; + if (cdsEnd >= chromPos && (start +1 <= chromPos)) { + int tmp = chromPos - start -1 ; + + + logger.debug(" -> cdsForward found position in non coding exon#"+i+": " + chromPos + " " + format(start+1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); + return codingLength + tmp ; + } + codingLength += (cdsEnd - start); + + logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3)); + logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); + + + } else if (inCoding) { + + if (end >= chromPos && (start +1 <=chromPos)) { + + int tmp = chromPos-start-1 ; + + + logger.debug(codingLength + " | " + tmp); + logger.debug(" -> found position in coding exon #" + (i + 1) + ": " + format(chromPos) + " " + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); + + + return codingLength + tmp ; + } + // full exon is coding + codingLength += (end - start); + + logger.debug(" Exon :" + format(start) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); + } + // if ( inCoding ) + // logger.debug("exon phase at end:" + (codingLength % 3)); + // + // logger.debug(" coding length: " + codingLength); + + + } + + //logger.debug("length exons: " + lengthExons); + //return codingLength - 3; + + // could not map! + + return -1; + } + + } From f63309dbba32add3f05425f684239b87df57e940 Mon Sep 17 00:00:00 2001 From: Andreas Prlic Date: Fri, 29 Jul 2016 10:40:09 -0700 Subject: [PATCH 4/5] adding new dependency on guava --- biojava-genome/pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/biojava-genome/pom.xml b/biojava-genome/pom.xml index 0e28a54a0c..bed7db065a 100644 --- a/biojava-genome/pom.xml +++ b/biojava-genome/pom.xml @@ -71,6 +71,12 @@ + + com.google.guava + guava + compile + 19.0 + junit junit From 795be8f04e461e7f4f9f67399567b27dfa7b153b Mon Sep 17 00:00:00 2001 From: Andreas Prlic Date: Fri, 29 Jul 2016 10:41:48 -0700 Subject: [PATCH 5/5] new utility class for chromosome mappings. --- .../genome/parsers/genename/ChromPos.java | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 biojava-genome/src/main/java/org/biojava/nbio/genome/parsers/genename/ChromPos.java diff --git a/biojava-genome/src/main/java/org/biojava/nbio/genome/parsers/genename/ChromPos.java b/biojava-genome/src/main/java/org/biojava/nbio/genome/parsers/genename/ChromPos.java new file mode 100644 index 0000000000..3d343d2f1a --- /dev/null +++ b/biojava-genome/src/main/java/org/biojava/nbio/genome/parsers/genename/ChromPos.java @@ -0,0 +1,31 @@ +package org.biojava.nbio.genome.parsers.genename; + +/** + * Created by ap3 on 27/10/2014. + */ +public class ChromPos { + + int pos; + int phase; + + public int getPhase() { + return phase; + } + + public void setPhase(int phase) { + this.phase = phase; + } + + public int getPos() { + return pos; + } + + public void setPos(int pos) { + this.pos = pos; + } + + public ChromPos(int pos, int phase){ + this.pos = pos; + this.phase = phase; + } +} \ No newline at end of file