From 86b406516a63db05c95063de82e3c37950b9d5e8 Mon Sep 17 00:00:00 2001 From: kaidu Date: Thu, 28 Jul 2016 16:28:28 +0200 Subject: [PATCH] implement round robin algorithm for decomposing molecular formulas taken from SIRIUS --- doc/refs/cheminf.bibx | 23 +- .../org/openscience/cdk/decomp/Alphabet.java | 53 ++ .../cdk/decomp/ChemicalAlphabet.java | 105 ++++ .../cdk/decomp/DecompIterator.java | 63 ++ .../cdk/decomp/DecomposerFactory.java | 52 ++ .../org/openscience/cdk/decomp/Interval.java | 44 ++ .../cdk/decomp/MassDecomposer.java | 542 +++++++++++++++++ .../cdk/decomp/RangeMassDecomposer.java | 551 ++++++++++++++++++ .../org/openscience/cdk/decomp/Weight.java | 85 +++ .../FullEnumerationFormulaGenerator.java | 385 ++++++++++++ .../cdk/formula/IFormulaGenerator.java | 11 + .../formula/MolecularFormulaGenerator.java | 382 +++--------- .../formula/RoundRobinFormulaGenerator.java | 130 +++++ .../MolecularFormulaGeneratorTest.java | 6 +- 14 files changed, 2144 insertions(+), 288 deletions(-) create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/Alphabet.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/ChemicalAlphabet.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/DecompIterator.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/DecomposerFactory.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/Interval.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/MassDecomposer.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/RangeMassDecomposer.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/decomp/Weight.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/formula/FullEnumerationFormulaGenerator.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/formula/IFormulaGenerator.java create mode 100644 tool/formula/src/main/java/org/openscience/cdk/formula/RoundRobinFormulaGenerator.java diff --git a/doc/refs/cheminf.bibx b/doc/refs/cheminf.bibx index cbd67268558..3dc5681aae7 100644 --- a/doc/refs/cheminf.bibx +++ b/doc/refs/cheminf.bibx @@ -1452,5 +1452,26 @@ Method 4396-4403 - + + + + DECOMP--from interpreting Mass Spectrometry peaks to solving the Money Changing Problem. + Böcker, Sebastian and Lipták, Zsuzsanna and Martin, Marcel and Pervukhin, Anton and Sudek, Henner + 2008 + Bioinformatics + 24 + 4 + 591--593 + http://bioinformatics.oxfordjournals.org/cgi/reprint/24/4/591?ijkey=1lM50Bkzz4SCLsa + + + + + Dührkop, Kai and Ludwig, Marcus and Meusel, Marvin and Böcker, Sebastian + Faster mass decomposition + 2013 + Proc. of Workshop on Algorithms in Bioinformatics (WABI 2013) + http://arxiv.org/abs/1307.7805 + + diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/Alphabet.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/Alphabet.java new file mode 100644 index 00000000000..2e4b890e8b0 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/Alphabet.java @@ -0,0 +1,53 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +/** + * The alphabet for which a given weight is decomposed. An alphabet is a vector c_1..c_k of k characters of Type T. + * It maps each character to a weight. It supports access by an index as well as by the character itself. + * + * @param type of a single character in the alphabet + */ +interface Alphabet { + + /** + * @return size of the alphabet. Indizes of characters are 0..{@literal <} size + */ + public int size(); + + /** + * @param i index of the character + * @return weight of character c_i + */ + public double weightOf(int i); + + /** + * @param i index of the character + * @return character c_i + */ + public T get(int i); + + /** + * Maps the character to its index. This operation should be fast, because internally a modified ordered + * alphabet is used which have to be mapped back to the original alphabet + * @param character + * @return the index of the character + */ + public int indexOf(T character); + +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/ChemicalAlphabet.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/ChemicalAlphabet.java new file mode 100644 index 00000000000..86f7ccd1c67 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/ChemicalAlphabet.java @@ -0,0 +1,105 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +import org.openscience.cdk.formula.MolecularFormulaRange; +import org.openscience.cdk.interfaces.IChemObjectBuilder; +import org.openscience.cdk.interfaces.IIsotope; +import org.openscience.cdk.interfaces.IMolecularFormula; + +import java.util.Arrays; + +/** + * Implements the {@link Alphabet} for chemical elements. + */ +public class ChemicalAlphabet implements Alphabet { + + /** + * Is used to convert compomeres to IMolecularFormula instances + */ + protected final IChemObjectBuilder objectBuilder; + + /** + * The characters (chemical elements) of the alphabet + */ + protected final IIsotope[] characters; + + /** + * Construct a new chemical alphabet from the given search space using the given object builder. + * + * @param molecularFormulaRange Search spacel, defining the allowed elements + */ + public ChemicalAlphabet(IChemObjectBuilder builder, MolecularFormulaRange molecularFormulaRange) { + this.objectBuilder = builder; + IIsotope[] chars = new IIsotope[molecularFormulaRange.getIsotopeCount()]; + int k=0; + for (IIsotope i : molecularFormulaRange.isotopes()) { + if (molecularFormulaRange.getIsotopeCountMax(i) > 0) chars[k++] = i; + } + if (k < chars.length) chars = Arrays.copyOf(chars, k); + this.characters = chars; + } + + /** + * Translates a compomere (multiset of characters) into a IMolecularFormula + */ + public IMolecularFormula buildFormulaFromCompomere(int[] compomere, int[] orderedIndizes) { + IMolecularFormula formula = objectBuilder.newInstance(IMolecularFormula.class); + for (int k=0; k < orderedIndizes.length; ++k) { + if (compomere[k] > 0) formula.addIsotope(characters[orderedIndizes[k]], compomere[k]); + } + return formula; + } + + /** + * Checks if two chemical alphabets are compatible. In theory, an alphabet would be compatible if it is a subset + * of another alphabet. However, we directly check for equality to keep this operation symetric. + * + * A decomposer can decompose every mass with alphabet as long as the alphabet is compatible to the decomposers + * own alphabet. + */ + public boolean isCompatible(ChemicalAlphabet other) { + return Arrays.equals(characters, other.characters); + } + + @Override + public int size() { + return characters.length; + } + + @Override + public double weightOf(int i) { + return characters[i].getExactMass(); + } + + @Override + public IIsotope get(int i) { + return characters[i]; + } + + /** + * maps each character to its index. This operation is quite slow, but have to be done only once when starting the + * decomposer. Therefore, we don't need a hash table here. + */ + @Override + public int indexOf(IIsotope character) { + for (int k=0; k < characters.length; ++k) + if (characters[k]==character) return k; + return -1; + } +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/DecompIterator.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/DecompIterator.java new file mode 100644 index 00000000000..60a92d87e34 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/DecompIterator.java @@ -0,0 +1,63 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +/** + * This class allows to iterate over decompositions instead of keeping all decompositions in memory. It's slightly + * slower than generating all decompositions within a single loop, but it offers more flexibility (i.e. find a single + * decomposition that satisfies some rule). + * @param type of the characters of the alphabet + */ +public interface DecompIterator { + + /** + * moves the iterator one step. + * @return true, if a new decomposition is found. false, if the iterator reached the end. + */ + public boolean next(); + + /** + * Give access to the current compomere. Please note that this array is only valid during the current iteration step + * and might be changed afterwards. Furthermore, it is absolutely forbidden to write anything into this array. + * However, you are free to clone the array and do anything with its copy. + * + * @return the compomere (a tuple (a_1,...,a_n) with a_i is the amount of the i-th character in the ordered alphabet + */ + public int[] getCurrentCompomere(); + + /** + * @return the underlying (possibly unordered) alphabet + */ + public Alphabet getAlphabet(); + + /** + * The order of characters in the compomere might be different to the order of characters in the alphabet (i.e. + * the characters in the compomere are always ordered by mass). This array maps the i-th character in the compomere + * to it's appropiate index in the alphabet + * @return mapping of positions in compomere to character indizes in alphabet + */ + public int[] getAlphabetOrder(); + + /** + * Returns the character on the given position in the compomere + * @param index index in compomere + * @return corresponding character in alphabet + */ + public T getCharacterAt(int index); + +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/DecomposerFactory.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/DecomposerFactory.java new file mode 100644 index 00000000000..05247cc8a6d --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/DecomposerFactory.java @@ -0,0 +1,52 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +import org.openscience.cdk.interfaces.IIsotope; + +import java.util.ArrayList; +import java.util.List; + +public final class DecomposerFactory { + + protected List> decomposerCache; + protected static final int maximalNumberOfCachedDecomposers = 10; + + protected final static DecomposerFactory instance = new DecomposerFactory(); + + public static DecomposerFactory getInstance() { + return instance; + } + + public DecomposerFactory() { + this.decomposerCache = new ArrayList<>(maximalNumberOfCachedDecomposers); + } + + public RangeMassDecomposer getDecomposerFor(ChemicalAlphabet alphabet) { + for (RangeMassDecomposer decomposer : decomposerCache) { + if (((ChemicalAlphabet)decomposer.getAlphabet()).isCompatible(alphabet)) { + return decomposer; + } + } + if (decomposerCache.size()>= maximalNumberOfCachedDecomposers) decomposerCache.remove(0); + final RangeMassDecomposer decomposer = new RangeMassDecomposer<>(alphabet); + decomposerCache.add(decomposer); + return decomposer; + } + +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/Interval.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/Interval.java new file mode 100644 index 00000000000..1a907e4976e --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/Interval.java @@ -0,0 +1,44 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +/** + * A simple POJO which defines a range from min to max (including max). + */ +public class Interval { + + private final int min; + private final int max; + + public Interval(int min, int max) { + this.min = min; + this.max = max; + } + + public int getMin() { + return min; + } + + public int getMax() { + return max; + } + + public String toString() { + return "(" + min + " .. " + max + ")"; + } +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/MassDecomposer.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/MassDecomposer.java new file mode 100644 index 00000000000..7f5d15480b4 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/MassDecomposer.java @@ -0,0 +1,542 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +import java.util.*; + +/** + * Decomposes a given mass over an alphabet, returning all decompositions which mass equals the given mass + * considering a given deviation. + * {@cdk.cite Boecker2008} + * @param + * type of the alphabet's characters + * @author Anton Pervukhin, Kai Dührkop, Marcus Ludwig + */ +class MassDecomposer { + + protected int[][] ERT; + protected double precision; + protected final List> weights; + protected double minError, maxError; + protected final Alphabet alphabet; + protected final int[] orderedCharacterIds; + + /** + * @param alphabet the alphabet the mass is decomposed over + */ + public MassDecomposer(Alphabet alphabet) { + this.precision = findOptimalPrecision(); + final int n = alphabet.size(); + this.weights = new ArrayList>(n); + for (int i=0; i < n; ++i) { + weights.add(new Weight(alphabet.get(i), alphabet.weightOf(i))); + } + Collections.sort(weights); + this.alphabet = alphabet; + this.orderedCharacterIds = new int[alphabet.size()]; + for (int i=0; i < alphabet.size(); ++i) { + orderedCharacterIds[i] = alphabet.indexOf(weights.get(i).getOwner()); + } + } + + public DecompIterator decomposeIterator(double from, double to) { + return decomposeIterator(from, to, null); + } + + public DecompIterator decomposeIterator(double from, double to, Map boundaries) { + init(); + if (to < 0d || from < 0d) throw new IllegalArgumentException("Expect positive mass for decomposition: [" + from + ", " + to + "]"); + if (to < from) throw new IllegalArgumentException("Negative range given: [" + from + ", " + to + "]"); + final int[] minValues = new int[weights.size()]; + final int[] boundsarray = new int[weights.size()]; + boolean minAllZero = true; + double cfrom = from, cto = to; + Arrays.fill(boundsarray, Integer.MAX_VALUE); + if (boundaries!=null && !boundaries.isEmpty()) { + for (int i = 0; i < boundsarray.length; i++) { + T el = weights.get(i).getOwner(); + Interval range = boundaries.get(el); + if (range != null) { + boundsarray[i] = range.getMax()-range.getMin(); + minValues[i] = range.getMin(); + if (minValues[i] > 0) { + minAllZero = false; + final double reduceWeightBy = weights.get(i).getMass() * range.getMin(); + cfrom -= reduceWeightBy; + cto -= reduceWeightBy; + } + } + } + } + + final Interval interval = integerBound(cfrom, cto); + + return new DecompIteratorImpl(ERT, interval.getMin(), interval.getMax(), from, to, minValues, boundsarray, alphabet, weights, orderedCharacterIds.clone()); + } + + protected double findOptimalPrecision() { + return 1d/5963.337687d; // TODO: check alphabet and mass deviation, define optimal blowup for given alphabet + } + + public Alphabet getAlphabet() { + return alphabet; + } + + public boolean maybeDecomposable(double startMass, double endMass) { + init(); + final Interval range = integerBound(startMass, endMass); + final int a = weights.get(0).getIntegerMass(); + for (int i = range.getMin(); i <= range.getMax(); ++i) { + final int r = i % a; + if (i >= ERT[r][weights.size()-1]) return true; + } + return false; + } + + /** + */ + public List decompose(double from, double to) { + return decompose(from, to, null); + } + + /** + * computes all decompositions for the given mass. The runtime depends only on the number of characters and the + * number of decompositions. Therefore this method is very fast as int as the number of decompositions is low. + * Unfortunately, the number of decompositions increases nearly exponential in the number of characters and in the + * input mass. + * + * This function can be called in multiple threads in parallel, because it does not modify the decomposer + + */ + public List decompose(final double from, final double to, Map boundaries){ + init(); + if (to < 0d || from < 0d) throw new IllegalArgumentException("Expect positive mass for decomposition: [" + from + ", " + to + "]"); + if (to < from) throw new IllegalArgumentException("Negative range given: [" + from + ", " + to + "]"); + if (to == 0d) return Collections.emptyList(); + final int[] minValues = new int[weights.size()]; + final int[] boundsarray = new int[weights.size()]; + boolean minAllZero = true; + double cfrom = from, cto = to; + Arrays.fill(boundsarray, Integer.MAX_VALUE); + if (boundaries!=null && !boundaries.isEmpty()) { + for (int i = 0; i < boundsarray.length; i++) { + T el = weights.get(i).getOwner(); + Interval range = boundaries.get(el); + if (range != null) { + boundsarray[i] = range.getMax()-range.getMin(); + minValues[i] = range.getMin(); + if (minValues[i] > 0) { + minAllZero = false; + final double reduceWeightBy = weights.get(i).getMass() * range.getMin(); + cfrom -= reduceWeightBy; + cto -= reduceWeightBy; + } + } + } + } + final ArrayList results = new ArrayList(); + ArrayList rawDecompositions = null; + final Interval interval = integerBound(cfrom, cto); + if (!minAllZero && interval.getMax() == 0) results.add(minValues); + for (int m = interval.getMin(); m <= interval.getMax(); ++m) { + rawDecompositions = integerDecompose(m, boundsarray); + for (int i=0; i < rawDecompositions.size(); ++i) { + final int[] decomp = rawDecompositions.get(i); + if (!minAllZero) { + for (int j=0; j < minValues.length; ++j) { + decomp[j] += minValues[j]; + } + } + final double realMass = calcMass(decomp); + if (realMass < from || realMass > to) continue; + else results.add(decomp); + } + } + return results; + } + + + /** + * Iterator implementation of the loop + * We do not use static classes. This gives us the possibility to make some of the variables behave thread safe + * and resistant against changes from the user. + * @param + */ + protected static class DecompIteratorImpl implements DecompIterator { + // final initialization values + protected final int[][] ERT; + protected final int minIntegerMass, maxIntegerMass; + protected final double minDoubleMass, maxDoubleMass; + protected final int[] buffer; + protected final int[] minValues; + protected final int[] maxValues; + protected final Alphabet alphabet; + protected final List> weights; + protected final int[] orderedCharacterIds; + + // loop variables + protected final int k; // current character + protected final int a; // mass of first character of the alphabet + protected int i; // current column in the ERT + protected int m; // current mass + protected int currentIntegerMass; // current mass to decompose + protected boolean rewind; + + + + protected DecompIteratorImpl(int[][] ERT, int minIntegerMass, int maxIntegerMass, double minDoubleMass, double maxDoubleMass, int[] minValues, int[] maxValues, Alphabet alphabet, List> weights, int[] orderedCharacterIds) { + this.ERT = ERT; + this.minIntegerMass = minIntegerMass; + this.maxIntegerMass = maxIntegerMass; + this.minDoubleMass = minDoubleMass; + this.maxDoubleMass = maxDoubleMass; + this.buffer = new int[weights.size()]; + if (minValues!=null) { + boolean allZero = true; + for (int k : minValues) if (k > 0) allZero=false; + if (!allZero) this.minValues = minValues; + else this.minValues = null; + } else this.minValues = null; + this.maxValues = maxValues; + this.alphabet = alphabet; + this.weights = weights; + this.orderedCharacterIds = orderedCharacterIds; + + this.a = weights.get(0).getIntegerMass(); + this.k = weights.size()-1; + this.i = k; + this.currentIntegerMass = this.minIntegerMass; + this.m = this.minIntegerMass; + this.rewind = false; + } + + @Override + public boolean next() { + while (currentIntegerMass <= maxIntegerMass) { + if (decomposeSingleIntegerMass()) { + if (checkCompomere()) return true; + } + nextIntegerInRange(); + } + return false; + } + + private void nextIntegerInRange() { + ++currentIntegerMass; + this.i = k; + this.m = this.currentIntegerMass; + this.rewind = false; + } + + private boolean decomposable(int i, int m, int a) { + return (m>=0) && ERT[(m % a)][i] <= m; + } + + private boolean decomposeSingleIntegerMass() { + if (rewind) { + afterFindingADecomposition(); + rewind = false; + } + while (i <= k) { + if (!decomposable(i,m,a)) { // jump back the search tree as int as there are no branches you can jump into + while (i <= k && !decomposable(i, m, a)) { + m = m+buffer[i]*weights.get(i).getIntegerMass(); + buffer[i] = 0; + ++i; + } + // now decomposable(i,m,a) = true + while (i<=k && buffer[i]>=maxValues[i]) { // Jump a step back if you reached the boundary + m += buffer[i]*weights.get(i).getIntegerMass(); + buffer[i] = 0; + ++i; + } + if (i <= k) { // insert a character + m -= weights.get(i).getIntegerMass(); + ++buffer[i]; + } + } else { + while (i > 0 && decomposable(i-1, m, a)) { // go as deep as possible into the "search tree" + --i; // initially we do not add any elements + } + // now decomposable[i,m,a]=true + if (i==0) { // you are finished: Add the decomposition + buffer[0] = (m/a); + rewind = true; + return true; + } + while (i<=k && buffer[i]>=maxValues[i]) { // Jump a step back if you reached the boundary + m += buffer[i]*weights.get(i).getIntegerMass(); + buffer[i] = 0; + ++i; + } + if (i <= k) { // insert a character + m -= weights.get(i).getIntegerMass(); + ++buffer[i]; + } + } + } + return false; + } + + private boolean checkCompomere() { + if (minValues!=null) { + for (int j=0; j < minValues.length; ++j) { + buffer[j] += minValues[j]; + } + } + // calculate mass of decomposition + double exactMass = 0; + for (int j=0; j < buffer.length; ++j) { + exactMass += buffer[j]*weights.get(j).getMass(); + } + if (exactMass >= minDoubleMass && exactMass <= maxDoubleMass) return true; + else return false; + } + + private void afterFindingADecomposition() { + if (minValues!=null) { + for (int j=0; j < minValues.length; ++j) { + buffer[j] -= minValues[j]; + } + } + ++i; // and go one step back in the search tree + while (i<=k && buffer[i]>=maxValues[i]) { // Jump a step back if you reached the boundary + m += buffer[i]*weights.get(i).getIntegerMass(); + buffer[i] = 0; + ++i; + } + if (i <= k) { // insert a character + m -= weights.get(i).getIntegerMass(); + ++buffer[i]; + } + } + + @Override + public int[] getCurrentCompomere() { + return buffer; + } + + @Override + public Alphabet getAlphabet() { + return alphabet; + } + + @Override + public int[] getAlphabetOrder() { + return orderedCharacterIds; + } + + @Override + public T getCharacterAt(int index) { + return alphabet.get(orderedCharacterIds[index]); + } + } + + protected ArrayList integerDecompose(int mass, int[] bounds){ + // Find compomers + ArrayList result = new ArrayList(); + int k = weights.size()-1; // index of last character + int a = weights.get(0).getIntegerMass(); // mass of first character + int[] c = new int[k+1]; // compomere + int i=k; // current column in ERT + int m = mass; // current mass + while (i <= k) { + if (!decomposable(i, m, a)) { // jump back the search tree as int as there are no branches you can jump into + while (i <= k && !decomposable(i, m, a)) { + m = m+c[i]*weights.get(i).getIntegerMass(); + c[i] = 0; + ++i; + } + // now decomposable(i,m,a) = true + while (i<=k && c[i]>=bounds[i]) { // Jump a step back if you reached the boundary + m += c[i]*weights.get(i).getIntegerMass(); + c[i] = 0; + ++i; + } + if (i <= k) { // insert a character + m -= weights.get(i).getIntegerMass(); + ++c[i]; + } + } else { + while (i > 0 && decomposable(i-1, m, a)) { // go as deep as possible into the "search tree" + --i; // initially we do not add any elements + } + // now decomposable[i,m,a]=true + if (i==0) { // you are finished: Add the decomposition + c[0] = (int)(m/a); + result.add(c.clone()); + ++i; // and go one step back in the search tree + } + while (i<=k && c[i]>=bounds[i]) { // Jump a step back if you reached the boundary + m += c[i]*weights.get(i).getIntegerMass(); + c[i] = 0; + ++i; + } + if (i <= k) { // insert a character + m -= weights.get(i).getIntegerMass(); + ++c[i]; + } + } + } + return result; + } + + private boolean decomposable(int i, int m, int a1) { + if (m<0)return false; + return ERT[(m % a1)][i] <= m; + } + + /** + * Initializes the decomposer. Computes the extended residue table. This have to be done only one time for + * a given alphabet, independently from the masses you want to decompose. This method is called automatically + * if you compute the decompositions, so call it only if you want to control the time of the initialisation. + */ + public void init() { + if (ERT != null) return; + synchronized (this) { + if (ERT != null) return; + discretizeMasses(); + divideByGCD(); + computeLCMs(); + calcERT(); + computeErrors(); + } + } + + protected double calcMass(int[] input){ + double result = 0d; + for (int i = 0; i < input.length; ++i){ + result += input[i]*weights.get(i).getMass(); + } + return result; + } + + protected void calcERT(){ + int firstLongVal = weights.get(0).getIntegerMass(); + ERT = new int[firstLongVal][weights.size()]; + int d, r, n, argmin; + + //Init + ERT[0][0] = 0; + for (int i = 1; i < ERT.length; ++i){ + ERT[i][0] = Integer.MAX_VALUE; // should be infinity + } + + //Filling the Table, j loops over columns + for (int j = 1; j < ERT[0].length; ++j){ + ERT[0][j] = 0; // Init again + d = gcd(firstLongVal, weights.get(j).getIntegerMass()); + for (int p = 0; p < d; p++){ // Need to start d Round Robin loops + if (p == 0) { + n = 0; // 0 is the min in the complete RT or the first p-loop + } else { + n = Integer.MAX_VALUE; // should be infinity + argmin = p; + for (int i = p; i weight = weights.get(i); + weight.setIntegerMass((int)(weight.getMass() / precision)); + } + } + + protected void divideByGCD() { + if (weights.size() > 0) { + int d = gcd(weights.get(0).getIntegerMass(), weights.get(1).getIntegerMass()); + for (int i=2; i < weights.size(); ++i) { + d = gcd(d, weights.get(i).getIntegerMass()); + if (d == 1) return; + } + precision *= d; + for (Weight weight : weights) { + weight.setIntegerMass(weight.getIntegerMass() / d); + } + } + } + + protected void computeLCMs() { + final Weight first = weights.get(0); + first.setL(1); + first.setLcm(first.getIntegerMass()); + + for(int i=1; i weight = weights.get(i); + int temp = first.getIntegerMass() / gcd(first.getIntegerMass(), weight.getIntegerMass()); + weight.setL(temp); + weight.setLcm(temp * weight.getIntegerMass()); + } + } + + protected static int gcd(int u, int v) { + int r = 0; + + while (v != 0) { + r = u % v; + u = v; + v = r; + } + return u; + } + + protected void computeErrors() { + this.minError = 0d; + this.maxError = 0d; + for (Weight weight : weights) { + final double error = (precision * weight.getIntegerMass() - weight.getMass()) / weight.getMass(); + minError = Math.min(minError, error); + maxError = Math.max(maxError, error); + } + } + + protected Interval integerBound(double from, double to) { + final double fromD = Math.ceil((1 + minError) * from / precision); + final double toD = Math.floor((1 + maxError) * to / precision); + if (fromD > Integer.MAX_VALUE || toD > Integer.MAX_VALUE) { + throw new ArithmeticException("Given mass is too large to decompose. Please use a smaller precision value, i.e. mass/precision have to be within 32 bit integer space"); + } + return new Interval( + Math.max(0, (int)fromD), + Math.max(0, (int)toD) + ); + } + +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/decomp/RangeMassDecomposer.java b/tool/formula/src/main/java/org/openscience/cdk/decomp/RangeMassDecomposer.java new file mode 100644 index 00000000000..68f25fd4d6c --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/decomp/RangeMassDecomposer.java @@ -0,0 +1,551 @@ +/* + * This file is part of the SIRIUS library for analyzing MS and MS/MS data + * + * Copyright (C) 2013-2015 Kai Dührkop + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with SIRIUS. If not, see . + */ +package org.openscience.cdk.decomp; + +import java.util.*; + +/** + * Decomposes a given mass over an alphabet, returning all decompositions which masses equals the given mass + * considering a given deviation. + * In contrast to {@link MassDecomposer}, MassDecomposerFast calculates the decompositions with the help of an ERT containing deviation information, not requiring to iterate over all different integer mass values {@cdk.cite Duehrkop2013}. + * @param + * type of characters of the alphabetr + * @author Marcus Ludwig, Kai Dührkop + * + */ +public class RangeMassDecomposer extends MassDecomposer{ + + /** + * Avoid locks by making ERTs volatile. This leads to the situation that several threads might accidentally compute + * the same ERT tables. However, as soon as an ERT table is written it is synchronized around all threads. After + * writing an ERT table it is never changed, so additional locking is not necessary. + */ + protected volatile int[][][] ERTs; + + /** + * @param alphabet the alphabet the mass is decomposed over + */ + public RangeMassDecomposer(Alphabet alphabet) { + super(alphabet); + this.ERTs = new int[0][][]; + } + + /** + * Check if a mass is decomposable. This is done in constant time (especially: it is very very very fast!). + * But it doesn't check if there is a valid decomposition. Therefore, even if the method returns true, + * all decompositions may be invalid for the given validator or given bounds. + * #decompose(mass) uses this function before starting the decomposition, therefore this method should only + * be used if you don't want to start the decomposition algorithm. + * @return + */ + @Override + public boolean maybeDecomposable(double from, double to) { + init(); + final int[][][] ERTs = this.ERTs; + //normal version seems to be faster, because it returns after first hit + final Interval range = integerBound(from, to); + final int a = weights.get(0).getIntegerMass(); + for (int i = range.getMin(); i <= range.getMax(); ++i) { + final int r = i % a; + if (i >= ERTs[0][r][weights.size()-1]) return true; + } + return false; + } + + @Override + public DecompIterator decomposeIterator(double from, double to, Map boundaries) { + init(); + if (to < 0d || from < 0d) throw new IllegalArgumentException("Expect positive mass for decomposition: [" + from + ", " + to + "]"); + if (to < from) throw new IllegalArgumentException("Negative range given: [" + from + ", " + to + "]"); + final int[] minValues = new int[weights.size()]; + final int[] boundsarray = new int[weights.size()]; + boolean minAllZero = true; + double cfrom = from, cto = to; + Arrays.fill(boundsarray, Integer.MAX_VALUE); + if (boundaries!=null && !boundaries.isEmpty()) { + for (int i = 0; i < boundsarray.length; i++) { + T el = weights.get(i).getOwner(); + Interval range = boundaries.get(el); + if (range != null) { + boundsarray[i] = range.getMax()-range.getMin(); + minValues[i] = range.getMin(); + if (minValues[i] > 0) { + minAllZero = false; + final double reduceWeightBy = weights.get(i).getMass() * range.getMin(); + cfrom -= reduceWeightBy; + cto -= reduceWeightBy; + } + } + } + } + final Interval interval = integerBound(cfrom, cto); + final int deviation = interval.getMax()-interval.getMin(); + final int[][][] _ERTs_ = this.ERTs; + //calculate the required ERTs + if ((1<<(ERTs.length-1)) <=deviation){ + calcERT(deviation); + } + final int[][][] ERTs = this.ERTs; + + //take ERT with required deviation + int[][] currentERT; + if (deviation==0) currentERT = ERTs[0]; + else currentERT = ERTs[32-Integer.numberOfLeadingZeros(deviation)]; + + return new DecompIteratorImpl<>(currentERT, interval.getMin(), interval.getMax(), from, to, minValues, boundsarray, alphabet, weights, orderedCharacterIds.clone()); + } + + /** + * computes all decompositions for the given mass. The runtime depends only on the number of characters and the + * number of decompositions. Therefore this method is very fast as long as the number of decompositions is low. + * Unfortunately, the number of decompositions increases nearly exponential in the number of characters and in the + * input mass. + * + * This function can be called in multiple threads in parallel, because it does not modify the decomposer + */ + @Override + public List decompose(double from, double to, Map boundaries){ + init(); + if (to < 0d || from < 0d) throw new IllegalArgumentException("Expect positive mass for decomposition: [" + from + ", " + to + "]"); + if (to < from) throw new IllegalArgumentException("Negative range given: [" + from + ", " + to + "]"); + if (to == 0d) return Collections.emptyList(); + final int[] minValues = new int[weights.size()]; + final int[] boundsarray = new int[weights.size()]; + boolean minAllZero = true; + double cfrom = from, cto = to; + Arrays.fill(boundsarray, Integer.MAX_VALUE); + if (boundaries!=null && !boundaries.isEmpty()) { + for (int i = 0; i < boundsarray.length; i++) { + T el = weights.get(i).getOwner(); + Interval range = boundaries.get(el); + if (range != null) { + boundsarray[i] = range.getMax()-range.getMin(); + minValues[i] = range.getMin(); + if (minValues[i] > 0) { + minAllZero = false; + final double reduceWeightBy = weights.get(i).getMass() * range.getMin(); + cfrom -= reduceWeightBy; + cto -= reduceWeightBy; + } + } + } + } + final ArrayList results = new ArrayList(); + ArrayList rawDecompositions = null; + final Interval interval = integerBound(cfrom, cto); + if (!minAllZero && interval.getMax() == 0) results.add(minValues); + + if (interval.getMax()(); + else rawDecompositions = integerDecompose(interval.getMax(), interval.getMax() - interval.getMin(), boundsarray); + for (int i=0; i < rawDecompositions.size(); ++i) { + final int[] decomp = rawDecompositions.get(i); + if (!minAllZero) { + for (int j=0; j < minValues.length; ++j) { + decomp[j] += minValues[j]; + } + } + final double realMass = calcMass(decomp); + if (realMass >= from && realMass <= to) results.add(decomp); + } + return results; + } + + /** + * Iterator implementation of the loop + * We do not use static classes. This gives us the possibility to make some of the variables behave thread safe + * and resistant against changes from the user. + * @param + */ + protected static class DecompIteratorImpl implements DecompIterator { + // final initialization values + protected final int[][] ERT; + protected final int minIntegerMass, maxIntegerMass; + protected final double minDoubleMass, maxDoubleMass; + protected final int[] buffer; + protected final int[] minValues; + protected final int[] maxValues; + protected final Alphabet alphabet; + protected final List> weights; + protected final int[] orderedCharacterIds; + + // loop variables + + protected final int[] j, m, lbound, r; + protected final int k, a, deviation, ERTdev; + protected boolean flagWhile, rewind; + protected int i; + + + + protected DecompIteratorImpl(int[][] ERT, int minIntegerMass, int maxIntegerMass, double minDoubleMass, double maxDoubleMass, int[] minValues, int[] maxValues, Alphabet alphabet, List> weights, int[] orderedCharacterIds) { + this.ERT = ERT; + this.minIntegerMass = minIntegerMass; + this.maxIntegerMass = maxIntegerMass; + this.minDoubleMass = minDoubleMass; + this.maxDoubleMass = maxDoubleMass; + this.buffer = new int[weights.size()]; + if (minValues!=null) { + boolean allZero = true; + for (int k : minValues) if (k > 0) allZero=false; + if (!allZero) this.minValues = minValues; + else this.minValues = null; + } else this.minValues = null; + this.maxValues = maxValues; + this.alphabet = alphabet; + this.weights = weights; + this.orderedCharacterIds = orderedCharacterIds; + + k = weights.size(); + j = new int[k]; + m = new int[k]; + lbound = new int[k]; + r = new int[k]; + flagWhile = false; // flag wether we are in the while-loop or not + a = weights.get(0).getIntegerMass(); + // Init + for (int i=1; i=0) && ERT[(m % a)][i] <= m; + } + + private boolean decomposeRangeIntegerMass() { + if (rewind) { + afterFindingADecomposition(); + rewind=false; + } + while (i != k){ + if (i == 0){ + final int v = (m[i]/a); + if (v <= maxValues[0]) { + buffer[0] = v; + rewind = true; + return true; + } + ++i; // "return" from recursion + flagWhile = true; // in this recursion-depth we are in the while-loop, cause the next recursion (the one we just exited) was called + m[i-1] -= weights.get(i).getLcm(); // execute the rest of the while + buffer[i] += weights.get(i).getL(); + } else { + if (flagWhile){ + if (m[i-1] >= lbound[i] && buffer[i] <= maxValues[i]){ //currently in while loop + --i; // "do" recursive call + } else { + flagWhile = false; // + } + } else { //we are in the for-loop + if (j[i] < weights.get(i).getL() && m[i]-j[i]*weights.get(i).getIntegerMass()>=0){ + buffer[i] = j[i]; + m[i-1] = m[i]-j[i]*weights.get(i).getIntegerMass(); + r[i] = m[i-1]%a; + //changed from normal algorithm: you have to look up the minimum at 2 position + int pos = r[i]-deviation+ERTdev; + if (pos<0) pos += ERT.length; + lbound[i] = Math.min(ERT[r[i]][i-1], ERT[pos][i-1]); + flagWhile = true; // call the while loop + ++j[i]; + } else { //exit for loop + // reset "function variables" + lbound[i] = Integer.MAX_VALUE; + j[i] = 0; + buffer[i] = 0; + ++i; // "return" from recursion + if (i != k) { // only if we are not done + flagWhile = true; // in this recursion-depth we are in the while-loop, cause the next recursion was called + m[i-1] -= weights.get(i).getLcm(); // execute the rest of the while + buffer[i] += weights.get(i).getL(); + } + } + } + } // end if i == 0 + } // end while + + return false; + } + + private boolean checkCompomere() { + if (minValues!=null) { + for (int j=0; j < minValues.length; ++j) { + buffer[j] += minValues[j]; + } + } + // calculate mass of decomposition + double exactMass = 0; + for (int j=0; j < buffer.length; ++j) { + exactMass += buffer[j]*weights.get(j).getMass(); + } + if (exactMass >= minDoubleMass && exactMass <= maxDoubleMass) return true; + else return false; + } + + private void afterFindingADecomposition() { + + if (minValues!=null) { + for (int j=0; j < minValues.length; ++j) { + buffer[j] -= minValues[j]; + } + } + + ++i; // "return" from recursion + flagWhile = true; // in this recursion-depth we are in the while-loop, cause the next recursion (the one we just exited) was called + m[i-1] -= weights.get(i).getLcm(); // execute the rest of the while + buffer[i] += weights.get(i).getL(); + } + + @Override + public int[] getCurrentCompomere() { + return buffer; + } + + @Override + public Alphabet getAlphabet() { + return alphabet; + } + + @Override + public int[] getAlphabetOrder() { + return orderedCharacterIds; + } + + @Override + public T getCharacterAt(int index) { + return alphabet.get(orderedCharacterIds[index]); + } + } + + /** + * decomposes an interval of masses with mass as UPPER mass and all other masses below within deviation + * Example: mass = 18, deviation 3 {@literal ->} decompose 18,17,16,15 + * @param mass + * @param deviation + * @param bounds + * @return + */ + protected ArrayList integerDecompose(int mass, int deviation, int[] bounds){ + assert (deviation result = new ArrayList(); + + int k = weights.size(); + int[] c = new int[k], deepCopy; + int[] j = new int[k], m = new int[k], lbound = new int[k], r = new int[k]; + boolean flagWhile = false; // flag wether we are in the while-loop or not + final int a = weights.get(0).getIntegerMass(); + // Init + for (int i=1; i= lbound[i] && c[i] <= bounds[i]){ //currently in while loop + --i; // "do" recursive call + } else { + flagWhile = false; // + } + } else { //we are in the for-loop + if (j[i] < weights.get(i).getL() && m[i]-j[i]*weights.get(i).getIntegerMass()>=0){ + c[i] = j[i]; + m[i-1] = m[i]-j[i]*weights.get(i).getIntegerMass(); + r[i] = m[i-1]%a; + //changed from normal algorithm: you have to look up the minimum at 2 position + int pos = r[i]-deviation+ERTdev; + if (pos<0) pos += currentERT.length; + lbound[i] = Math.min(currentERT[r[i]][i-1], currentERT[pos][i-1]); + flagWhile = true; // call the while loop + ++j[i]; + } else { //exit for loop + // reset "function variables" + lbound[i] = Integer.MAX_VALUE; + j[i] = 0; + c[i] = 0; + ++i; // "return" from recursion + if (i != k) { // only if we are not done + flagWhile = true; // in this recursion-depth we are in the while-loop, cause the next recursion was called + m[i-1] -= weights.get(i).getLcm(); // execute the rest of the while + c[i] += weights.get(i).getL(); + } + } + } + } // end if i == 0 + } // end while + + return result; + } // end function + + /** + * calculates ERTs to look up whether a mass or lower masses within a certain deviation are decomposable. + * only ERTs for deviation 2^x are calculated + * @param deviation + */ + protected void calcERT(int deviation){ + final int[][][] ERTs = this.ERTs; + final int currentLength = ERTs.length; + + // we have to extend the ERT table + + int[][] lastERT = ERTs[ERTs.length-1]; + int[][] nextERT = new int[lastERT.length][weights.size()]; + if (currentLength==1){ + //first line compares biggest residue and 0 + for (int j = 0; j < weights.size(); j++) { + nextERT[0][j] = Math.min(lastERT[nextERT.length-1][j], lastERT[0][j]); + } + for (int i = 1; i < nextERT.length; i++) { + for (int j = 0; j < weights.size(); j++) { + nextERT[i][j] = Math.min(lastERT[i][j], lastERT[i-1][j]); + } + } + } else { + int step = (1<<(currentLength-2)); + for (int i = step; i < nextERT.length; i++) { + for (int j = 0; j < weights.size(); j++) { + nextERT[i][j] = Math.min(lastERT[i][j], lastERT[i-step][j]); + } + } + //first lines compared with last lines (greatest residues) because of modulo's cyclic characteristic + for (int i = 0; i < step; i++) { + for (int j = 0; j < weights.size(); j++) { + nextERT[i][j] = Math.min(lastERT[i][j], lastERT[i+nextERT.length-step][j]); + } + } + } + + // now store newly calculated ERT + synchronized(this) { + final int[][][] tables = this.ERTs; + if (tables.length == currentLength) { + this.ERTs = Arrays.copyOf(this.ERTs, this.ERTs.length+1); + this.ERTs[this.ERTs.length-1] = nextERT; + } else { + // another background thread did already compute the ERT. So we don't have to do this again + } + } + // recursively calculate ERTs for higher deviations + // current ERT is already sufficient + if ((1<<(currentLength-1)) <= deviation) calcERT(deviation); + } + + @Override + protected void calcERT(){ + int firstLongVal = weights.get(0).getIntegerMass(); + int[][] ERT = new int[firstLongVal][weights.size()]; + int d, r, n, argmin; + + //Init + ERT[0][0] = 0; + for (int i = 1; i < ERT.length; ++i){ + ERT[i][0] = Integer.MAX_VALUE; // should be infinity + } + + //Filling the Table, j loops over columns + for (int j = 1; j < ERT[0].length; ++j){ + ERT[0][j] = 0; // Init again + d = gcd(firstLongVal, weights.get(j).getIntegerMass()); + for (int p = 0; p < d; p++){ // Need to start d Round Robin loops + if (p == 0) { + n = 0; // 0 is the min in the complete RT or the first p-loop + } else { + n = Integer.MAX_VALUE; // should be infinity + argmin = p; + for (int i = p; i. + */ +package org.openscience.cdk.decomp; + +/** + * A POJO storing the weight information about a character in the alphabet + * @param + */ +class Weight implements Comparable> { + + /** + * corresponding character in the alphabet + */ + private final T owner; + + /** + * the exact mass of the character + */ + private final double mass; + + /** + * the transformation of the mass in the integer space + */ + private int integerMass; + + private int l; + private int lcm; + + public Weight(T owner, double mass) { + this.owner = owner; + this.mass = mass; + } + + public T getOwner() { + return owner; + } + + public double getMass() { + return mass; + } + + public int getIntegerMass() { + return integerMass; + } + + public void setIntegerMass(int integerMass) { + this.integerMass = integerMass; + } + + public int getL() { + return l; + } + + public void setL(int l) { + this.l = l; + } + + public int getLcm() { + return lcm; + } + + public void setLcm(int lcm) { + this.lcm = lcm; + } + + @Override + public int compareTo(Weight tWeight) { + return (int)Math.signum(mass - tWeight.mass); + } +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/formula/FullEnumerationFormulaGenerator.java b/tool/formula/src/main/java/org/openscience/cdk/formula/FullEnumerationFormulaGenerator.java new file mode 100644 index 00000000000..23a5f400f92 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/formula/FullEnumerationFormulaGenerator.java @@ -0,0 +1,385 @@ +/* Copyright (C) 2014 Tomas Pluskal + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * All we ask is that proper credit is given for our work, which includes + * - but is not limited to - adding the above copyright notice to the beginning + * of your source code files, and to any copyright notice that you may distribute + * with programs based on this work. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + */ +package org.openscience.cdk.formula; + +import org.openscience.cdk.interfaces.IChemObjectBuilder; +import org.openscience.cdk.interfaces.IIsotope; +import org.openscience.cdk.interfaces.IMolecularFormula; +import org.openscience.cdk.interfaces.IMolecularFormulaSet; +import org.openscience.cdk.tools.ILoggingTool; +import org.openscience.cdk.tools.LoggingToolFactory; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.TreeSet; + +/** + * This class generates molecular formulas within given mass range and elemental + * composition. There is no guaranteed order in which the formulas are + * generated. + * + * Usage: + * + *
+ * IsotopeFactory ifac = Isotopes.getInstance();
+ * IIsotope c = ifac.getMajorIsotope("C");
+ * IIsotope h = ifac.getMajorIsotope("H");
+ * IIsotope n = ifac.getMajorIsotope("N");
+ * IIsotope o = ifac.getMajorIsotope("O");
+ * IIsotope p = ifac.getMajorIsotope("P");
+ * IIsotope s = ifac.getMajorIsotope("S");
+ * 
+ * MolecularFormulaRange mfRange = new MolecularFormulaRange();
+ * mfRange.addIsotope(c, 0, 50);
+ * mfRange.addIsotope(h, 0, 100);
+ * mfRange.addIsotope(o, 0, 50);
+ * mfRange.addIsotope(n, 0, 50);
+ * mfRange.addIsotope(p, 0, 10);
+ * mfRange.addIsotope(s, 0, 10);
+ * 
+ * MolecularFormulaGenerator mfg = new MolecularFormulaGenerator(builder, minMass,
+ *         maxMass, mfRange);
+ * double minMass = 133.003;
+ * double maxMass = 133.005;
+ * IMolecularFormulaSet mfSet = mfg.getAllFormulas();
+ * 
+ * + * The code was originally developed for a MZmine 2 framework module, published + * in Pluskal et al. {@cdk.cite Pluskal2012}. + * + * @cdk.module formula + * @author Tomas Pluskal + * @cdk.created 2014-12-28 + * @cdk.githash + */ +class FullEnumerationFormulaGenerator implements IFormulaGenerator { + + private static final ILoggingTool logger = LoggingToolFactory + .createLoggingTool(FullEnumerationFormulaGenerator.class); + + private final IChemObjectBuilder builder; + + /** + * Mass range to search by this instance of MolecularFormulaGenerator + */ + private final double minMass, maxMass; + + /** + * Internal arrays of isotopes (elements) used for the formula generation, + * their minimal and maximal counts, and the current counts, which + * correspond to the latest formula candidate. + * + * For example, let's assume we set isotopes=[C,H,N], minCounts=[0,0,0], and + * maxCounts=[3,3,3]. Then, currentCounts will be iterated in the following + * order: [0,0,0], [1,0,0], [2,0,0], [3,0,0], [0,1,0], [1,1,0], [2,1,0], + * [3,1,0], [0,2,0], [1,2,0], [2,2,0], etc. + * + * The lastIncreasedPosition index indicates the last position in + * currentCounts that was increased by calling increaseCounter(position) + */ + private final IIsotope isotopes[]; + private final int minCounts[], maxCounts[], currentCounts[]; + private int lastIncreasedPosition = 0; + + /** + * A flag indicating that the formula generation is running. If the search + * is finished (the whole search space has been examined) or the search is + * canceled by calling the cancel() method, this flag is set to false. + * + * @see #cancel() + */ + private volatile boolean searchRunning = true; + + /** + * Initiate the MolecularFormulaGenerator. + * + * @param minMass + * Lower boundary of the target mass range + * @param maxMass + * Upper boundary of the target mass range + * @param mfRange + * A range of elemental compositions defining the search space + * @throws IllegalArgumentException + * In case some of the isotopes in mfRange has undefined exact + * mass or in case illegal parameters are provided (e.g., + * negative mass values or empty MolecularFormulaRange) + * @see MolecularFormulaRange + */ + public FullEnumerationFormulaGenerator(final IChemObjectBuilder builder, + final double minMass, final double maxMass, + final MolecularFormulaRange mfRange) { + + logger.info("Initiate MolecularFormulaGenerator, mass range ", minMass, + "-", maxMass); + + // Check parameter values + if ((minMass < 0.0) || (maxMass < 0.0)) { + throw (new IllegalArgumentException( + "The minimum and maximum mass values must be >=0")); + } + + if ((minMass > maxMass)) { + throw (new IllegalArgumentException( + "Minimum mass must be <= maximum mass")); + } + + if ((mfRange == null) || (mfRange.getIsotopeCount() == 0)) { + throw (new IllegalArgumentException( + "The MolecularFormulaRange parameter must be non-null and must contain at least one isotope")); + } + + // Save the parameters + this.builder = builder; + this.minMass = minMass; + this.maxMass = maxMass; + + // Sort the elements by mass in ascending order. That speeds up + // the search. + final TreeSet isotopesSet = new TreeSet( + new IIsotopeSorterByMass()); + for (IIsotope isotope : mfRange.isotopes()) { + // Check if exact mass of each isotope is set + if (isotope.getExactMass() == null) + throw new IllegalArgumentException( + "The exact mass value of isotope " + isotope + + " is not set"); + isotopesSet.add(isotope); + } + isotopes = isotopesSet.toArray(new IIsotope[isotopesSet.size()]); + + // Save the element counts from the provided MolecularFormulaRange + minCounts = new int[isotopes.length]; + maxCounts = new int[isotopes.length]; + for (int i = 0; i < isotopes.length; i++) { + minCounts[i] = mfRange.getIsotopeCountMin(isotopes[i]); + maxCounts[i] = mfRange.getIsotopeCountMax(isotopes[i]); + + // Update the maximum count according to the mass limit + int maxCountAccordingToMass = (int) Math.floor(maxMass + / isotopes[i].getExactMass()); + if (maxCounts[i] > maxCountAccordingToMass) + maxCounts[i] = maxCountAccordingToMass; + + } + + // Set the current counters to minimal values, initially + currentCounts = Arrays.copyOf(minCounts, minCounts.length); + + } + + /** + * Returns next generated formula or null in case no new formula was found + * (search is finished). There is no guaranteed order in which the formulas + * are generated. + */ + public synchronized IMolecularFormula getNextFormula() { + + // Main cycle iterating through element counters + mainCycle: while (searchRunning) { + + double currentMass = calculateCurrentMass(); + + // Heuristics: if we are over the mass, it is meaningless to add + // more atoms, so let's jump directly to the maximum count at the + // position we increased last time. + if (currentMass > maxMass) { + // Keep a lock on the currentCounts, because + // getFinishedPercentage() might be called from another + // thread + synchronized (currentCounts) { + currentCounts[lastIncreasedPosition] = maxCounts[lastIncreasedPosition]; + increaseCounter(lastIncreasedPosition); + } + continue mainCycle; + } + + // If the current formula mass fits in the target mass range, let's + // return it + if ((currentMass >= minMass) && (currentMass <= maxMass)) { + IMolecularFormula cdkFormula = generateFormulaObject(); + increaseCounter(0); + return cdkFormula; + } + + // Increase the counter + increaseCounter(0); + } + + // All combinations tested, return null + return null; + + } + + /** + * Generates a {@link IMolecularFormulaSet} by repeatedly calling {@link + * FullEnumerationFormulaGenerator#getNextFormula()} until all possible formulas are generated. There is no + * guaranteed order to the formulas in the resulting + * {@link IMolecularFormulaSet}. + * + * Note: if some formulas were already generated by calling {@link + * FullEnumerationFormulaGenerator#getNextFormula()} on this MolecularFormulaGenerator instance, those + * formulas will not be included in the returned + * {@link IMolecularFormulaSet}. + * + * @see #getNextFormula() + */ + public synchronized IMolecularFormulaSet getAllFormulas() { + + final IMolecularFormulaSet result = builder + .newInstance(IMolecularFormulaSet.class); + IMolecularFormula nextFormula; + while ((nextFormula = getNextFormula()) != null) { + result.addMolecularFormula(nextFormula); + } + return result; + } + + /** + * Increases the internal counter array currentCounts[] at given position. + * If the position has reached its maximum value, its value is reset to the + * minimum value and the next position is increased. + * + * @param position + * Index to the currentCounts[] array that should be increased + */ + private void increaseCounter(int position) { + + // This should never happen, but let's check, just in case + if (position >= currentCounts.length) { + throw new IllegalArgumentException( + "Cannot increase the currentCounts counter at position " + + position); + } + + lastIncreasedPosition = position; + + // Keep a lock on the currentCounts, because + // getFinishedPercentage() might be called from another thread + synchronized (currentCounts) { + + if (currentCounts[position] < maxCounts[position]) { + currentCounts[position]++; + } else { + // Reset the value at given position, and increase the next one. + if (position < isotopes.length - 1) { + currentCounts[position] = minCounts[position]; + increaseCounter(position + 1); + } else { + // If we are already at the last position, that means we + // have covered the whole search space and we can stop the + // search. + searchRunning = false; + + // Copy the maxCounts[] array to currentCounts[]. This + // ensures that getFinishedPercentage() will return 1.0 + System.arraycopy(maxCounts, 0, currentCounts, 0, + maxCounts.length); + } + } + } + + } + + /** + * Calculates the exact mass of the currently evaluated formula. Basically, + * it multiplies the currentCounts[] array by the masses of the isotopes in + * the isotopes[] array. + */ + private double calculateCurrentMass() { + double mass = 0; + for (int i = 0; i < isotopes.length; i++) { + mass += currentCounts[i] * isotopes[i].getExactMass(); + } + + return mass; + } + + /** + * Generates a MolecularFormula object that contains the isotopes in the + * isotopes[] array with counts in the currentCounts[] array. In other + * words, generates a proper CDK representation of the currently examined + * formula. + */ + private IMolecularFormula generateFormulaObject() { + IMolecularFormula formulaObject = builder + .newInstance(IMolecularFormula.class); + for (int i = 0; i < isotopes.length; i++) { + if (currentCounts[i] == 0) + continue; + formulaObject.addIsotope(isotopes[i], currentCounts[i]); + } + return formulaObject; + + } + + /** + * Returns a value between 0.0 and 1.0 indicating what portion of the search + * space has been examined so far by this MolecularFormulaGenerator. Before + * the first call to {@link FullEnumerationFormulaGenerator#getNextFormula()}, this method returns 0. After + * all possible formulas are generated, this method returns 1.0 (the exact + * returned value might be slightly off due to rounding errors). This method + * can be called from any thread. + */ + public double getFinishedPercentage() { + double result = 0.0; + double remainingPerc = 1.0; + + // Keep a lock on currentCounts, otherwise it might change during the + // calculation + synchronized (currentCounts) { + for (int i = currentCounts.length - 1; i >= 0; i--) { + double max = maxCounts[i]; + if (i > 0) + max += 1.0; + result += remainingPerc * ((double) currentCounts[i] / max); + remainingPerc /= max; + } + } + return result; + } + + /** + * Cancel the current search. This method can be called from any thread. If + * another thread is executing the {@link FullEnumerationFormulaGenerator#getNextFormula()} method, that + * method call will return immediately with null return value. If another + * thread is executing the {@link FullEnumerationFormulaGenerator#getAllFormulas()} method, that method call + * will return immediately, returning all formulas generated until this + * moment. The search cannot be restarted once canceled - any subsequent + * calls to {@link FullEnumerationFormulaGenerator#getNextFormula()} will return null. + */ + public void cancel() { + logger.info("Canceling MolecularFormulaGenerator"); + this.searchRunning = false; + } + + /** + * A simple {@link Comparator} implementation for sorting IIsotopes by their + * mass + */ + private class IIsotopeSorterByMass implements Comparator { + @Override + public int compare(IIsotope i1, IIsotope i2) { + return Double.compare(i1.getExactMass(), i2.getExactMass()); + } + } +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/formula/IFormulaGenerator.java b/tool/formula/src/main/java/org/openscience/cdk/formula/IFormulaGenerator.java new file mode 100644 index 00000000000..bf84dde3463 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/formula/IFormulaGenerator.java @@ -0,0 +1,11 @@ +package org.openscience.cdk.formula; + +import org.openscience.cdk.interfaces.IMolecularFormula; +import org.openscience.cdk.interfaces.IMolecularFormulaSet; + +interface IFormulaGenerator { + public IMolecularFormula getNextFormula(); + public IMolecularFormulaSet getAllFormulas(); + public double getFinishedPercentage(); + public void cancel(); +} diff --git a/tool/formula/src/main/java/org/openscience/cdk/formula/MolecularFormulaGenerator.java b/tool/formula/src/main/java/org/openscience/cdk/formula/MolecularFormulaGenerator.java index 376d25840ff..ccf56d4b590 100644 --- a/tool/formula/src/main/java/org/openscience/cdk/formula/MolecularFormulaGenerator.java +++ b/tool/formula/src/main/java/org/openscience/cdk/formula/MolecularFormulaGenerator.java @@ -1,45 +1,16 @@ -/* Copyright (C) 2014 Tomas Pluskal - * - * Contact: cdk-devel@lists.sourceforge.net - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * All we ask is that proper credit is given for our work, which includes - * - but is not limited to - adding the above copyright notice to the beginning - * of your source code files, and to any copyright notice that you may distribute - * with programs based on this work. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - */ package org.openscience.cdk.formula; -import java.util.Arrays; -import java.util.Comparator; -import java.util.TreeSet; - import org.openscience.cdk.interfaces.IChemObjectBuilder; import org.openscience.cdk.interfaces.IIsotope; import org.openscience.cdk.interfaces.IMolecularFormula; import org.openscience.cdk.interfaces.IMolecularFormulaSet; -import org.openscience.cdk.tools.ILoggingTool; -import org.openscience.cdk.tools.LoggingToolFactory; /** * This class generates molecular formulas within given mass range and elemental - * composition. There is no guaranteed order in which the formulas are - * generated. - * + * composition. + * * Usage: - * + * *
  * IsotopeFactory ifac = Isotopes.getInstance();
  * IIsotope c = ifac.getMajorIsotope("C");
@@ -48,7 +19,7 @@
  * IIsotope o = ifac.getMajorIsotope("O");
  * IIsotope p = ifac.getMajorIsotope("P");
  * IIsotope s = ifac.getMajorIsotope("S");
- * 
+ *
  * MolecularFormulaRange mfRange = new MolecularFormulaRange();
  * mfRange.addIsotope(c, 0, 50);
  * mfRange.addIsotope(h, 0, 100);
@@ -56,63 +27,36 @@
  * mfRange.addIsotope(n, 0, 50);
  * mfRange.addIsotope(p, 0, 10);
  * mfRange.addIsotope(s, 0, 10);
- * 
+ *
  * MolecularFormulaGenerator mfg = new MolecularFormulaGenerator(builder, minMass,
  *         maxMass, mfRange);
  * double minMass = 133.003;
  * double maxMass = 133.005;
  * IMolecularFormulaSet mfSet = mfg.getAllFormulas();
  * 
- * - * The code was originally developed for a MZmine 2 framework module, published - * in Pluskal et al. {@cdk.cite Pluskal2012}. - * + * + * This class offers two implementations: The Round Robin algorithm {@cdk.cite Boecker2008} on mass ranges + * {@cdk.cite Duehrkop2013} is used on most inputs. For special cases (e.g. single elements, extremely large mass ranges) + * a full enumeration algorithm {@cdk.cite Pluskal2012} is used. + * + * The Round Robin algorithm was originally developed for the SIRIUS 3 software. The full enumeration algorithm was + * originally developed for a MZmine 2 framework module, published in Pluskal et al. {@cdk.cite Pluskal2012}. + * * @cdk.module formula - * @author Tomas Pluskal + * @author Tomas Pluskal, Kai Dührkop, Marcus Ludwig * @cdk.created 2014-12-28 * @cdk.githash */ -public class MolecularFormulaGenerator { - - private static final ILoggingTool logger = LoggingToolFactory - .createLoggingTool(MolecularFormulaGenerator.class); - - private final IChemObjectBuilder builder; +public class MolecularFormulaGenerator implements IFormulaGenerator { /** - * Mass range to search by this instance of MolecularFormulaGenerator + * The chosen implementation */ - private final double minMass, maxMass; - - /** - * Internal arrays of isotopes (elements) used for the formula generation, - * their minimal and maximal counts, and the current counts, which - * correspond to the latest formula candidate. - * - * For example, let's assume we set isotopes=[C,H,N], minCounts=[0,0,0], and - * maxCounts=[3,3,3]. Then, currentCounts will be iterated in the following - * order: [0,0,0], [1,0,0], [2,0,0], [3,0,0], [0,1,0], [1,1,0], [2,1,0], - * [3,1,0], [0,2,0], [1,2,0], [2,2,0], etc. - * - * The lastIncreasedPosition index indicates the last position in - * currentCounts that was increased by calling increaseCounter(position) - */ - private final IIsotope isotopes[]; - private final int minCounts[], maxCounts[], currentCounts[]; - private int lastIncreasedPosition = 0; - - /** - * A flag indicating that the formula generation is running. If the search - * is finished (the whole search space has been examined) or the search is - * canceled by calling the cancel() method, this flag is set to false. - * - * @see #cancel() - */ - private boolean searchRunning = true; + protected final IFormulaGenerator formulaGenerator; /** * Initiate the MolecularFormulaGenerator. - * + * * @param minMass * Lower boundary of the target mass range * @param maxMass @@ -126,65 +70,40 @@ public class MolecularFormulaGenerator { * @see MolecularFormulaRange */ public MolecularFormulaGenerator(final IChemObjectBuilder builder, - final double minMass, final double maxMass, - final MolecularFormulaRange mfRange) { - - logger.info("Initiate MolecularFormulaGenerator, mass range ", minMass, - "-", maxMass); - - // Check parameter values - if ((minMass < 0.0) || (maxMass < 0.0)) { - throw (new IllegalArgumentException( - "The minimum and maximum mass values must be >=0")); - } - - if ((minMass > maxMass)) { - throw (new IllegalArgumentException( - "Minimum mass must be <= maximum mass")); - } - - if ((mfRange == null) || (mfRange.getIsotopeCount() == 0)) { - throw (new IllegalArgumentException( - "The MolecularFormulaRange parameter must be non-null and must contain at least one isotope")); - } - - // Save the parameters - this.builder = builder; - this.minMass = minMass; - this.maxMass = maxMass; - - // Sort the elements by mass in ascending order. That speeds up - // the search. - final TreeSet isotopesSet = new TreeSet( - new IIsotopeSorterByMass()); - for (IIsotope isotope : mfRange.isotopes()) { - // Check if exact mass of each isotope is set - if (isotope.getExactMass() == null) - throw new IllegalArgumentException( - "The exact mass value of isotope " + isotope - + " is not set"); - isotopesSet.add(isotope); - } - isotopes = isotopesSet.toArray(new IIsotope[isotopesSet.size()]); - - // Save the element counts from the provided MolecularFormulaRange - minCounts = new int[isotopes.length]; - maxCounts = new int[isotopes.length]; - for (int i = 0; i < isotopes.length; i++) { - minCounts[i] = mfRange.getIsotopeCountMin(isotopes[i]); - maxCounts[i] = mfRange.getIsotopeCountMax(isotopes[i]); - - // Update the maximum count according to the mass limit - int maxCountAccordingToMass = (int) Math.floor(maxMass - / isotopes[i].getExactMass()); - if (maxCounts[i] > maxCountAccordingToMass) - maxCounts[i] = maxCountAccordingToMass; - - } - - // Set the current counters to minimal values, initially - currentCounts = Arrays.copyOf(minCounts, minCounts.length); + final double minMass, final double maxMass, + final MolecularFormulaRange mfRange) { + checkInputParameters(builder,minMass,maxMass,mfRange); + this.formulaGenerator = isIllPosed(minMass, maxMass, mfRange) ? new FullEnumerationFormulaGenerator(builder, minMass, maxMass, mfRange) : new RoundRobinFormulaGenerator(builder, minMass, maxMass, mfRange); + } + /** + * Decides wheter to use the round robin algorithm or full enumeration algorithm. + * The round robin implementation here is optimized for chemical elements in organic compounds. It gets slow + * if + * - the mass of the smallest element is very large (i.e. hydrogen is not allowed) + * - the maximal mass to decompose is too large (round robin always decomposes integers. Therefore, the mass have + * to be small enough to be represented as 32 bit integer) + * - the number of elements in the set is extremely small (in this case, however, the problem becomes trivial anyways) + * + * In theory we could handle these problems by optimizing the way DECOMP discretizes the masses. However, it's easier + * to just fall back to the full enumeration method if a problem occurs (especially, because most of the problems + * lead to trivial cases that are fast to compute). + * + * @return true if the problem is ill-posed (i.e. should be calculated by full enumeration method) + */ + private static boolean isIllPosed(double minMass, double maxMass, MolecularFormulaRange mfRange) { + // when the number of integers to decompose is incredible large + // we have to adjust the internal settings (e.g. precision!) + // instead we simply fallback to the full enumeration method + if (maxMass-minMass > 1) return true; + if (maxMass > 400000) return true; + // if the number of elements to decompose is very small + // we fall back to the full enumeration methods as the + // minimal decomposable mass of a certain residue class might + // exceed the 32 bit integer space + if (mfRange.getIsotopeCount() <= 2) return true; + + return false; } /** @@ -192,194 +111,85 @@ public MolecularFormulaGenerator(final IChemObjectBuilder builder, * (search is finished). There is no guaranteed order in which the formulas * are generated. */ - public synchronized IMolecularFormula getNextFormula() { - - // Main cycle iterating through element counters - mainCycle: while (searchRunning) { - - double currentMass = calculateCurrentMass(); - - // Heuristics: if we are over the mass, it is meaningless to add - // more atoms, so let's jump directly to the maximum count at the - // position we increased last time. - if (currentMass > maxMass) { - // Keep a lock on the currentCounts, because - // getFinishedPercentage() might be called from another - // thread - synchronized (currentCounts) { - currentCounts[lastIncreasedPosition] = maxCounts[lastIncreasedPosition]; - increaseCounter(lastIncreasedPosition); - } - continue mainCycle; - } - - // If the current formula mass fits in the target mass range, let's - // return it - if ((currentMass >= minMass) && (currentMass <= maxMass)) { - IMolecularFormula cdkFormula = generateFormulaObject(); - increaseCounter(0); - return cdkFormula; - } - - // Increase the counter - increaseCounter(0); - } - - // All combinations tested, return null - return null; - + @Override + public IMolecularFormula getNextFormula() { + return formulaGenerator.getNextFormula(); } /** * Generates a {@link IMolecularFormulaSet} by repeatedly calling {@link - * getNextFormula()} until all possible formulas are generated. There is no + * MolecularFormulaGenerator#getNextFormula()} until all possible formulas are generated. There is no * guaranteed order to the formulas in the resulting * {@link IMolecularFormulaSet}. - * + * * Note: if some formulas were already generated by calling {@link - * getNextFormula()} on this MolecularFormulaGenerator instance, those + * MolecularFormulaGenerator#getNextFormula()} on this MolecularFormulaGenerator instance, those * formulas will not be included in the returned * {@link IMolecularFormulaSet}. - * + * * @see #getNextFormula() */ - public synchronized IMolecularFormulaSet getAllFormulas() { - - final IMolecularFormulaSet result = builder - .newInstance(IMolecularFormulaSet.class); - IMolecularFormula nextFormula; - while ((nextFormula = getNextFormula()) != null) { - result.addMolecularFormula(nextFormula); - } - return result; - } - - /** - * Increases the internal counter array currentCounts[] at given position. - * If the position has reached its maximum value, its value is reset to the - * minimum value and the next position is increased. - * - * @param position - * Index to the currentCounts[] array that should be increased - */ - private void increaseCounter(int position) { - - // This should never happen, but let's check, just in case - if (position >= currentCounts.length) { - throw new IllegalArgumentException( - "Cannot increase the currentCounts counter at position " - + position); - } - - lastIncreasedPosition = position; - - // Keep a lock on the currentCounts, because - // getFinishedPercentage() might be called from another thread - synchronized (currentCounts) { - - if (currentCounts[position] < maxCounts[position]) { - currentCounts[position]++; - } else { - // Reset the value at given position, and increase the next one. - if (position < isotopes.length - 1) { - currentCounts[position] = minCounts[position]; - increaseCounter(position + 1); - } else { - // If we are already at the last position, that means we - // have covered the whole search space and we can stop the - // search. - searchRunning = false; - - // Copy the maxCounts[] array to currentCounts[]. This - // ensures that getFinishedPercentage() will return 1.0 - System.arraycopy(maxCounts, 0, currentCounts, 0, - maxCounts.length); - } - } - } - - } - - /** - * Calculates the exact mass of the currently evaluated formula. Basically, - * it multiplies the currentCounts[] array by the masses of the isotopes in - * the isotopes[] array. - */ - private double calculateCurrentMass() { - double mass = 0; - for (int i = 0; i < isotopes.length; i++) { - mass += currentCounts[i] * isotopes[i].getExactMass(); - } - - return mass; - } - - /** - * Generates a MolecularFormula object that contains the isotopes in the - * isotopes[] array with counts in the currentCounts[] array. In other - * words, generates a proper CDK representation of the currently examined - * formula. - */ - private IMolecularFormula generateFormulaObject() { - IMolecularFormula formulaObject = builder - .newInstance(IMolecularFormula.class); - for (int i = 0; i < isotopes.length; i++) { - if (currentCounts[i] == 0) - continue; - formulaObject.addIsotope(isotopes[i], currentCounts[i]); - } - return formulaObject; - + @Override + public IMolecularFormulaSet getAllFormulas() { + return formulaGenerator.getAllFormulas(); } /** * Returns a value between 0.0 and 1.0 indicating what portion of the search * space has been examined so far by this MolecularFormulaGenerator. Before - * the first call to {@link getNextFormula()}, this method returns 0. After + * the first call to {@link MolecularFormulaGenerator#getNextFormula()}, this method returns 0. After * all possible formulas are generated, this method returns 1.0 (the exact * returned value might be slightly off due to rounding errors). This method * can be called from any thread. */ + @Override public double getFinishedPercentage() { - double result = 0.0; - double remainingPerc = 1.0; - - // Keep a lock on currentCounts, otherwise it might change during the - // calculation - synchronized (currentCounts) { - for (int i = currentCounts.length - 1; i >= 0; i--) { - double max = maxCounts[i]; - if (i > 0) - max += 1.0; - result += remainingPerc * ((double) currentCounts[i] / max); - remainingPerc /= max; - } - } - return result; + return formulaGenerator.getFinishedPercentage(); } /** * Cancel the current search. This method can be called from any thread. If - * another thread is executing the {@link getNextFormula()} method, that + * another thread is executing the {@link MolecularFormulaGenerator#getNextFormula()} method, that * method call will return immediately with null return value. If another - * thread is executing the {@link getAllFormulas()} method, that method call + * thread is executing the {@link MolecularFormulaGenerator#getAllFormulas()} method, that method call * will return immediately, returning all formulas generated until this * moment. The search cannot be restarted once canceled - any subsequent - * calls to {@link getNextFormula()} will return null. + * calls to {@link MolecularFormulaGenerator#getNextFormula()} will return null. */ + @Override public void cancel() { - logger.info("Canceling MolecularFormulaGenerator"); - this.searchRunning = false; + formulaGenerator.cancel(); } /** - * A simple {@link Comparator} implementation for sorting IIsotopes by their - * mass + * Checks if input parameters are valid and throws an IllegalArgumentException otherwise. */ - private class IIsotopeSorterByMass implements Comparator { - @Override - public int compare(IIsotope i1, IIsotope i2) { - return Double.compare(i1.getExactMass(), i2.getExactMass()); + protected void checkInputParameters(final IChemObjectBuilder builder, + final double minMass, final double maxMass, + final MolecularFormulaRange mfRange) { + if ((minMass < 0.0) || (maxMass < 0.0)) { + throw (new IllegalArgumentException( + "The minimum and maximum mass values must be >=0")); + } + + if ((minMass > maxMass)) { + throw (new IllegalArgumentException( + "Minimum mass must be <= maximum mass")); + } + + if ((mfRange == null) || (mfRange.getIsotopeCount() == 0)) { + throw (new IllegalArgumentException( + "The MolecularFormulaRange parameter must be non-null and must contain at least one isotope")); + } + + // Sort the elements by mass in ascending order. That speeds up + // the search. + for (IIsotope isotope : mfRange.isotopes()) { + // Check if exact mass of each isotope is set + if (isotope.getExactMass() == null) + throw new IllegalArgumentException( + "The exact mass value of isotope " + isotope + + " is not set"); } } } diff --git a/tool/formula/src/main/java/org/openscience/cdk/formula/RoundRobinFormulaGenerator.java b/tool/formula/src/main/java/org/openscience/cdk/formula/RoundRobinFormulaGenerator.java new file mode 100644 index 00000000000..1e93162e6a3 --- /dev/null +++ b/tool/formula/src/main/java/org/openscience/cdk/formula/RoundRobinFormulaGenerator.java @@ -0,0 +1,130 @@ +package org.openscience.cdk.formula; + +import org.openscience.cdk.decomp.ChemicalAlphabet; +import org.openscience.cdk.decomp.DecompIterator; +import org.openscience.cdk.decomp.DecomposerFactory; +import org.openscience.cdk.decomp.Interval; +import org.openscience.cdk.interfaces.IChemObjectBuilder; +import org.openscience.cdk.interfaces.IIsotope; +import org.openscience.cdk.interfaces.IMolecularFormula; +import org.openscience.cdk.interfaces.IMolecularFormulaSet; + +import java.util.HashMap; + +/** + * This class generates molecular formulas within given mass range and elemental + * composition. It should not be used directly but via the {@link MolecularFormulaGenerator} as it cannot deal with + * all kind of inputs. + * + * This class is using the Round Robin algorithm {@cdk.cite Boecker2008} on mass ranges + * {@cdk.cite Duehrkop2013}. It uses dynamic programming to compute an extended residue table which allows a constant + * time lookup if some integer mass is decomposable over a certain alphabet. For each alphabet this table has to be + * computed only once and is then cached in memory. Using this table the algorithm can decide directly which masses + * are decomposable and, therefore, is only enumerating formulas which masses are in the given integer mass range. The + * mass range decomposer is using a logarithmic number of tables, one for each alphabet and an allowed mass deviation. + * It, therefore, allows to decompose a whole range of integer numbers instead of a single one. + * + * As masses are real values, the decomposer has to translate values from real space to integer space and vice versa. + * This translation is done via multiplying with a blow-up factor (which is by default 5963.337687) and rounding the + * results. The blow-up factor is optimized for organic molecules. For other alphabets (e.g. amino acids or molecules + * without hydrogens) another blow-up factor have to be chosen. Therefore, it is recommended to use this decomposer + * only for organic molecular formulas. + */ +class RoundRobinFormulaGenerator implements IFormulaGenerator { + + /** + * generates the IMolecularFormula and IMolecularFormulaSet instances + */ + protected final IChemObjectBuilder builder; + + /** + * the decomposer algorithm with the cached extended residue table + */ + protected final DecompIterator decomposer; + + /** + * The used chemical alphabet (a mapping of IIsotopes to their masses) + */ + protected final ChemicalAlphabet alphabet; + + /** + * a flag indicating if the algorithm is done or should be canceled. + * This flag have to be volatile to allow other threads to cancel the enumeration. + */ + protected volatile boolean done; + + /** + * Initiate the MolecularFormulaGenerator. + * + * @param minMass + * Lower boundary of the target mass range + * @param maxMass + * Upper boundary of the target mass range + * @param mfRange + * A range of elemental compositions defining the search space + * @throws IllegalArgumentException + * In case some of the isotopes in mfRange has undefined exact + * mass or in case illegal parameters are provided (e.g., + * negative mass values or empty MolecularFormulaRange) + * @see MolecularFormulaRange + */ + public RoundRobinFormulaGenerator(final IChemObjectBuilder builder, + final double minMass, final double maxMass, + final MolecularFormulaRange mfRange) { + this.builder = builder; + this.alphabet = new ChemicalAlphabet(builder, mfRange); + final HashMap boundaries = new HashMap<>(alphabet.size()); + for (IIsotope i : mfRange.isotopes()) { + boundaries.put(i, new Interval(mfRange.getIsotopeCountMin(i), mfRange.getIsotopeCountMax(i))); + } + this.decomposer = DecomposerFactory.getInstance().getDecomposerFor(alphabet).decomposeIterator(minMass, maxMass,boundaries); + this.done = false; + } + + /** + * @see MolecularFormulaGenerator#getNextFormula() + */ + @Override + public synchronized IMolecularFormula getNextFormula() { + if (!done && decomposer.next()) { + return alphabet.buildFormulaFromCompomere(decomposer.getCurrentCompomere(), decomposer.getAlphabetOrder()); + } else { + done = true; + return null; + } + } + + /** + * @see MolecularFormulaGenerator#getAllFormulas() + */ + @Override + public synchronized IMolecularFormulaSet getAllFormulas() { + final IMolecularFormulaSet set = builder.newInstance(IMolecularFormulaSet.class); + if (done) return set; + for (IMolecularFormula formula = getNextFormula(); formula != null; formula = getNextFormula()) { + set.addMolecularFormula(formula); + if (done) return set; + } + done = true; + return set; + } + + /** + * This method does not work for Round Robin as the algorithm only enumerates formulas which really have the + * searched mass range (except for false positives due to rounding errors). As the exact number of molecular formulas + * with a given mass is unknown (calculating it is as expensive as enumerating them) there is no way to give a + * progress number. Therefore, the method returns just 0 if it's enumerating and 1 if it's done. + */ + @Override + public double getFinishedPercentage() { + return done ? 1 : 0; + } + + /** + * Cancel the computation + */ + @Override + public void cancel() { + done=true; + } +} diff --git a/tool/formula/src/test/java/org/openscience/cdk/formula/MolecularFormulaGeneratorTest.java b/tool/formula/src/test/java/org/openscience/cdk/formula/MolecularFormulaGeneratorTest.java index 48571489048..0552dae3f38 100644 --- a/tool/formula/src/test/java/org/openscience/cdk/formula/MolecularFormulaGeneratorTest.java +++ b/tool/formula/src/test/java/org/openscience/cdk/formula/MolecularFormulaGeneratorTest.java @@ -31,7 +31,7 @@ import org.openscience.cdk.tools.manipulator.MolecularFormulaManipulator; /** - * Checks the functionality of the MolecularFormulaGenerator. + * Checks the functionality of the FastFormulaGenerator. * * @cdk.module test-formula */ @@ -40,6 +40,7 @@ public class MolecularFormulaGeneratorTest extends CDKTestCase { private final IChemObjectBuilder builder = SilentChemObjectBuilder .getInstance(); + /** * Test the getNextFormula() method */ @@ -476,6 +477,9 @@ public void testHighMass() throws Exception { Assert.assertEquals(1, mfSet.size()); Assert.assertEquals("C374H623N103O116S", MolecularFormulaManipulator .getString(mfSet.getMolecularFormula(0))); + + + ////////////////// } /**