Skip to content

Commit

Permalink
implement round robin algorithm for decomposing molecular formulas ta…
Browse files Browse the repository at this point in the history
…ken from SIRIUS
  • Loading branch information
kaibioinfo committed Jul 28, 2016
1 parent 397a0c5 commit 86b4065
Show file tree
Hide file tree
Showing 14 changed files with 2,144 additions and 288 deletions.
23 changes: 22 additions & 1 deletion doc/refs/cheminf.bibx
Original file line number Diff line number Diff line change
Expand Up @@ -1452,5 +1452,26 @@ Method </bibtex:title>
<bibtex:pages>4396-4403</bibtex:pages>
</bibtex:article>
</bibtex:entry>


<bibtex:entry id="Boecker2008">
<bibtex:article>
<bibtex:title>DECOMP--from interpreting Mass Spectrometry peaks to solving the Money Changing Problem.</bibtex:title>
<bibtex:author>Böcker, Sebastian and Lipták, Zsuzsanna and Martin, Marcel and Pervukhin, Anton and Sudek, Henner</bibtex:author>
<bibtex:year>2008</bibtex:year>
<bibtex:journal>Bioinformatics</bibtex:journal>
<bibtex:volume>24</bibtex:volume>
<bibtex:number>4</bibtex:number>
<bibtex:pages>591--593</bibtex:pages>
<bibtex:url>http://bioinformatics.oxfordjournals.org/cgi/reprint/24/4/591?ijkey=1lM50Bkzz4SCLsa</bibtex:url>
</bibtex:article>
</bibtex:entry>
<bibtex:entry id="Duehrkop2013">
<bibtex:inproceedings>
<bibtex:author>Dührkop, Kai and Ludwig, Marcus and Meusel, Marvin and Böcker, Sebastian</bibtex:author>
<bibtex:title>Faster mass decomposition</bibtex:title>
<bibtex:year>2013</bibtex:year>
<bibtex:booktitle>Proc. of Workshop on Algorithms in Bioinformatics (WABI 2013)</bibtex:booktitle>
<bibtex:url>http://arxiv.org/abs/1307.7805</bibtex:url>
</bibtex:inproceedings>
</bibtex:entry>
</bibtex:file>
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* This file is part of the SIRIUS library for analyzing MS and MS/MS data
*
* Copyright (C) 2013-2015 Kai Dührkop
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with SIRIUS. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openscience.cdk.decomp;

/**
* The alphabet for which a given weight is decomposed. An alphabet is a vector c_1..c_k of k characters of Type T.
* It maps each character to a weight. It supports access by an index as well as by the character itself.
*
* @param <T> type of a single character in the alphabet
*/
interface Alphabet<T> {

/**
* @return size of the alphabet. Indizes of characters are 0..{@literal <} size
*/
public int size();

/**
* @param i index of the character
* @return weight of character c_i
*/
public double weightOf(int i);

/**
* @param i index of the character
* @return character c_i
*/
public T get(int i);

/**
* Maps the character to its index. This operation should be fast, because internally a modified ordered
* alphabet is used which have to be mapped back to the original alphabet
* @param character
* @return the index of the character
*/
public int indexOf(T character);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* This file is part of the SIRIUS library for analyzing MS and MS/MS data
*
* Copyright (C) 2013-2015 Kai Dührkop
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with SIRIUS. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openscience.cdk.decomp;

import org.openscience.cdk.formula.MolecularFormulaRange;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.interfaces.IIsotope;
import org.openscience.cdk.interfaces.IMolecularFormula;

import java.util.Arrays;

/**
* Implements the {@link Alphabet} for chemical elements.
*/
public class ChemicalAlphabet implements Alphabet<IIsotope> {

/**
* Is used to convert compomeres to IMolecularFormula instances
*/
protected final IChemObjectBuilder objectBuilder;

/**
* The characters (chemical elements) of the alphabet
*/
protected final IIsotope[] characters;

/**
* Construct a new chemical alphabet from the given search space using the given object builder.
*
* @param molecularFormulaRange Search spacel, defining the allowed elements
*/
public ChemicalAlphabet(IChemObjectBuilder builder, MolecularFormulaRange molecularFormulaRange) {
this.objectBuilder = builder;
IIsotope[] chars = new IIsotope[molecularFormulaRange.getIsotopeCount()];
int k=0;
for (IIsotope i : molecularFormulaRange.isotopes()) {
if (molecularFormulaRange.getIsotopeCountMax(i) > 0) chars[k++] = i;
}
if (k < chars.length) chars = Arrays.copyOf(chars, k);
this.characters = chars;
}

/**
* Translates a compomere (multiset of characters) into a IMolecularFormula
*/
public IMolecularFormula buildFormulaFromCompomere(int[] compomere, int[] orderedIndizes) {
IMolecularFormula formula = objectBuilder.newInstance(IMolecularFormula.class);
for (int k=0; k < orderedIndizes.length; ++k) {
if (compomere[k] > 0) formula.addIsotope(characters[orderedIndizes[k]], compomere[k]);
}
return formula;
}

/**
* Checks if two chemical alphabets are compatible. In theory, an alphabet would be compatible if it is a subset
* of another alphabet. However, we directly check for equality to keep this operation symetric.
*
* A decomposer can decompose every mass with alphabet as long as the alphabet is compatible to the decomposers
* own alphabet.
*/
public boolean isCompatible(ChemicalAlphabet other) {
return Arrays.equals(characters, other.characters);
}

@Override
public int size() {
return characters.length;
}

@Override
public double weightOf(int i) {
return characters[i].getExactMass();
}

@Override
public IIsotope get(int i) {
return characters[i];
}

/**
* maps each character to its index. This operation is quite slow, but have to be done only once when starting the
* decomposer. Therefore, we don't need a hash table here.
*/
@Override
public int indexOf(IIsotope character) {
for (int k=0; k < characters.length; ++k)
if (characters[k]==character) return k;
return -1;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* This file is part of the SIRIUS library for analyzing MS and MS/MS data
*
* Copyright (C) 2013-2015 Kai Dührkop
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with SIRIUS. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openscience.cdk.decomp;

/**
* This class allows to iterate over decompositions instead of keeping all decompositions in memory. It's slightly
* slower than generating all decompositions within a single loop, but it offers more flexibility (i.e. find a single
* decomposition that satisfies some rule).
* @param <T> type of the characters of the alphabet
*/
public interface DecompIterator<T> {

/**
* moves the iterator one step.
* @return true, if a new decomposition is found. false, if the iterator reached the end.
*/
public boolean next();

/**
* Give access to the current compomere. Please note that this array is only valid during the current iteration step
* and might be changed afterwards. Furthermore, it is absolutely forbidden to write anything into this array.
* However, you are free to clone the array and do anything with its copy.
*
* @return the compomere (a tuple (a_1,...,a_n) with a_i is the amount of the i-th character in the ordered alphabet
*/
public int[] getCurrentCompomere();

/**
* @return the underlying (possibly unordered) alphabet
*/
public Alphabet<T> getAlphabet();

/**
* The order of characters in the compomere might be different to the order of characters in the alphabet (i.e.
* the characters in the compomere are always ordered by mass). This array maps the i-th character in the compomere
* to it's appropiate index in the alphabet
* @return mapping of positions in compomere to character indizes in alphabet
*/
public int[] getAlphabetOrder();

/**
* Returns the character on the given position in the compomere
* @param index index in compomere
* @return corresponding character in alphabet
*/
public T getCharacterAt(int index);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* This file is part of the SIRIUS library for analyzing MS and MS/MS data
*
* Copyright (C) 2013-2015 Kai Dührkop
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with SIRIUS. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openscience.cdk.decomp;

import org.openscience.cdk.interfaces.IIsotope;

import java.util.ArrayList;
import java.util.List;

public final class DecomposerFactory {

protected List<RangeMassDecomposer<IIsotope>> decomposerCache;
protected static final int maximalNumberOfCachedDecomposers = 10;

protected final static DecomposerFactory instance = new DecomposerFactory();

public static DecomposerFactory getInstance() {
return instance;
}

public DecomposerFactory() {
this.decomposerCache = new ArrayList<>(maximalNumberOfCachedDecomposers);
}

public RangeMassDecomposer<IIsotope> getDecomposerFor(ChemicalAlphabet alphabet) {
for (RangeMassDecomposer<IIsotope> decomposer : decomposerCache) {
if (((ChemicalAlphabet)decomposer.getAlphabet()).isCompatible(alphabet)) {
return decomposer;
}
}
if (decomposerCache.size()>= maximalNumberOfCachedDecomposers) decomposerCache.remove(0);
final RangeMassDecomposer<IIsotope> decomposer = new RangeMassDecomposer<>(alphabet);
decomposerCache.add(decomposer);
return decomposer;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* This file is part of the SIRIUS library for analyzing MS and MS/MS data
*
* Copyright (C) 2013-2015 Kai Dührkop
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with SIRIUS. If not, see <http://www.gnu.org/licenses/>.
*/
package org.openscience.cdk.decomp;

/**
* A simple POJO which defines a range from min to max (including max).
*/
public class Interval {

private final int min;
private final int max;

public Interval(int min, int max) {
this.min = min;
this.max = max;
}

public int getMin() {
return min;
}

public int getMax() {
return max;
}

public String toString() {
return "(" + min + " .. " + max + ")";
}
}

0 comments on commit 86b4065

Please sign in to comment.