From a43a478f36d9ae3e5b1d77a44dda9fe9ec91be95 Mon Sep 17 00:00:00 2001 From: John May Date: Tue, 23 Aug 2016 14:13:40 +0100 Subject: [PATCH] Extract and generalise SMARTS substructure functionality to a separate class. Copies exact functionality ATM. --- .../fingerprint/CircularFingerprinter.java | 228 ----------- .../CircularFingerprintSmartsTest.java | 5 +- .../cdk/smarts/SubstructureSmarts.java | 383 ++++++++++++++++++ .../cdk/smarts/SubstructureSmartsTest.java | 85 ++++ 4 files changed, 472 insertions(+), 229 deletions(-) create mode 100644 tool/smarts/src/main/java/org/openscience/cdk/smarts/SubstructureSmarts.java create mode 100644 tool/smarts/src/test/java/org/openscience/cdk/smarts/SubstructureSmartsTest.java diff --git a/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java b/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java index 0695935f4a3..d825dbf5a2a 100644 --- a/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java +++ b/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java @@ -508,235 +508,7 @@ private void considerNewFP(FP newFP) { if (fp.iteration < newFP.iteration || fp.hashCode < newFP.hashCode) return; fplist.set(hit, newFP); } - - // ------------ Generation of fingerprint smarts ------------ - - //Helper variables for getFPSmarts() function - private HashMap nodes = new HashMap(); - private HashMap atomIndexes = new HashMap(); - private List traversedAtoms = new ArrayList(); - private List ringClosures = new ArrayList(); - private FP curFP = null; - private IAtomContainer curFPMolecule = null; - private int curIndex; - - private static class AtomNode - { - private int parent; - private int atom; - } - - /** - * Determines the structural fragment corresponding to particular FP object - * and returns it as SMARTS notation. - * This function must be called immediately after calculate() function since it uses the - * internal state of CircularFingerprint object. - * - * @param fp - the fingerprint - * @param molecule - the molecule for which the fingerprints were calculated - * @return the fragment as smarts/smiles - */ - public String getFPSmarts(FP fp, IAtomContainer molecule) - { - if (fp.atoms == null) - return null; - - int n = fp.atoms.length; - if (n == 0) - return null; - - curFP = fp; - curFPMolecule = molecule; - - nodes.clear(); - traversedAtoms.clear(); - atomIndexes.clear(); - ringClosures.clear(); - curIndex = 1; - - //Set initial node - AtomNode node = new AtomNode(); - node.parent = -1; - node.atom = fp.atoms[0]; - traversedAtoms.add(node.atom); - nodes.put(node.atom, node); - - return nodeToString(fp.atoms[0]); //traverse recursively all layers of the atom - } - - - /** - * Recursive generation of a smarts string for particular AtomNode - * @param atom index of atom - * @return SMARTS expression - */ - private String nodeToString(int atom) - { - StringBuilder sb = new StringBuilder(); - AtomNode curNode = nodes.get(atom); - List branches = new ArrayList(); - - for (int i = 0; i < atomAdj[atom].length; i++) - { - int neighborAt = atomAdj[atom][i]; - - if (neighborAt == curNode.parent) - continue; //This is the parent atom (it is already traversed) - - int neighborBo = bondAdj[atom][i]; - - AtomNode neighborNode = nodes.get(neighborAt); - if (neighborNode == null) // This node has not been registered yet - { - //Check for external atom (e.g. it is a neighbor atom which is not in the fp.atoms[] array) - if (Ints.indexOf(curFP.atoms, neighborAt) == -1) - { - String bond_str = ""; - if (bondArom[neighborBo]) - { - //aromatic bond is represented as "" - branches.add(":a"); - } - else - { - bond_str = bondToString1(bondOrder[neighborBo]); - branches.add(bond_str + "*"); - } - continue; - } - - // Registering a new Node and a new branch - AtomNode newNode = new AtomNode(); - newNode.atom = neighborAt; - newNode.parent = atom; - traversedAtoms.add(newNode.atom); - nodes.put(newNode.atom, newNode); - - String bond_str = ""; - if (!bondArom[neighborBo]) //aromatic bond is represented as "" - bond_str = bondToString1(bondOrder[neighborBo]); - - //recursion - branches.add(bond_str + nodeToString(neighborAt)); - } - else - { // Handle ring closure: adding indexes to both atoms - - if (!ringClosures.contains(neighborBo)) { - ringClosures.add(neighborBo); - String ind = ((curIndex > 9) ? "%" : "") + curIndex; - String bond_str = ""; - if (!bondArom[neighborBo]) //aromatic bond is represented as "" - bond_str = bondToString1(bondOrder[neighborBo]); - addIndexToAtom(bond_str + ind, atom); - addIndexToAtom(ind, neighborAt); - curIndex++; - } - } - } - - // Add atom from the current node - sb.append(getAtomSmarts(curFPMolecule, atom)); - - // Add indexes - if (atomIndexes.containsKey(atom)) - sb.append(atomIndexes.get(atom)); - - // Add branches - if (branches.size() == 0) - return (sb.toString()); - - for (int i = 0; i < branches.size() - 1; i++) - sb.append("(").append(branches.get(i)).append(")"); - sb.append(branches.get(branches.size() - 1)); - - return sb.toString(); - } - - private void addIndexToAtom(String ind, final int atom) - { - final String curr = atomIndexes.get(atom); - if (curr != null) - ind = curr + ind; - atomIndexes.put(atom, ind); - } - - - private String bondToString1(int boOrder) - { - switch (boOrder) - { - //single bond ('-') is coded as "" by default - case 2: - return "="; - case 3: - return "#"; - default: - return ""; - } - } - - private String getAtomSmarts(IAtomContainer mol, int atNum) - { - IAtom at = mol.getAtom(atNum); - Integer chrg = at.getFormalCharge(); - String atStr = at.getSymbol(); - - boolean complex = false; - - String chStr = ""; - if (chrg != null) - if (chrg != 0) // +0 ? - { - chStr = getChargeSmartsStr(chrg); - complex = true; - } - - switch (at.getAtomicNumber()) { - case 5: // B - case 6: // C - case 7: // N - case 8: // O - case 15: // P - case 16: // S - case 9: // F - case 17: // Cl - case 35: // Br - case 53: // I - break; - default: - complex = true; - break; - } - - //Handle aromaticity - if (atomArom[atNum]) - atStr = atStr.toLowerCase(); - - if (complex) - atStr = "[" + atStr + chStr + "]"; - - return atStr; - } - - private String getChargeSmartsStr(int chrg) - { - if (chrg == -1) - return "-"; - if (chrg == +1) - return "+"; - - if (chrg > 0) - return "+" + chrg; - else - if (chrg < 0) - return "" + chrg; - else - return ""; // case chrg == 0 - } - - // ------------ molecule analysis: cached cheminformatics ------------ // summarize preliminary information about the molecular structure, to make sure the rest all goes quickly diff --git a/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java b/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java index ce88e2c5332..045be2252cf 100644 --- a/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java +++ b/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java @@ -5,6 +5,7 @@ import org.openscience.cdk.fingerprint.CircularFingerprinter.FP; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.silent.SilentChemObjectBuilder; +import org.openscience.cdk.smarts.SubstructureSmarts; import org.openscience.cdk.smiles.SmilesParser; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; @@ -110,12 +111,14 @@ private void checkFPSmartsForMolecule(String moleculeSmiles, CircularFingerprinter circ = new CircularFingerprinter(); circ.calculate(mol); + SubstructureSmarts subsmarts = new SubstructureSmarts(mol); + subsmarts.setIncludePeripheralBonds(true); int numFP = circ.getFPCount(); Set actual = new HashSet<>(); for (int i = 0; i < numFP; i++) { FP fp = circ.getFP(i); - actual.add(circ.getFPSmarts(fp, mol)); + actual.add(subsmarts.generate(fp.atoms)); } assertThat(actual, everyItem(isIn(expected))); diff --git a/tool/smarts/src/main/java/org/openscience/cdk/smarts/SubstructureSmarts.java b/tool/smarts/src/main/java/org/openscience/cdk/smarts/SubstructureSmarts.java new file mode 100644 index 00000000000..8e3ca82c853 --- /dev/null +++ b/tool/smarts/src/main/java/org/openscience/cdk/smarts/SubstructureSmarts.java @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2016 John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. All we ask is that proper credit is given + * for our work, which includes - but is not limited to - adding the above + * copyright notice to the beginning of your source code files, and to any + * copyright notice that you may distribute with programs based on this work. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.smarts; + +import org.openscience.cdk.interfaces.IAtom; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +/** + * Utility class to create SMARTS that match part (substructure) of a molecule. + * SMARTS are generated by providing the atom indexes. An example use cases is + * encoding features from a fingerprint. + * + *
{@code
+ * IChemObjectBuilder bldr      = SilentChemObjectBuilder.getInstance();
+ * SmilesParser       smipar    = new SmilesParser(bldr);
+ *
+ * IAtomContainer     mol       = smipar.parseSmiles("[nH]1ccc2c1cccc2");
+ * SubstructureSmarts subsmarts = new SubstructureSmarts(mol);
+ *
+ * // smarts=nccc
+ * String             smarts    = mol.generate(new int[]{0,1,3,4});
+ *
+ * subsmarts.setIncludePeripheralBonds(true);
+ * // smarts=n(ccc(a)a)a
+ * String             smarts    = mol.generate(new int[]{0,1,3,4});
+ * }
+ * + * @author Nikolay Kochev + * @author Nina Jeliazkova + * @author John May + */ +public final class SubstructureSmarts { + + // molecule being selected over + private final IAtomContainer mol; + + // fast-access mol graph data structures + private final int[][] atomAdj, bondAdj; + private final int[] deg; + + // SMARTS atom and bond expressions + private final String[] aexpr; + private final String[] bexpr; + + // SMARTS traversal/generation + private final int[] avisit; + private final int[] rbnds; + private final int[] rnums; + private int numVisit; + + // option whether to include bonds in which only one atom is in the substructure + private boolean peripherals = false; + + /** + * Create a new instance over the provided molecule. + * + * @param mol molecule + */ + public SubstructureSmarts(IAtomContainer mol) { + this.mol = mol; + + final int numAtoms = mol.getAtomCount(); + final int numBonds = mol.getBondCount(); + + // build fast access + this.deg = new int[numAtoms]; + this.atomAdj = new int[numAtoms][4]; + this.bondAdj = new int[numAtoms][4]; + this.aexpr = new String[numAtoms]; + this.bexpr = new String[numBonds]; + this.avisit = new int[numAtoms]; + this.rbnds = new int[numBonds]; + this.rnums = new int[100]; // max 99 in SMILES/SMARTS + + // index adjacency information and bond expressions for quick + // reference and traversal + for (int bondIdx = 0; bondIdx < numBonds; bondIdx++) { + IBond bond = mol.getBond(bondIdx); + IAtom beg = bond.getAtom(0); + IAtom end = bond.getAtom(1); + int begIdx = mol.getAtomNumber(beg); + int endIdx = mol.getAtomNumber(end); + this.bexpr[bondIdx] = encodeBondExpr(bondIdx, begIdx, endIdx); + + // make sufficient space + if (deg[begIdx] == atomAdj[begIdx].length) { + atomAdj[begIdx] = Arrays.copyOf(atomAdj[begIdx], deg[begIdx] + 2); + bondAdj[begIdx] = Arrays.copyOf(bondAdj[begIdx], deg[begIdx] + 2); + } + if (deg[endIdx] == atomAdj[endIdx].length) { + atomAdj[endIdx] = Arrays.copyOf(atomAdj[endIdx], deg[endIdx] + 2); + bondAdj[endIdx] = Arrays.copyOf(bondAdj[endIdx], deg[endIdx] + 2); + } + + atomAdj[begIdx][deg[begIdx]] = endIdx; + bondAdj[begIdx][deg[begIdx]] = bondIdx; + atomAdj[endIdx][deg[endIdx]] = begIdx; + bondAdj[endIdx][deg[endIdx]] = bondIdx; + + deg[begIdx]++; + deg[endIdx]++; + } + + // pre-generate atom expressions + for (int atomIdx = 0; atomIdx < numAtoms; atomIdx++) + this.aexpr[atomIdx] = encodeAtomExpr(atomIdx); + } + + /** + * Option sets whether peripheral bonds should be written. Peripheral + * bonds are bonds where only one atom is in the requested substructure. + * The atoms not in the substructure are written as '*' or 'a'. + * + * @param val value true/false + */ + public void setIncludePeripheralBonds(boolean val) { + this.peripherals = val; + } + + /** + * Generate a SMARTS for the substructure formed of the provided + * atoms. + * + * @param atomIdxs atom indexes + * @return SMARTS, null if an empty array is passed + */ + public String generate(int[] atomIdxs) { + + if (atomIdxs == null) + throw new NullPointerException("No atom indexes provided"); + if (atomIdxs.length == 0) + return null; // makes sense? + + // special case + if (atomIdxs.length == 1 && !peripherals) + return aexpr[atomIdxs[0]]; + + // initialize traversal information + Arrays.fill(rbnds, 0); + Arrays.fill(avisit, 0); + for (int atmIdx : atomIdxs) + avisit[atmIdx] = -1; + + // first visit marks ring information + numVisit = 1; + for (int atomIdx : atomIdxs) { + if (avisit[atomIdx] < 0) + markRings(atomIdx, -1); + } + + // reset visit flags and generate + numVisit = 1; + for (int atmIdx : atomIdxs) + avisit[atmIdx] = -1; + + // second pass builds the expression + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < atomIdxs.length; i++) { + if (avisit[atomIdxs[i]] < 0) { + if (i > 0) sb.append('.'); + encodeExpr(atomIdxs[i], -1, sb); + } + } + + return sb.toString(); + } + + /** + * Recursively marks ring closures (back edges) in the {@link #rbnds} + * array in a depth first order. + * + * @param idx atom index + * @param bprev previous bond + */ + private void markRings(int idx, int bprev) { + avisit[idx] = numVisit++; + final int d = deg[idx]; + for (int j = 0; j < d; j++) { + int nbr = atomAdj[idx][j]; + int bidx = bondAdj[idx][j]; + if (avisit[nbr] == 0 || bidx == bprev) + continue; // ignored + else if (avisit[nbr] < 0) + markRings(nbr, bidx); + else if (avisit[nbr] < avisit[idx]) + rbnds[bidx] = -1; // ring closure + } + } + + /** + * Recursively encodes a SMARTS expression into the provides + * string builder. + * + * @param idx atom index + * @param bprev previous bond + * @param sb destition to write SMARTS to + */ + private void encodeExpr(int idx, int bprev, StringBuilder sb) { + avisit[idx] = numVisit++; + sb.append(aexpr[idx]); + final int d = deg[idx]; + + int remain = d; + for (int j = 0; j < d; j++) { + int nbr = atomAdj[idx][j]; + int bidx = bondAdj[idx][j]; + + // ring open/close + if (rbnds[bidx] < 0) { + // open + final int rnum = chooseRingNumber(); + if (rnum > 9) sb.append('%'); + sb.append(rnum); + rbnds[bidx] = rnum; + } else if (rbnds[bidx] > 0) { + // close + final int rnum = rbnds[bidx]; + releaseRingNumber(rnum); + if (rnum > 9) sb.append('%'); + sb.append(rnum); + } + + if (!peripherals && avisit[nbr] == 0 || + bidx == bprev || + rbnds[bidx] != 0) + remain--; + } + + for (int j = 0; j < d; j++) { + int nbr = atomAdj[idx][j]; + int bidx = bondAdj[idx][j]; + if (!peripherals && avisit[nbr] == 0 || + bidx == bprev || + rbnds[bidx] != 0) + continue; // ignored + remain--; + if (avisit[nbr] == 0) { + // peripheral bond + if (remain > 0) sb.append('('); + sb.append(mol.getBond(bidx).isAromatic() ? ':' : bexpr[bidx]); + sb.append(mol.getAtom(nbr).isAromatic() ? 'a' : '*'); + if (remain > 0) sb.append(')'); + } else { + if (remain > 0) sb.append('('); + sb.append(bexpr[bidx]); + encodeExpr(nbr, bidx, sb); + if (remain > 0) sb.append(')'); + } + } + } + + /** + * Select the lowest ring number for use in SMARTS. + * @return ring number + * @throws IllegalStateException all ring numbers are used + */ + private int chooseRingNumber() { + for (int i = 1; i < rnums.length; i++) { + if (rnums[i] == 0) { + rnums[i] = 1; + return i; + } + } + throw new IllegalStateException("No more ring numbers available!"); + } + + /** + * Releases a ring number allowing it to be reused. + * @param rnum ring number + */ + private void releaseRingNumber(int rnum) { + rnums[rnum] = 0; + } + + /** + * Encodes the atom at index (atmIdx) to a SMARTS + * expression that matches itself. + * + * @param atmIdx atom index + * @return SMARTS atom expression + */ + private String encodeAtomExpr(int atmIdx) { + final IAtom atom = mol.getAtom(atmIdx); + + boolean complex = false; + + StringBuilder sb = new StringBuilder(); + + switch (atom.getAtomicNumber()) { + case 0: // * + sb.append('*'); + break; + case 5: // B + case 6: // C + case 7: // N + case 8: // O + case 15: // P + case 16: // S + case 9: // F + case 17: // Cl + case 35: // Br + case 53: // I + sb.append(atom.isAromatic() ? atom.getSymbol().toLowerCase(Locale.ROOT) + : atom.getSymbol()); + break; + default: + complex = true; + sb.append(atom.isAromatic() ? atom.getSymbol().toLowerCase(Locale.ROOT) + : atom.getSymbol()); + break; + } + + Integer chg = atom.getFormalCharge(); + if (chg != null && chg != 0) { + if (chg >= 0) sb.append('+'); + else sb.append('-'); + if (chg < -1 || chg > +1) + sb.append(Math.abs(chg)); + complex = true; + } + + return complex ? '[' + sb.toString() + ']' : sb.toString(); + } + + /** + * Encodes the bond at index (bondIdx) to a SMARTS + * expression that matches itself. + * + * @param bondIdx bond index + * @param beg atom index of first atom + * @param end atom index of second atom + * @return SMARTS bond expression + */ + private String encodeBondExpr(int bondIdx, int beg, int end) { + IBond bond = mol.getBond(bondIdx); + if (bond.getOrder() == null) + return ""; + + boolean bArom = bond.isAromatic(); + boolean aArom = mol.getAtom(beg).isAromatic() && mol.getAtom(end).isAromatic(); + switch (bond.getOrder()) { + case SINGLE: + if (bArom) { + return aArom ? "" : ":"; + } else { + return aArom ? "-" : ""; + } + case DOUBLE: + return bArom ? "" : "="; + case TRIPLE: + return "#"; + default: + throw new IllegalArgumentException("Unsupported bond type: " + bond.getOrder()); + } + } +} diff --git a/tool/smarts/src/test/java/org/openscience/cdk/smarts/SubstructureSmartsTest.java b/tool/smarts/src/test/java/org/openscience/cdk/smarts/SubstructureSmartsTest.java new file mode 100644 index 00000000000..642a8a9627f --- /dev/null +++ b/tool/smarts/src/test/java/org/openscience/cdk/smarts/SubstructureSmartsTest.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016 John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. All we ask is that proper credit is given + * for our work, which includes - but is not limited to - adding the above + * copyright notice to the beginning of your source code files, and to any + * copyright notice that you may distribute with programs based on this work. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.smarts; + +import org.junit.Test; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.silent.SilentChemObjectBuilder; +import org.openscience.cdk.smiles.SmilesParser; + +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +public class SubstructureSmartsTest { + + private String generate(String smi, boolean peripheral, int[] idxs) throws Exception { + SmilesParser smipar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smipar.parseSmiles(smi); + SubstructureSmarts subsmarts = new SubstructureSmarts(mol); + subsmarts.setIncludePeripheralBonds(peripheral); + return subsmarts.generate(idxs); + } + + private static int[] makeSeq(int beg, int to) { + int[] a = new int[to-beg]; + for (int i = 0; i < a.length; i++) + a[i] = beg++; + return a; + } + + @Test + public void indole() throws Exception { + String smarts = generate("[nH]1ccc2c1cccc2", + false, + makeSeq(0,4)); + assertThat(smarts, is("nccc")); + } + + @Test + public void indoleWithPeripheral() throws Exception { + String smarts = generate("[nH]1ccc2c1cccc2", + true, + makeSeq(0,4)); + assertThat(smarts, is("n(ccc(:a):a):a")); + } + + @Test + public void biphenylIncludesSingleBond() throws Exception { + String smarts = generate("c1ccccc1-c1ccccc1", + false, + makeSeq(0,12)); + assertThat(smarts, containsString("-")); + } + + @Test + public void fullereneC60() throws Exception { + String smarts = generate("c12c3c4c5c1c1c6c7c2c2c8c3c3c9c4c4c%10c5c5c1c1c6c6c%11c7c2c2c7c8c3c3c8c9c4c4c9c%10c5c5c1c1c6c6c%11c2c2c7c3c3c8c4c4c9c5c1c1c6c2c3c41", + false, + makeSeq(0,60)); + assertThat(smarts, + is("c12c3c4c5c1c1c6c7c2c2c8c3c3c9c4c4c%10c5c5c1c1c6c6c%11c7c2c2c7c8c3c3c8c9c4c4c9c%10c5c5c1c1c6c6c%11c2c2c7c3c3c8c4c4c9c5c1c1c6c2c3c41")); + } + +} \ No newline at end of file