|
| 1 | +/* $Revision$ $Author$ $Date$ |
| 2 | + * |
| 3 | + * Copyright (C) 2012 Syed Asad Rahman <asad@ebi.ac.uk> |
| 4 | + * |
| 5 | + * |
| 6 | + * Contact: cdk-devel@lists.sourceforge.net |
| 7 | + * |
| 8 | + * This program is free software; you can redistribute it and/or |
| 9 | + * modify it under the terms of the GNU Lesser General Public License |
| 10 | + * as published by the Free Software Foundation; either version 2.1 |
| 11 | + * of the License, or (at your option) any later version. |
| 12 | + * All we ask is that proper credit is given for our work, which includes |
| 13 | + * - but is not limited to - adding the above copyright notice to the beginning |
| 14 | + * of your source code files, and to any copyright notice that you may distribute |
| 15 | + * with programs based on this work. |
| 16 | + * |
| 17 | + * This program is distributed in the hope that it will be useful, |
| 18 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | + * GNU Lesser General Public License for more details. |
| 21 | + * |
| 22 | + * You should have received a copy of the GNU Lesser General Public License |
| 23 | + * along with this program; if not, write to the Free Software |
| 24 | + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 25 | + */ |
| 26 | +package org.openscience.cdk.fingerprint; |
| 27 | + |
| 28 | +import java.io.Serializable; |
| 29 | +import java.util.*; |
| 30 | +import org.openscience.cdk.annotations.TestClass; |
| 31 | +import org.openscience.cdk.annotations.TestMethod; |
| 32 | +import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector; |
| 33 | +import org.openscience.cdk.exception.CDKException; |
| 34 | +import org.openscience.cdk.graph.ConnectivityChecker; |
| 35 | +import org.openscience.cdk.interfaces.*; |
| 36 | +import org.openscience.cdk.ringsearch.SSSRFinder; |
| 37 | +import org.openscience.cdk.tools.ILoggingTool; |
| 38 | +import org.openscience.cdk.tools.LoggingToolFactory; |
| 39 | +import org.openscience.cdk.tools.manipulator.RingSetManipulator; |
| 40 | +import org.openscience.cdk.tools.periodictable.PeriodicTable; |
| 41 | + |
| 42 | +/** |
| 43 | + * Generates a fingerprint for a given {@link IAtomContainer}. Fingerprints are one-dimensional bit arrays, where bits |
| 44 | + * are set according to a the occurrence of a particular structural feature (See for example the Daylight inc. theory |
| 45 | + * manual for more information). Fingerprints allow for a fast screening step to exclude candidates for a substructure |
| 46 | + * search in a database. They are also a means for determining the similarity of chemical structures. |
| 47 | +
|
| 48 | + * <pre> |
| 49 | + * |
| 50 | + * A fingerprint is generated for an AtomContainer with this code: |
| 51 | + * It is recommended to use atomtyped container before generating the fingerprints. |
| 52 | + * |
| 53 | + * For example: AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(atomContainer); |
| 54 | + * |
| 55 | + * AtomContainer molecule = new AtomContainer(); |
| 56 | + * AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(atomContainer); |
| 57 | + * IFingerprinter fingerprinter = new ShortestPathFingerprinter(); |
| 58 | + * IBitFingerprint fingerprint = fingerprinter.getFingerprint(molecule); |
| 59 | + * fingerprint.fingerprintLength(); // returns 1024 by default |
| 60 | + * fingerprint.length(); // returns the highest set bit |
| 61 | + * </pre> |
| 62 | + * |
| 63 | + * <P>The FingerPrinter calculates fingerprint based on the Shortest Paths between two atoms. It also takes into account |
| 64 | + * ring system, charges etc while generating a fingerprint. </P> |
| 65 | + * |
| 66 | + * <p>The FingerPrinter assumes that hydrogens are explicitly given! Furthermore, if pseudo atoms or atoms with |
| 67 | + * malformed symbols are present, their atomic number is taken as one more than the last element currently supported in {@link PeriodicTable}. |
| 68 | + * </P> |
| 69 | + * |
| 70 | + * |
| 71 | + * @author Syed Asad Rahman (2012) |
| 72 | + * @cdk.keyword fingerprint |
| 73 | + * @cdk.keyword similarity |
| 74 | + * @cdk.module standard |
| 75 | + * @cdk.githash |
| 76 | + * |
| 77 | + */ |
| 78 | +@TestClass("org.openscience.cdk.fingerprint.ShortestPathFingerprinterTest") |
| 79 | +public class ShortestPathFingerprinter extends RandomNumber implements IFingerprinter, Serializable { |
| 80 | + |
| 81 | + /** |
| 82 | + * The default length of created fingerprints. |
| 83 | + */ |
| 84 | + public final static int DEFAULT_SIZE = 1024; |
| 85 | + private static final long serialVersionUID = 7867864332244557861L; |
| 86 | + /** |
| 87 | + * The default length of created fingerprints. |
| 88 | + */ |
| 89 | + private int fingerprintLength; |
| 90 | + private static ILoggingTool logger = |
| 91 | + LoggingToolFactory.createLoggingTool(ShortestPathFingerprinter.class); |
| 92 | + |
| 93 | + /** |
| 94 | + * Creates a fingerprint generator of length |
| 95 | + * <code>DEFAULT_SIZE</code> |
| 96 | + */ |
| 97 | + @TestMethod("testFingerprint") |
| 98 | + public ShortestPathFingerprinter() { |
| 99 | + this(DEFAULT_SIZE); |
| 100 | + } |
| 101 | + |
| 102 | + /** |
| 103 | + * Constructs a fingerprint generator that creates fingerprints of the given fingerprintLength, using a generation |
| 104 | + * algorithm with shortest paths. |
| 105 | + * |
| 106 | + * @param fingerprintLength The desired fingerprintLength of the fingerprint |
| 107 | + */ |
| 108 | + public ShortestPathFingerprinter(int fingerprintLength) { |
| 109 | + this.fingerprintLength = fingerprintLength; |
| 110 | + } |
| 111 | + |
| 112 | + /** |
| 113 | + * Generates a shortest path based BitSet fingerprint for the given AtomContainer. |
| 114 | + * |
| 115 | + * @param ac The AtomContainer for which a fingerprint is generated |
| 116 | + * @exception CDKException if there error in aromaticity perception or other CDK functions |
| 117 | + * @return A {@link BitSet} representing the fingerprint |
| 118 | + */ |
| 119 | + @Override |
| 120 | + @TestMethod("testgetBitFingerprint_IAtomContainer") |
| 121 | + public IBitFingerprint getBitFingerprint( |
| 122 | + IAtomContainer ac) |
| 123 | + throws CDKException { |
| 124 | + |
| 125 | + IAtomContainer atomContainer = null; |
| 126 | + try { |
| 127 | + atomContainer = (IAtomContainer) ac.clone(); |
| 128 | + } catch (CloneNotSupportedException ex) { |
| 129 | + logger.error("Failed to clone the molecule:", ex); |
| 130 | + } |
| 131 | + CDKHueckelAromaticityDetector.detectAromaticity(atomContainer); |
| 132 | + BitSet bitSet = new BitSet(fingerprintLength); |
| 133 | + if (!ConnectivityChecker.isConnected(atomContainer)) { |
| 134 | + IAtomContainerSet partitionedMolecules = ConnectivityChecker.partitionIntoMolecules(atomContainer); |
| 135 | + for (IAtomContainer container : partitionedMolecules.atomContainers()) { |
| 136 | + addUniquePath(container, bitSet); |
| 137 | + } |
| 138 | + } else { |
| 139 | + addUniquePath(atomContainer, bitSet); |
| 140 | + } |
| 141 | + return new BitSetFingerprint(bitSet); |
| 142 | + } |
| 143 | + |
| 144 | + /** |
| 145 | + * {@inheritDoc} |
| 146 | + * |
| 147 | + * @param ac The AtomContainer for which a fingerprint is generated |
| 148 | + * @return Map of raw fingerprint paths/features |
| 149 | + * @exception CDKException if there error in aromaticity perception or other CDK functions |
| 150 | + */ |
| 151 | + @Override |
| 152 | + public Map<String, Integer> getRawFingerprint(IAtomContainer ac) throws CDKException { |
| 153 | + IAtomContainer atomContainer = null; |
| 154 | + try { |
| 155 | + atomContainer = (IAtomContainer) ac.clone(); |
| 156 | + } catch (CloneNotSupportedException ex) { |
| 157 | + logger.error("Failed to clone the molecule:", ex); |
| 158 | + } |
| 159 | + CDKHueckelAromaticityDetector.detectAromaticity(atomContainer); |
| 160 | + Map<String, Integer> uniquePaths = new TreeMap<String, Integer>(); |
| 161 | + if (!ConnectivityChecker.isConnected(atomContainer)) { |
| 162 | + IAtomContainerSet partitionedMolecules = ConnectivityChecker.partitionIntoMolecules(atomContainer); |
| 163 | + for (IAtomContainer container : partitionedMolecules.atomContainers()) { |
| 164 | + addUniquePath(container, uniquePaths); |
| 165 | + } |
| 166 | + } else { |
| 167 | + addUniquePath(atomContainer, uniquePaths); |
| 168 | + } |
| 169 | + return uniquePaths; |
| 170 | + } |
| 171 | + |
| 172 | + private void addUniquePath(IAtomContainer container, BitSet bitSet) { |
| 173 | + Integer[] hashes = findPaths(container); |
| 174 | + for (Integer hash : hashes) { |
| 175 | + int position = getRandomNumber(hash); |
| 176 | + bitSet.set(position); |
| 177 | + } |
| 178 | + } |
| 179 | + |
| 180 | + private void addUniquePath(IAtomContainer atomContainer, Map<String, Integer> uniquePaths) { |
| 181 | + Integer[] hashes; |
| 182 | + hashes = findPaths(atomContainer); |
| 183 | + for (Integer hash : hashes) { |
| 184 | + int position = getRandomNumber(hash); |
| 185 | + uniquePaths.put(String.valueOf(position), hash); |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + /** |
| 190 | + * Get all paths of lengths 0 to the specified length. |
| 191 | + * |
| 192 | + * This method will find all paths upto length N starting from each atom in the molecule and return the unique set |
| 193 | + * of such paths. |
| 194 | + * |
| 195 | + * @param container The molecule to search |
| 196 | + * @return A map of path strings, keyed on themselves |
| 197 | + */ |
| 198 | + private Integer[] findPaths(IAtomContainer container) { |
| 199 | + |
| 200 | + ShortestPathWalker walker = new ShortestPathWalker(container); |
| 201 | + // convert paths to hashes |
| 202 | + List<Integer> paths = new ArrayList<Integer>(); |
| 203 | + int patternIndex = 0; |
| 204 | + |
| 205 | + for (String s : walker.getPaths()) { |
| 206 | + int toHashCode = s.hashCode(); |
| 207 | + paths.add(patternIndex, toHashCode); |
| 208 | + patternIndex++; |
| 209 | + } |
| 210 | + |
| 211 | + /* |
| 212 | + * Add ring information |
| 213 | + */ |
| 214 | + SSSRFinder finder = new SSSRFinder(container); |
| 215 | + IRingSet sssr = finder.findEssentialRings(); |
| 216 | + RingSetManipulator.sort(sssr); |
| 217 | + for (Iterator<IAtomContainer> it = sssr.atomContainers().iterator(); it.hasNext();) { |
| 218 | + IAtomContainer ring = it.next(); |
| 219 | + int toHashCode = String.valueOf(ring.getAtomCount()).hashCode(); |
| 220 | + paths.add(patternIndex, toHashCode); |
| 221 | + patternIndex++; |
| 222 | + } |
| 223 | + /* |
| 224 | + * Check for the charges |
| 225 | + */ |
| 226 | + List<String> l = new ArrayList<String>(); |
| 227 | + for (Iterator<IAtom> it = container.atoms().iterator(); it.hasNext();) { |
| 228 | + IAtom atom = it.next(); |
| 229 | + int charge = atom.getFormalCharge() == null ? 0 : atom.getFormalCharge(); |
| 230 | + if (charge != 0) { |
| 231 | + l.add(atom.getSymbol().concat(String.valueOf(charge))); |
| 232 | + } |
| 233 | + } |
| 234 | + Collections.sort(l); |
| 235 | + int toHashCode = l.hashCode(); |
| 236 | + paths.add(patternIndex, toHashCode); |
| 237 | + patternIndex++; |
| 238 | + |
| 239 | + l = new ArrayList<String>(); |
| 240 | + /* |
| 241 | + * atom stereo parity |
| 242 | + */ |
| 243 | + for (Iterator<IAtom> it = container.atoms().iterator(); it.hasNext();) { |
| 244 | + IAtom atom = it.next(); |
| 245 | + int st = atom.getStereoParity() == null ? 0 : atom.getStereoParity(); |
| 246 | + if (st != 0) { |
| 247 | + l.add(atom.getSymbol().concat(String.valueOf(st))); |
| 248 | + } |
| 249 | + } |
| 250 | + Collections.sort(l); |
| 251 | + toHashCode = l.hashCode(); |
| 252 | + paths.add(patternIndex, toHashCode); |
| 253 | + patternIndex++; |
| 254 | + |
| 255 | + if (container.getSingleElectronCount() > 0) { |
| 256 | + StringBuilder radicalInformation = new StringBuilder(); |
| 257 | + radicalInformation.append("RAD: ").append(String.valueOf(container.getSingleElectronCount())); |
| 258 | + paths.add(patternIndex, radicalInformation.toString().hashCode()); |
| 259 | + patternIndex++; |
| 260 | + } |
| 261 | + if (container.getLonePairCount() > 0) { |
| 262 | + StringBuilder lpInformation = new StringBuilder(); |
| 263 | + lpInformation.append("LP: ").append(String.valueOf(container.getLonePairCount())); |
| 264 | + paths.add(patternIndex, lpInformation.toString().hashCode()); |
| 265 | + patternIndex++; |
| 266 | + } |
| 267 | + return paths.toArray(new Integer[paths.size()]); |
| 268 | + } |
| 269 | + |
| 270 | + @Override |
| 271 | + @TestMethod("testSize") |
| 272 | + public int getSize() { |
| 273 | + return fingerprintLength; |
| 274 | + } |
| 275 | + |
| 276 | + @Override |
| 277 | + public ICountFingerprint getCountFingerprint(IAtomContainer iac) throws CDKException { |
| 278 | + throw new UnsupportedOperationException("Not supported yet."); |
| 279 | + } |
| 280 | + |
| 281 | + /* |
| 282 | + * Returns a random number for a given object |
| 283 | + */ |
| 284 | + private int getRandomNumber(Integer hashValue) { |
| 285 | + return generateMersenneTwisterRandomNumber(fingerprintLength, hashValue); |
| 286 | + } |
| 287 | +} |
0 commit comments