Skip to content

Commit 29aabd3

Browse files
asadegonw
authored andcommitted
added ShortestPathFingerprinter with recommended changes Signed-off-by:Syed Asad Rahman <s9asad@gmail.com>
Signed-off-by: John May <john.wilkinsonmay@gmail.com>
1 parent ca4bd8d commit 29aabd3

4 files changed

Lines changed: 620 additions & 0 deletions

File tree

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/* $Revision$ $Author$ $Date$
2+
*
3+
* Copyright (C) 2012 Syed Asad Rahman <asad@ebi.ac.uk>
4+
*
5+
*
6+
* Contact: cdk-devel@lists.sourceforge.net
7+
*
8+
* This program is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public License
10+
* as published by the Free Software Foundation; either version 2.1
11+
* of the License, or (at your option) any later version.
12+
* All we ask is that proper credit is given for our work, which includes
13+
* - but is not limited to - adding the above copyright notice to the beginning
14+
* of your source code files, and to any copyright notice that you may distribute
15+
* with programs based on this work.
16+
*
17+
* This program is distributed in the hope that it will be useful,
18+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20+
* GNU Lesser General Public License for more details.
21+
*
22+
* You should have received a copy of the GNU Lesser General Public License
23+
* along with this program; if not, write to the Free Software
24+
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
25+
*/
26+
package org.openscience.cdk.fingerprint;
27+
28+
import java.io.Serializable;
29+
import org.apache.commons.math3.random.MersenneTwister;
30+
import org.apache.commons.math3.random.RandomAdaptor;
31+
import org.apache.commons.math3.random.RandomGenerator;
32+
33+
/**
34+
* @author Syed Asad Rahman (2012)
35+
* @cdk.keyword fingerprint
36+
* @cdk.keyword similarity
37+
* @cdk.module standard
38+
* @cdk.githash
39+
*/
40+
public class RandomNumber implements Serializable {
41+
42+
private static final long serialVersionUID = 23345464573453571L;
43+
44+
/**
45+
* Mersenne Twister Random Number for a hashcode within a range between 0 to maximum
46+
*
47+
* @param maximum
48+
* @param hashCode
49+
* @return
50+
*/
51+
public static int generateMersenneTwisterRandomNumber(int maximum, long hashCode) {
52+
RandomGenerator rg = new RandomAdaptor(new MersenneTwister(hashCode));
53+
return rg.nextInt(maximum);
54+
}
55+
}
Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
/* $Revision$ $Author$ $Date$
2+
*
3+
* Copyright (C) 2012 Syed Asad Rahman <asad@ebi.ac.uk>
4+
*
5+
*
6+
* Contact: cdk-devel@lists.sourceforge.net
7+
*
8+
* This program is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public License
10+
* as published by the Free Software Foundation; either version 2.1
11+
* of the License, or (at your option) any later version.
12+
* All we ask is that proper credit is given for our work, which includes
13+
* - but is not limited to - adding the above copyright notice to the beginning
14+
* of your source code files, and to any copyright notice that you may distribute
15+
* with programs based on this work.
16+
*
17+
* This program is distributed in the hope that it will be useful,
18+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20+
* GNU Lesser General Public License for more details.
21+
*
22+
* You should have received a copy of the GNU Lesser General Public License
23+
* along with this program; if not, write to the Free Software
24+
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
25+
*/
26+
package org.openscience.cdk.fingerprint;
27+
28+
import java.io.Serializable;
29+
import java.util.*;
30+
import org.openscience.cdk.annotations.TestClass;
31+
import org.openscience.cdk.annotations.TestMethod;
32+
import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
33+
import org.openscience.cdk.exception.CDKException;
34+
import org.openscience.cdk.graph.ConnectivityChecker;
35+
import org.openscience.cdk.interfaces.*;
36+
import org.openscience.cdk.ringsearch.SSSRFinder;
37+
import org.openscience.cdk.tools.ILoggingTool;
38+
import org.openscience.cdk.tools.LoggingToolFactory;
39+
import org.openscience.cdk.tools.manipulator.RingSetManipulator;
40+
import org.openscience.cdk.tools.periodictable.PeriodicTable;
41+
42+
/**
43+
* Generates a fingerprint for a given {@link IAtomContainer}. Fingerprints are one-dimensional bit arrays, where bits
44+
* are set according to a the occurrence of a particular structural feature (See for example the Daylight inc. theory
45+
* manual for more information). Fingerprints allow for a fast screening step to exclude candidates for a substructure
46+
* search in a database. They are also a means for determining the similarity of chemical structures.
47+
48+
* <pre>
49+
*
50+
* A fingerprint is generated for an AtomContainer with this code:
51+
* It is recommended to use atomtyped container before generating the fingerprints.
52+
*
53+
* For example: AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(atomContainer);
54+
*
55+
* AtomContainer molecule = new AtomContainer();
56+
* AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(atomContainer);
57+
* IFingerprinter fingerprinter = new ShortestPathFingerprinter();
58+
* IBitFingerprint fingerprint = fingerprinter.getFingerprint(molecule);
59+
* fingerprint.fingerprintLength(); // returns 1024 by default
60+
* fingerprint.length(); // returns the highest set bit
61+
* </pre>
62+
*
63+
* <P>The FingerPrinter calculates fingerprint based on the Shortest Paths between two atoms. It also takes into account
64+
* ring system, charges etc while generating a fingerprint. </P>
65+
*
66+
* <p>The FingerPrinter assumes that hydrogens are explicitly given! Furthermore, if pseudo atoms or atoms with
67+
* malformed symbols are present, their atomic number is taken as one more than the last element currently supported in {@link PeriodicTable}.
68+
* </P>
69+
*
70+
*
71+
* @author Syed Asad Rahman (2012)
72+
* @cdk.keyword fingerprint
73+
* @cdk.keyword similarity
74+
* @cdk.module standard
75+
* @cdk.githash
76+
*
77+
*/
78+
@TestClass("org.openscience.cdk.fingerprint.ShortestPathFingerprinterTest")
79+
public class ShortestPathFingerprinter extends RandomNumber implements IFingerprinter, Serializable {
80+
81+
/**
82+
* The default length of created fingerprints.
83+
*/
84+
public final static int DEFAULT_SIZE = 1024;
85+
private static final long serialVersionUID = 7867864332244557861L;
86+
/**
87+
* The default length of created fingerprints.
88+
*/
89+
private int fingerprintLength;
90+
private static ILoggingTool logger =
91+
LoggingToolFactory.createLoggingTool(ShortestPathFingerprinter.class);
92+
93+
/**
94+
* Creates a fingerprint generator of length
95+
* <code>DEFAULT_SIZE</code>
96+
*/
97+
@TestMethod("testFingerprint")
98+
public ShortestPathFingerprinter() {
99+
this(DEFAULT_SIZE);
100+
}
101+
102+
/**
103+
* Constructs a fingerprint generator that creates fingerprints of the given fingerprintLength, using a generation
104+
* algorithm with shortest paths.
105+
*
106+
* @param fingerprintLength The desired fingerprintLength of the fingerprint
107+
*/
108+
public ShortestPathFingerprinter(int fingerprintLength) {
109+
this.fingerprintLength = fingerprintLength;
110+
}
111+
112+
/**
113+
* Generates a shortest path based BitSet fingerprint for the given AtomContainer.
114+
*
115+
* @param ac The AtomContainer for which a fingerprint is generated
116+
* @exception CDKException if there error in aromaticity perception or other CDK functions
117+
* @return A {@link BitSet} representing the fingerprint
118+
*/
119+
@Override
120+
@TestMethod("testgetBitFingerprint_IAtomContainer")
121+
public IBitFingerprint getBitFingerprint(
122+
IAtomContainer ac)
123+
throws CDKException {
124+
125+
IAtomContainer atomContainer = null;
126+
try {
127+
atomContainer = (IAtomContainer) ac.clone();
128+
} catch (CloneNotSupportedException ex) {
129+
logger.error("Failed to clone the molecule:", ex);
130+
}
131+
CDKHueckelAromaticityDetector.detectAromaticity(atomContainer);
132+
BitSet bitSet = new BitSet(fingerprintLength);
133+
if (!ConnectivityChecker.isConnected(atomContainer)) {
134+
IAtomContainerSet partitionedMolecules = ConnectivityChecker.partitionIntoMolecules(atomContainer);
135+
for (IAtomContainer container : partitionedMolecules.atomContainers()) {
136+
addUniquePath(container, bitSet);
137+
}
138+
} else {
139+
addUniquePath(atomContainer, bitSet);
140+
}
141+
return new BitSetFingerprint(bitSet);
142+
}
143+
144+
/**
145+
* {@inheritDoc}
146+
*
147+
* @param ac The AtomContainer for which a fingerprint is generated
148+
* @return Map of raw fingerprint paths/features
149+
* @exception CDKException if there error in aromaticity perception or other CDK functions
150+
*/
151+
@Override
152+
public Map<String, Integer> getRawFingerprint(IAtomContainer ac) throws CDKException {
153+
IAtomContainer atomContainer = null;
154+
try {
155+
atomContainer = (IAtomContainer) ac.clone();
156+
} catch (CloneNotSupportedException ex) {
157+
logger.error("Failed to clone the molecule:", ex);
158+
}
159+
CDKHueckelAromaticityDetector.detectAromaticity(atomContainer);
160+
Map<String, Integer> uniquePaths = new TreeMap<String, Integer>();
161+
if (!ConnectivityChecker.isConnected(atomContainer)) {
162+
IAtomContainerSet partitionedMolecules = ConnectivityChecker.partitionIntoMolecules(atomContainer);
163+
for (IAtomContainer container : partitionedMolecules.atomContainers()) {
164+
addUniquePath(container, uniquePaths);
165+
}
166+
} else {
167+
addUniquePath(atomContainer, uniquePaths);
168+
}
169+
return uniquePaths;
170+
}
171+
172+
private void addUniquePath(IAtomContainer container, BitSet bitSet) {
173+
Integer[] hashes = findPaths(container);
174+
for (Integer hash : hashes) {
175+
int position = getRandomNumber(hash);
176+
bitSet.set(position);
177+
}
178+
}
179+
180+
private void addUniquePath(IAtomContainer atomContainer, Map<String, Integer> uniquePaths) {
181+
Integer[] hashes;
182+
hashes = findPaths(atomContainer);
183+
for (Integer hash : hashes) {
184+
int position = getRandomNumber(hash);
185+
uniquePaths.put(String.valueOf(position), hash);
186+
}
187+
}
188+
189+
/**
190+
* Get all paths of lengths 0 to the specified length.
191+
*
192+
* This method will find all paths upto length N starting from each atom in the molecule and return the unique set
193+
* of such paths.
194+
*
195+
* @param container The molecule to search
196+
* @return A map of path strings, keyed on themselves
197+
*/
198+
private Integer[] findPaths(IAtomContainer container) {
199+
200+
ShortestPathWalker walker = new ShortestPathWalker(container);
201+
// convert paths to hashes
202+
List<Integer> paths = new ArrayList<Integer>();
203+
int patternIndex = 0;
204+
205+
for (String s : walker.getPaths()) {
206+
int toHashCode = s.hashCode();
207+
paths.add(patternIndex, toHashCode);
208+
patternIndex++;
209+
}
210+
211+
/*
212+
* Add ring information
213+
*/
214+
SSSRFinder finder = new SSSRFinder(container);
215+
IRingSet sssr = finder.findEssentialRings();
216+
RingSetManipulator.sort(sssr);
217+
for (Iterator<IAtomContainer> it = sssr.atomContainers().iterator(); it.hasNext();) {
218+
IAtomContainer ring = it.next();
219+
int toHashCode = String.valueOf(ring.getAtomCount()).hashCode();
220+
paths.add(patternIndex, toHashCode);
221+
patternIndex++;
222+
}
223+
/*
224+
* Check for the charges
225+
*/
226+
List<String> l = new ArrayList<String>();
227+
for (Iterator<IAtom> it = container.atoms().iterator(); it.hasNext();) {
228+
IAtom atom = it.next();
229+
int charge = atom.getFormalCharge() == null ? 0 : atom.getFormalCharge();
230+
if (charge != 0) {
231+
l.add(atom.getSymbol().concat(String.valueOf(charge)));
232+
}
233+
}
234+
Collections.sort(l);
235+
int toHashCode = l.hashCode();
236+
paths.add(patternIndex, toHashCode);
237+
patternIndex++;
238+
239+
l = new ArrayList<String>();
240+
/*
241+
* atom stereo parity
242+
*/
243+
for (Iterator<IAtom> it = container.atoms().iterator(); it.hasNext();) {
244+
IAtom atom = it.next();
245+
int st = atom.getStereoParity() == null ? 0 : atom.getStereoParity();
246+
if (st != 0) {
247+
l.add(atom.getSymbol().concat(String.valueOf(st)));
248+
}
249+
}
250+
Collections.sort(l);
251+
toHashCode = l.hashCode();
252+
paths.add(patternIndex, toHashCode);
253+
patternIndex++;
254+
255+
if (container.getSingleElectronCount() > 0) {
256+
StringBuilder radicalInformation = new StringBuilder();
257+
radicalInformation.append("RAD: ").append(String.valueOf(container.getSingleElectronCount()));
258+
paths.add(patternIndex, radicalInformation.toString().hashCode());
259+
patternIndex++;
260+
}
261+
if (container.getLonePairCount() > 0) {
262+
StringBuilder lpInformation = new StringBuilder();
263+
lpInformation.append("LP: ").append(String.valueOf(container.getLonePairCount()));
264+
paths.add(patternIndex, lpInformation.toString().hashCode());
265+
patternIndex++;
266+
}
267+
return paths.toArray(new Integer[paths.size()]);
268+
}
269+
270+
@Override
271+
@TestMethod("testSize")
272+
public int getSize() {
273+
return fingerprintLength;
274+
}
275+
276+
@Override
277+
public ICountFingerprint getCountFingerprint(IAtomContainer iac) throws CDKException {
278+
throw new UnsupportedOperationException("Not supported yet.");
279+
}
280+
281+
/*
282+
* Returns a random number for a given object
283+
*/
284+
private int getRandomNumber(Integer hashValue) {
285+
return generateMersenneTwisterRandomNumber(fingerprintLength, hashValue);
286+
}
287+
}

0 commit comments

Comments
 (0)