/
Fingerprinter.java
428 lines (382 loc) · 15.9 KB
/
Fingerprinter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
/* Copyright (C) 2002-2007 Christoph Steinbeck <steinbeck@users.sf.net>
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may distribute
* with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.fingerprint;
import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.aromaticity.Aromaticity;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.graph.PathTools;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.ringsearch.AllRingsFinder;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
* Generates a fingerprint for a given AtomContainer. Fingerprints are
* one-dimensional bit arrays, where bits are set according to a the
* occurrence of a particular structural feature (See for example the
* Daylight inc. theory manual for more information). Fingerprints allow for
* a fast screening step to exclude candidates for a substructure search in a
* database. They are also a means for determining the similarity of chemical
* structures. <p>
*
* A fingerprint is generated for an AtomContainer with this code: <pre>
* Molecule molecule = new Molecule();
* IFingerprinter fingerprinter = new Fingerprinter();
* IBitFingerprint fingerprint = fingerprinter.getBitFingerprint(molecule);
* fingerprint.size(); // returns 1024 by default
* fingerprint.length(); // returns the highest set bit
* </pre> <p>
*
* The FingerPrinter assumes that hydrogens are explicitly given! Furthermore,
* if pseudo atoms or atoms with malformed symbols are present, their atomic
* number is taken as one more than the last element currently supported in
* {@link org.openscience.cdk.tools.periodictable.PeriodicTable}.
*
* <font color="#FF0000">Warning: The aromaticity detection for this
* FingerPrinter relies on AllRingsFinder, which is known to take very long
* for some molecules with many cycles or special cyclic topologies. Thus,
* the AllRingsFinder has a built-in timeout of 5 seconds after which it
* aborts and throws an Exception. If you want your SMILES generated at any
* expense, you need to create your own AllRingsFinder, set the timeout to a
* higher value, and assign it to this FingerPrinter. In the vast majority of
* cases, however, the defaults will be fine. </font> <p>
*
* <font color="#FF0000">Another Warning : The daylight manual says:
* "Fingerprints are not so definite: if a fingerprint indicates a pattern is
* missing then it certainly is, but it can only indicate a pattern's presence
* with some probability." In the case of very small molecules, the
* probability that you get the same fingerprint for different molecules is
* high. </font>
* </p>
*
* @author steinbeck
* @cdk.created 2002-02-24
* @cdk.keyword fingerprint
* @cdk.keyword similarity
* @cdk.module standard
* @cdk.githash
*/
public class Fingerprinter extends AbstractFingerprinter implements IFingerprinter {
/** Throw an exception if too many paths (per atom) are generated. */
private final static int DEFAULT_PATH_LIMIT = 1500;
/** The default length of created fingerprints. */
public final static int DEFAULT_SIZE = 1024;
/** The default search depth used to create the fingerprints. */
public final static int DEFAULT_SEARCH_DEPTH = 8;
private int size;
private int searchDepth;
private int pathLimit = DEFAULT_PATH_LIMIT;
static int debugCounter = 0;
private static ILoggingTool logger = LoggingToolFactory
.createLoggingTool(Fingerprinter.class);
/**
* Creates a fingerprint generator of length <code>DEFAULT_SIZE</code>
* and with a search depth of <code>DEFAULT_SEARCH_DEPTH</code>.
*/
public Fingerprinter() {
this(DEFAULT_SIZE, DEFAULT_SEARCH_DEPTH);
}
public Fingerprinter(int size) {
this(size, DEFAULT_SEARCH_DEPTH);
}
/**
* Constructs a fingerprint generator that creates fingerprints of
* the given size, using a generation algorithm with the given search
* depth.
*
* @param size The desired size of the fingerprint
* @param searchDepth The desired depth of search
*/
public Fingerprinter(int size, int searchDepth) {
this.size = size;
this.searchDepth = searchDepth;
}
/**
* Generates a fingerprint of the default size for the given AtomContainer.
*
* @param container The AtomContainer for which a Fingerprint is generated
* @param ringFinder An instance of
* {@link org.openscience.cdk.ringsearch.AllRingsFinder}
* @exception CDKException if there is a timeout in ring or aromaticity
* perception
* @return A {@link BitSet} representing the fingerprint
*/
public IBitFingerprint getBitFingerprint(IAtomContainer container, AllRingsFinder ringFinder) throws CDKException {
int position = -1;
logger.debug("Entering Fingerprinter");
logger.debug("Starting Aromaticity Detection");
long before = System.currentTimeMillis();
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(container);
Aromaticity.cdkLegacy().apply(container);
long after = System.currentTimeMillis();
logger.debug("time for aromaticity calculation: " + (after - before) + " milliseconds");
logger.debug("Finished Aromaticity Detection");
BitSet bitSet = new BitSet(size);
encodePaths(container, searchDepth, bitSet, size);
return new BitSetFingerprint(bitSet);
}
/**
* Generates a fingerprint of the default size for the given AtomContainer.
*
*@param container The AtomContainer for which a Fingerprint is generated
*/
@Override
public IBitFingerprint getBitFingerprint(IAtomContainer container) throws CDKException {
return getBitFingerprint(container, null);
}
/** {@inheritDoc} */
@Override
public Map<String, Integer> getRawFingerprint(IAtomContainer iAtomContainer) throws CDKException {
throw new UnsupportedOperationException();
}
private IBond findBond(List<IBond> bonds, IAtom beg, IAtom end) {
for (IBond bond : bonds)
if (bond.contains(beg) && bond.contains(end))
return bond;
return null;
}
private String encodePath(IAtomContainer mol, Map<IAtom, List<IBond>> cache, List<IAtom> path, StringBuilder buffer) {
buffer.setLength(0);
IAtom prev = path.get(path.size()-1);
buffer.append(getAtomSymbol(prev));
for (int i = path.size()-2; i >= 0; i--) {
final IAtom next = path.get(i);
List<IBond> bonds = cache.get(prev);
if (bonds == null) {
bonds = mol.getConnectedBondsList(prev);
cache.put(prev, bonds);
}
IBond bond = findBond(bonds, next, prev);
if (bond == null)
throw new IllegalStateException("FATAL - Atoms in patch were connected?");
buffer.append(getBondSymbol(bond));
buffer.append(getAtomSymbol(next));
prev = next;
}
return buffer.toString();
}
private static final class State {
private int numPaths = 0;
private Random rand = new Random();
private BitSet fp;
private IAtomContainer mol;
private Set<IAtom> visited = new HashSet<>();
private List<IAtom> apath = new ArrayList<>();
private List<IBond> bpath = new ArrayList<>();
private final int maxDepth;
private final int fpsize;
private Map<IAtom,List<IBond>> cache = new IdentityHashMap<>();
public StringBuilder buffer = new StringBuilder();
public State(IAtomContainer mol, BitSet fp, int fpsize, int maxDepth) {
this.mol = mol;
this.fp = fp;
this.fpsize = fpsize;
this.maxDepth = maxDepth;
}
List<IBond> getBonds(IAtom atom) {
List<IBond> bonds = cache.get(atom);
if (bonds == null) {
bonds = mol.getConnectedBondsList(atom);
cache.put(atom, bonds);
}
return bonds;
}
boolean visit(IAtom a) {
return visited.add(a);
}
boolean unvisit(IAtom a) {
return visited.remove(a);
}
void push(IAtom atom, IBond bond) {
apath.add(atom);
if (bond != null)
bpath.add(bond);
}
void pop() {
if (!apath.isEmpty())
apath.remove(apath.size()-1);
if (!bpath.isEmpty())
bpath.remove(bpath.size()-1);
}
void addHash(int x) {
numPaths++;
rand.setSeed(x);
// XXX: fp.set(x % size); would work just as well but would encode a
// different bit
fp.set(rand.nextInt(fpsize));
}
}
private void traversePaths(State state, IAtom beg, IBond prev) throws CDKException {
state.push(beg, prev);
state.addHash(encodeUniquePath(state.mol, state.cache, state.apath, state.buffer));
if (state.numPaths > pathLimit)
throw new CDKException("To many paths!");
if (state.apath.size() < state.maxDepth) {
for (IBond bond : state.getBonds(beg)) {
if (bond == prev)
continue;
final IAtom nbr = bond.getConnectedAtom(beg);
if (state.visit(nbr)) {
traversePaths(state, nbr, prev);
state.unvisit(nbr); // traverse all paths
}
}
}
state.pop();
}
/**
* Get all paths of lengths 0 to the specified length.
*
* This method will find all paths up to length N starting from each
* atom in the molecule and return the unique set of such paths.
*
* @param container The molecule to search
* @param searchDepth The maximum path length desired
* @return A Map of path strings, keyed on themselves
* @deprecated Use {@link #encodePaths(IAtomContainer, int, BitSet, int)}
*/
@Deprecated
protected int[] findPathes(IAtomContainer container, int searchDepth) throws CDKException {
Set<Integer> hashes = new HashSet<>();
Map<IAtom, List<IBond>> cache = new HashMap<>();
StringBuilder buffer = new StringBuilder();
for (IAtom startAtom : container.atoms()) {
List<List<IAtom>> p = PathTools.getLimitedPathsOfLengthUpto(container, startAtom, searchDepth, pathLimit);
for (List<IAtom> path : p) {
hashes.add(encodeUniquePath(container, cache, path, buffer));
}
}
int pos = 0;
int[] result = new int[hashes.size()];
for (Integer hash : hashes)
result[pos++] = hash;
return result;
}
protected void encodePaths(IAtomContainer mol, int depth, BitSet fp, int size) throws CDKException {
State state = new State(mol, fp, size, depth+1);
for (IAtom atom : mol.atoms()) {
state.numPaths = 0;
state.visit(atom);
traversePaths(state, atom, null);
state.unvisit(atom);
}
}
private int encodeUniquePath(IAtomContainer container, Map<IAtom, List<IBond>> cache, List<IAtom> path, StringBuilder buffer) {
int hash = 0;
if (path.size() == 1)
return getAtomSymbol(path.get(0)).hashCode();
String forward = encodePath(container, cache, path, buffer);
Collections.reverse(path);
String reverse = encodePath(container, cache, path, buffer);
Collections.reverse(path);
final int x;
if (reverse.compareTo(forward) < 0)
x = forward.hashCode();
else
x = reverse.hashCode();
return x;
}
private String getAtomSymbol(IAtom atom) {
if (atom.getAtomicNumber() == null ||
atom.getAtomicNumber() == 0 ||
atom instanceof IPseudoAtom){
return "*";
} else {
// XXX: backwards compatibility
// This is completely random, I believe the intention is because
// paths were reversed with string manipulation to de-duplicate
// (only the lowest lexicographically is stored) however this
// doesn't work with multiple atom symbols:
// e.g. Fe-C => C-eF vs C-Fe => eF-C
// A dirty hack is to replace "common" symbols with single letter
// equivalents so the reversing is less wrong
switch (atom.getAtomicNumber()) {
case 17: // Cl
return "X";
case 35: // Br
return "Z";
case 14: // Si
return "Y";
case 33: // As
return "D";
case 3: // Li
return "L";
case 34: // Se
return "E";
case 11: // Na
return "G";
case 20: // Ca
return "J";
case 13: // Al
return "A";
}
return atom.getSymbol();
}
}
/**
* Gets the bondSymbol attribute of the Fingerprinter class
*
*@param bond Description of the Parameter
*@return The bondSymbol value
*/
protected String getBondSymbol(IBond bond) {
String bondSymbol = "";
if (bond.getFlag(CDKConstants.ISAROMATIC)) {
bondSymbol = ":";
} else if (bond.getOrder() == IBond.Order.SINGLE) {
bondSymbol = "-";
} else if (bond.getOrder() == IBond.Order.DOUBLE) {
bondSymbol = "=";
} else if (bond.getOrder() == IBond.Order.TRIPLE) {
bondSymbol = "#";
}
return bondSymbol;
}
public void setPathLimit(int limit) {
this.pathLimit = limit;
}
public int getSearchDepth() {
return searchDepth;
}
@Override
public int getSize() {
return size;
}
@Override
public ICountFingerprint getCountFingerprint(IAtomContainer container) throws CDKException {
throw new UnsupportedOperationException();
}
}