diff --git a/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java b/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java
index bc2855af412..d825dbf5a2a 100644
--- a/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java
+++ b/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java
@@ -30,6 +30,8 @@
import java.util.ArrayList;
import java.util.BitSet;
+import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.zip.CRC32;
@@ -37,11 +39,14 @@
import javax.vecmath.Point2d;
import javax.vecmath.Point3d;
+import com.google.common.primitives.Ints;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IBond;
+
+
/**
*
Circular fingerprints: for generating fingerprints that are functionally equivalent to ECFP-2/4/6 and FCFP-2/4/6
* fingerprints, which are partially described by Rogers et al. {@cdk.cite Rogers2010}.
@@ -112,6 +117,7 @@ public FP(int hashCode, int iteration, int[] atoms) {
this.atoms = atoms;
}
}
+
// ------------ private members ------------
@@ -1238,7 +1244,7 @@ private int findBond(int a1, int a2) {
if (atomAdj[a1][n] == a2) return bondAdj[a1][n];
return -1;
}
-
+
/*
* for debugging convenience: revive if necessary private void wr(String
* str) {System.out.println(str);} private String arrayStr(int[] val) { if
diff --git a/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java b/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java
new file mode 100644
index 00000000000..298a600d284
--- /dev/null
+++ b/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016 John May
+ *
+ * Contact: cdk-devel@lists.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version. All we ask is that proper credit is given
+ * for our work, which includes - but is not limited to - adding the above
+ * copyright notice to the beginning of your source code files, and to any
+ * copyright notice that you may distribute with programs based on this work.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
+ */
+package org.openscience.cdk.fingerprint;
+
+import org.junit.Test;
+import org.openscience.cdk.CDKTestCase;
+import org.openscience.cdk.fingerprint.CircularFingerprinter.FP;
+import org.openscience.cdk.interfaces.IAtomContainer;
+import org.openscience.cdk.silent.SilentChemObjectBuilder;
+import org.openscience.cdk.smarts.SmartsFragmentExtractor;
+import org.openscience.cdk.smiles.SmilesParser;
+import org.openscience.cdk.tools.ILoggingTool;
+import org.openscience.cdk.tools.LoggingToolFactory;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.hamcrest.CoreMatchers.everyItem;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.collection.IsIn.isIn;
+
+/**
+ * @cdk.module test-standard
+ */
+public class CircularFingerprintSmartsTest extends CDKTestCase {
+
+ private static ILoggingTool logger = LoggingToolFactory
+ .createLoggingTool(CircularFingerprintSmartsTest.class);
+
+ public static SmilesParser parser = new SmilesParser(
+ SilentChemObjectBuilder.getInstance());
+
+ @Test
+ public void testMol1() throws Exception {
+ String molSmiles = "CC";
+ String expectedFPSmarts[][] = { { "C*" }, { "CC" } };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ @Test
+ public void testMol2() throws Exception {
+ String molSmiles = "CCC";
+ String expectedFPSmarts[][] = { { "C*" }, { "C(*)*" },
+ { "CC*", "C(*)C" }, { "CCC" }, };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ @Test
+ public void testMol3() throws Exception {
+ String molSmiles = "CCN";
+ String expectedFPSmarts[][] = { { "C*" }, { "C(*)*" }, { "N*" },
+ { "CC*", "C(*)C" }, { "C(*)N", "NC*" },
+ { "CCN", "NCC", "C(C)N", "C(N)C" }, };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ @Test
+ public void testMol4() throws Exception {
+ String molSmiles = "C1CC1";
+ String expectedFPSmarts[][] = {
+
+ { "C(*)*" }, { "C1CC1", "C(C1)C1" } };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ @Test
+ public void testMol5() throws Exception {
+ String molSmiles = "C1CCC1";
+ String expectedFPSmarts[][] = {
+
+ { "C(*)*" }, { "C(C*)C*", "C(CC*)*", "C(*)CC*" },
+ { "C1CCC1", "C(CC1)C1", "C(C1)CC1" } };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ @Test
+ public void testMol6() throws Exception {
+ String molSmiles = "CC[C-]";
+ String expectedFPSmarts[][] = {
+
+ { "C*" }, { "C(*)*" }, { "[C-]*" }, { "CC*", "C(*)C" },
+ { "[C-]C*", "C(*)[C-]" },
+ { "CC[C-]", "C(C)[C-]", "[C-]CC", "C([C-])C" } };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ @Test
+ public void testMol7() throws Exception {
+ String molSmiles = "c1ccccc1";
+ String expectedFPSmarts[][] = {
+
+ { "c(a)a" },
+ { "c(a)cca", "c(ca)ca", "c(cca)a" },
+ { "c(a)cccca", "c(ca)ccca", "c(cca)cca", "c(ccca)ca",
+ "c(cccca)a" },
+ { "c1ccccc1", "c(c1)cccc1", "c(cc1)ccc1", "c(ccc1)cc1",
+ "c(cccc1)c1" } };
+ checkFPSmartsForMolecule(molSmiles, expectedFPSmarts);
+ }
+
+ private void checkFPSmartsForMolecule(String moleculeSmiles,
+ String expectedFPSmarts[][]) throws Exception {
+
+ Set expected = new HashSet<>();
+ for (String[] strs : expectedFPSmarts)
+ Collections.addAll(expected, strs);
+
+ // expectedFPSmarts[][] is a double array because for each smarts
+ // several equivalent variants
+ // of the smarts are given e.g. CCC C(C)C
+ IAtomContainer mol = parser.parseSmiles(moleculeSmiles);
+
+ CircularFingerprinter circ = new CircularFingerprinter();
+ circ.calculate(mol);
+ SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
+ subsmarts.setMode(SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER);
+ int numFP = circ.getFPCount();
+
+ Set actual = new HashSet<>();
+ for (int i = 0; i < numFP; i++) {
+ FP fp = circ.getFP(i);
+ actual.add(subsmarts.generate(fp.atoms));
+ }
+
+ assertThat(actual, everyItem(isIn(expected)));
+ }
+}
diff --git a/doc/refs/cheminf.bibx b/doc/refs/cheminf.bibx
index 3dc5681aae7..255e1de6be8 100644
--- a/doc/refs/cheminf.bibx
+++ b/doc/refs/cheminf.bibx
@@ -16,7 +16,7 @@
80-90
-
+
Colin Batchelor and Ken Karapetyan and Valery Tkachenko and Anthony Williams
@@ -26,7 +26,7 @@
http://www.slideshare.net/RSC-Chemistry/20130724-cisrg-sugarsbatchelor
-
+
Bernstein, H.J.
@@ -34,18 +34,6 @@
http://www.openrasmol.org/doc/rasmol.html#cpkcolours
-
-
-
- Jonathan Brecher
- Graphical representation standards for chemical structure diagrams (IUPAC Recommendations 2008)
- Pure Appl. Chem
- 2008
- 80
- 2
- 277–410
-
-
@@ -57,7 +45,29 @@
51-62
-
+
+
+
+ Willighagen, E. and Hutchinson, G. and Niehaus, C. and Buchwald, J. and Pfeiffer, M. and Leidert, D. and Brefort, J.
+ Blue Obelisk Data Repository (version 10)
+ Figshare
+ 2014
+ 10.6084/m9.figshare.1025775.v1
+
+
+
+
+
+ Jonathan Brecher
+ Graphical representation standards for chemical structure diagrams (IUPAC Recommendations 2008)
+ Pure Appl. Chem
+ 2008
+ 80
+ 2
+ 277–410
+
+
+
Cahn, R.S. and Ingold, C. and Prelog, V.
@@ -76,7 +86,7 @@
A New Effective Algorithm for the Unambiguous Identification of the Stereochemical Characteristics of Compounds During Their Registration in Databases
Molecules
2001
- 6
+ 6
915-926
@@ -92,17 +102,17 @@
1107-23
-
-
-
- Clark A.
- Rendering Molecular Sketches for Publication Quality Output
- Molecular Informatics
- 2013
- 32
- 291-301
-
-
+
+
+
+ Clark A.
+ Rendering Molecular Sketches for Publication Quality Output
+ Molecular Informatics
+ 2013
+ 32
+ 291-301
+
+
@@ -126,12 +136,12 @@
359-360
-
+
Berger, F. and Gritzmann, P. and De Vries, S.
Cyclic Invariants for Molecular Graphs
- Lehrstuhl f�r Angewandte Geometrie und Diskrete Mathematik, Technische Universit�t M�nchen
+ Lehrstuhl für Angewandte Geometrie und Diskrete Mathematik, Technische Universität München
2004
http://www-m9.ma.tum.de/dm/cycles/
@@ -156,7 +166,7 @@
Constitutional Formulae generated from Connectivity
Information: the Program MDRAW
Journal of Chemical Research
- 1991
+ 1991
2601-2689
@@ -207,7 +217,7 @@
225-227
-
+
Burden, F.R.
@@ -218,7 +228,7 @@
309-314
-
+
Cordella Luigi P and Foggia Pasquale and Carlo Sansone and Vento Mario
@@ -229,7 +239,7 @@
10
-
+
Cherkasov, A.
@@ -261,8 +271,8 @@
Ertl, P. and Rohde, B. and Selzer, P.
- Fast Calculation of Molecular Polar Surface Area as a Sum of
- Fragment-Based Contributions and Its Application to the Prediction of
+ Fast Calculation of Molecular Polar Surface Area as a Sum of
+ Fragment-Based Contributions and Its Application to the Prediction of
Drug Transport Properties
J. Med. Chem.
2000
@@ -300,7 +310,7 @@
707-720
-
+
Faulon, J. L., Collins, M. J., and Carr, R. D.
@@ -374,14 +384,15 @@
-
+
- Green and Kahn and Savoy and Sprague and Teig
- Chemical Function Queries for 3D Database Search
- Journal of Chemical Information and Computer Science
- 1994
- 34
- 1297-1308
+ Guha, R. and Howard, M.T. and Hutchison, G.R. and Murray-Rust, P. and Rzepa, H. and Steinbeck, S. and Wegner, J. and Willighagen, E.L.
+ The Blue Obelisks - Interoperability in Chemical Informatics
+ J. Chem. Inf. Model.
+ 2006
+ 46
+ 991-998
+ 10.1021/ci050400b
@@ -394,7 +405,62 @@
Thecn. Univ. Munchen
-
+
+
+
+ T. A. Halgren
+ Merck Molecular Force Field. I. Basis, Form, Scope, Parametrization, and Performance of MMFF94
+ J. Comput. Chem
+ 1996
+ 17
+ 490-519
+
+
+
+
+
+ T. A. Halgren
+ Merck Molecular Force Field. II. MMFF94 van der Waals and Electrostatic Parameters for Intermolecular Interactions
+ J. Comput. Chem
+ 1996
+ 17
+ 520-552
+
+
+
+
+
+ T. A. Halgren
+ Merck Molecular Force Field. III. Molecular Geometries and Vibrational Frequencies for MMFF94
+ J. Comput. Chem
+ 1996
+ 17
+ 553-586
+
+
+
+
+
+ T. A. Halgren
+ Merck Molecular Force Field. IV. Conformational Energies and Geometries for MMFF94
+ J. Comput. Chem
+ 1996
+ 17
+ 587-615
+
+
+
+
+
+ T. A. Halgren
+ Merck Molecular Force Field. V. Extension of MMFF94 Using Experimental Data, Additional Computational Data, and Empirical Rules
+ J. Comput. Chem
+ 1996
+ 17
+ 616-641
+
+
+
Tonnelier, C. and Jauffret, Ph. and Hanser, Th. and Jauffret, Ph. and Kaufmann, G.
@@ -420,7 +486,7 @@
Hanser, Th. and Jauffret, Ph. and Kaufmann, G.
- A New Algorithm for Exhaustive Ring Perception in a
+ A New Algorithm for Exhaustive Ring Perception in a
Molecular Graph
J. Chem. Inf. Comput. Sci.
1996
@@ -429,6 +495,21 @@
+
+
+ Hinselmann, and Georg and Rosenbaum, and Lars and Jahn, and Andreas and Fechner, and Nikolas and Zell, and Andreas
+ jCompoundMapper: An open source Java library and command-line tool for chemical fingerprints
+ Journal of Cheminformatics
+ 2011
+ 3
+ 1–14
+ 1
+ http://dx.doi.org/10.1186/1758-2946-3-3
+ 10.1186/1758-2946-3-3
+
+
+
+
Helson, Harold E.
@@ -443,6 +524,17 @@
+
+
+ Berger, F. and Gritzmann, P. and De Vries, S.
+ Minimum cycle bases for network graphs
+ Algorithmica
+ 2004
+ 1
+ 51-62
+
+
+
Stein, S. and Heller, S.
@@ -498,7 +590,7 @@
318
-
+
Wolf Dietrich Ihlenfeldt and Johann Gasteiger
@@ -589,6 +681,19 @@ obtained by accurate mass spectrometry
+
+
+ Highly accurate chemical formula prediction tool utilizing high-resolution mass spectra, MS/MS fragmentation, heuristic rules, and isotope pattern matching
+ Pluskal, Tomas and Uehara, Taisuke and Yanagida, Mitsuhiro
+ 2012
+ Analytical Chemistry
+ 84
+ 10
+ 4396-4403
+
+
+
+
Marston, C.C.
@@ -671,7 +776,7 @@ obtained by accurate mass spectrometry
http://www.csaszar.org/interesting/The_Open_Source_Reader.pdf
-
+
Pearlman, R.S. and Smith, K.M.
@@ -744,17 +849,6 @@ obtained by accurate mass spectrometry
-
-
- Rogers and Hahn
- J. Chem. Inf. Mod.
- 2010
- 50
- 742-754
- 10.1021/ci100050t
-
-
-
SMILES Tutorial
@@ -768,7 +862,7 @@ obtained by accurate mass spectrometry
http://www.daylight.com/dayhtml/smiles/ssmiles.html
-
+
Von Scholley, A.
@@ -833,7 +927,7 @@ obtained by accurate mass spectrometry
http://www.cdk.org/
-
+
Andreas Steffen, Thierry Kogej, Christian Tyrchan and Ola Engkvist
@@ -845,7 +939,7 @@ obtained by accurate mass spectrometry
10.1021/ci800326z
-
+
Shelley CA
@@ -856,7 +950,7 @@ obtained by accurate mass spectrometry
61
-
+
J.A. Grant, J.A. Haigh, B.T. Pickup, A. Nicholls and R.A. Sayle
@@ -954,7 +1048,7 @@ obtained by accurate mass spectrometry
97-101
-
+
Wiener, Harry
@@ -983,7 +1077,7 @@ obtained by accurate mass spectrometry
273-282
-
+
Wessel, M.D. and Jurs, P.C. and Tolan, J.W. and Muskal, S.M.
@@ -1009,7 +1103,7 @@ obtained by accurate mass spectrometry
2323-2329
-
+
Hendlich, M. and Rippmann, F. and Bernickel, G.
@@ -1044,12 +1138,12 @@ obtained by accurate mass spectrometry
2001
4
- This article describes the first opensource Java implementation of import
- filters for the Chemical Markup Language (CML). The filters support CML conventions
+ This article describes the first opensource Java implementation of import
+ filters for the Chemical Markup Language (CML). The filters support CML conventions
and were tested with two opensource project: Jmol, a 3D molecular viewer, and
- JChemPaint, a chemical editor. Furthermore, the use of conventions in CML is
- explained and the reason for using conventions is pointed out. Finally, the
- implementation is compared with two recently developed techniques for handling
+ JChemPaint, a chemical editor. Furthermore, the use of conventions in CML is
+ explained and the reason for using conventions is pointed out. Finally, the
+ implementation is compared with two recently developed techniques for handling
CML data.
XML, CML, Java
@@ -1128,7 +1222,7 @@ obtained by accurate mass spectrometry
A QSPR Analysis of HPLC Column Capacity
Factors for a set of High-Energy Materials Using Electronic Van der Waals
-Surface Property Descriptors Computed by the Transferable Atom Equivalent
+Surface Property Descriptors Computed by the Transferable Atom Equivalent
Method
Journal of Computational Chemistry
1997
@@ -1148,6 +1242,19 @@ Method
+
+
+ Liu, R. and Rallo, R. and George, S. and Ji, Z. and Nair, S. and Nel, A.E. and Cohen, Y.
+ Classification NanoSAR development for cytotoxicity of metal oxide nanoparticles
+ Small
+ 2011
+ 7
+ 8
+ 1118-1126
+ 10.1002/smll.201002366
+
+
+
Liu, S. and Cao, C. and Li, Z.
@@ -1183,6 +1290,7 @@ Method
1986
7
565-577
+ 10.1002/jcc.540070419
@@ -1195,6 +1303,7 @@ Method
1987
27
21-35
+ 10.1021/ci00053a005
@@ -1218,6 +1327,7 @@ Method
1995
35
1039-1045
+ 10.1021/ci00028a014
@@ -1233,6 +1343,7 @@ Method
2004
9
1004-1009
+ 10.3390/91201004
@@ -1246,8 +1357,9 @@ Method
2000
18
464-477
+ 10.1016/S1093-3263(00)00068-1
-
+
@@ -1257,6 +1369,7 @@ Method
2006
78
1897-1970
+ 10.1351/pac200678101897
@@ -1268,6 +1381,7 @@ Method
2009
1
12
+ 10.1186/1758-2946-1-12
@@ -1279,6 +1393,7 @@ Method
2010
ASAP
+ 10.1021/jm1008456
@@ -1290,6 +1405,7 @@ Method
2005
45
386-393
+ 10.1021/ci0496797
@@ -1314,7 +1430,7 @@ Method
2-4
-
+
Zhao, Yuan H. and Abraham, Michael H. and Zissimos, Andreas M.
@@ -1327,7 +1443,7 @@ Method
10.1021/jo034808o
-
+
Thalheim, T. and Vollmer, A. and Ebert, R. and Kuehne, R. and Schuurmann, G.
@@ -1336,11 +1452,12 @@ Method
2010
50
1223-1232
+ 10.1021/ci1001179
-
+
-
+
Rojas-Cherto, Miguel and Kasper, Piotr T and Willighagen, Egon L and Vreeken, R. and Hankemeier, Thomas and Reijmers, Theo
Elemental Composition determination based on MSn
Bioinformatics
@@ -1350,20 +1467,20 @@ Method
10.1093/bioinformatics/btr409
-
+
Klekota, Justin and Roth, Frederick P.
Chemical substructures that enrich for biological activity
- 24
- 21
- 2518-2525
- 2008
+ 24
+ 21
+ 2518-2525
+ 2008
10.1093/bioinformatics/btn479
- http://bioinformatics.oxfordjournals.org/content/24/21/2518.abstract
- http://bioinformatics.oxfordjournals.org/content/24/21/2518.full.pdf+html
+ http://bioinformatics.oxfordjournals.org/content/24/21/2518.abstract
+ http://bioinformatics.oxfordjournals.org/content/24/21/2518.full.pdf+html
Bioinformatics
@@ -1377,15 +1494,15 @@ Method
Journal of the Association for Computing Machinery
23
1
+ 10.1145/321921.321925
-
-
+
+
Vismara, Philippe
Union of all the minimum cycle bases of a graph
- 1997
- 10.1.1.47.3674
+ 1997
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.47.3674
Combinatorics
4
@@ -1416,9 +1533,22 @@ Method
J. Chem. Inf. Comput. Sci
33
812-825
+ 10.1021/ci00016a003
-
-
+
+
+
+
+ Rogers and Hahn
+ Extended-connectivity fingerprints
+ J. Chem. Inf. Mod.
+ 2010
+ 50
+ 742-754
+ 10.1021/ci100050t
+
+
+
Towards a Universal SMILES representation - A standard method to generate canonical SMILES based on the InChI
@@ -1427,9 +1557,10 @@ Method
Journal of Cheminformatics
4
22
+ 10.1186/1758-2946-4-22
-
+
Paths, trees and flowers
@@ -1474,4 +1605,7 @@ Method
http://arxiv.org/abs/1307.7805
+
+
+
diff --git a/tool/smarts/src/main/java/org/openscience/cdk/smarts/SmartsFragmentExtractor.java b/tool/smarts/src/main/java/org/openscience/cdk/smarts/SmartsFragmentExtractor.java
new file mode 100644
index 00000000000..1550bed2a08
--- /dev/null
+++ b/tool/smarts/src/main/java/org/openscience/cdk/smarts/SmartsFragmentExtractor.java
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2016 John May
+ *
+ * Contact: cdk-devel@lists.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version. All we ask is that proper credit is given
+ * for our work, which includes - but is not limited to - adding the above
+ * copyright notice to the beginning of your source code files, and to any
+ * copyright notice that you may distribute with programs based on this work.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
+ */
+
+package org.openscience.cdk.smarts;
+
+import org.openscience.cdk.interfaces.IAtom;
+import org.openscience.cdk.interfaces.IAtomContainer;
+import org.openscience.cdk.interfaces.IBond;
+
+import java.util.Arrays;
+import java.util.Locale;
+
+/**
+ * Utility class to create SMARTS that match part (substructure) of a molecule.
+ * SMARTS are generated by providing the atom indexes. An example use cases is
+ * encoding features from a fingerprint.
+ *
+ * The extractor has two modes. {@link #MODE_EXACT} (default) captures the element,
+ * valence, hydrogen count, connectivity, and charge in the SMARTS atom expressions.
+ * The alternative mode, {@link #MODE_JCOMPOUNDMAPPER}, only captures the element,
+ * non-zero charge, and peripheral bonds. Although the later looks cleaner, the
+ * peripheral bonds intend to capture the connectivity of the terminal atoms but
+ * since the valence is not bounded further substitution is still allowed. This
+ * mirrors functionality from jCompoundMapper {@cdk.cite Hinselmann2011}.
+ *
+ *
The difference is easily demonstrated for methyl. Consider the compound
+ * of 2-methylpentane {@code CC(C)CCC}, if we extract one of the methyl atoms
+ * depending on the mode we obtain {@code [CH3v4X4+0]} or {@code C*}. The first
+ * of these patterns (obtained by {@link #MODE_EXACT}) matches the compound in
+ * three places (the three methyl groups). The second matches six
+ * times (every atom) because the substituion on the carbon is not locked.
+ * A further complication is introduced by the inclusion of the peripheral atoms,
+ * for 1H-indole {@code [nH]1ccc2c1cccc2} we can obtain the SMARTS {@code n(ccc(a)a)a}
+ * that doesn't match at all. This is because one of the aromatic atoms ('a')
+ * needs to match the nitrogen.
+ *
+ *
Basic Usage:
+ * {@code
+ *
+ * IChemObjectBuilder bldr = SilentChemObjectBuilder.getInstance();
+ * SmilesParser smipar = new SmilesParser(bldr);
+ *
+ * IAtomContainer mol = smipar.parseSmiles("[nH]1ccc2c1cccc2");
+ * SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
+ *
+ * // smarts=[nH1v3X3+0][cH1v4X3+0][cH1v4X3+0][cH0v4X3+0]
+ * // hits =1
+ * String smarts = mol.generate(new int[]{0,1,3,4});
+ *
+ * subsmarts.setMode(MODE_JCOMPOUNDMAPPER);
+ * // smarts=n(ccc(a)a)a
+ * // hits = 0 - one of the 'a' atoms needs to match the nitrogen
+ * String smarts = mol.generate(new int[]{0,1,3,4});
+ * }
+ *
+ * @author Nikolay Kochev
+ * @author Nina Jeliazkova
+ * @author John May
+ */
+public final class SmartsFragmentExtractor {
+
+ /**
+ * Sets the mode of the extractor to produce SMARTS similar to JCompoundMapper.
+ */
+ public static final int MODE_JCOMPOUNDMAPPER = 1;
+
+ /**
+ * Sets the mode of the extractor to produce exact SMARTS.
+ */
+ public static final int MODE_EXACT = 2;
+
+ // molecule being selected over
+ private final IAtomContainer mol;
+
+ // fast-access mol graph data structures
+ private final int[][] atomAdj, bondAdj;
+ private final int[] deg;
+
+ // SMARTS atom and bond expressions
+ private final String[] aexpr;
+ private final String[] bexpr;
+
+ // SMARTS traversal/generation
+ private final int[] avisit;
+ private final int[] rbnds;
+ private final int[] rnums;
+ private int numVisit;
+
+ // which mode should SMARTS be encoded in
+ private int mode = MODE_EXACT;
+
+ /**
+ * Create a new instance over the provided molecule.
+ *
+ * @param mol molecule
+ */
+ public SmartsFragmentExtractor(IAtomContainer mol) {
+ this.mol = mol;
+
+ final int numAtoms = mol.getAtomCount();
+ final int numBonds = mol.getBondCount();
+
+ // build fast access
+ this.deg = new int[numAtoms];
+ this.atomAdj = new int[numAtoms][4];
+ this.bondAdj = new int[numAtoms][4];
+ this.aexpr = new String[numAtoms];
+ this.bexpr = new String[numBonds];
+ this.avisit = new int[numAtoms];
+ this.rbnds = new int[numBonds];
+ this.rnums = new int[100]; // max 99 in SMILES/SMARTS
+
+ // index adjacency information and bond expressions for quick
+ // reference and traversal
+ for (int bondIdx = 0; bondIdx < numBonds; bondIdx++) {
+ IBond bond = mol.getBond(bondIdx);
+ IAtom beg = bond.getAtom(0);
+ IAtom end = bond.getAtom(1);
+ int begIdx = mol.getAtomNumber(beg);
+ int endIdx = mol.getAtomNumber(end);
+ this.bexpr[bondIdx] = encodeBondExpr(bondIdx, begIdx, endIdx);
+
+ // make sufficient space
+ if (deg[begIdx] == atomAdj[begIdx].length) {
+ atomAdj[begIdx] = Arrays.copyOf(atomAdj[begIdx], deg[begIdx] + 2);
+ bondAdj[begIdx] = Arrays.copyOf(bondAdj[begIdx], deg[begIdx] + 2);
+ }
+ if (deg[endIdx] == atomAdj[endIdx].length) {
+ atomAdj[endIdx] = Arrays.copyOf(atomAdj[endIdx], deg[endIdx] + 2);
+ bondAdj[endIdx] = Arrays.copyOf(bondAdj[endIdx], deg[endIdx] + 2);
+ }
+
+ atomAdj[begIdx][deg[begIdx]] = endIdx;
+ bondAdj[begIdx][deg[begIdx]] = bondIdx;
+ atomAdj[endIdx][deg[endIdx]] = begIdx;
+ bondAdj[endIdx][deg[endIdx]] = bondIdx;
+
+ deg[begIdx]++;
+ deg[endIdx]++;
+ }
+
+ // pre-generate atom expressions
+ for (int atomIdx = 0; atomIdx < numAtoms; atomIdx++)
+ this.aexpr[atomIdx] = encodeAtomExpr(atomIdx);
+ }
+
+ /**
+ * Set the mode of SMARTS substructure selection
+ *
+ * @param mode the mode
+ */
+ public void setMode(int mode) {
+ // check arg
+ switch (mode) {
+ case MODE_EXACT:
+ case MODE_JCOMPOUNDMAPPER:
+ break;
+ default:
+ throw new IllegalArgumentException("Invalid mode specified!");
+ }
+ this.mode = mode;
+
+ // re-gen atom expressions
+ int numAtoms = mol.getAtomCount();
+ for (int atomIdx = 0; atomIdx < numAtoms; atomIdx++)
+ this.aexpr[atomIdx] = encodeAtomExpr(atomIdx);
+ }
+
+ /**
+ * Generate a SMARTS for the substructure formed of the provided
+ * atoms.
+ *
+ * @param atomIdxs atom indexes
+ * @return SMARTS, null if an empty array is passed
+ */
+ public String generate(int[] atomIdxs) {
+
+ if (atomIdxs == null)
+ throw new NullPointerException("No atom indexes provided");
+ if (atomIdxs.length == 0)
+ return null; // makes sense?
+
+ // special case
+ if (atomIdxs.length == 1 && mode == MODE_EXACT)
+ return aexpr[atomIdxs[0]];
+
+ // initialize traversal information
+ Arrays.fill(rbnds, 0);
+ Arrays.fill(avisit, 0);
+ for (int atmIdx : atomIdxs)
+ avisit[atmIdx] = -1;
+
+ // first visit marks ring information
+ numVisit = 1;
+ for (int atomIdx : atomIdxs) {
+ if (avisit[atomIdx] < 0)
+ markRings(atomIdx, -1);
+ }
+
+ // reset visit flags and generate
+ numVisit = 1;
+ for (int atmIdx : atomIdxs)
+ avisit[atmIdx] = -1;
+
+ // second pass builds the expression
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < atomIdxs.length; i++) {
+ if (avisit[atomIdxs[i]] < 0) {
+ if (i > 0) sb.append('.');
+ encodeExpr(atomIdxs[i], -1, sb);
+ }
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * Recursively marks ring closures (back edges) in the {@link #rbnds}
+ * array in a depth first order.
+ *
+ * @param idx atom index
+ * @param bprev previous bond
+ */
+ private void markRings(int idx, int bprev) {
+ avisit[idx] = numVisit++;
+ final int d = deg[idx];
+ for (int j = 0; j < d; j++) {
+ int nbr = atomAdj[idx][j];
+ int bidx = bondAdj[idx][j];
+ if (avisit[nbr] == 0 || bidx == bprev)
+ continue; // ignored
+ else if (avisit[nbr] < 0)
+ markRings(nbr, bidx);
+ else if (avisit[nbr] < avisit[idx])
+ rbnds[bidx] = -1; // ring closure
+ }
+ }
+
+ /**
+ * Recursively encodes a SMARTS expression into the provides
+ * string builder.
+ *
+ * @param idx atom index
+ * @param bprev previous bond
+ * @param sb destition to write SMARTS to
+ */
+ private void encodeExpr(int idx, int bprev, StringBuilder sb) {
+ avisit[idx] = numVisit++;
+ sb.append(aexpr[idx]);
+ final int d = deg[idx];
+
+ int remain = d;
+ for (int j = 0; j < d; j++) {
+ int nbr = atomAdj[idx][j];
+ int bidx = bondAdj[idx][j];
+
+ // ring open/close
+ if (rbnds[bidx] < 0) {
+ // open
+ final int rnum = chooseRingNumber();
+ if (rnum > 9) sb.append('%');
+ sb.append(rnum);
+ rbnds[bidx] = rnum;
+ } else if (rbnds[bidx] > 0) {
+ // close
+ final int rnum = rbnds[bidx];
+ releaseRingNumber(rnum);
+ if (rnum > 9) sb.append('%');
+ sb.append(rnum);
+ }
+
+ if (mode == MODE_EXACT && avisit[nbr] == 0 ||
+ bidx == bprev ||
+ rbnds[bidx] != 0)
+ remain--;
+ }
+
+ for (int j = 0; j < d; j++) {
+ int nbr = atomAdj[idx][j];
+ int bidx = bondAdj[idx][j];
+ if (mode == MODE_EXACT && avisit[nbr] == 0 ||
+ bidx == bprev ||
+ rbnds[bidx] != 0)
+ continue; // ignored
+ remain--;
+ if (avisit[nbr] == 0) {
+ // peripheral bond
+ if (remain > 0) sb.append('(');
+ sb.append(bexpr[bidx]);
+ sb.append(mol.getAtom(nbr).isAromatic() ? 'a' : '*');
+ if (remain > 0) sb.append(')');
+ } else {
+ if (remain > 0) sb.append('(');
+ sb.append(bexpr[bidx]);
+ encodeExpr(nbr, bidx, sb);
+ if (remain > 0) sb.append(')');
+ }
+ }
+ }
+
+ /**
+ * Select the lowest ring number for use in SMARTS.
+ *
+ * @return ring number
+ * @throws IllegalStateException all ring numbers are used
+ */
+ private int chooseRingNumber() {
+ for (int i = 1; i < rnums.length; i++) {
+ if (rnums[i] == 0) {
+ rnums[i] = 1;
+ return i;
+ }
+ }
+ throw new IllegalStateException("No more ring numbers available!");
+ }
+
+ /**
+ * Releases a ring number allowing it to be reused.
+ *
+ * @param rnum ring number
+ */
+ private void releaseRingNumber(int rnum) {
+ rnums[rnum] = 0;
+ }
+
+ /**
+ * Encodes the atom at index (atmIdx) to a SMARTS
+ * expression that matches itself.
+ *
+ * @param atmIdx atom index
+ * @return SMARTS atom expression
+ */
+ private String encodeAtomExpr(int atmIdx) {
+ final IAtom atom = mol.getAtom(atmIdx);
+
+ boolean complex = mode == MODE_EXACT;
+
+ StringBuilder sb = new StringBuilder();
+
+ switch (atom.getAtomicNumber()) {
+ case 0: // *
+ sb.append('*');
+ break;
+ case 5: // B
+ case 6: // C
+ case 7: // N
+ case 8: // O
+ case 15: // P
+ case 16: // S
+ case 9: // F
+ case 17: // Cl
+ case 35: // Br
+ case 53: // I
+ sb.append(atom.isAromatic() ? atom.getSymbol().toLowerCase(Locale.ROOT)
+ : atom.getSymbol());
+ break;
+ default:
+ complex = true;
+ sb.append(atom.isAromatic() ? atom.getSymbol().toLowerCase(Locale.ROOT)
+ : atom.getSymbol());
+ break;
+ }
+
+ if (mode == MODE_EXACT) {
+
+ int hcount = atom.getImplicitHydrogenCount();
+ int valence = hcount;
+ int connections = hcount;
+
+ int atmDeg = this.deg[atmIdx];
+ for (int i = 0; i < atmDeg; i++) {
+ IBond bond = mol.getBond(bondAdj[atmIdx][i]);
+ IAtom nbr = bond.getConnectedAtom(atom);
+ if (nbr.getAtomicNumber() != null && nbr.getAtomicNumber() == 1)
+ hcount++;
+ int bord = bond.getOrder() != null ? bond.getOrder().numeric() : 0;
+ if (bord == 0)
+ throw new IllegalArgumentException("Molecule had unsupported zero-order or unset bonds!");
+ valence += bord;
+ connections++;
+ }
+
+ sb.append('H').append(hcount);
+ sb.append('v').append(valence);
+ sb.append('X').append(connections);
+ }
+
+ Integer chg = atom.getFormalCharge();
+ if (chg == null) chg = 0;
+
+
+ if (chg <= -1 || chg >= +1) {
+ if (chg >= 0) sb.append('+');
+ else sb.append('-');
+ int abs = Math.abs(chg);
+ if (abs > 1) sb.append(abs);
+ complex = true;
+ } else if (mode == MODE_EXACT) {
+ sb.append("+0");
+ }
+
+ return complex ? '[' + sb.toString() + ']' : sb.toString();
+ }
+
+ /**
+ * Encodes the bond at index (bondIdx) to a SMARTS
+ * expression that matches itself.
+ *
+ * @param bondIdx bond index
+ * @param beg atom index of first atom
+ * @param end atom index of second atom
+ * @return SMARTS bond expression
+ */
+ private String encodeBondExpr(int bondIdx, int beg, int end) {
+ IBond bond = mol.getBond(bondIdx);
+ if (bond.getOrder() == null)
+ return "";
+
+ boolean bArom = bond.isAromatic();
+ boolean aArom = mol.getAtom(beg).isAromatic() && mol.getAtom(end).isAromatic();
+ switch (bond.getOrder()) {
+ case SINGLE:
+ if (bArom) {
+ return aArom ? "" : ":";
+ } else {
+ return aArom ? "-" : "";
+ }
+ case DOUBLE:
+ return bArom ? "" : "=";
+ case TRIPLE:
+ return "#";
+ default:
+ throw new IllegalArgumentException("Unsupported bond type: " + bond.getOrder());
+ }
+ }
+}
diff --git a/tool/smarts/src/test/java/org/openscience/cdk/smarts/SmartsFragmentExtractorTest.java b/tool/smarts/src/test/java/org/openscience/cdk/smarts/SmartsFragmentExtractorTest.java
new file mode 100644
index 00000000000..7ce2d52aa73
--- /dev/null
+++ b/tool/smarts/src/test/java/org/openscience/cdk/smarts/SmartsFragmentExtractorTest.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016 John May
+ *
+ * Contact: cdk-devel@lists.sourceforge.net
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version. All we ask is that proper credit is given
+ * for our work, which includes - but is not limited to - adding the above
+ * copyright notice to the beginning of your source code files, and to any
+ * copyright notice that you may distribute with programs based on this work.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
+ */
+
+package org.openscience.cdk.smarts;
+
+import org.junit.Test;
+import org.openscience.cdk.interfaces.IAtomContainer;
+import org.openscience.cdk.silent.SilentChemObjectBuilder;
+import org.openscience.cdk.smiles.SmilesParser;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+public class SmartsFragmentExtractorTest {
+
+ private String generate(String smi, int mode, int[] idxs) throws Exception {
+ SmilesParser smipar = new SmilesParser(SilentChemObjectBuilder.getInstance());
+ IAtomContainer mol = smipar.parseSmiles(smi);
+ SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
+ subsmarts.setMode(mode);
+ return subsmarts.generate(idxs);
+ }
+
+ private static int[] makeSeq(int beg, int to) {
+ int[] a = new int[to-beg];
+ for (int i = 0; i < a.length; i++)
+ a[i] = beg++;
+ return a;
+ }
+
+ @Test
+ public void methylExact() throws Exception {
+ String smarts = generate("CC(C)CCC",
+ SmartsFragmentExtractor.MODE_EXACT,
+ makeSeq(0,1));
+ assertThat(smarts, is("[CH3v4X4+0]"));
+ }
+
+ @Test
+ public void methylForJCompoundMap() throws Exception {
+ String smarts = generate("CC(C)CCC",
+ SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER,
+ makeSeq(0,1));
+ assertThat(smarts, is("C*"));
+ }
+
+ @Test
+ public void indole() throws Exception {
+ String smarts = generate("[nH]1ccc2c1cccc2",
+ SmartsFragmentExtractor.MODE_EXACT,
+ makeSeq(0,4));
+ assertThat(smarts, is("[nH1v3X3+0][cH1v4X3+0][cH1v4X3+0][cH0v4X3+0]"));
+ }
+
+ @Test
+ public void indoleForJCompoundMap() throws Exception {
+ String smarts = generate("[nH]1ccc2c1cccc2",
+ SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER,
+ makeSeq(0,4));
+ assertThat(smarts, is("n(ccc(a)a)a"));
+ }
+
+ @Test
+ public void biphenylIncludesSingleBond() throws Exception {
+ String smarts = generate("c1ccccc1-c1ccccc1",
+ SmartsFragmentExtractor.MODE_EXACT,
+ makeSeq(0,12));
+ assertThat(smarts, containsString("-"));
+ }
+
+ @Test
+ public void fullereneC60() throws Exception {
+ String smarts = generate("c12c3c4c5c1c1c6c7c2c2c8c3c3c9c4c4c%10c5c5c1c1c6c6c%11c7c2c2c7c8c3c3c8c9c4c4c9c%10c5c5c1c1c6c6c%11c2c2c7c3c3c8c4c4c9c5c1c1c6c2c3c41",
+ SmartsFragmentExtractor.MODE_EXACT,
+ makeSeq(0,60));
+ assertThat(smarts,
+ is("[cH0v4X3+0]12[cH0v4X3+0]3[cH0v4X3+0]4[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]7[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]8[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]9[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]%10[cH0v4X3+0]5[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]6[cH0v4X3+0]%11[cH0v4X3+0]7[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]7[cH0v4X3+0]8[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]8[cH0v4X3+0]9[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]9[cH0v4X3+0]%10[cH0v4X3+0]5[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]6[cH0v4X3+0]%11[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]7[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]8[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]9[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]2[cH0v4X3+0]3[cH0v4X3+0]41"));
+ }
+
+}
\ No newline at end of file