diff --git a/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java b/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java index bc2855af412..d825dbf5a2a 100644 --- a/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java +++ b/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java @@ -30,6 +30,8 @@ import java.util.ArrayList; import java.util.BitSet; +import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.zip.CRC32; @@ -37,11 +39,14 @@ import javax.vecmath.Point2d; import javax.vecmath.Point3d; +import com.google.common.primitives.Ints; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; + + /** *

Circular fingerprints: for generating fingerprints that are functionally equivalent to ECFP-2/4/6 and FCFP-2/4/6 * fingerprints, which are partially described by Rogers et al. {@cdk.cite Rogers2010}. @@ -112,6 +117,7 @@ public FP(int hashCode, int iteration, int[] atoms) { this.atoms = atoms; } } + // ------------ private members ------------ @@ -1238,7 +1244,7 @@ private int findBond(int a1, int a2) { if (atomAdj[a1][n] == a2) return bondAdj[a1][n]; return -1; } - + /* * for debugging convenience: revive if necessary private void wr(String * str) {System.out.println(str);} private String arrayStr(int[] val) { if diff --git a/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java b/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java new file mode 100644 index 00000000000..298a600d284 --- /dev/null +++ b/descriptor/fingerprint/src/test/java/org/openscience/cdk/fingerprint/CircularFingerprintSmartsTest.java @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016 John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. All we ask is that proper credit is given + * for our work, which includes - but is not limited to - adding the above + * copyright notice to the beginning of your source code files, and to any + * copyright notice that you may distribute with programs based on this work. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ +package org.openscience.cdk.fingerprint; + +import org.junit.Test; +import org.openscience.cdk.CDKTestCase; +import org.openscience.cdk.fingerprint.CircularFingerprinter.FP; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.silent.SilentChemObjectBuilder; +import org.openscience.cdk.smarts.SmartsFragmentExtractor; +import org.openscience.cdk.smiles.SmilesParser; +import org.openscience.cdk.tools.ILoggingTool; +import org.openscience.cdk.tools.LoggingToolFactory; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.CoreMatchers.everyItem; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.collection.IsIn.isIn; + +/** + * @cdk.module test-standard + */ +public class CircularFingerprintSmartsTest extends CDKTestCase { + + private static ILoggingTool logger = LoggingToolFactory + .createLoggingTool(CircularFingerprintSmartsTest.class); + + public static SmilesParser parser = new SmilesParser( + SilentChemObjectBuilder.getInstance()); + + @Test + public void testMol1() throws Exception { + String molSmiles = "CC"; + String expectedFPSmarts[][] = { { "C*" }, { "CC" } }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + @Test + public void testMol2() throws Exception { + String molSmiles = "CCC"; + String expectedFPSmarts[][] = { { "C*" }, { "C(*)*" }, + { "CC*", "C(*)C" }, { "CCC" }, }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + @Test + public void testMol3() throws Exception { + String molSmiles = "CCN"; + String expectedFPSmarts[][] = { { "C*" }, { "C(*)*" }, { "N*" }, + { "CC*", "C(*)C" }, { "C(*)N", "NC*" }, + { "CCN", "NCC", "C(C)N", "C(N)C" }, }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + @Test + public void testMol4() throws Exception { + String molSmiles = "C1CC1"; + String expectedFPSmarts[][] = { + + { "C(*)*" }, { "C1CC1", "C(C1)C1" } }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + @Test + public void testMol5() throws Exception { + String molSmiles = "C1CCC1"; + String expectedFPSmarts[][] = { + + { "C(*)*" }, { "C(C*)C*", "C(CC*)*", "C(*)CC*" }, + { "C1CCC1", "C(CC1)C1", "C(C1)CC1" } }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + @Test + public void testMol6() throws Exception { + String molSmiles = "CC[C-]"; + String expectedFPSmarts[][] = { + + { "C*" }, { "C(*)*" }, { "[C-]*" }, { "CC*", "C(*)C" }, + { "[C-]C*", "C(*)[C-]" }, + { "CC[C-]", "C(C)[C-]", "[C-]CC", "C([C-])C" } }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + @Test + public void testMol7() throws Exception { + String molSmiles = "c1ccccc1"; + String expectedFPSmarts[][] = { + + { "c(a)a" }, + { "c(a)cca", "c(ca)ca", "c(cca)a" }, + { "c(a)cccca", "c(ca)ccca", "c(cca)cca", "c(ccca)ca", + "c(cccca)a" }, + { "c1ccccc1", "c(c1)cccc1", "c(cc1)ccc1", "c(ccc1)cc1", + "c(cccc1)c1" } }; + checkFPSmartsForMolecule(molSmiles, expectedFPSmarts); + } + + private void checkFPSmartsForMolecule(String moleculeSmiles, + String expectedFPSmarts[][]) throws Exception { + + Set expected = new HashSet<>(); + for (String[] strs : expectedFPSmarts) + Collections.addAll(expected, strs); + + // expectedFPSmarts[][] is a double array because for each smarts + // several equivalent variants + // of the smarts are given e.g. CCC C(C)C + IAtomContainer mol = parser.parseSmiles(moleculeSmiles); + + CircularFingerprinter circ = new CircularFingerprinter(); + circ.calculate(mol); + SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol); + subsmarts.setMode(SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER); + int numFP = circ.getFPCount(); + + Set actual = new HashSet<>(); + for (int i = 0; i < numFP; i++) { + FP fp = circ.getFP(i); + actual.add(subsmarts.generate(fp.atoms)); + } + + assertThat(actual, everyItem(isIn(expected))); + } +} diff --git a/doc/refs/cheminf.bibx b/doc/refs/cheminf.bibx index 3dc5681aae7..255e1de6be8 100644 --- a/doc/refs/cheminf.bibx +++ b/doc/refs/cheminf.bibx @@ -16,7 +16,7 @@ 80-90 - + Colin Batchelor and Ken Karapetyan and Valery Tkachenko and Anthony Williams @@ -26,7 +26,7 @@ http://www.slideshare.net/RSC-Chemistry/20130724-cisrg-sugarsbatchelor - + Bernstein, H.J. @@ -34,18 +34,6 @@ http://www.openrasmol.org/doc/rasmol.html#cpkcolours - - - - Jonathan Brecher - Graphical representation standards for chemical structure diagrams (IUPAC Recommendations 2008) - Pure Appl. Chem - 2008 - 80 - 2 - 277–410 - - @@ -57,7 +45,29 @@ 51-62 - + + + + Willighagen, E. and Hutchinson, G. and Niehaus, C. and Buchwald, J. and Pfeiffer, M. and Leidert, D. and Brefort, J. + Blue Obelisk Data Repository (version 10) + Figshare + 2014 + 10.6084/m9.figshare.1025775.v1 + + + + + + Jonathan Brecher + Graphical representation standards for chemical structure diagrams (IUPAC Recommendations 2008) + Pure Appl. Chem + 2008 + 80 + 2 + 277–410 + + + Cahn, R.S. and Ingold, C. and Prelog, V. @@ -76,7 +86,7 @@ A New Effective Algorithm for the Unambiguous Identification of the Stereochemical Characteristics of Compounds During Their Registration in Databases Molecules 2001 - 6 + 6 915-926 @@ -92,17 +102,17 @@ 1107-23 - - - - Clark A. - Rendering Molecular Sketches for Publication Quality Output - Molecular Informatics - 2013 - 32 - 291-301 - - + + + + Clark A. + Rendering Molecular Sketches for Publication Quality Output + Molecular Informatics + 2013 + 32 + 291-301 + + @@ -126,12 +136,12 @@ 359-360 - + Berger, F. and Gritzmann, P. and De Vries, S. Cyclic Invariants for Molecular Graphs - Lehrstuhl f�r Angewandte Geometrie und Diskrete Mathematik, Technische Universit�t M�nchen + Lehrstuhl für Angewandte Geometrie und Diskrete Mathematik, Technische Universität München 2004 http://www-m9.ma.tum.de/dm/cycles/ @@ -156,7 +166,7 @@ Constitutional Formulae generated from Connectivity Information: the Program MDRAW Journal of Chemical Research - 1991 + 1991 2601-2689 @@ -207,7 +217,7 @@ 225-227 - + Burden, F.R. @@ -218,7 +228,7 @@ 309-314 - + Cordella Luigi P and Foggia Pasquale and Carlo Sansone and Vento Mario @@ -229,7 +239,7 @@ 10 - + Cherkasov, A. @@ -261,8 +271,8 @@ Ertl, P. and Rohde, B. and Selzer, P. - Fast Calculation of Molecular Polar Surface Area as a Sum of - Fragment-Based Contributions and Its Application to the Prediction of + Fast Calculation of Molecular Polar Surface Area as a Sum of + Fragment-Based Contributions and Its Application to the Prediction of Drug Transport Properties J. Med. Chem. 2000 @@ -300,7 +310,7 @@ 707-720 - + Faulon, J. L., Collins, M. J., and Carr, R. D. @@ -374,14 +384,15 @@ - + - Green and Kahn and Savoy and Sprague and Teig - Chemical Function Queries for 3D Database Search - Journal of Chemical Information and Computer Science - 1994 - 34 - 1297-1308 + Guha, R. and Howard, M.T. and Hutchison, G.R. and Murray-Rust, P. and Rzepa, H. and Steinbeck, S. and Wegner, J. and Willighagen, E.L. + The Blue Obelisks - Interoperability in Chemical Informatics + J. Chem. Inf. Model. + 2006 + 46 + 991-998 + 10.1021/ci050400b @@ -394,7 +405,62 @@ Thecn. Univ. Munchen - + + + + T. A. Halgren + Merck Molecular Force Field. I. Basis, Form, Scope, Parametrization, and Performance of MMFF94 + J. Comput. Chem + 1996 + 17 + 490-519 + + + + + + T. A. Halgren + Merck Molecular Force Field. II. MMFF94 van der Waals and Electrostatic Parameters for Intermolecular Interactions + J. Comput. Chem + 1996 + 17 + 520-552 + + + + + + T. A. Halgren + Merck Molecular Force Field. III. Molecular Geometries and Vibrational Frequencies for MMFF94 + J. Comput. Chem + 1996 + 17 + 553-586 + + + + + + T. A. Halgren + Merck Molecular Force Field. IV. Conformational Energies and Geometries for MMFF94 + J. Comput. Chem + 1996 + 17 + 587-615 + + + + + + T. A. Halgren + Merck Molecular Force Field. V. Extension of MMFF94 Using Experimental Data, Additional Computational Data, and Empirical Rules + J. Comput. Chem + 1996 + 17 + 616-641 + + + Tonnelier, C. and Jauffret, Ph. and Hanser, Th. and Jauffret, Ph. and Kaufmann, G. @@ -420,7 +486,7 @@ Hanser, Th. and Jauffret, Ph. and Kaufmann, G. - A New Algorithm for Exhaustive Ring Perception in a + A New Algorithm for Exhaustive Ring Perception in a Molecular Graph J. Chem. Inf. Comput. Sci. 1996 @@ -429,6 +495,21 @@ + + + Hinselmann, and Georg and Rosenbaum, and Lars and Jahn, and Andreas and Fechner, and Nikolas and Zell, and Andreas + jCompoundMapper: An open source Java library and command-line tool for chemical fingerprints + Journal of Cheminformatics + 2011 + 3 + 1–14 + 1 + http://dx.doi.org/10.1186/1758-2946-3-3 + 10.1186/1758-2946-3-3 + + + + Helson, Harold E. @@ -443,6 +524,17 @@ + + + Berger, F. and Gritzmann, P. and De Vries, S. + Minimum cycle bases for network graphs + Algorithmica + 2004 + 1 + 51-62 + + + Stein, S. and Heller, S. @@ -498,7 +590,7 @@ 318 - + Wolf Dietrich Ihlenfeldt and Johann Gasteiger @@ -589,6 +681,19 @@ obtained by accurate mass spectrometry + + + Highly accurate chemical formula prediction tool utilizing high-resolution mass spectra, MS/MS fragmentation, heuristic rules, and isotope pattern matching + Pluskal, Tomas and Uehara, Taisuke and Yanagida, Mitsuhiro + 2012 + Analytical Chemistry + 84 + 10 + 4396-4403 + + + + Marston, C.C. @@ -671,7 +776,7 @@ obtained by accurate mass spectrometry http://www.csaszar.org/interesting/The_Open_Source_Reader.pdf - + Pearlman, R.S. and Smith, K.M. @@ -744,17 +849,6 @@ obtained by accurate mass spectrometry - - - Rogers and Hahn - J. Chem. Inf. Mod. - 2010 - 50 - 742-754 - 10.1021/ci100050t - - - SMILES Tutorial @@ -768,7 +862,7 @@ obtained by accurate mass spectrometry http://www.daylight.com/dayhtml/smiles/ssmiles.html - + Von Scholley, A. @@ -833,7 +927,7 @@ obtained by accurate mass spectrometry http://www.cdk.org/ - + Andreas Steffen, Thierry Kogej, Christian Tyrchan and Ola Engkvist @@ -845,7 +939,7 @@ obtained by accurate mass spectrometry 10.1021/ci800326z - + Shelley CA @@ -856,7 +950,7 @@ obtained by accurate mass spectrometry 61 - + J.A. Grant, J.A. Haigh, B.T. Pickup, A. Nicholls and R.A. Sayle @@ -954,7 +1048,7 @@ obtained by accurate mass spectrometry 97-101 - + Wiener, Harry @@ -983,7 +1077,7 @@ obtained by accurate mass spectrometry 273-282 - + Wessel, M.D. and Jurs, P.C. and Tolan, J.W. and Muskal, S.M. @@ -1009,7 +1103,7 @@ obtained by accurate mass spectrometry 2323-2329 - + Hendlich, M. and Rippmann, F. and Bernickel, G. @@ -1044,12 +1138,12 @@ obtained by accurate mass spectrometry 2001 4 - This article describes the first opensource Java implementation of import - filters for the Chemical Markup Language (CML). The filters support CML conventions + This article describes the first opensource Java implementation of import + filters for the Chemical Markup Language (CML). The filters support CML conventions and were tested with two opensource project: Jmol, a 3D molecular viewer, and - JChemPaint, a chemical editor. Furthermore, the use of conventions in CML is - explained and the reason for using conventions is pointed out. Finally, the - implementation is compared with two recently developed techniques for handling + JChemPaint, a chemical editor. Furthermore, the use of conventions in CML is + explained and the reason for using conventions is pointed out. Finally, the + implementation is compared with two recently developed techniques for handling CML data. XML, CML, Java @@ -1128,7 +1222,7 @@ obtained by accurate mass spectrometry A QSPR Analysis of HPLC Column Capacity Factors for a set of High-Energy Materials Using Electronic Van der Waals -Surface Property Descriptors Computed by the Transferable Atom Equivalent +Surface Property Descriptors Computed by the Transferable Atom Equivalent Method Journal of Computational Chemistry 1997 @@ -1148,6 +1242,19 @@ Method + + + Liu, R. and Rallo, R. and George, S. and Ji, Z. and Nair, S. and Nel, A.E. and Cohen, Y. + Classification NanoSAR development for cytotoxicity of metal oxide nanoparticles + Small + 2011 + 7 + 8 + 1118-1126 + 10.1002/smll.201002366 + + + Liu, S. and Cao, C. and Li, Z. @@ -1183,6 +1290,7 @@ Method 1986 7 565-577 + 10.1002/jcc.540070419 @@ -1195,6 +1303,7 @@ Method 1987 27 21-35 + 10.1021/ci00053a005 @@ -1218,6 +1327,7 @@ Method 1995 35 1039-1045 + 10.1021/ci00028a014 @@ -1233,6 +1343,7 @@ Method 2004 9 1004-1009 + 10.3390/91201004 @@ -1246,8 +1357,9 @@ Method 2000 18 464-477 + 10.1016/S1093-3263(00)00068-1 - + @@ -1257,6 +1369,7 @@ Method 2006 78 1897-1970 + 10.1351/pac200678101897 @@ -1268,6 +1381,7 @@ Method 2009 1 12 + 10.1186/1758-2946-1-12 @@ -1279,6 +1393,7 @@ Method 2010 ASAP + 10.1021/jm1008456 @@ -1290,6 +1405,7 @@ Method 2005 45 386-393 + 10.1021/ci0496797 @@ -1314,7 +1430,7 @@ Method 2-4 - + Zhao, Yuan H. and Abraham, Michael H. and Zissimos, Andreas M. @@ -1327,7 +1443,7 @@ Method 10.1021/jo034808o - + Thalheim, T. and Vollmer, A. and Ebert, R. and Kuehne, R. and Schuurmann, G. @@ -1336,11 +1452,12 @@ Method 2010 50 1223-1232 + 10.1021/ci1001179 - + - + Rojas-Cherto, Miguel and Kasper, Piotr T and Willighagen, Egon L and Vreeken, R. and Hankemeier, Thomas and Reijmers, Theo Elemental Composition determination based on MSn Bioinformatics @@ -1350,20 +1467,20 @@ Method 10.1093/bioinformatics/btr409 - + Klekota, Justin and Roth, Frederick P. Chemical substructures that enrich for biological activity - 24 - 21 - 2518-2525 - 2008 + 24 + 21 + 2518-2525 + 2008 10.1093/bioinformatics/btn479 - http://bioinformatics.oxfordjournals.org/content/24/21/2518.abstract - http://bioinformatics.oxfordjournals.org/content/24/21/2518.full.pdf+html + http://bioinformatics.oxfordjournals.org/content/24/21/2518.abstract + http://bioinformatics.oxfordjournals.org/content/24/21/2518.full.pdf+html Bioinformatics @@ -1377,15 +1494,15 @@ Method Journal of the Association for Computing Machinery 23 1 + 10.1145/321921.321925 - - + + Vismara, Philippe Union of all the minimum cycle bases of a graph - 1997 - 10.1.1.47.3674 + 1997 http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.47.3674 Combinatorics 4 @@ -1416,9 +1533,22 @@ Method J. Chem. Inf. Comput. Sci 33 812-825 + 10.1021/ci00016a003 - - + + + + + Rogers and Hahn + Extended-connectivity fingerprints + J. Chem. Inf. Mod. + 2010 + 50 + 742-754 + 10.1021/ci100050t + + + Towards a Universal SMILES representation - A standard method to generate canonical SMILES based on the InChI @@ -1427,9 +1557,10 @@ Method Journal of Cheminformatics 4 22 + 10.1186/1758-2946-4-22 - + Paths, trees and flowers @@ -1474,4 +1605,7 @@ Method http://arxiv.org/abs/1307.7805 + + + diff --git a/tool/smarts/src/main/java/org/openscience/cdk/smarts/SmartsFragmentExtractor.java b/tool/smarts/src/main/java/org/openscience/cdk/smarts/SmartsFragmentExtractor.java new file mode 100644 index 00000000000..1550bed2a08 --- /dev/null +++ b/tool/smarts/src/main/java/org/openscience/cdk/smarts/SmartsFragmentExtractor.java @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2016 John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. All we ask is that proper credit is given + * for our work, which includes - but is not limited to - adding the above + * copyright notice to the beginning of your source code files, and to any + * copyright notice that you may distribute with programs based on this work. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.smarts; + +import org.openscience.cdk.interfaces.IAtom; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; + +import java.util.Arrays; +import java.util.Locale; + +/** + * Utility class to create SMARTS that match part (substructure) of a molecule. + * SMARTS are generated by providing the atom indexes. An example use cases is + * encoding features from a fingerprint. + *

+ * The extractor has two modes. {@link #MODE_EXACT} (default) captures the element, + * valence, hydrogen count, connectivity, and charge in the SMARTS atom expressions. + * The alternative mode, {@link #MODE_JCOMPOUNDMAPPER}, only captures the element, + * non-zero charge, and peripheral bonds. Although the later looks cleaner, the + * peripheral bonds intend to capture the connectivity of the terminal atoms but + * since the valence is not bounded further substitution is still allowed. This + * mirrors functionality from jCompoundMapper {@cdk.cite Hinselmann2011}. + * + *

The difference is easily demonstrated for methyl. Consider the compound + * of 2-methylpentane {@code CC(C)CCC}, if we extract one of the methyl atoms + * depending on the mode we obtain {@code [CH3v4X4+0]} or {@code C*}. The first + * of these patterns (obtained by {@link #MODE_EXACT}) matches the compound in + * three places (the three methyl groups). The second matches six + * times (every atom) because the substituion on the carbon is not locked. + * A further complication is introduced by the inclusion of the peripheral atoms, + * for 1H-indole {@code [nH]1ccc2c1cccc2} we can obtain the SMARTS {@code n(ccc(a)a)a} + * that doesn't match at all. This is because one of the aromatic atoms ('a') + * needs to match the nitrogen. + * + *

Basic Usage:

+ *
{@code
+ *
+ * IChemObjectBuilder      bldr      = SilentChemObjectBuilder.getInstance();
+ * SmilesParser            smipar    = new SmilesParser(bldr);
+ *
+ * IAtomContainer          mol       = smipar.parseSmiles("[nH]1ccc2c1cccc2");
+ * SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
+ *
+ * // smarts=[nH1v3X3+0][cH1v4X3+0][cH1v4X3+0][cH0v4X3+0]
+ * // hits  =1
+ * String             smarts    = mol.generate(new int[]{0,1,3,4});
+ *
+ * subsmarts.setMode(MODE_JCOMPOUNDMAPPER);
+ * // smarts=n(ccc(a)a)a
+ * // hits  = 0 - one of the 'a' atoms needs to match the nitrogen
+ * String             smarts    = mol.generate(new int[]{0,1,3,4});
+ * }
+ * + * @author Nikolay Kochev + * @author Nina Jeliazkova + * @author John May + */ +public final class SmartsFragmentExtractor { + + /** + * Sets the mode of the extractor to produce SMARTS similar to JCompoundMapper. + */ + public static final int MODE_JCOMPOUNDMAPPER = 1; + + /** + * Sets the mode of the extractor to produce exact SMARTS. + */ + public static final int MODE_EXACT = 2; + + // molecule being selected over + private final IAtomContainer mol; + + // fast-access mol graph data structures + private final int[][] atomAdj, bondAdj; + private final int[] deg; + + // SMARTS atom and bond expressions + private final String[] aexpr; + private final String[] bexpr; + + // SMARTS traversal/generation + private final int[] avisit; + private final int[] rbnds; + private final int[] rnums; + private int numVisit; + + // which mode should SMARTS be encoded in + private int mode = MODE_EXACT; + + /** + * Create a new instance over the provided molecule. + * + * @param mol molecule + */ + public SmartsFragmentExtractor(IAtomContainer mol) { + this.mol = mol; + + final int numAtoms = mol.getAtomCount(); + final int numBonds = mol.getBondCount(); + + // build fast access + this.deg = new int[numAtoms]; + this.atomAdj = new int[numAtoms][4]; + this.bondAdj = new int[numAtoms][4]; + this.aexpr = new String[numAtoms]; + this.bexpr = new String[numBonds]; + this.avisit = new int[numAtoms]; + this.rbnds = new int[numBonds]; + this.rnums = new int[100]; // max 99 in SMILES/SMARTS + + // index adjacency information and bond expressions for quick + // reference and traversal + for (int bondIdx = 0; bondIdx < numBonds; bondIdx++) { + IBond bond = mol.getBond(bondIdx); + IAtom beg = bond.getAtom(0); + IAtom end = bond.getAtom(1); + int begIdx = mol.getAtomNumber(beg); + int endIdx = mol.getAtomNumber(end); + this.bexpr[bondIdx] = encodeBondExpr(bondIdx, begIdx, endIdx); + + // make sufficient space + if (deg[begIdx] == atomAdj[begIdx].length) { + atomAdj[begIdx] = Arrays.copyOf(atomAdj[begIdx], deg[begIdx] + 2); + bondAdj[begIdx] = Arrays.copyOf(bondAdj[begIdx], deg[begIdx] + 2); + } + if (deg[endIdx] == atomAdj[endIdx].length) { + atomAdj[endIdx] = Arrays.copyOf(atomAdj[endIdx], deg[endIdx] + 2); + bondAdj[endIdx] = Arrays.copyOf(bondAdj[endIdx], deg[endIdx] + 2); + } + + atomAdj[begIdx][deg[begIdx]] = endIdx; + bondAdj[begIdx][deg[begIdx]] = bondIdx; + atomAdj[endIdx][deg[endIdx]] = begIdx; + bondAdj[endIdx][deg[endIdx]] = bondIdx; + + deg[begIdx]++; + deg[endIdx]++; + } + + // pre-generate atom expressions + for (int atomIdx = 0; atomIdx < numAtoms; atomIdx++) + this.aexpr[atomIdx] = encodeAtomExpr(atomIdx); + } + + /** + * Set the mode of SMARTS substructure selection + * + * @param mode the mode + */ + public void setMode(int mode) { + // check arg + switch (mode) { + case MODE_EXACT: + case MODE_JCOMPOUNDMAPPER: + break; + default: + throw new IllegalArgumentException("Invalid mode specified!"); + } + this.mode = mode; + + // re-gen atom expressions + int numAtoms = mol.getAtomCount(); + for (int atomIdx = 0; atomIdx < numAtoms; atomIdx++) + this.aexpr[atomIdx] = encodeAtomExpr(atomIdx); + } + + /** + * Generate a SMARTS for the substructure formed of the provided + * atoms. + * + * @param atomIdxs atom indexes + * @return SMARTS, null if an empty array is passed + */ + public String generate(int[] atomIdxs) { + + if (atomIdxs == null) + throw new NullPointerException("No atom indexes provided"); + if (atomIdxs.length == 0) + return null; // makes sense? + + // special case + if (atomIdxs.length == 1 && mode == MODE_EXACT) + return aexpr[atomIdxs[0]]; + + // initialize traversal information + Arrays.fill(rbnds, 0); + Arrays.fill(avisit, 0); + for (int atmIdx : atomIdxs) + avisit[atmIdx] = -1; + + // first visit marks ring information + numVisit = 1; + for (int atomIdx : atomIdxs) { + if (avisit[atomIdx] < 0) + markRings(atomIdx, -1); + } + + // reset visit flags and generate + numVisit = 1; + for (int atmIdx : atomIdxs) + avisit[atmIdx] = -1; + + // second pass builds the expression + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < atomIdxs.length; i++) { + if (avisit[atomIdxs[i]] < 0) { + if (i > 0) sb.append('.'); + encodeExpr(atomIdxs[i], -1, sb); + } + } + + return sb.toString(); + } + + /** + * Recursively marks ring closures (back edges) in the {@link #rbnds} + * array in a depth first order. + * + * @param idx atom index + * @param bprev previous bond + */ + private void markRings(int idx, int bprev) { + avisit[idx] = numVisit++; + final int d = deg[idx]; + for (int j = 0; j < d; j++) { + int nbr = atomAdj[idx][j]; + int bidx = bondAdj[idx][j]; + if (avisit[nbr] == 0 || bidx == bprev) + continue; // ignored + else if (avisit[nbr] < 0) + markRings(nbr, bidx); + else if (avisit[nbr] < avisit[idx]) + rbnds[bidx] = -1; // ring closure + } + } + + /** + * Recursively encodes a SMARTS expression into the provides + * string builder. + * + * @param idx atom index + * @param bprev previous bond + * @param sb destition to write SMARTS to + */ + private void encodeExpr(int idx, int bprev, StringBuilder sb) { + avisit[idx] = numVisit++; + sb.append(aexpr[idx]); + final int d = deg[idx]; + + int remain = d; + for (int j = 0; j < d; j++) { + int nbr = atomAdj[idx][j]; + int bidx = bondAdj[idx][j]; + + // ring open/close + if (rbnds[bidx] < 0) { + // open + final int rnum = chooseRingNumber(); + if (rnum > 9) sb.append('%'); + sb.append(rnum); + rbnds[bidx] = rnum; + } else if (rbnds[bidx] > 0) { + // close + final int rnum = rbnds[bidx]; + releaseRingNumber(rnum); + if (rnum > 9) sb.append('%'); + sb.append(rnum); + } + + if (mode == MODE_EXACT && avisit[nbr] == 0 || + bidx == bprev || + rbnds[bidx] != 0) + remain--; + } + + for (int j = 0; j < d; j++) { + int nbr = atomAdj[idx][j]; + int bidx = bondAdj[idx][j]; + if (mode == MODE_EXACT && avisit[nbr] == 0 || + bidx == bprev || + rbnds[bidx] != 0) + continue; // ignored + remain--; + if (avisit[nbr] == 0) { + // peripheral bond + if (remain > 0) sb.append('('); + sb.append(bexpr[bidx]); + sb.append(mol.getAtom(nbr).isAromatic() ? 'a' : '*'); + if (remain > 0) sb.append(')'); + } else { + if (remain > 0) sb.append('('); + sb.append(bexpr[bidx]); + encodeExpr(nbr, bidx, sb); + if (remain > 0) sb.append(')'); + } + } + } + + /** + * Select the lowest ring number for use in SMARTS. + * + * @return ring number + * @throws IllegalStateException all ring numbers are used + */ + private int chooseRingNumber() { + for (int i = 1; i < rnums.length; i++) { + if (rnums[i] == 0) { + rnums[i] = 1; + return i; + } + } + throw new IllegalStateException("No more ring numbers available!"); + } + + /** + * Releases a ring number allowing it to be reused. + * + * @param rnum ring number + */ + private void releaseRingNumber(int rnum) { + rnums[rnum] = 0; + } + + /** + * Encodes the atom at index (atmIdx) to a SMARTS + * expression that matches itself. + * + * @param atmIdx atom index + * @return SMARTS atom expression + */ + private String encodeAtomExpr(int atmIdx) { + final IAtom atom = mol.getAtom(atmIdx); + + boolean complex = mode == MODE_EXACT; + + StringBuilder sb = new StringBuilder(); + + switch (atom.getAtomicNumber()) { + case 0: // * + sb.append('*'); + break; + case 5: // B + case 6: // C + case 7: // N + case 8: // O + case 15: // P + case 16: // S + case 9: // F + case 17: // Cl + case 35: // Br + case 53: // I + sb.append(atom.isAromatic() ? atom.getSymbol().toLowerCase(Locale.ROOT) + : atom.getSymbol()); + break; + default: + complex = true; + sb.append(atom.isAromatic() ? atom.getSymbol().toLowerCase(Locale.ROOT) + : atom.getSymbol()); + break; + } + + if (mode == MODE_EXACT) { + + int hcount = atom.getImplicitHydrogenCount(); + int valence = hcount; + int connections = hcount; + + int atmDeg = this.deg[atmIdx]; + for (int i = 0; i < atmDeg; i++) { + IBond bond = mol.getBond(bondAdj[atmIdx][i]); + IAtom nbr = bond.getConnectedAtom(atom); + if (nbr.getAtomicNumber() != null && nbr.getAtomicNumber() == 1) + hcount++; + int bord = bond.getOrder() != null ? bond.getOrder().numeric() : 0; + if (bord == 0) + throw new IllegalArgumentException("Molecule had unsupported zero-order or unset bonds!"); + valence += bord; + connections++; + } + + sb.append('H').append(hcount); + sb.append('v').append(valence); + sb.append('X').append(connections); + } + + Integer chg = atom.getFormalCharge(); + if (chg == null) chg = 0; + + + if (chg <= -1 || chg >= +1) { + if (chg >= 0) sb.append('+'); + else sb.append('-'); + int abs = Math.abs(chg); + if (abs > 1) sb.append(abs); + complex = true; + } else if (mode == MODE_EXACT) { + sb.append("+0"); + } + + return complex ? '[' + sb.toString() + ']' : sb.toString(); + } + + /** + * Encodes the bond at index (bondIdx) to a SMARTS + * expression that matches itself. + * + * @param bondIdx bond index + * @param beg atom index of first atom + * @param end atom index of second atom + * @return SMARTS bond expression + */ + private String encodeBondExpr(int bondIdx, int beg, int end) { + IBond bond = mol.getBond(bondIdx); + if (bond.getOrder() == null) + return ""; + + boolean bArom = bond.isAromatic(); + boolean aArom = mol.getAtom(beg).isAromatic() && mol.getAtom(end).isAromatic(); + switch (bond.getOrder()) { + case SINGLE: + if (bArom) { + return aArom ? "" : ":"; + } else { + return aArom ? "-" : ""; + } + case DOUBLE: + return bArom ? "" : "="; + case TRIPLE: + return "#"; + default: + throw new IllegalArgumentException("Unsupported bond type: " + bond.getOrder()); + } + } +} diff --git a/tool/smarts/src/test/java/org/openscience/cdk/smarts/SmartsFragmentExtractorTest.java b/tool/smarts/src/test/java/org/openscience/cdk/smarts/SmartsFragmentExtractorTest.java new file mode 100644 index 00000000000..7ce2d52aa73 --- /dev/null +++ b/tool/smarts/src/test/java/org/openscience/cdk/smarts/SmartsFragmentExtractorTest.java @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. All we ask is that proper credit is given + * for our work, which includes - but is not limited to - adding the above + * copyright notice to the beginning of your source code files, and to any + * copyright notice that you may distribute with programs based on this work. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.smarts; + +import org.junit.Test; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.silent.SilentChemObjectBuilder; +import org.openscience.cdk.smiles.SmilesParser; + +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +public class SmartsFragmentExtractorTest { + + private String generate(String smi, int mode, int[] idxs) throws Exception { + SmilesParser smipar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smipar.parseSmiles(smi); + SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol); + subsmarts.setMode(mode); + return subsmarts.generate(idxs); + } + + private static int[] makeSeq(int beg, int to) { + int[] a = new int[to-beg]; + for (int i = 0; i < a.length; i++) + a[i] = beg++; + return a; + } + + @Test + public void methylExact() throws Exception { + String smarts = generate("CC(C)CCC", + SmartsFragmentExtractor.MODE_EXACT, + makeSeq(0,1)); + assertThat(smarts, is("[CH3v4X4+0]")); + } + + @Test + public void methylForJCompoundMap() throws Exception { + String smarts = generate("CC(C)CCC", + SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER, + makeSeq(0,1)); + assertThat(smarts, is("C*")); + } + + @Test + public void indole() throws Exception { + String smarts = generate("[nH]1ccc2c1cccc2", + SmartsFragmentExtractor.MODE_EXACT, + makeSeq(0,4)); + assertThat(smarts, is("[nH1v3X3+0][cH1v4X3+0][cH1v4X3+0][cH0v4X3+0]")); + } + + @Test + public void indoleForJCompoundMap() throws Exception { + String smarts = generate("[nH]1ccc2c1cccc2", + SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER, + makeSeq(0,4)); + assertThat(smarts, is("n(ccc(a)a)a")); + } + + @Test + public void biphenylIncludesSingleBond() throws Exception { + String smarts = generate("c1ccccc1-c1ccccc1", + SmartsFragmentExtractor.MODE_EXACT, + makeSeq(0,12)); + assertThat(smarts, containsString("-")); + } + + @Test + public void fullereneC60() throws Exception { + String smarts = generate("c12c3c4c5c1c1c6c7c2c2c8c3c3c9c4c4c%10c5c5c1c1c6c6c%11c7c2c2c7c8c3c3c8c9c4c4c9c%10c5c5c1c1c6c6c%11c2c2c7c3c3c8c4c4c9c5c1c1c6c2c3c41", + SmartsFragmentExtractor.MODE_EXACT, + makeSeq(0,60)); + assertThat(smarts, + is("[cH0v4X3+0]12[cH0v4X3+0]3[cH0v4X3+0]4[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]7[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]8[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]9[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]%10[cH0v4X3+0]5[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]6[cH0v4X3+0]%11[cH0v4X3+0]7[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]7[cH0v4X3+0]8[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]8[cH0v4X3+0]9[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]9[cH0v4X3+0]%10[cH0v4X3+0]5[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]6[cH0v4X3+0]%11[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]7[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]8[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]9[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]2[cH0v4X3+0]3[cH0v4X3+0]41")); + } + +} \ No newline at end of file