diff --git a/src/main/org/openscience/cdk/hash/HashGeneratorMaker.java b/src/main/org/openscience/cdk/hash/HashGeneratorMaker.java index 65d4ce72aa3..8cb044c9897 100644 --- a/src/main/org/openscience/cdk/hash/HashGeneratorMaker.java +++ b/src/main/org/openscience/cdk/hash/HashGeneratorMaker.java @@ -2,7 +2,9 @@ import org.openscience.cdk.annotations.TestClass; import org.openscience.cdk.annotations.TestMethod; +import org.openscience.cdk.hash.equivalent.EquivalentSetFinder; import org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSet; +import org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSetUnion; import org.openscience.cdk.hash.seed.AtomEncoder; import org.openscience.cdk.hash.seed.BasicAtomEncoder; import org.openscience.cdk.hash.seed.ConjugatedAtomEncoder; @@ -75,7 +77,7 @@ public class HashGeneratorMaker { private List stereoEncoders = new ArrayList(); /* whether we want to use perturbed hash generators */ - private boolean perturbed = false; + private EquivalentSetFinder equivSetFinder = null; /** * Specify the depth of the hash generator. Larger values discriminate more @@ -167,14 +169,61 @@ public HashGeneratorMaker chiral() { } /** - * Discriminate atoms experiencing uniform environments. + * Discriminate atoms experiencing uniform environments. This method uses + * {@link MinimumEquivalentCyclicSet} to break symmetry but depending on + * application one may need a more comprehensive method. Please refer to + * {@link #perturbWith(EquivalentSetFinder)} for further configuration + * details. * * @return fluent API reference (self) - * @throws UnsupportedOperationException not yet implemented + * @see MinimumEquivalentCyclicSet + * @see #perturbWith(EquivalentSetFinder) */ @TestMethod("testPerturbed") public HashGeneratorMaker perturbed() { - perturbed = true; + return perturbWith(new MinimumEquivalentCyclicSet()); + } + + /** + * Discriminate atoms experiencing uniform environments using the provided + * method. Depending on the level of identity required one can choose how + * the atoms a perturbed in an attempt to break symmetry. As with all + * hashing there is always a probability of collision but some of these + * collisions may be due to an insufficiency in the algorithm opposed to a + * random chance of collision. Currently there are three strategies but one + * should choose either to use the fast, but good, heuristic {@link + * MinimumEquivalentCyclicSet} or the exact {@link org.openscience.cdk.hash.equivalent.AllEquivalentCyclicSet}. + * In practice {@link MinimumEquivalentCyclicSet} is good enough for most + * applications but it is important to understand the potential trade off. + * The {@link MinimumEquivalentCyclicSetUnion} is provided for demonstration + * only, and as such, is deprecated. + * + * + * + * At the time of writing (Feb, 2013) the number of known false possibles + * found in PubChem-Compound (aprx. 46,000,000 structures) are as follows: + * + * + * + * @param equivSetFinder equivalent set finder, used to determine which + * atoms will be perturbed to try and break symmetry. + * @return fluent API reference (self) + * @see org.openscience.cdk.hash.equivalent.AllEquivalentCyclicSet + * @see MinimumEquivalentCyclicSet + * @see MinimumEquivalentCyclicSetUnion + */ + @TestMethod("testPerturbedWith") + public HashGeneratorMaker perturbWith(EquivalentSetFinder equivSetFinder) { + this.equivSetFinder = equivSetFinder; return this; } @@ -266,11 +315,11 @@ public AtomHashGenerator atomic() { AtomEncoder encoder = new ConjugatedAtomEncoder(encoders); - if (perturbed) { + if (equivSetFinder != null) { return new PerturbedAtomHashGenerator(new SeedGenerator(encoder), new Xorshift(), makeStereoEncoderFactory(), - new MinimumEquivalentCyclicSet(), + equivSetFinder, depth); } diff --git a/src/main/org/openscience/cdk/hash/equivalent/AllEquivalentCyclicSet.java b/src/main/org/openscience/cdk/hash/equivalent/AllEquivalentCyclicSet.java new file mode 100644 index 00000000000..430e6a0d884 --- /dev/null +++ b/src/main/org/openscience/cdk/hash/equivalent/AllEquivalentCyclicSet.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2013. John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * All we ask is that proper credit is given for our work, which includes + * - but is not limited to - adding the above copyright notice to the beginning + * of your source code files, and to any copyright notice that you may distribute + * with programs based on this work. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.hash.equivalent; + +import org.openscience.cdk.annotations.TestClass; +import org.openscience.cdk.annotations.TestMethod; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.ringsearch.RingSearch; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +/** + * Finds the set of equivalent values are members of a ring. This class is + * intended to drive the systematic perturbation of the {@link + * org.openscience.cdk.hash.PerturbedAtomHashGenerator}. This {@link + * EquivalentSetFinder} provides the highest probability of avoid collisions due + * to uniform atom environments but is much more demanding then the simpler + * {@link MinimumEquivalentCyclicSet}. + * + *


The easiest way to use this class is with the {@link + * org.openscience.cdk.hash.HashGeneratorMaker}. + *

+ * MoleculeHashGenerator generator =
+ *   new HashGeneratorMaker().depth(6)
+ *                           .elemental()
+ *                           .perturbWith(new AllEquivalentCyclicSet())
+ *                           .molecular();
+ * 
+ * + * @author John May + * @cdk.module hash + * @see org.openscience.cdk.hash.PerturbedAtomHashGenerator + * @see MinimumEquivalentCyclicSet + * @see MinimumEquivalentCyclicSetUnion + */ +@TestClass("org.openscience.cdk.hash.equivalent.AllEquivalentCyclicSetTest") +public final class AllEquivalentCyclicSet implements EquivalentSetFinder { + + /** + * @inheritDoc + */ + @Override + @TestMethod("testFind,testFind_Distinct,testScenario") + public Set find(long[] invariants, IAtomContainer container, int[][] graph) { + + int n = invariants.length; + + // find cyclic vertices using DFS + RingSearch ringSearch = new RingSearch(container, graph); + + // ordered map of the set of vertices for each value + Map> equivalent = new TreeMap>(); + + // divide the invariants into equivalent indexed and ordered sets + for (int i = 0; i < invariants.length; i++) { + + Long invariant = invariants[i]; + Set set = equivalent.get(invariant); + + if (set == null) { + if (ringSearch.cyclic(i)) { + set = new HashSet(n / 2); + set.add(i); + equivalent.put(invariant, set); + } + } else { + set.add(i); + } + } + + // find the smallest set of equivalent cyclic vertices + Set set = new TreeSet(); + for (Map.Entry> e : equivalent.entrySet()) { + Set vertices = e.getValue(); + if (vertices.size() > 1) { + set.addAll(vertices); + } + } + + return set; + } +} diff --git a/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSet.java b/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSet.java index cfd7ba076eb..26810bfe4fa 100644 --- a/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSet.java +++ b/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSet.java @@ -15,17 +15,40 @@ * Finds the smallest set of equivalent values are members of a ring. If there * are multiple smallest sets then the set with the lowest invariant value is * returned. This class is intended to drive the systematic perturbation of the - * {@link org.openscience.cdk.hash.PerturbedAtomHashGenerator}. The method is - * different from the original publication {@cdk.cite Ihlenfeldt93} where only - * non-terminally removable vertices are considered. The method differs as it - * allows us to make the code more modular. In reality, ring perception - * provided by {@link RingSearch} is very computationally cheap.


A - * alternative and (potentially) more robust way may be use the union of all - * minimum equivalent cyclic sets. + * {@link org.openscience.cdk.hash.PerturbedAtomHashGenerator}. + * + * This method will not distinguish all possible molecules but represents a good + * enough approximation to quickly narrow down an identity search. At the time + * of writing (Feb, 2013) there are only 128 molecules (64 false positives) in + * PubChem-Compound (46E6 molecules) which are not separated. In many data sets + * this method will suffice however the exact {@link AllEquivalentCyclicSet} is + * provided.


+ * + * This method is currently the default used by the {@link + * org.openscience.cdk.hash.HashGeneratorMaker} but can also be explicitly + * specified.

+ *
+ * MoleculeHashGenerator generator =
+ *   new HashGeneratorMaker().depth(6)
+ *                           .elemental()
+ *                           .perturbed() // use this class by default
+ *                           .molecular();
+ *
+ * // explicitly specify the method
+ * MoleculeHashGenerator generator =
+ *   new HashGeneratorMaker().depth(6)
+ *                           .elemental()
+ *                           .perturbWith(new MinimumEquivalentCyclicSet())
+ *                           .molecular();
+ * 
+ *
* * @author John May * @cdk.module hash * @see org.openscience.cdk.hash.PerturbedAtomHashGenerator + * @see MinimumEquivalentCyclicSetUnion + * @see AllEquivalentCyclicSet + * @see org.openscience.cdk.hash.HashGeneratorMaker */ @TestClass("org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSetTest") public final class MinimumEquivalentCyclicSet implements EquivalentSetFinder { @@ -48,8 +71,8 @@ public Set find(long[] invariants, IAtomContainer container, int[][] gr // divide the invariants into equivalent indexed and ordered sets for (int i = 0; i < invariants.length; i++) { - Long invariant = invariants[i]; - Set set = equivalent.get(invariant); + Long invariant = invariants[i]; + Set set = equivalent.get(invariant); if (set == null) { if (ringSearch.cyclic(i)) { @@ -63,12 +86,12 @@ public Set find(long[] invariants, IAtomContainer container, int[][] gr } // find the smallest set of equivalent cyclic vertices - int minSize = Integer.MAX_VALUE; - Set min = Collections.emptySet(); + int minSize = Integer.MAX_VALUE; + Set min = Collections.emptySet(); for (Map.Entry> e : equivalent.entrySet()) { Set vertices = e.getValue(); - if (vertices.size() < minSize) { - min = vertices; + if (vertices.size() < minSize && vertices.size() > 1) { + min = vertices; minSize = vertices.size(); } } diff --git a/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSetUnion.java b/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSetUnion.java new file mode 100644 index 00000000000..f5ffb6269c8 --- /dev/null +++ b/src/main/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSetUnion.java @@ -0,0 +1,110 @@ +package org.openscience.cdk.hash.equivalent; + +import org.openscience.cdk.annotations.TestClass; +import org.openscience.cdk.annotations.TestMethod; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.ringsearch.RingSearch; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +/** + * The union of all the smallest set of equivalent values are members of a ring. + * This class is intended to drive the systematic perturbation of the {@link + * org.openscience.cdk.hash.PerturbedAtomHashGenerator}. The method is more + * comprehensive then a single {@link MinimumEquivalentCyclicSet} and not as + * computationally demanding as {@link AllEquivalentCyclicSet}. In reality one + * should choose either use the fast (but good) heuristic {@link + * MinimumEquivalentCyclicSet} or the exact {@link AllEquivalentCyclicSet}. This + * method is provided for demonstration only. + * + * As with the {@link MinimumEquivalentCyclicSet} perturbation, this method does + * not guarantee that all molecules will be distinguished. At the time of + * writing (Feb 2013) there are only 8 structure in PubChem-Compound which need + * the more comprehensive perturbation method ({@link AllEquivalentCyclicSet}), + * these are listed below. + * + * + * + * + * + *
CID 144432 + * and CID 15584856
CID 138898 + * and CID 241107
CID 9990759 + * and CID 10899923
CID 5460768 + * and CID 20673269
+ * + *


The easiest way to use this class is with the {@link + * org.openscience.cdk.hash.HashGeneratorMaker}. + *

+ * MoleculeHashGenerator generator =
+ *   new HashGeneratorMaker().depth(6)
+ *                           .elemental()
+ *                           .perturbWith(new MinimumEquivalentCyclicSetUnion())
+ *                           .molecular();
+ * 
+ * + * @author John May + * @cdk.module hash + * @see org.openscience.cdk.hash.PerturbedAtomHashGenerator + * @see MinimumEquivalentCyclicSet + * @see AllEquivalentCyclicSet + * @deprecated provided for to demonstrate a relatively robust but ultimately + * incomplete approach + */ +@Deprecated +@TestClass("org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSetUnionTest") +public final class MinimumEquivalentCyclicSetUnion + implements EquivalentSetFinder { + + /** + * @inheritDoc + */ + @TestMethod("testFind,testFind_Distinct,testScenario") + @Override + public Set find(long[] invariants, IAtomContainer container, int[][] graph) { + + int n = invariants.length; + + // find cyclic vertices using DFS + RingSearch ringSearch = new RingSearch(container, graph); + + // ordered map of the set of vertices for each value + Map> equivalent = new TreeMap>(); + + // divide the invariants into equivalent indexed and ordered sets + for (int i = 0; i < invariants.length; i++) { + + Long invariant = invariants[i]; + Set set = equivalent.get(invariant); + + if (set == null) { + if (ringSearch.cyclic(i)) { + set = new HashSet(n / 2); + set.add(i); + equivalent.put(invariant, set); + } + } else { + set.add(i); + } + } + + // find the smallest set of equivalent cyclic vertices + int minSize = Integer.MAX_VALUE; + Set min = Collections.emptySet(); + for (Map.Entry> e : equivalent.entrySet()) { + Set vertices = e.getValue(); + if (vertices.size() < minSize && vertices.size() > 1) { + min = vertices; + minSize = vertices.size(); + } else if (vertices.size() == minSize) { + min.addAll(vertices); + } + } + + return min; + } +} diff --git a/src/test/org/openscience/cdk/hash/HashGeneratorMakerTest.java b/src/test/org/openscience/cdk/hash/HashGeneratorMakerTest.java index b8d63ad54f5..25b6a06e206 100644 --- a/src/test/org/openscience/cdk/hash/HashGeneratorMakerTest.java +++ b/src/test/org/openscience/cdk/hash/HashGeneratorMakerTest.java @@ -26,6 +26,7 @@ import org.junit.Assert; import org.junit.Test; +import org.openscience.cdk.hash.equivalent.EquivalentSetFinder; import org.openscience.cdk.hash.seed.AtomEncoder; import org.openscience.cdk.hash.seed.ConjugatedAtomEncoder; import org.openscience.cdk.hash.stereo.factory.StereoEncoderFactory; @@ -36,6 +37,7 @@ import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.not; +import static org.hamcrest.CoreMatchers.sameInstance; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; @@ -115,6 +117,20 @@ public class HashGeneratorMakerTest { assertTrue(g1 instanceof PerturbedAtomHashGenerator); } + @Test public void testPerturbedWith() throws NoSuchFieldException, + IllegalAccessException { + EquivalentSetFinder mock = mock(EquivalentSetFinder.class); + AtomHashGenerator g1 = new HashGeneratorMaker().depth(0) + .elemental() + .perturbWith(mock) + .atomic(); + + assertTrue(g1 instanceof PerturbedAtomHashGenerator); + Field field = g1.getClass().getDeclaredField("finder"); + field.setAccessible(true); + assertThat((EquivalentSetFinder) field.get(g1), is(sameInstance(mock))); + } + @Test public void testOrdering() { AtomHashGenerator g1 = new HashGeneratorMaker().depth(0) diff --git a/src/test/org/openscience/cdk/hash/PerturbedAtomHashGeneratorTest.java b/src/test/org/openscience/cdk/hash/PerturbedAtomHashGeneratorTest.java index 7f129c6ac8b..46757fa114c 100644 --- a/src/test/org/openscience/cdk/hash/PerturbedAtomHashGeneratorTest.java +++ b/src/test/org/openscience/cdk/hash/PerturbedAtomHashGeneratorTest.java @@ -29,7 +29,9 @@ import org.openscience.cdk.Atom; import org.openscience.cdk.AtomContainer; import org.openscience.cdk.Bond; +import org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSet; import org.openscience.cdk.hash.seed.BasicAtomEncoder; +import org.openscience.cdk.hash.stereo.factory.StereoEncoderFactory; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; @@ -59,6 +61,8 @@ public void testGenerate() throws Exception { 8)); MoleculeHashGenerator perturb = new BasicMoleculeHashGenerator(new PerturbedAtomHashGenerator(seeding, pseudorandom, + StereoEncoderFactory.EMPTY, + new MinimumEquivalentCyclicSet(), 8)); // basic encoding should say these are the same assertThat(basic.generate(m1), is(basic.generate(m2))); @@ -74,6 +78,8 @@ public void testCombine() throws Exception { Xorshift prng = new Xorshift(); PerturbedAtomHashGenerator generator = new PerturbedAtomHashGenerator(new SeedGenerator(BasicAtomEncoder.ATOMIC_NUMBER), prng, + StereoEncoderFactory.EMPTY, + new MinimumEquivalentCyclicSet(), 8); long[][] perturbed = new long[][]{ {1, 2, 3, 4}, diff --git a/src/test/org/openscience/cdk/hash/equivalent/AllEquivalentCyclicSetTest.java b/src/test/org/openscience/cdk/hash/equivalent/AllEquivalentCyclicSetTest.java new file mode 100644 index 00000000000..3bb0df2f9d1 --- /dev/null +++ b/src/test/org/openscience/cdk/hash/equivalent/AllEquivalentCyclicSetTest.java @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2013. John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * All we ask is that proper credit is given for our work, which includes + * - but is not limited to - adding the above copyright notice to the beginning + * of your source code files, and to any copyright notice that you may distribute + * with programs based on this work. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.hash.equivalent; + +import org.junit.Test; +import org.openscience.cdk.Atom; +import org.openscience.cdk.AtomContainer; +import org.openscience.cdk.Bond; +import org.openscience.cdk.hash.HashGeneratorMaker; +import org.openscience.cdk.hash.MoleculeHashGenerator; +import org.openscience.cdk.interfaces.IAtom; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; + +import java.util.Set; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.not; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + +/** + * @author John May + * @cdk.module test-hash + */ +public class AllEquivalentCyclicSetTest { + + @Test + public void testFind() throws Exception { + IAtomContainer dummy = mock(IAtomContainer.class); + int[][] g = new int[][]{{1, 5, 6}, + {0, 2}, + {1, 3}, + {2, 4, 7}, + {3, 5}, + {0, 4}, + {0}, + {3}}; + + // this mock the invariants + long[] values = new long[]{ + 1, + 0, + 0, + 1, + 0, + 0, + 2, + 2 + }; + + EquivalentSetFinder finder = new AllEquivalentCyclicSet(); + Set set = finder.find(values, dummy, g); + + assertThat(set.size(), is(6)); + // the first size vertex are all in a cycle + assertTrue(set.contains(0)); + assertTrue(set.contains(1)); + assertTrue(set.contains(2)); + assertTrue(set.contains(3)); + assertTrue(set.contains(4)); + assertTrue(set.contains(5)); + + } + + @Test + public void testFind_Distinct() throws Exception { + IAtomContainer dummy = mock(IAtomContainer.class); + int[][] g = new int[][]{{1, 5, 6}, + {0, 2}, + {1, 3}, + {2, 4, 7}, + {3, 5}, + {0, 4}, + {0}, + {3}}; + + // all values distinct + long[] values = new long[]{ + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17 + }; + + EquivalentSetFinder finder = new AllEquivalentCyclicSet(); + Set set = finder.find(values, dummy, g); + + assertThat(set.size(), is(0)); + } + + /** + * Test the method at perturbing the 2D representations of CID 138898 and + * CID 241107. These molecules are very similar but distinct. To tell these + * apart we must use {@link AllEquivalentCyclicSet} opposed to the faster + * methods. This test also serves to demonstrates the basic equivalent set + * finder does not tell them apart but that this more complex finder does. + */ + @Test + public void testScenario() throws Exception { + + IAtomContainer cid138898 = cid138898(); + IAtomContainer cid241107 = cid241107(); + + MoleculeHashGenerator basic = new HashGeneratorMaker().depth(12) + .elemental() + .perturbed() + .molecular(); + // basic equivalence method can't tell these apart + assertThat(basic.generate(cid138898), is(basic.generate(cid241107))); + + MoleculeHashGenerator cmplx = new HashGeneratorMaker().depth(12) + .elemental() + .perturbWith(new AllEquivalentCyclicSet()) + .molecular(); + + // complex equivalence method can tell these apart + assertThat(cmplx.generate(cid138898), is(not(cmplx.generate(cid241107)))); + + } + + /** + * PubChem-Compound CID 241107 CC12CC3(SC(S3)(CC(S1)(S2)C)C)C + * + * @cdk.inchi InChI=1S/C10H16S4/c1-7-5-8(2)13-10(4,14-8)6-9(3,11-7)12-7/h5-6H2,1-4H3 + */ + private IAtomContainer cid241107() { + IAtomContainer m = new AtomContainer(14, 16, 0, 0); + IAtom[] as = new IAtom[]{ + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("S"), + new Atom("C"), + new Atom("S"), + new Atom("C"), + new Atom("C"), + new Atom("S"), + new Atom("S"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + }; + IBond[] bs = new IBond[]{ + new Bond(as[1], as[0]), + new Bond(as[2], as[1]), + new Bond(as[3], as[2]), + new Bond(as[4], as[3]), + new Bond(as[5], as[4]), + new Bond(as[6], as[5]), + new Bond(as[6], as[3]), + new Bond(as[7], as[5]), + new Bond(as[8], as[7]), + new Bond(as[9], as[8]), + new Bond(as[9], as[1]), + new Bond(as[10], as[8]), + new Bond(as[10], as[1]), + new Bond(as[11], as[8]), + new Bond(as[12], as[5]), + new Bond(as[13], as[3]), + }; + m.setAtoms(as); + m.setBonds(bs); + return m; + } + + /** + * PubChem-Compound CID 138898 CC12CC3(SC(S1)(CC(S2)(S3)C)C)C + * + * @cdk.inchi InChI=1S/C10H16S4/c1-7-5-8(2)13-9(3,11-7)6-10(4,12-7)14-8/h5-6H2,1-4H3 + */ + private IAtomContainer cid138898() { + IAtomContainer m = new AtomContainer(14, 16, 0, 0); + IAtom[] as = new IAtom[]{ + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("S"), + new Atom("C"), + new Atom("S"), + new Atom("C"), + new Atom("C"), + new Atom("S"), + new Atom("S"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + }; + IBond[] bs = new IBond[]{ + new Bond(as[1], as[0]), + new Bond(as[2], as[1]), + new Bond(as[3], as[2]), + new Bond(as[4], as[3]), + new Bond(as[5], as[4]), + new Bond(as[6], as[5]), + new Bond(as[6], as[1]), + new Bond(as[7], as[5]), + new Bond(as[8], as[7]), + new Bond(as[9], as[8]), + new Bond(as[9], as[1]), + new Bond(as[10], as[8]), + new Bond(as[10], as[3]), + new Bond(as[11], as[8]), + new Bond(as[12], as[5]), + new Bond(as[13], as[3]), + }; + m.setAtoms(as); + m.setBonds(bs); + return m; + } + +} diff --git a/src/test/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSetUnionTest.java b/src/test/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSetUnionTest.java new file mode 100644 index 00000000000..45a115eae80 --- /dev/null +++ b/src/test/org/openscience/cdk/hash/equivalent/MinimumEquivalentCyclicSetUnionTest.java @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2013. John May + * + * Contact: cdk-devel@lists.sourceforge.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * All we ask is that proper credit is given for our work, which includes + * - but is not limited to - adding the above copyright notice to the beginning + * of your source code files, and to any copyright notice that you may distribute + * with programs based on this work. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U + */ + +package org.openscience.cdk.hash.equivalent; + +import org.junit.Test; +import org.openscience.cdk.Atom; +import org.openscience.cdk.AtomContainer; +import org.openscience.cdk.Bond; +import org.openscience.cdk.hash.HashGeneratorMaker; +import org.openscience.cdk.hash.MoleculeHashGenerator; +import org.openscience.cdk.interfaces.IAtom; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; + +import java.util.Set; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.not; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.openscience.cdk.interfaces.IBond.Order.DOUBLE; + +/** + * @author John May + * @cdk.module test-hash + */ +public class MinimumEquivalentCyclicSetUnionTest { + + @Test + public void testFind() throws Exception { + IAtomContainer dummy = mock(IAtomContainer.class); + int[][] g = new int[][]{{1, 5, 6}, + {0, 2}, + {1, 3}, + {2, 4, 7}, + {3, 5}, + {0, 4}, + {0}, + {3}}; + + // mock the invariants + long[] values = new long[]{ + 1, + 4, + 3, + 1, + 3, + 5, + 7, + 8 + }; + + EquivalentSetFinder finder = new AllEquivalentCyclicSet(); + Set set = finder.find(values, dummy, g); + + assertThat(set.size(), is(4)); + // the first size vertex are all in a cycle + assertTrue(set.contains(0)); + assertTrue(set.contains(2)); + assertTrue(set.contains(3)); + assertTrue(set.contains(4)); + } + + @Test + public void testFind_Distinct() throws Exception { + IAtomContainer dummy = mock(IAtomContainer.class); + int[][] g = new int[][]{{1, 5, 6}, + {0, 2}, + {1, 3}, + {2, 4, 7}, + {3, 5}, + {0, 4}, + {0}, + {3}}; + + // mock the invariants + long[] values = new long[]{ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + }; + + EquivalentSetFinder finder = new AllEquivalentCyclicSet(); + Set set = finder.find(values, dummy, g); + + assertThat(set.size(), is(0)); + } + + /** + * Test the method at perturbing the 2D representations of CID 44333798 and + * CID 57170558. These molecules are similar but distinct. To tell these + * apart we must use {@link MinimumEquivalentCyclicSetUnion} opposed to the + * faster method. This test serves to demonstrates the basic equivalent set + * finder does not tell them apart but that a more comprehensive set finder + * does. + */ + @Test public void testScenario() { + + IAtomContainer cid4433798 = cid44333798(); + IAtomContainer cid57170558 = cid57170558(); + + MoleculeHashGenerator basic = new HashGeneratorMaker().depth(12) + .elemental() + .perturbed() + .molecular(); + // basic equivalence method can't tell these apart + assertThat(basic.generate(cid4433798), is(basic.generate(cid57170558))); + + MoleculeHashGenerator cmplx = new HashGeneratorMaker().depth(12) + .elemental() + .perturbWith(new MinimumEquivalentCyclicSetUnion()) + .molecular(); + + // complex equivalence method can tell these apart + assertThat(cmplx.generate(cid4433798), is(not(cmplx.generate(cid57170558)))); + } + + /** + * CC1=CC=C(C=C1)N2C3CCC2CC3 + * + * @cdk.inchi InChI=1S/C13H17N/c1-10-2-4-11(5-3-10)14-12-6-7-13(14)9-8-12/h2-5,12-13H,6-9H2,1H3 + */ + private IAtomContainer cid44333798() { + IAtomContainer m = new AtomContainer(14, 16, 0, 0); + IAtom[] as = new IAtom[]{ + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("N"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + }; + IBond[] bs = new IBond[]{ + new Bond(as[1], as[0]), + new Bond(as[2], as[1], DOUBLE), + new Bond(as[3], as[2]), + new Bond(as[4], as[3], DOUBLE), + new Bond(as[5], as[4]), + new Bond(as[6], as[5], DOUBLE), + new Bond(as[6], as[1]), + new Bond(as[7], as[4]), + new Bond(as[8], as[7]), + new Bond(as[9], as[8]), + new Bond(as[10], as[9]), + new Bond(as[11], as[10]), + new Bond(as[11], as[7]), + new Bond(as[12], as[11]), + new Bond(as[13], as[12]), + new Bond(as[13], as[8]), + }; + m.setAtoms(as); + m.setBonds(bs); + return m; + } + + /** + * CC1=CC=C(C=C1)N(C2CC2)C3CC3 + * + * @cdk.inchi InChI=1S/C13H17N/c1-10-2-4-11(5-3-10)14(12-6-7-12)13-8-9-13/h2-5,12-13H,6-9H2,1H3 + */ + private IAtomContainer cid57170558() { + IAtomContainer m = new AtomContainer(14, 16, 0, 0); + IAtom[] as = new IAtom[]{ + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("N"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + new Atom("C"), + }; + IBond[] bs = new IBond[]{ + new Bond(as[1], as[0]), + new Bond(as[2], as[1], DOUBLE), + new Bond(as[3], as[2]), + new Bond(as[4], as[3], DOUBLE), + new Bond(as[5], as[4]), + new Bond(as[6], as[5], DOUBLE), + new Bond(as[6], as[1]), + new Bond(as[7], as[4]), + new Bond(as[8], as[7]), + new Bond(as[9], as[8]), + new Bond(as[10], as[9]), + new Bond(as[10], as[8]), + new Bond(as[11], as[7]), + new Bond(as[12], as[11]), + new Bond(as[13], as[12]), + new Bond(as[13], as[11]), + }; + m.setAtoms(as); + m.setBonds(bs); + return m; + } +}