Skip to content

Commit

Permalink
alternate methods to breaking the symmetry of uniform hashed atom env…
Browse files Browse the repository at this point in the history
…ironments.

Signed-off-by: Egon Willighagen <egonw@users.sourceforge.net>
  • Loading branch information
johnmay authored and egonw committed Apr 19, 2013
1 parent 74de143 commit d9f6722
Show file tree
Hide file tree
Showing 8 changed files with 809 additions and 19 deletions.
61 changes: 55 additions & 6 deletions src/main/org/openscience/cdk/hash/HashGeneratorMaker.java
Expand Up @@ -2,7 +2,9 @@

import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.hash.equivalent.EquivalentSetFinder;
import org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSet;
import org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSetUnion;
import org.openscience.cdk.hash.seed.AtomEncoder;
import org.openscience.cdk.hash.seed.BasicAtomEncoder;
import org.openscience.cdk.hash.seed.ConjugatedAtomEncoder;
Expand Down Expand Up @@ -75,7 +77,7 @@ public class HashGeneratorMaker {
private List<StereoEncoderFactory> stereoEncoders = new ArrayList<StereoEncoderFactory>();

/* whether we want to use perturbed hash generators */
private boolean perturbed = false;
private EquivalentSetFinder equivSetFinder = null;

/**
* Specify the depth of the hash generator. Larger values discriminate more
Expand Down Expand Up @@ -167,14 +169,61 @@ public HashGeneratorMaker chiral() {
}

/**
* Discriminate atoms experiencing uniform environments.
* Discriminate atoms experiencing uniform environments. This method uses
* {@link MinimumEquivalentCyclicSet} to break symmetry but depending on
* application one may need a more comprehensive method. Please refer to
* {@link #perturbWith(EquivalentSetFinder)} for further configuration
* details.
*
* @return fluent API reference (self)
* @throws UnsupportedOperationException not yet implemented
* @see MinimumEquivalentCyclicSet
* @see #perturbWith(EquivalentSetFinder)
*/
@TestMethod("testPerturbed")
public HashGeneratorMaker perturbed() {
perturbed = true;
return perturbWith(new MinimumEquivalentCyclicSet());
}

/**
* Discriminate atoms experiencing uniform environments using the provided
* method. Depending on the level of identity required one can choose how
* the atoms a perturbed in an attempt to break symmetry. As with all
* hashing there is always a probability of collision but some of these
* collisions may be due to an insufficiency in the algorithm opposed to a
* random chance of collision. Currently there are three strategies but one
* should choose either to use the fast, but good, heuristic {@link
* MinimumEquivalentCyclicSet} or the exact {@link org.openscience.cdk.hash.equivalent.AllEquivalentCyclicSet}.
* In practice {@link MinimumEquivalentCyclicSet} is good enough for most
* applications but it is important to understand the potential trade off.
* The {@link MinimumEquivalentCyclicSetUnion} is provided for demonstration
* only, and as such, is deprecated.
*
* <ul> <li>MinimumEquivalentCyclicSet - fastest, attempt to break symmetry
* by changing a single smallest set of the equivalent atoms which occur in
* a ring</li> <li><strike>MinimumEquivalentCyclicSetUnion</strike>
* (deprecated) - distinguishes more molecules by changing all smallest sets
* of the equivalent atoms which occur in a ring. This method is provided
* from example only</li> <li>AllEquivalentCyclicSet - slowest,
* systematically perturb all equivalent atoms that occur in a ring</li>
* </ul>
*
* At the time of writing (Feb, 2013) the number of known false possibles
* found in PubChem-Compound (aprx. 46,000,000 structures) are as follows:
*
* <ul> <li>MinimumEquivalentCyclicSet - 128 molecules, 64 false positives
* (128/2)</li> <li>MinimumEquivalentCyclicSetUnion - 8 molecules, 4 false
* positives (8/2)</li> <li>AllEquivalentCyclicSet - 0 molecules</li> </ul>
*
* @param equivSetFinder equivalent set finder, used to determine which
* atoms will be perturbed to try and break symmetry.
* @return fluent API reference (self)
* @see org.openscience.cdk.hash.equivalent.AllEquivalentCyclicSet
* @see MinimumEquivalentCyclicSet
* @see MinimumEquivalentCyclicSetUnion
*/
@TestMethod("testPerturbedWith")
public HashGeneratorMaker perturbWith(EquivalentSetFinder equivSetFinder) {
this.equivSetFinder = equivSetFinder;
return this;
}

Expand Down Expand Up @@ -266,11 +315,11 @@ public AtomHashGenerator atomic() {

AtomEncoder encoder = new ConjugatedAtomEncoder(encoders);

if (perturbed) {
if (equivSetFinder != null) {
return new PerturbedAtomHashGenerator(new SeedGenerator(encoder),
new Xorshift(),
makeStereoEncoderFactory(),
new MinimumEquivalentCyclicSet(),
equivSetFinder,
depth);
}

Expand Down
@@ -0,0 +1,108 @@
/*
* Copyright (c) 2013. John May <jwmay@users.sf.net>
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may distribute
* with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
*/

package org.openscience.cdk.hash.equivalent;

import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.ringsearch.RingSearch;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

/**
* Finds the set of equivalent values are members of a ring. This class is
* intended to drive the systematic perturbation of the {@link
* org.openscience.cdk.hash.PerturbedAtomHashGenerator}. This {@link
* EquivalentSetFinder} provides the highest probability of avoid collisions due
* to uniform atom environments but is much more demanding then the simpler
* {@link MinimumEquivalentCyclicSet}.
*
* <p/><br/> The easiest way to use this class is with the {@link
* org.openscience.cdk.hash.HashGeneratorMaker}.
* <blockquote><pre>
* MoleculeHashGenerator generator =
* new HashGeneratorMaker().depth(6)
* .elemental()
* .perturbWith(new AllEquivalentCyclicSet())
* .molecular();
* </pre></blockquote>
*
* @author John May
* @cdk.module hash
* @see org.openscience.cdk.hash.PerturbedAtomHashGenerator
* @see MinimumEquivalentCyclicSet
* @see MinimumEquivalentCyclicSetUnion
*/
@TestClass("org.openscience.cdk.hash.equivalent.AllEquivalentCyclicSetTest")
public final class AllEquivalentCyclicSet implements EquivalentSetFinder {

/**
* @inheritDoc
*/
@Override
@TestMethod("testFind,testFind_Distinct,testScenario")
public Set<Integer> find(long[] invariants, IAtomContainer container, int[][] graph) {

int n = invariants.length;

// find cyclic vertices using DFS
RingSearch ringSearch = new RingSearch(container, graph);

// ordered map of the set of vertices for each value
Map<Long, Set<Integer>> equivalent = new TreeMap<Long, Set<Integer>>();

// divide the invariants into equivalent indexed and ordered sets
for (int i = 0; i < invariants.length; i++) {

Long invariant = invariants[i];
Set<Integer> set = equivalent.get(invariant);

if (set == null) {
if (ringSearch.cyclic(i)) {
set = new HashSet<Integer>(n / 2);
set.add(i);
equivalent.put(invariant, set);
}
} else {
set.add(i);
}
}

// find the smallest set of equivalent cyclic vertices
Set<Integer> set = new TreeSet<Integer>();
for (Map.Entry<Long, Set<Integer>> e : equivalent.entrySet()) {
Set<Integer> vertices = e.getValue();
if (vertices.size() > 1) {
set.addAll(vertices);
}
}

return set;
}
}
Expand Up @@ -15,17 +15,40 @@
* Finds the smallest set of equivalent values are members of a ring. If there
* are multiple smallest sets then the set with the lowest invariant value is
* returned. This class is intended to drive the systematic perturbation of the
* {@link org.openscience.cdk.hash.PerturbedAtomHashGenerator}. The method is
* different from the original publication {@cdk.cite Ihlenfeldt93} where only
* non-terminally removable vertices are considered. The method differs as it
* allows us to make the code more modular. In reality, ring perception
* provided by {@link RingSearch} is very computationally cheap. <p/><br/> A
* alternative and (potentially) more robust way may be use the union of all
* minimum equivalent cyclic sets.
* {@link org.openscience.cdk.hash.PerturbedAtomHashGenerator}.
*
* This method will not distinguish all possible molecules but represents a good
* enough approximation to quickly narrow down an identity search. At the time
* of writing (Feb, 2013) there are only 128 molecules (64 false positives) in
* PubChem-Compound (46E6 molecules) which are not separated. In many data sets
* this method will suffice however the exact {@link AllEquivalentCyclicSet} is
* provided. <p/><br/>
*
* This method is currently the default used by the {@link
* org.openscience.cdk.hash.HashGeneratorMaker} but can also be explicitly
* specified. <blockquote>
* <pre>
* MoleculeHashGenerator generator =
* new HashGeneratorMaker().depth(6)
* .elemental()
* .perturbed() // use this class by default
* .molecular();
*
* // explicitly specify the method
* MoleculeHashGenerator generator =
* new HashGeneratorMaker().depth(6)
* .elemental()
* .perturbWith(new MinimumEquivalentCyclicSet())
* .molecular();
* </pre>
* </blockquote>
*
* @author John May
* @cdk.module hash
* @see org.openscience.cdk.hash.PerturbedAtomHashGenerator
* @see MinimumEquivalentCyclicSetUnion
* @see AllEquivalentCyclicSet
* @see org.openscience.cdk.hash.HashGeneratorMaker
*/
@TestClass("org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSetTest")
public final class MinimumEquivalentCyclicSet implements EquivalentSetFinder {
Expand All @@ -48,8 +71,8 @@ public Set<Integer> find(long[] invariants, IAtomContainer container, int[][] gr
// divide the invariants into equivalent indexed and ordered sets
for (int i = 0; i < invariants.length; i++) {

Long invariant = invariants[i];
Set<Integer> set = equivalent.get(invariant);
Long invariant = invariants[i];
Set<Integer> set = equivalent.get(invariant);

if (set == null) {
if (ringSearch.cyclic(i)) {
Expand All @@ -63,12 +86,12 @@ public Set<Integer> find(long[] invariants, IAtomContainer container, int[][] gr
}

// find the smallest set of equivalent cyclic vertices
int minSize = Integer.MAX_VALUE;
Set<Integer> min = Collections.emptySet();
int minSize = Integer.MAX_VALUE;
Set<Integer> min = Collections.emptySet();
for (Map.Entry<Long, Set<Integer>> e : equivalent.entrySet()) {
Set<Integer> vertices = e.getValue();
if (vertices.size() < minSize) {
min = vertices;
if (vertices.size() < minSize && vertices.size() > 1) {
min = vertices;
minSize = vertices.size();
}
}
Expand Down
@@ -0,0 +1,110 @@
package org.openscience.cdk.hash.equivalent;

import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.ringsearch.RingSearch;

import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

/**
* The union of all the smallest set of equivalent values are members of a ring.
* This class is intended to drive the systematic perturbation of the {@link
* org.openscience.cdk.hash.PerturbedAtomHashGenerator}. The method is more
* comprehensive then a single {@link MinimumEquivalentCyclicSet} and not as
* computationally demanding as {@link AllEquivalentCyclicSet}. In reality one
* should choose either use the fast (but good) heuristic {@link
* MinimumEquivalentCyclicSet} or the exact {@link AllEquivalentCyclicSet}. This
* method is provided for demonstration only.
*
* As with the {@link MinimumEquivalentCyclicSet} perturbation, this method does
* not guarantee that all molecules will be distinguished. At the time of
* writing (Feb 2013) there are only 8 structure in PubChem-Compound which need
* the more comprehensive perturbation method ({@link AllEquivalentCyclicSet}),
* these are listed below.
*
* <table><tr><td>CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=144432">144432</a>
* and CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=15584856">15584856</a></td></tr>
* <tr><td>CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=138898">138898</a>
* and CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=241107">241107</a></td></tr>
* <tr><td>CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=9990759">9990759</a>
* and CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=10899923">10899923</a></td></tr>
* <tr><td>CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=5460768">5460768</a>
* and CID <a href="http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=20673269">20673269</a></td></tr>
* </table>
*
* <p/><br/> The easiest way to use this class is with the {@link
* org.openscience.cdk.hash.HashGeneratorMaker}.
* <blockquote><pre>
* MoleculeHashGenerator generator =
* new HashGeneratorMaker().depth(6)
* .elemental()
* .perturbWith(new MinimumEquivalentCyclicSetUnion())
* .molecular();
* </pre></blockquote>
*
* @author John May
* @cdk.module hash
* @see org.openscience.cdk.hash.PerturbedAtomHashGenerator
* @see MinimumEquivalentCyclicSet
* @see AllEquivalentCyclicSet
* @deprecated provided for to demonstrate a relatively robust but ultimately
* incomplete approach
*/
@Deprecated
@TestClass("org.openscience.cdk.hash.equivalent.MinimumEquivalentCyclicSetUnionTest")
public final class MinimumEquivalentCyclicSetUnion
implements EquivalentSetFinder {

/**
* @inheritDoc
*/
@TestMethod("testFind,testFind_Distinct,testScenario")
@Override
public Set<Integer> find(long[] invariants, IAtomContainer container, int[][] graph) {

int n = invariants.length;

// find cyclic vertices using DFS
RingSearch ringSearch = new RingSearch(container, graph);

// ordered map of the set of vertices for each value
Map<Long, Set<Integer>> equivalent = new TreeMap<Long, Set<Integer>>();

// divide the invariants into equivalent indexed and ordered sets
for (int i = 0; i < invariants.length; i++) {

Long invariant = invariants[i];
Set<Integer> set = equivalent.get(invariant);

if (set == null) {
if (ringSearch.cyclic(i)) {
set = new HashSet<Integer>(n / 2);
set.add(i);
equivalent.put(invariant, set);
}
} else {
set.add(i);
}
}

// find the smallest set of equivalent cyclic vertices
int minSize = Integer.MAX_VALUE;
Set<Integer> min = Collections.emptySet();
for (Map.Entry<Long, Set<Integer>> e : equivalent.entrySet()) {
Set<Integer> vertices = e.getValue();
if (vertices.size() < minSize && vertices.size() > 1) {
min = vertices;
minSize = vertices.size();
} else if (vertices.size() == minSize) {
min.addAll(vertices);
}
}

return min;
}
}

0 comments on commit d9f6722

Please sign in to comment.