Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raw and count path fingerprints #834

Merged
merged 6 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
Expand Down Expand Up @@ -189,8 +188,27 @@ public IBitFingerprint getBitFingerprint(IAtomContainer container) throws CDKExc

/** {@inheritDoc} */
@Override
public Map<String, Integer> getRawFingerprint(IAtomContainer iAtomContainer) throws CDKException {
throw new UnsupportedOperationException();
public Map<String, Integer> getRawFingerprint(IAtomContainer container) throws CDKException {
if (!hasPseudoAtom(container.atoms())) {
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(container);
Aromaticity.cdkLegacy().apply(container);
}
Map<String,Integer> rawFp = new HashMap<>();
BitSet bitSet = new BitSet(size);
State state = new State(container, bitSet, size, searchDepth+1);
state.setFeatureMap(rawFp);
for (IAtom atom : container.atoms()) {
state.numPaths = 0;
state.visit(atom);
traversePaths(state, atom, null);
state.unvisit(atom);
}
return rawFp;
}

@Override
public ICountFingerprint getCountFingerprint(IAtomContainer container) throws CDKException {
return new IntArrayCountFingerprint(getRawFingerprint(container));
}

private IBond findBond(List<IBond> bonds, IAtom beg, IAtom end) {
Expand All @@ -200,19 +218,13 @@ private IBond findBond(List<IBond> bonds, IAtom beg, IAtom end) {
return null;
}

private String encodePath(IAtomContainer mol, Map<IAtom, List<IBond>> cache, List<IAtom> path, StringBuilder buffer) {
private String encodePath(IAtomContainer mol, List<IAtom> path, StringBuilder buffer) {
buffer.setLength(0);
IAtom prev = path.get(0);
buffer.append(getAtomSymbol(prev));
for (int i = 1; i < path.size(); i++) {
final IAtom next = path.get(i);
List<IBond> bonds = cache.get(prev);

if (bonds == null) {
bonds = mol.getConnectedBondsList(prev);
cache.put(prev, bonds);
}

List<IBond> bonds = mol.getConnectedBondsList(prev);
IBond bond = findBond(bonds, next, prev);
if (bond == null)
throw new IllegalStateException("FATAL - Atoms in patch were connected?");
Expand All @@ -236,7 +248,24 @@ private String encodePath(List<IAtom> apath, List<IBond> bpath, StringBuilder bu
return buffer.toString();
}

private int appendHash(int hash, String str) {
private String encodeRevPath(List<IAtom> apath, List<IBond> bpath, StringBuilder buffer) {
// atoms=[0, 1, 2, 3], bonds=[0, 1, 2]
// len=4 a0 | b0 a1 b1 a2 b2 a3 (fwd)
// len=4 a3 | b2 a2 b1 a1 b0 a0 (rev)
buffer.setLength(0);
int len = apath.size();
IAtom prev = apath.get(len-1);
buffer.append(getAtomSymbol(prev));
for (int i = len-2; i >= 0; i--) {
final IAtom next = apath.get(i);
final IBond bond = bpath.get(i);
buffer.append(getBondSymbol(bond));
buffer.append(getAtomSymbol(next));
}
return buffer.toString();
}

private static int appendHash(int hash, String str) {
int len = str.length();
for (int i = 0; i < len; i++)
hash = 31 * hash + str.charAt(0);
Expand Down Expand Up @@ -268,17 +297,17 @@ private int hashRevPath(List<IAtom> apath, List<IBond> bpath) {
return hash;
}

private static final class State {
private final class State {
private int numPaths = 0;
private Random rand = new Random();
private BitSet fp;
private Map<String,Integer> feats;
private IAtomContainer mol;
private Set<IAtom> visited = new HashSet<>();
private List<IAtom> apath = new ArrayList<>();
private List<IBond> bpath = new ArrayList<>();
private final int maxDepth;
private final int fpsize;
private Map<IAtom,List<IBond>> cache = new IdentityHashMap<>();
public StringBuilder buffer = new StringBuilder();

public State(IAtomContainer mol, BitSet fp, int fpsize, int maxDepth) {
Expand All @@ -288,13 +317,12 @@ public State(IAtomContainer mol, BitSet fp, int fpsize, int maxDepth) {
this.maxDepth = maxDepth;
}

public void setFeatureMap(Map<String,Integer> feats) {
this.feats = feats;
}

List<IBond> getBonds(IAtom atom) {
List<IBond> bonds = cache.get(atom);
if (bonds == null) {
bonds = mol.getConnectedBondsList(atom);
cache.put(atom, bonds);
}
return bonds;
return mol.getConnectedBondsList(atom);
}

boolean visit(IAtom a) {
Expand Down Expand Up @@ -325,13 +353,64 @@ void addHash(int x) {
// different bit
fp.set(rand.nextInt(fpsize));
}

private void storeFeat(String path) {
if (feats == null)
return;
feats.compute(path, (k, v) -> v == null ? 1 : v+1);
}

private void storeForward() {
addHash(hashPath(apath, bpath));
if (feats != null) {
storeFeat(encodePath(apath, bpath, buffer));
}
}

private void storeReverse() {
addHash(hashRevPath(apath, bpath));
if (feats != null) {
storeFeat(encodeRevPath(apath, bpath, buffer));
}
}

/**
* Optimisation - determine if the path if lexicographically smaller
* forwards rather than backwards. When we come to actually hash the
* path we hash it forwards and backwards and store the lowest so only
* need to do that more expensive encoding once.
* We can do this a couple of ways for example atom index - but since
* that may be a linear time lookup (at least in the old IAtomContainer
* implementation) we use the identity hash code (memory address).
*
* @return true - do encode/false - skip encoding
*/
public boolean isOrderedPath() {
return System.identityHashCode(apath.get(0)) <
System.identityHashCode(apath.get(apath.size()-1));
}

public void storePath() {
if (bpath.isEmpty()) {
addHash(getAtomSymbol(apath.get(0)).hashCode());
storeFeat(getAtomSymbol(apath.get(0)));
} else {
if (!isOrderedPath())
return;
if (compare(apath, bpath) >= 0) {
storeForward();
} else {
storeReverse();
}
}
}
}

private void traversePaths(State state, IAtom beg, IBond prev) throws CDKException {
if (!hashPseudoAtoms && isPseudo(beg))
return;
state.push(beg, prev);
state.addHash(encodeUniquePath(state.apath, state.bpath, state.buffer));
state.storePath();
if (state.numPaths > pathLimit)
throw new CDKException("Too many paths! Structure is likely a cage, reduce path length or increase path limit");
if (state.apath.size() < state.maxDepth) {
Expand Down Expand Up @@ -364,13 +443,12 @@ protected int[] findPathes(IAtomContainer container, int searchDepth) throws CDK

Set<Integer> hashes = new HashSet<>();

Map<IAtom, List<IBond>> cache = new HashMap<>();
StringBuilder buffer = new StringBuilder();
for (IAtom startAtom : container.atoms()) {
List<List<IAtom>> p = PathTools.getLimitedPathsOfLengthUpto(container, startAtom, searchDepth, pathLimit);
for (List<IAtom> path : p) {
if (hashPseudoAtoms || !hasPseudoAtom(path))
hashes.add(encodeUniquePath(container, cache, path, buffer));
hashes.add(encodeUniquePath(container, path, buffer));
}
}

Expand Down Expand Up @@ -403,12 +481,15 @@ private static boolean hasPseudoAtom(Iterable<IAtom> path) {
return false;
}

private int encodeUniquePath(IAtomContainer container, Map<IAtom, List<IBond>> cache, List<IAtom> path, StringBuilder buffer) {
private int encodeUniquePath(IAtomContainer container,
List<IAtom> path,
StringBuilder buffer) {
if (path.size() == 1)
return getAtomSymbol(path.get(0)).hashCode();
String forward = encodePath(container, cache, path, buffer);

String forward = encodePath(container, path, buffer);
Collections.reverse(path);
String reverse = encodePath(container, cache, path, buffer);
String reverse = encodePath(container, path, buffer);
Collections.reverse(path);

final int x;
Expand All @@ -425,7 +506,7 @@ private int encodeUniquePath(IAtomContainer container, Map<IAtom, List<IBond>> c
* @param b atom b
* @return comparison &lt;0 a is less than b, &gt;0 a is more than b
*/
private int compare(IAtom a, IAtom b) {
private static int compare(IAtom a, IAtom b) {
final int elemA = getElem(a);
final int elemB = getElem(b);
if (elemA == elemB)
Expand Down Expand Up @@ -470,26 +551,14 @@ private int compare(List<IAtom> apath, List<IBond> bpath) {
return 0;
}

private int encodeUniquePath(List<IAtom> apath, List<IBond> bpath, StringBuilder buffer) {
if (bpath.size() == 0)
return getAtomSymbol(apath.get(0)).hashCode();
final int x;
if (compare(apath, bpath) >= 0) {
x = hashPath(apath, bpath);
} else {
x = hashRevPath(apath, bpath);
}
return x;
}

private static int getElem(IAtom atom) {
Integer elem = atom.getAtomicNumber();
if (elem == null)
elem = 0;
return elem;
}

private String getAtomSymbol(IAtom atom) {
private static String getAtomSymbol(IAtom atom) {
// XXX: backwards compatibility
// This is completely random, I believe the intention is because
// paths were reversed with string manipulation to de-duplicate
Expand Down Expand Up @@ -566,10 +635,4 @@ public int getSearchDepth() {
public int getSize() {
return size;
}

@Override
public ICountFingerprint getCountFingerprint(IAtomContainer container) throws CDKException {
throw new UnsupportedOperationException();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import org.openscience.cdk.interfaces.IBond;

import java.util.BitSet;
import java.util.Random;

/**
* Specialized version of the {@link Fingerprinter} which does not take bond orders
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IRingSet;
import org.openscience.cdk.ringsearch.RingPartitioner;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import org.openscience.cdk.tools.manipulator.MolecularFormulaManipulator;

import java.util.BitSet;
Expand Down Expand Up @@ -104,8 +105,29 @@ public IBitFingerprint getBitFingerprint(IAtomContainer container) throws CDKExc
* {@inheritDoc}
*/
@Override
public Map<String, Integer> getRawFingerprint(IAtomContainer iAtomContainer) throws CDKException {
throw new UnsupportedOperationException();
public Map<String, Integer> getRawFingerprint(IAtomContainer container) throws CDKException {
Map<String,Integer> fp = fingerprinter.getRawFingerprint(container);
double weight = AtomContainerManipulator.getMass(container);
for (int i = 1; i < 11; i++) {
if (weight > (100 * i))
fp.put("MASS_RANGE:" + i, 1);
}
// set the ring bits - this a little odd
IRingSet ringSet = Cycles.sssr(container).toRingSet();
List<IRingSet> rslist = RingPartitioner.partitionRings(ringSet);
for (int i = 0; i < 7; i++) {
if (ringSet.getAtomContainerCount() > i)
fp.compute("RNCT:" + (i+1), (k,v) -> v != null ? v+1 : 1);
}
int maxrcnt = 0;
for (IRingSet rs : rslist) {
if (rs.getAtomContainerCount() > maxrcnt)
maxrcnt = rs.getAtomContainerCount();
}
for (int i = 0; i < maxrcnt && i < 9; i++) {
fp.put("RCNT_MAX:" + (i+1), 1);
}
return fp;
}

/**
Expand Down Expand Up @@ -148,15 +170,15 @@ public IBitFingerprint getBitFingerprint(IAtomContainer atomContainer, IRingSet
rslist = RingPartitioner.partitionRings(ringSet);
}
for (int i = 0; i < 7; i++) {
if (ringSet.getAtomContainerCount() > i) fingerprint.set(size - 15 + i); // 15 := RESERVED_BITS+1+10 mass bits
if (ringSet.getAtomContainerCount() > i)
fingerprint.set(size - 15 + i); // 15 := RESERVED_BITS+1+10 mass bits
}
int maximumringsystemsize = 0;
for (int i = 0; i < rslist.size(); i++) {
if (((IRingSet) rslist.get(i)).getAtomContainerCount() > maximumringsystemsize)

maximumringsystemsize = ((IRingSet) rslist.get(i)).getAtomContainerCount();
int maxrcnt = 0;
for (IRingSet iRingSet : rslist) {
if (iRingSet.getAtomContainerCount() > maxrcnt)
maxrcnt = iRingSet.getAtomContainerCount();
}
for (int i = 0; i < maximumringsystemsize && i < 9; i++) {
for (int i = 0; i < maxrcnt && i < 9; i++) {
fingerprint.set(size - 8 + i - 3);
}
return fingerprint;
Expand All @@ -175,7 +197,7 @@ public int getSize() {
*/
@Override
public ICountFingerprint getCountFingerprint(IAtomContainer container) throws CDKException {
throw new UnsupportedOperationException();
return new IntArrayCountFingerprint(getRawFingerprint(container));
}

@Override
Expand Down