Skip to content

Commit

Permalink
Finalise patch: rename to make it clear what it is doing and is easy …
Browse files Browse the repository at this point in the history
…to find (start with 'Smarts' prefix). Introduce the MODE_EXACT to specify setting the exact valence. We use int constants rather than enums to avoid adding two classes to the public API.
  • Loading branch information
johnmay committed Oct 7, 2016
1 parent adff265 commit 144f4d7
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import org.openscience.cdk.fingerprint.CircularFingerprinter.FP;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.silent.SilentChemObjectBuilder;
import org.openscience.cdk.smarts.SubstructureSmarts;
import org.openscience.cdk.smarts.SmartsFragmentExtractor;
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
Expand Down Expand Up @@ -111,7 +111,7 @@ private void checkFPSmartsForMolecule(String moleculeSmiles,

CircularFingerprinter circ = new CircularFingerprinter();
circ.calculate(mol);
SubstructureSmarts subsmarts = new SubstructureSmarts(mol);
SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
subsmarts.setIncludePeripheralBonds(true);
int numFP = circ.getFPCount();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,42 +28,72 @@
import org.openscience.cdk.interfaces.IBond;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

/**
* Utility class to create SMARTS that match part (substructure) of a molecule.
* SMARTS are generated by providing the atom indexes. An example use cases is
* encoding features from a fingerprint.
* <p>
* The extractor has two modes. {@link #MODE_EXACT} (default) captures the element,
* valence, hydrogen count, connectivity, and charge in the SMARTS atom expressions.
* The alternative mode, {@link #MODE_JCOMPOUNDMAPPER}, only captures the element,
* non-zero charge, and peripheral bonds. Although the later looks cleaner, the
* peripheral bonds intend to capture the connectivity of the terminal atoms but
* since the valence is not bounded further substitution is still allowed.
*
* <p>The difference is easily demonstrated for methyl. Consider the compound
* of 2-methylpentane {@code CC(C)CCC}, if we extract one of the methyl atoms
* depending on the mode we obtain {@code [CH3v4X4+0]} or {@code C*}. The first
* of these patterns (obtained by {@link #MODE_EXACT}) matches the compound in
* <b>three places</b> (the three methyl groups). The second matches <b>six</b>
* times (every atom) because the substituion on the carbon is not locked.
* A further complication is introduced by the inclusion of the peripheral atoms,
* for 1H-indole {@code [nH]1ccc2c1cccc2} we can obtain the SMARTS {@code n(ccc(a)a)a}
* that doesn't match at all. This is because one of the aromatic atoms ('a')
* needs to match the nitrogen.
*
* <p><b>Basic Usage:</b></p>
* <pre>{@code
* IChemObjectBuilder bldr = SilentChemObjectBuilder.getInstance();
* SmilesParser smipar = new SmilesParser(bldr);
*
* IAtomContainer mol = smipar.parseSmiles("[nH]1ccc2c1cccc2");
* SubstructureSmarts subsmarts = new SubstructureSmarts(mol);
* IChemObjectBuilder bldr = SilentChemObjectBuilder.getInstance();
* SmilesParser smipar = new SmilesParser(bldr);
*
* IAtomContainer mol = smipar.parseSmiles("[nH]1ccc2c1cccc2");
* SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
*
* // smarts=nccc
* // smarts=[nH1v3X3+0][cH1v4X3+0][cH1v4X3+0][cH0v4X3+0]
* // hits =1
* String smarts = mol.generate(new int[]{0,1,3,4});
*
* subsmarts.setIncludePeripheralBonds(true);
* subsmarts.setMode(MODE_JCOMPOUNDMAPPER);
* // smarts=n(ccc(a)a)a
* // hits = 0 - one of the 'a' atoms needs to match the nitrogen
* String smarts = mol.generate(new int[]{0,1,3,4});
* }</pre>
*
* @author Nikolay Kochev
* @author Nina Jeliazkova
* @author John May
*/
public final class SubstructureSmarts {
public final class SmartsFragmentExtractor {

/**
* Sets the mode of the extractor to produce SMARTS similar to JCompoundMapper.
*/
public static final int MODE_JCOMPOUNDMAPPER = 1;

/**
* Sets the mode of the extractor to produce exact SMARTS.
*/
public static final int MODE_EXACT = 2;

// molecule being selected over
private final IAtomContainer mol;

// fast-access mol graph data structures
private final int[][] atomAdj, bondAdj;
private final int[] deg;
private final int[] deg;

// SMARTS atom and bond expressions
private final String[] aexpr;
Expand All @@ -75,15 +105,15 @@ public final class SubstructureSmarts {
private final int[] rnums;
private int numVisit;

// option whether to include bonds in which only one atom is in the substructure
private boolean peripherals = false;
// which mode should SMARTS be encoded in
private int mode = MODE_EXACT;

/**
* Create a new instance over the provided molecule.
*
* @param mol molecule
*/
public SubstructureSmarts(IAtomContainer mol) {
public SmartsFragmentExtractor(IAtomContainer mol) {
this.mol = mol;

final int numAtoms = mol.getAtomCount();
Expand Down Expand Up @@ -134,14 +164,25 @@ public SubstructureSmarts(IAtomContainer mol) {
}

/**
* Option sets whether peripheral bonds should be written. Peripheral
* bonds are bonds where only one atom is in the requested substructure.
* The atoms not in the substructure are written as '*' or 'a'.
* Set the mode of SMARTS substructure selection
*
* @param val value true/false
* @param mode the mode
*/
public void setIncludePeripheralBonds(boolean val) {
this.peripherals = val;
public void setMode(int mode) {
// check arg
switch (mode) {
case MODE_EXACT:
case MODE_JCOMPOUNDMAPPER:
break;
default:
throw new IllegalArgumentException("Invalid mode specified!");
}
this.mode = mode;

// re-gen atom expressions
int numAtoms = mol.getAtomCount();
for (int atomIdx = 0; atomIdx < numAtoms; atomIdx++)
this.aexpr[atomIdx] = encodeAtomExpr(atomIdx);
}

/**
Expand All @@ -159,7 +200,7 @@ public String generate(int[] atomIdxs) {
return null; // makes sense?

// special case
if (atomIdxs.length == 1 && !peripherals)
if (atomIdxs.length == 1 && mode == MODE_EXACT)
return aexpr[atomIdxs[0]];

// initialize traversal information
Expand Down Expand Up @@ -196,7 +237,7 @@ public String generate(int[] atomIdxs) {
* Recursively marks ring closures (back edges) in the {@link #rbnds}
* array in a depth first order.
*
* @param idx atom index
* @param idx atom index
* @param bprev previous bond
*/
private void markRings(int idx, int bprev) {
Expand All @@ -218,9 +259,9 @@ else if (avisit[nbr] < avisit[idx])
* Recursively encodes a SMARTS expression into the provides
* string builder.
*
* @param idx atom index
* @param idx atom index
* @param bprev previous bond
* @param sb destition to write SMARTS to
* @param sb destition to write SMARTS to
*/
private void encodeExpr(int idx, int bprev, StringBuilder sb) {
avisit[idx] = numVisit++;
Expand All @@ -247,7 +288,7 @@ private void encodeExpr(int idx, int bprev, StringBuilder sb) {
sb.append(rnum);
}

if (!peripherals && avisit[nbr] == 0 ||
if (mode == MODE_EXACT && avisit[nbr] == 0 ||
bidx == bprev ||
rbnds[bidx] != 0)
remain--;
Expand All @@ -256,7 +297,7 @@ private void encodeExpr(int idx, int bprev, StringBuilder sb) {
for (int j = 0; j < d; j++) {
int nbr = atomAdj[idx][j];
int bidx = bondAdj[idx][j];
if (!peripherals && avisit[nbr] == 0 ||
if (mode == MODE_EXACT && avisit[nbr] == 0 ||
bidx == bprev ||
rbnds[bidx] != 0)
continue; // ignored
Expand All @@ -278,6 +319,7 @@ private void encodeExpr(int idx, int bprev, StringBuilder sb) {

/**
* Select the lowest ring number for use in SMARTS.
*
* @return ring number
* @throws IllegalStateException all ring numbers are used
*/
Expand All @@ -293,6 +335,7 @@ private int chooseRingNumber() {

/**
* Releases a ring number allowing it to be reused.
*
* @param rnum ring number
*/
private void releaseRingNumber(int rnum) {
Expand All @@ -309,7 +352,7 @@ private void releaseRingNumber(int rnum) {
private String encodeAtomExpr(int atmIdx) {
final IAtom atom = mol.getAtom(atmIdx);

boolean complex = false;
boolean complex = mode == MODE_EXACT;

StringBuilder sb = new StringBuilder();

Expand Down Expand Up @@ -337,13 +380,41 @@ private String encodeAtomExpr(int atmIdx) {
break;
}

if (mode == MODE_EXACT) {

int hcount = atom.getImplicitHydrogenCount();
int valence = hcount;
int connections = hcount;

int atmDeg = this.deg[atmIdx];
for (int i = 0; i < atmDeg; i++) {
IBond bond = mol.getBond(bondAdj[atmIdx][i]);
IAtom nbr = bond.getConnectedAtom(atom);
if (nbr.getAtomicNumber() != null && nbr.getAtomicNumber() == 1)
hcount++;
int bord = bond.getOrder() != null ? bond.getOrder().numeric() : 0;
if (bord == 0)
throw new IllegalArgumentException("Molecule had unsupported zero-order or unset bonds!");
valence += bord;
connections++;
}

sb.append('H').append(hcount);
sb.append('v').append(valence);
sb.append('X').append(connections);
}

Integer chg = atom.getFormalCharge();
if (chg != null && chg != 0) {
if (chg == null) chg = 0;


if (chg < -1 || chg > +1) {
if (chg >= 0) sb.append('+');
else sb.append('-');
if (chg < -1 || chg > +1)
sb.append(Math.abs(chg));
sb.append(Math.abs(chg));
complex = true;
} else if (mode == MODE_EXACT) {
sb.append("+0");
}

return complex ? '[' + sb.toString() + ']' : sb.toString();
Expand All @@ -354,8 +425,8 @@ private String encodeAtomExpr(int atmIdx) {
* expression that matches itself.
*
* @param bondIdx bond index
* @param beg atom index of first atom
* @param end atom index of second atom
* @param beg atom index of first atom
* @param end atom index of second atom
* @return SMARTS bond expression
*/
private String encodeBondExpr(int bondIdx, int beg, int end) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;

public class SubstructureSmartsTest {
public class SmartsFragmentExtractorTest {

private String generate(String smi, boolean peripheral, int[] idxs) throws Exception {
private String generate(String smi, int mode, int[] idxs) throws Exception {
SmilesParser smipar = new SmilesParser(SilentChemObjectBuilder.getInstance());
IAtomContainer mol = smipar.parseSmiles(smi);
SubstructureSmarts subsmarts = new SubstructureSmarts(mol);
subsmarts.setIncludePeripheralBonds(peripheral);
SmartsFragmentExtractor subsmarts = new SmartsFragmentExtractor(mol);
subsmarts.setMode(mode);
return subsmarts.generate(idxs);
}

Expand All @@ -49,37 +49,53 @@ private static int[] makeSeq(int beg, int to) {
return a;
}

@Test
public void methylExact() throws Exception {
String smarts = generate("CC(C)CCC",
SmartsFragmentExtractor.MODE_EXACT,
makeSeq(0,1));
assertThat(smarts, is("[CH3v4X4+0]"));
}

@Test
public void methylForJCompoundMap() throws Exception {
String smarts = generate("CC(C)CCC",
SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER,
makeSeq(0,1));
assertThat(smarts, is("C*"));
}

@Test
public void indole() throws Exception {
String smarts = generate("[nH]1ccc2c1cccc2",
false,
SmartsFragmentExtractor.MODE_EXACT,
makeSeq(0,4));
assertThat(smarts, is("nccc"));
assertThat(smarts, is("[nH1v3X3+0][cH1v4X3+0][cH1v4X3+0][cH0v4X3+0]"));
}

@Test
public void indoleWithPeripheral() throws Exception {
public void indoleForJCompoundMap() throws Exception {
String smarts = generate("[nH]1ccc2c1cccc2",
true,
SmartsFragmentExtractor.MODE_JCOMPOUNDMAPPER,
makeSeq(0,4));
assertThat(smarts, is("n(ccc(a)a)a"));
}

@Test
public void biphenylIncludesSingleBond() throws Exception {
String smarts = generate("c1ccccc1-c1ccccc1",
false,
SmartsFragmentExtractor.MODE_EXACT,
makeSeq(0,12));
assertThat(smarts, containsString("-"));
}

@Test
public void fullereneC60() throws Exception {
String smarts = generate("c12c3c4c5c1c1c6c7c2c2c8c3c3c9c4c4c%10c5c5c1c1c6c6c%11c7c2c2c7c8c3c3c8c9c4c4c9c%10c5c5c1c1c6c6c%11c2c2c7c3c3c8c4c4c9c5c1c1c6c2c3c41",
false,
SmartsFragmentExtractor.MODE_EXACT,
makeSeq(0,60));
assertThat(smarts,
is("c12c3c4c5c1c1c6c7c2c2c8c3c3c9c4c4c%10c5c5c1c1c6c6c%11c7c2c2c7c8c3c3c8c9c4c4c9c%10c5c5c1c1c6c6c%11c2c2c7c3c3c8c4c4c9c5c1c1c6c2c3c41"));
is("[cH0v4X3+0]12[cH0v4X3+0]3[cH0v4X3+0]4[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]7[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]8[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]9[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]%10[cH0v4X3+0]5[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]6[cH0v4X3+0]%11[cH0v4X3+0]7[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]7[cH0v4X3+0]8[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]8[cH0v4X3+0]9[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]9[cH0v4X3+0]%10[cH0v4X3+0]5[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]6[cH0v4X3+0]%11[cH0v4X3+0]2[cH0v4X3+0]2[cH0v4X3+0]7[cH0v4X3+0]3[cH0v4X3+0]3[cH0v4X3+0]8[cH0v4X3+0]4[cH0v4X3+0]4[cH0v4X3+0]9[cH0v4X3+0]5[cH0v4X3+0]1[cH0v4X3+0]1[cH0v4X3+0]6[cH0v4X3+0]2[cH0v4X3+0]3[cH0v4X3+0]41"));
}

}

0 comments on commit 144f4d7

Please sign in to comment.