Skip to content

Commit

Permalink
Don't stop the world if a single invalid molecule is found. The itera…
Browse files Browse the repository at this point in the history
…tor is meant for parsing large data sets and should not stop early if an invalid structure was found. If the SMILES could not be parsed the iterator now returns an empty container and set the attempted input as a property.

Signed-off-by: Egon Willighagen <egonw@users.sourceforge.net>
  • Loading branch information
johnmay authored and egonw committed Jan 25, 2014
1 parent ade0307 commit df9a307
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.io.formats.IResourceFormat;
Expand All @@ -44,7 +45,9 @@
* Iterating SMILES file reader. It allows to iterate over all molecules
* in the SMILES file, without being read into memory all. Suitable
* for very large SMILES files. These SMILES files are expected to have one
* molecule on each line.
* molecule on each line. If a line could not be parsed and empty molecule is
* returned and the property {@link #BAD_SMILES_INPUT} is set to the attempted
* input. The error is also logged.
*
* <p>For parsing each SMILES it still uses the normal SMILESReader.
*
Expand All @@ -71,6 +74,10 @@ public class IteratingSMILESReader
private boolean nextAvailableIsKnown;
private boolean hasNext;
private IAtomContainer nextMolecule;
private final IChemObjectBuilder builder;

/** Store the problem input as a property. */
public static final String BAD_SMILES_INPUT = "bad.smiles.input";

/**
* Constructs a new IteratingSMILESReader that can read Molecule from a given Reader.
Expand All @@ -84,6 +91,7 @@ public class IteratingSMILESReader
public IteratingSMILESReader(Reader in, IChemObjectBuilder builder) {
sp = new SmilesParser(builder);
setReader(in);
this.builder = builder;
}

/**
Expand Down Expand Up @@ -121,17 +129,19 @@ public boolean hasNext() {

final String line = input.readLine();

if (line == null)
if (line == null) {
nextAvailableIsKnown = true;
return false;
}

hasNext = true;
final String suffix = suffix(line);

nextMolecule = sp.parseSmiles(line);
nextMolecule = readSmiles(line);
nextMolecule.setProperty(CDKConstants.TITLE, suffix);

} catch (Exception exception) {
logger.error("Error while reading next molecule: ", exception.getMessage());
logger.error("Unexpeced problem: ", exception.getMessage());
logger.debug(exception);
hasNext = false;
}
Expand All @@ -157,6 +167,24 @@ private String suffix(final String line) {
return "";
}

/**
* Read the SMILES given in the input line - or return an empty container.
*
* @param line input line
* @return the read container (or an empty one)
*/
private IAtomContainer readSmiles(final String line) {
try {
return sp.parseSmiles(line);
} catch (CDKException e) {
logger.error("Error while reading the SMILES from: " + line + ", ", e);
final IAtomContainer empty = builder.newInstance(IAtomContainer.class,
0, 0, 0, 0);
empty.setProperty(BAD_SMILES_INPUT, line);
return empty;
}
}

/**
* Get the next molecule from the stream.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
* */
package org.openscience.cdk.io.iterator;

import com.google.common.collect.Iterables;
import org.hamcrest.CoreMatchers;
import org.junit.Assert;
import org.junit.Test;
Expand Down Expand Up @@ -211,4 +210,28 @@ public void testRemove() {
CoreMatchers.is("empty2"));
assertFalse(smis.hasNext());
}

@Test public void problemSmiles() {

Reader reader = new StringReader(" okay\nn1cccc1 bad\n okay");
IteratingSMILESReader smis = new IteratingSMILESReader(reader,
SilentChemObjectBuilder.getInstance());
assertTrue(smis.hasNext());
IAtomContainer m1 = smis.next();
assertThat(m1.getAtomCount(), is(0));
assertThat(m1.getProperty(CDKConstants.TITLE, String.class),
CoreMatchers.is("okay"));
assertTrue(smis.hasNext());
IAtomContainer m2 = smis.next();
assertThat(m2.getAtomCount(), is(0));
assertThat(m2.getProperty(CDKConstants.TITLE, String.class),
CoreMatchers.is("bad"));
assertThat(m2.getProperty(IteratingSMILESReader.BAD_SMILES_INPUT, String.class),
CoreMatchers.is("n1cccc1 bad"));
IAtomContainer m3 = smis.next();
assertThat(m3.getAtomCount(), is(0));
assertThat(m3.getProperty(CDKConstants.TITLE, String.class),
CoreMatchers.is("okay"));
assertFalse(smis.hasNext());
}
}

0 comments on commit df9a307

Please sign in to comment.