Skip to content

Commit

Permalink
Improved composite abbreviations for mixtures, use interpunct dot rat…
Browse files Browse the repository at this point in the history
…her than slash, possible option in future.
  • Loading branch information
johnmay committed Oct 5, 2016
1 parent 4ac5e5e commit 7041beb
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import org.openscience.cdk.smiles.SmilesGenerator;
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
import uk.ac.ebi.beam.Element;

import java.io.BufferedReader;
import java.io.File;
Expand All @@ -64,6 +65,7 @@
import java.nio.charset.StandardCharsets;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
Expand Down Expand Up @@ -124,6 +126,11 @@ public class Abbreviations implements Iterable<String> {

private static final int MAX_FRAG = 50;

/**
* Symbol for joining disconnected fragments.
*/
private static final String INTERPUNCT = "·";

private final Map<String, String> connectedAbbreviations = new LinkedHashMap<>();
private final Map<String, String> disconnectedAbbreviations = new LinkedHashMap<>();
private final Set<String> labels = new LinkedHashSet<>();
Expand Down Expand Up @@ -349,38 +356,54 @@ public List<Sgroup> generate(final IAtomContainer mol) {
sgroup.addAtom(atom);
return Collections.singletonList(sgroup);
} else if (cansmi.contains(".")) {
List<Sgroup> newSgroups = new ArrayList<>();
List<Sgroup> complexAbbr = new ArrayList<>(4); // e.g. NEt3
List<Sgroup> simpleAbbr = new ArrayList<>(4); // e.g. HCl
for (IAtomContainer part : ConnectivityChecker.partitionIntoMolecules(mol).atomContainers()) {
cansmi = usmigen.create(part);
label = disconnectedAbbreviations.get(cansmi);
if (label != null && !disabled.contains(label)) {
Sgroup sgroup = new Sgroup();
sgroup.setType(SgroupType.CtabAbbreviation);
sgroup.setSubscript(label);
for (IAtom atom : part.atoms())
if (part.getAtomCount() == 1) {
IAtom atom = part.getAtom(0);
label = getBasicElementSymbol(atom);
if (label != null) {
Sgroup sgroup = new Sgroup();
sgroup.setType(SgroupType.CtabAbbreviation);
sgroup.setSubscript(label);
sgroup.addAtom(atom);
newSgroups.add(sgroup);
simpleAbbr.add(sgroup);
}
} else {
cansmi = usmigen.create(part);
label = disconnectedAbbreviations.get(cansmi);
if (label != null && !disabled.contains(label)) {
Sgroup sgroup = new Sgroup();
sgroup.setType(SgroupType.CtabAbbreviation);
sgroup.setSubscript(label);
for (IAtom atom : part.atoms())
sgroup.addAtom(atom);
complexAbbr.add(sgroup);
}
}
}
if (!newSgroups.isEmpty()) {
// merge together
if (newSgroups.size() > 1) {
if (!complexAbbr.isEmpty()) {
// merge together the abbreviations, iff there is at least
// one complex abbr
if (complexAbbr.size() > 0 &&
complexAbbr.size() + simpleAbbr.size() > 1) {
Sgroup combined = new Sgroup();
label = null;
for (Sgroup sgroup : newSgroups) {
complexAbbr.addAll(simpleAbbr);
for (Sgroup sgroup : complexAbbr) {
if (label == null)
label = sgroup.getSubscript();
else
label += "/" + sgroup.getSubscript();
label += INTERPUNCT + sgroup.getSubscript();
for (IAtom atom : sgroup.getAtoms())
combined.addAtom(atom);
}
combined.setSubscript(label);
combined.setType(SgroupType.CtabAbbreviation);
newSgroups.clear();
newSgroups.add(combined);
complexAbbr.clear();
complexAbbr.add(combined);
}
return newSgroups;
return complexAbbr;
}
}

Expand Down Expand Up @@ -783,6 +806,33 @@ private static String getSmilesSuffix(String line) {
return "";
}

private static String getBasicElementSymbol(IAtom atom) {
if (atom.getFormalCharge() != null && atom.getFormalCharge() != 0)
return null;
if (atom.getMassNumber() != null && atom.getMassNumber() != 0)
return null;
if (atom.getAtomicNumber() == null || atom.getAtomicNumber() < 1)
return null;
Integer hcnt = atom.getImplicitHydrogenCount();
if (hcnt == null) return null;
Elements elem = Elements.ofNumber(atom.getAtomicNumber());
final String hsym = (hcnt > 0) ? ((hcnt > 1) ? ("H" + hcnt) : "H") : "";
// see HydrogenPosition for canonical list
switch (elem) {
case Oxygen:
case Sulfur:
case Selenium:
case Tellurium:
case Fluorine:
case Chlorine:
case Bromine:
case Iodine:
return hsym + elem.symbol();
default:
return elem.symbol() + hsym;
}
}

private int loadSmiles(final InputStream in) throws IOException {
int count = 0;
try (BufferedReader brdr = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public void dontOverwriteExistingSgroups() throws Exception {
IAtomContainer mol = smi("c1ccccc1N(Cl)C(=O)OC(C)(C)C");
List<Sgroup> sgroups = factory.generate(mol);
assertThat(sgroups.size(), is(1));
assertThat(sgroups.get(0).getSubscript(), is("N(Cl)Boc"));
assertThat(sgroups.get(0).getSubscript(), is("NClBoc"));
assertThat(sgroups.get(0).getBonds().size(), is(1));
assertThat(sgroups.get(0).getAtoms().size(), is(9));
}
Expand Down Expand Up @@ -247,6 +247,15 @@ public void dontOverwriteExistingSgroups() throws Exception {
assertThat(sgroups.get(1).getSubscript(), is("SO3-"));
}

@Test public void hclSaltOfEdci() throws Exception {
Abbreviations factory = new Abbreviations();
factory.add("CCN=C=NCCCN(C)C EDCI");
IAtomContainer mol = smi("CCN=C=NCCCN(C)C.Cl");
List<Sgroup> sgroups = factory.generate(mol);
assertThat(sgroups.size(), is(1));
assertThat(sgroups.get(0).getSubscript(), is("EDCI·HCl"));
}

@Test
public void loadFromFile() throws Exception {
Abbreviations factory = new Abbreviations();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ static boolean parse(String label, List<String> tokens) {
continue;
}

if (c == '/') {
if (c == '/' || c == '·') {
tokens.add(Character.toString(c));
i++;
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,27 @@ public void formatTBu() {
assertThat(texts.get(1).style, is(AbbreviationLabel.STYLE_NORMAL));
}

@Test
public void NEt3DotHCl() {
List<String> tokens = new ArrayList<>();
assertTrue(AbbreviationLabel.parse("NEt3·HCl", tokens));
assertThat(tokens.size(), is(5));
assertThat(tokens.get(0), is("N"));
assertThat(tokens.get(1), is("Et3"));
assertThat(tokens.get(2), is("·"));
assertThat(tokens.get(3), is("H"));
assertThat(tokens.get(4), is("Cl"));
List<AbbreviationLabel.FormattedText> formatted = AbbreviationLabel.format(tokens);
AbbreviationLabel.reduce(formatted, 0, formatted.size());
assertThat(formatted.size(), is(3));
assertThat(formatted.get(0).text, is("NEt"));
assertThat(formatted.get(0).style, is(AbbreviationLabel.STYLE_NORMAL));
assertThat(formatted.get(1).text, is("3"));
assertThat(formatted.get(1).style, is(AbbreviationLabel.STYLE_SUBSCRIPT));
assertThat(formatted.get(2).text, is("·HCl"));
assertThat(formatted.get(2).style, is(AbbreviationLabel.STYLE_NORMAL));
}

@Test
public void formatOPO3H2() {
List<String> tokens = Arrays.asList("O", "P", "O3", "H2");
Expand Down

0 comments on commit 7041beb

Please sign in to comment.