In [1]:
import automol
import polars as pl
from automech.util import df_

spc_df0 = pl.read_json("1expand.json")

include_symbols = {"C", "H", "O"}
exclude_symbols = set(spc_df0["formula"].struct.fields) - include_symbols
print(exclude_symbols)

{'He', 'S', 'Be', 'N', 'Ar', 'Na', 'Mg', 'P', 'F', 'Si', 'Ne', 'Al', 'B', 'Li', 'Cl'}


In [2]:
# Filter out non-canonical enantiomers
spc_df = spc_df0.filter(pl.col("canon"))

In [3]:
# Filter out non-CHO species
spc_df = spc_df.filter(
    ~pl.any_horizontal(
        [pl.col("formula").struct.field(s).fill_null(0) > 0 for s in exclude_symbols]
    )
)

In [4]:
spc_df = df_.map_(spc_df, "amchi", "inchi", automol.amchi.chi_, bar=True)

  0%|          | 0/1002 [00:00<?, ?it/s]

In [5]:
spc_df = df_.map_(spc_df, "inchi", "inchikey", automol.chi.inchi_key, bar=True)

  0%|          | 0/1002 [00:00<?, ?it/s]

In [6]:
spc_df = spc_df.with_columns((pl.col("spin") + 1).alias("mult"))

In [7]:
spc_df = df_.map_(
    spc_df, "inchi", "canon_enant_ich", automol.chi.canonical_enantiomer, bar=True
)

  0%|          | 0/1002 [00:00<?, ?it/s]

In [8]:
from rdkit import Chem

def sanitize_smiles(smi: str) -> str:
    smi = smi.replace("=/", "=").replace("=\\", "=")
    mol = Chem.MolFromSmiles(smi)
    return Chem.MolToSmiles(mol, canonical=True)

spc_df = df_.map_(spc_df, "smiles", "rdkit_smiles", sanitize_smiles, bar=True)

  0%|          | 0/1002 [00:00<?, ?it/s]

In [9]:
mech_spc_df = spc_df.select("name", "smiles", "inchi", "inchikey", "mult", "charge", "canon_enant_ich", "rdkit_smiles")
mech_spc_df.write_csv("2sanitize.csv", quote_char="'", quote_style="non_numeric")