In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import os

In [2]:
qdf= pd.read_csv('../data/from-smiles/SAMPL.csv')
smiles = qdf['smiles']
dataset = 'FreeSolv'

three_ds = []
failed_three_ds = []
three_ds_path = f'../data/from-smiles/{dataset}-3d/'
opt_three_ds = []
failed_opt_three_ds = []
opt_three_ds_path = f'../data/from-smiles/{dataset}-opt-3d/'
two_ds = []
two_ds_path = f'../data/from-smiles/{dataset}-2d/'
sdf_file = f'{dataset}.sdf'
csv_file = f'{dataset}.sdf.csv'

target_columns = ['expt']

for smile in tqdm(smiles):
    try:
        mol = Chem.MolFromSmiles(smile)
        mol = Chem.AddHs(mol)
        mol.SetProp('_Name', smile)
        
        AllChem.Compute2DCoords(mol)

        # Save a copy of the 2D molecule
        two_ds.append(mol.__copy__())

        if (AllChem.EmbedMolecule(mol, randomSeed = 0xf00d, maxAttempts = 100) == -1):
            failed_three_ds.append(smile)
            continue
        # Save 3D molecule
        three_ds.append(mol.__copy__())

        if AllChem.MMFFOptimizeMolecule(mol) == -1:
            failed_opt_three_ds.append(smile)
            continue

        # Save optimized 3D molecule
        opt_three_ds.append(mol.__copy__())
    except Exception as e:
        print(e)
        continue

if not os.path.exists(three_ds_path):
    os.makedirs(three_ds_path)

if not os.path.exists(opt_three_ds_path):
    os.makedirs(opt_three_ds_path)

if not os.path.exists(two_ds_path):
    os.makedirs(two_ds_path)

with Chem.SDWriter(two_ds_path + sdf_file) as writer:
    for mol in two_ds:
        writer.write(mol)
qdf.drop(columns=qdf.columns.difference(target_columns), inplace=False).to_csv(two_ds_path + csv_file, index=False)

with Chem.SDWriter(three_ds_path + sdf_file) as writer:
    for mol in three_ds:
        writer.write(mol)
qdf.drop(columns=qdf.columns.difference(target_columns), inplace=False).to_csv(three_ds_path + csv_file, index=False)


with Chem.SDWriter(opt_three_ds_path + sdf_file) as writer:
    for mol in opt_three_ds:
        writer.write(mol)
qdf.drop(columns=qdf.columns.difference(target_columns), inplace=False).to_csv(opt_three_ds_path + csv_file, index=False)


100%|██████████| 642/642 [00:03<00:00, 176.42it/s]


Handling accurate structure of some of the datasets

In [4]:
dataset = 'esol'
csv_path = f'../data/from-smiles/{dataset}.csv'
sdf_dir_path = f'../data/from-smiles/{dataset}/'
storing_path = f'../data/from-smiles/{dataset}-accurate/'
indices = []
mols = []

for file in os.listdir(sdf_dir_path):
    if file.endswith('.sdf'):
        suppl = Chem.SDMolSupplier(sdf_dir_path + file, removeHs = False, sanitize = False)
        mols.append(suppl[0])
        indices.append(int(file.split('_')[-1].split('.')[0]))

csv_file = pd.read_csv(csv_path)
csv_file = csv_file.iloc[indices]
csv_file_name = f'{dataset}.sdf.csv'
sdf_file_name = f'{dataset}.sdf'

if not os.path.exists(storing_path):
    os.makedirs(storing_path)

with Chem.SDWriter(storing_path + sdf_file_name) as writer:
    for mol in mols:
        writer.write(mol)

csv_file.to_csv(storing_path + csv_file_name, index=False)

[20:57:55] Failed to find a good bond to set as UP or DOWN for an atropisomer - atoms are: 0 8


In [6]:
mols_sample = mols[34:63]
mols_sample_smiles = [mol.GetProp('_Name') for mol in mols_sample]

csv_file_sample = csv_file.iloc[34:63]
csv_file_sample_smiles = csv_file_sample['smiles'].tolist()
# # this is idiotic!
# for smile in csv_file_sample['smiles']:
#     Chem.Kekulize(Chem.MolFromSmiles(smile))
#     csv_file_sample_smiles.append(smile)

In [18]:
csv_file.iloc[:,1:-1]

Unnamed: 0,smiles,expt
338,C(F)Cl,-0.77
310,CCC(C)(C)O,-4.43
476,c1cc(ccc1[N+](=O)[O-])O,-10.64
462,c1cc(cc(c1)[N+](=O)[O-])N,-8.84
304,Cc1cc2ccccc2cc1C,-2.78
...,...,...
309,c1cc(cnc1)C=O,-7.10
321,CC(C)Br,-0.48
447,C(CBr)Br,-2.33
453,CCS,-1.14
