In [1]:
from rdkit import Chem
from ccdc_rdkit_connector import CcdcRdkitConnector
from ccdc.conformer import ConformerGenerator
from tqdm import tqdm

import pickle
import os
import pandas as pd

In [2]:
# To be able to save conformer properties
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps) 

In [3]:
data_dir_path = 'data/'

In [4]:
with open(os.path.join(data_dir_path, 'raw', 'all_conf_ensemble_library.p'), 'rb') as f :
    all_CEL = pickle.load(f)

In [5]:
ccdc_rdkit_connector = CcdcRdkitConnector()

In [6]:
smiles_df = pd.read_csv(os.path.join(data_dir_path, 'smiles_df.csv'), index_col=0)

In [7]:
included_smiles = smiles_df[smiles_df['included']].values

In [8]:
initial_ccdc_mols = []
corresponding_ce_mols = []
for smiles, conf_ensemble in tqdm(all_CEL.get_unique_molecules()) :
    if smiles in included_smiles : # see comments above
        ccdc_mol = ccdc_rdkit_connector.rdkit_conf_to_ccdc_mol(conf_ensemble.mol)
        assert conf_ensemble.mol.GetNumAtoms() == len(ccdc_mol.atoms)
        initial_ccdc_mols.append(ccdc_mol)
        corresponding_ce_mols.append(conf_ensemble.mol)

100%|███████████████████████████████████| 15949/15949 [00:14<00:00, 1084.65it/s]


In [9]:
n_conf_per_chunk = 100
chunk_idxs = [list(range(i, i + n_conf_per_chunk)) for i in range(0, len(initial_ccdc_mols), n_conf_per_chunk)]

In [10]:
len(initial_ccdc_mols)

15666

In [13]:
ccdc_conformer_generator = ConformerGenerator(nthreads=8)
ccdc_conformer_generator.settings.max_conformers = 100

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul


In [12]:
for chunk_idx in tqdm(chunk_idxs) :
    
    mol_list = [mol for i, mol in enumerate(initial_ccdc_mols) if i in chunk_idx]
    ce_mols = [mol for i, mol in enumerate(corresponding_ce_mols) if i in chunk_idx]
    
    conformers = ccdc_conformer_generator.generate(mol_list)
    
    for ce_mol, conformers in zip(ce_mols, conformers) :
        try :
            generated_conf_ids = ccdc_rdkit_connector.ccdc_conformers_to_rdkit_mol(conformers, ce_mol)
        except Exception as e :
            print(e)

  0%|                                                   | 0/157 [00:00<?, ?it/s]

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul


  1%|▎                                        | 1/157 [00:33<1:27:20, 33.59s/it]

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul


  1%|▌                                        | 2/157 [01:02<1:18:57, 30.56s/it]

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul


  2%|▊                                        | 3/157 [01:31<1:16:59, 29.99s/it]

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul


  3%|█                                        | 4/157 [01:56<1:11:31, 28.05s/it]

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul


  3%|█▎                                       | 5/157 [03:07<1:35:06, 37.55s/it]

INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul
INFO: /home/baillifb/CCDC/CSD_2021/bin/mogul





KeyboardInterrupt: 

In [None]:
with open(os.path.join(data_dir_path, 'raw', 'ccdc_generated_conf_ensemble_library_unfiltered.p'), 'wb') as f :
    pickle.dump(all_CEL, f)
    
# here we only have the molecules parsed identically by RDKit (from mol2) and CSD (from smiles)

In [None]:
faulty_smiles = ['O=C[Ru+9]12345(C6=C1C2C3=C64)n1c2ccc(O)cc2c2c3c(c4ccc[n+]5c4c21)C(=O)NC3=O',
 'Cc1cc2c3c(c4c5ccccc5n5c4c2[n+](c1)[Ru+9]51246(Cl)C5=C(C(=O)[O-])C1=C2C4=C56)C(=O)NC3=O']
# cannot be pickled because of a number of radical electron error

In [None]:
smiles_df.loc[smiles_df['smiles'].isin(faulty_smiles), 'included'] = False

In [None]:
smiles_df.to_csv(os.path.join(data_dir_path, 'smiles_df.csv'))

In [None]:
excluded_smiles = smiles_df[~smiles_df['included']]['smiles'].values

In [None]:
len(excluded_smiles)

In [None]:
for smiles in excluded_smiles :
    all_CEL.library.pop(smiles)

In [None]:
all_CEL.get_num_molecules()

In [None]:
with open(os.path.join(data_dir_path, 'raw', 'ccdc_generated_conf_ensemble_library.p'), 'wb') as f :
    pickle.dump(all_CEL, f)

# Generate conf table for data file creation torch geometric

In [6]:
with open(os.path.join(data_dir_path, 'raw', 'ccdc_generated_conf_ensemble_library.p'), 'rb') as f :
    all_CEL = pickle.load(f)

In [7]:
smiles_df = pd.read_csv(os.path.join(data_dir_path, 'smiles_df.csv'))

In [11]:
conf_list = []
for smiles in smiles_df[smiles_df['included']]['smiles'].values :
    confs = all_CEL.get_conf_ensemble(smiles).mol.GetConformers()
    for conf in confs :
        generated = 'Generator' in conf.GetPropsAsDict()
        conf_list.append([smiles, generated])

In [12]:
len(conf_list)

1165920

In [13]:
conf_df = pd.DataFrame(conf_list, columns=['smiles', 'generated'])

In [15]:
conf_df.to_csv(os.path.join(data_dir_path, 'conf_df.csv'))