In [48]:
import pandas as pd
from rdkit.Chem import Mol, AllChem
from rdkit import Chem
import copy

In [46]:
def generate_3d_structure(mol: Mol) -> Mol:
    # Generate an initial 3D conformation
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    # Optimize the 3D conformation to minimize energy
    AllChem.UFFOptimizeMolecule(mol)
    return mol

In [47]:
def mol_from_smiles(smiles: str) -> Mol:
    mol = Chem.MolFromSmiles(smiles)
    return mol

In [49]:
def conformers_from_smiles(smiles: str) -> Mol:
    mol = mol_from_smiles(smiles)
    mol = generate_3d_structure(mol)
    return mol

In [55]:
def expand_row(row):
    # ['molecule_chembl_id', 'SMILES_nostereo', 'ID', 'symmetry_number','chiral_centers']
    # ['ID', 'SMILES_nostereo', 'rdkit_mol_cistrans_stereo', 'RS_label', 'RS_label_binary']
    expanded_rows = []
    try:
        for i in range(6):
            new_row = {
                'ID': row['ID'],
                'SMILES_nostereo': row['SMILES_nostereo'],
                'rdkit_mol_cistrans_stereo': conformers_from_smiles(row['ID']),
                'RSA_label': 'A',
                'RSA_label_one_hot': [0,0,1]
            }
            expanded_rows.append(new_row)
        return expanded_rows
    except: return []

In [2]:
achiral_dataset_organic = pd.read_csv("achiral_dataset_organic.csv")
test_RS_classification_enantiomers = pd.read_pickle("final_data_splits/test_RS_classification_enantiomers_MOL_69719_11680_5840.pkl")
train_RS_classification_enantiomers = pd.read_pickle("final_data_splits/train_RS_classification_enantiomers_MOL_326865_55084_27542.pkl")
validation_RS_classification_enantiomers = pd.read_pickle("final_data_splits/validation_RS_classification_enantiomers_MOL_70099_11748_5874.pkl")

In [31]:
a_size = achiral_dataset_organic.shape[0]
number_of_stereoisomers = train_RS_classification_enantiomers.ID.nunique()
print (f'Size of achiral organic dataset: {a_size}')
print (f'Size of conformers in achiral organic dataset: {a_size*5}')
print (f'70% split of dataset: {int(a_size*0.7)}')
print (f'15% split of dataset: {int(a_size*0.15)} (one of the datasets has to have 1 more)')
print (f'70 + 15 + 15 split of dataset: {int(a_size*0.7) + 2*int(a_size*0.15)}')
print ("-"*20)
print (f'Proportion RS dataset number of stereoisomers/number of achiral molecules (training set): {int(a_size*0.7)/(number_of_stereoisomers)}')
print (f'Proportion RS dataset number of enantiomers/number of achiral molecules (training set): {int(a_size*0.7)/(number_of_stereoisomers/2)}')
print (f'Achiral dataset is {1/(int(a_size*0.7)/(number_of_stereoisomers/2))}x smaller than it should be')
proportion = int(a_size*0.7)/(number_of_stereoisomers/2)
train_final_size =int(train_RS_classification_enantiomers.ID.nunique() * proportion)
test_final_size = int(test_RS_classification_enantiomers.ID.nunique() * proportion)
validation_final_size = int(validation_RS_classification_enantiomers.ID.nunique() * proportion)
total_rs_final_size = train_final_size + test_final_size + validation_final_size
print ("-"*20)
print (f'So the we should select this number of enantiomers:')
print (f'Training: {train_final_size}, proportion={train_final_size/total_rs_final_size}')
print (f'Test: {test_final_size}, proportion={test_final_size/total_rs_final_size}')
print (f'Validation: {validation_final_size}, proportion={validation_final_size/total_rs_final_size}')
print ("-"*20)
print (f"Dataset size after joining should be:")
print (f'Training: {(train_final_size + int(a_size*0.7))}, proportion={(train_final_size + int(a_size*0.7))/(total_rs_final_size + a_size)}')
print (f'Test: {(test_final_size + int(a_size*0.15))}, proportion={(test_final_size + int(a_size*0.15))/(total_rs_final_size + a_size)}')
print (f'Validation: {(validation_final_size + int(a_size*0.15))}, proportion={(validation_final_size + int(a_size*0.15))/(total_rs_final_size + a_size)}')

Size of achiral organic dataset: 5734
Size of conformers in achiral organic dataset: 28670
70% split of dataset: 4013
15% split of dataset: 860 (one of the datasets has to have 1 more)
70 + 15 + 15 split of dataset: 5733
--------------------
Proportion RS dataset number of stereoisomers/number of achiral molecules (training set): 0.07285237092440636
Proportion RS dataset number of enantiomers/number of achiral molecules (training set): 0.1457047418488127
Achiral dataset is 6.863194617493147x smaller than it should be
--------------------
So the we should select this number of enantiomers:
Training: 8025, proportion=0.7016700183614584
Test: 1701, proportion=0.14872781323773718
Validation: 1711, proportion=0.1496021684008044
--------------------
Dataset size after joining should be:
Training: 12038, proportion=0.7010657503931047
Test: 2561, proportion=0.1491468173082523
Validation: 2571, proportion=0.1497291945722439


In [35]:
train_RS_classification_enantiomers_selected = pd.concat([group for _, group in train_RS_classification_enantiomers.groupby("ID")][:train_final_size], ignore_index=True)
test_RS_classification_enantiomers_selected = pd.concat([group for _, group in test_RS_classification_enantiomers.groupby("ID")][:test_final_size], ignore_index=True)
validation_RS_classification_enantiomers_selected = pd.concat([group for _, group in validation_RS_classification_enantiomers.groupby("ID")][:validation_final_size], ignore_index=True)

In [37]:
print(train_RS_classification_enantiomers_selected.shape[0])
print(train_RS_classification_enantiomers_selected.ID.nunique())

46541
8025


In [45]:
print(achiral_dataset_organic.columns)
print(train_RS_classification_enantiomers_selected.columns)

Index(['molecule_chembl_id', 'SMILES_nostereo', 'ID', 'symmetry_number',
       'chiral_centers'],
      dtype='object')
Index(['ID', 'SMILES_nostereo', 'rdkit_mol_cistrans_stereo', 'RS_label',
       'RS_label_binary'],
      dtype='object')


In [39]:
train_achiral_dataset_organic = achiral_dataset_organic[:int(a_size*0.7)]
test_achiral_dataset_organic = achiral_dataset_organic[int(a_size*0.7):(int(a_size*0.7)+int(a_size*0.15))]
validation_achiral_dataset_organic = achiral_dataset_organic[(int(a_size*0.7)+int(a_size*0.15)):(int(a_size*0.7)+2*int(a_size*0.15))]

In [40]:
print(train_achiral_dataset_organic.shape[0])
print(test_achiral_dataset_organic.shape[0])
print(validation_achiral_dataset_organic.shape[0])

4013
860
860


In [56]:
train_achiral_dataset_organic_3d_conformers = pd.DataFrame([new_row for _, row in train_achiral_dataset_organic.iterrows() for new_row in expand_row(row)])
test_achiral_dataset_organic_3d_conformers = pd.DataFrame([new_row for _, row in test_achiral_dataset_organic.iterrows() for new_row in expand_row(row)])
validation_achiral_dataset_organic_3d_conformers = pd.DataFrame([new_row for _, row in validation_achiral_dataset_organic.iterrows() for new_row in expand_row(row)])

[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling 

[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling AddHs()
[20:23:53] Molecule does not have explicit Hs. Consider calling 

In [58]:
train_RS_classification_enantiomers_selected['RSA_label'] = train_RS_classification_enantiomers_selected['RS_label']
test_RS_classification_enantiomers_selected['RSA_label'] = test_RS_classification_enantiomers_selected['RS_label']
validation_RS_classification_enantiomers_selected['RSA_label'] = validation_RS_classification_enantiomers_selected['RS_label']

In [61]:
print(train_achiral_dataset_organic_3d_conformers.ID.nunique())

4004


In [65]:
def map_to_list(value):
    return [1, 0, 0] if value == 0 else [0, 1, 0]
train_RS_classification_enantiomers_selected.iloc[0]

ID                                             BrC/C(=C\[C@@H]1CCCO1)C1CCCCC1
SMILES_nostereo                                       BrCC(=CC1CCCO1)C1CCCCC1
rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x7acebdc4ebd0>
RS_label                                                                    S
RS_label_binary                                                             1
RSA_label                                                                   S
Name: 0, dtype: object

In [66]:
train_RS_classification_enantiomers_selected['RSA_label_one_hot'] = train_RS_classification_enantiomers_selected['RS_label_binary'].apply(map_to_list)
test_RS_classification_enantiomers_selected['RSA_label_one_hot'] = test_RS_classification_enantiomers_selected['RS_label_binary'].apply(map_to_list)
validation_RS_classification_enantiomers_selected['RSA_label_one_hot'] = validation_RS_classification_enantiomers_selected['RS_label_binary'].apply(map_to_list)

In [67]:
train_final = pd.concat([train_achiral_dataset_organic_3d_conformers, train_RS_classification_enantiomers_selected], ignore_index=True, join='inner')
test_final = pd.concat([test_achiral_dataset_organic_3d_conformers, test_RS_classification_enantiomers_selected], ignore_index=True, join='inner')
validation_final = pd.concat([validation_achiral_dataset_organic_3d_conformers, validation_RS_classification_enantiomers_selected], ignore_index=True, join='inner')

In [68]:
train_final.to_pickle('train_final_RSA.pkl')
test_final.to_pickle('test_final_RSA.pkl')
validation_final.to_pickle('validation_final_RSA.pkl')

In [69]:
print(train_final.shape[0])
print(test_final.shape[0])
print(validation_final.shape[0])

70565
15078
15169


In [70]:
print(train_final.ID.nunique())
print(test_final.ID.nunique())
print(validation_final.ID.nunique())

12029
2560
2569
