### Find chiral molecules

In [24]:
from rdkit import Chem
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [42]:
data_dir = Path('data/geom/')

In [43]:
save_file = Path("geom_drugs_30.npy")
smiles_list_file = Path('geom_drugs_smiles.txt')
number_atoms_file = Path("geom_drugs_n_30.npy")

In [47]:
all_data = np.load(data_dir / save_file)  # 2d array: num_atoms x 5
mol_id = all_data[:, 0].astype(int)
conformers = all_data[:, 1:]
print(f'all_data shape: {all_data.shape}')

all_data shape: (322877623, 5)


In [22]:
df_smile = pd.read_csv(data_dir / smiles_list_file, sep=' ', header=None)
df_smile.columns = ['smiles']
df_smile.head()

Unnamed: 0,smiles
0,C=CCn1c(CSc2nc3ccccc3s2)nnc1SCC(=O)NCc1ccco1
1,Nc1nc2nc(-c3ccc(OCc4ccccc4)cc3)cc(C(F)(F)F)n2n1
2,CCOc1ccc(OC(=O)C(CCS(C)(=O)=O)N2C(=O)c3ccccc3C...
3,Clc1ccc(-c2nnc3n2CCCS3)cc1
4,CN(C(=O)/C(N=Nc1cccc(C(F)(F)F)c1)=C(\O)C(F)(F)...


In [28]:
def is_chiral_from_smiles(smiles):
    """Fallback: detect chirality from SMILES if 3D info is missing."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
    return len(chiral_centers) > 0

def process_smiles_file(df, smiles_column='smiles'):
    """Process a CSV file with SMILES strings."""
    chiral_mols = []

    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        smiles = row[smiles_column]
        if is_chiral_from_smiles(smiles):
            chiral_mols.append((idx, smiles))
    
    return chiral_mols

def save_chiral_molecules(chiral_mols, output_file='chiral_molecules.csv'):
    df = pd.DataFrame(chiral_mols, columns=['Index', 'SMILES'])
    df.to_csv(output_file, index=False)
    print(f"Saved {len(chiral_mols)} chiral molecules to {output_file}")

In [29]:
chiral_molecules = process_smiles_file(df_smile, smiles_column='smiles')
output_file = 'chiral_molecules.csv'
save_chiral_molecules(chiral_molecules, output_file=data_dir / output_file)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 292035/292035 [00:47<00:00, 6136.85it/s]


Saved 78634 chiral molecules to data/geom/chiral_molecules.csv


In [33]:
print(f'Percentage of molecules that are chiral: {len(chiral_molecules)/df_smile.shape[0]*100}%')

Percentage of molecules that are chiral: 26.926224596366872%


### Create artificial mirror images

In [None]:
import numpy as np
from pathlib import Path 

def reflect_dataset(input_file, output_file, axis=0):
    """
    Reflects molecules in the dataset along a specified axis.

    Args:
        input_file (str): Path to the original conformers (.npy file).
        output_file (str): Path to save the reflected conformers.
        axis (int): 0 = x-axis, 1 = y-axis, 2 = z-axis.
    """
    print(f"Loading data from {input_file}...")
    dataset = np.load(input_file)

    print(f"Original dataset shape: {dataset.shape}")
    reflected_dataset = dataset.copy()
    
    # axis+2 because dataset format is (mol_id, atomic_number, x, y, z)
    reflected_dataset[:, axis + 2] *= -1.0

    print(f"Saving reflected dataset to {output_file}...")
    np.save(output_file, reflected_dataset)
    print("Done.")

# Example usage:
if __name__ == "__main__":
    data_dir = Path('../data/geom/')

    input_filename = Path("geom_drugs_chiral_30.npy")
    output_filename = Path("geom_drugs_chiral_30_mirrored.npy")

    reflect_dataset(
        data_dir/input_filename, 
        data_dir/output_filename, 
        axis=0
    )  # reflect along x-axis

### Pass the molecules through the model

In [7]:

import pickle
model_path = Path('outputs/drugs_latent2/')
with open(model_path/ Path('args.pickle'), 'rb') as f:
        args = pickle.load(f)

In [8]:
args

Namespace(exp_name='rld_fixsig_enc1_latent2_geom_drugs', train_diffusion=True, ae_path=None, trainable_ae=True, latent_nf=2, kl_weight=0.01, model='egnn_dynamics', probabilistic_model='diffusion', diffusion_steps=1000, diffusion_noise_schedule='polynomial_2', diffusion_loss_type='l2', diffusion_noise_precision=1e-05, n_epochs=3000, batch_size=32, lr=0.0001, break_train_epoch=False, dp=True, condition_time=True, clip_grad=True, trace='hutch', n_layers=4, inv_sublayers=1, nf=256, tanh=True, attention=True, norm_constant=1, sin_embedding=False, ode_regularization=0.001, dataset='geom', filter_n_atoms=None, dequantization='argmax_variational', n_report_steps=50, wandb_usr=None, no_wandb=False, online=True, no_cuda=False, save_model=True, generate_epochs=1, num_workers=0, test_epochs=1, data_augmentation=False, conditioning=[], resume=None, start_epoch=0, ema_decay=0.9999, augment_noise=0, n_stability_samples=500, normalize_factors=[1, 4, 10], remove_h=False, include_charges=False, visualiz