# Eval MolDecod

In this notebook, we evaluate the model on molecular metrics for different temperature levels.

Results on 10,000 generated molecules:

| Temperature | Validity | Uniqueness | Diversity | Novelty | KL Divergence | Fragment Similarity | Scaffold Diversity |
|-------------|----------|------------|-----------|---------|---------------|---------------------|--------------------|
| 0.1         | 1.00     | 0.04       | 0.76      | 0.9455  | 6.4742        | 0.0545              | 0.0148             |
| 0.25        | 1.00     | 0.49       | 0.81      | 0.8347  | 4.3664        | 0.1653              | 0.1398             |
| 0.5         | 0.98     | 0.95       | 0.85      | 0.8768  | 5.7033        | 0.1237              | 0.4556             |
| 0.7         | 0.96     | 0.95       | 0.87      | 0.9240  | 5.6936        | 0.0778              | 0.6540             |
| 0.9         | 0.88     | 0.88       | 0.88      | 0.9562  | 5.3179        | 0.0502              | 0.7524             |



## Setup

In [None]:
# pip install torch scikit-learn PyTDC rdkit-pypi sentencepiece

In [2]:
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from rdkit import Chem
import sentencepiece as spm

# Check if GPU is available
cuda_available = torch.cuda.is_available()

if cuda_available:
    print("CUDA is available.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")

    for i in range(torch.cuda.device_count()):
        print(f"Device {i+1}: {torch.cuda.get_device_name(i)}")
        print(f"  - Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024 ** 3):.2f} GB")
        print(f"  - Multiprocessors: {torch.cuda.get_device_properties(i).multi_processor_count}")
        print(f"  - Compute Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")
else:
    print("CUDA is not available.")

# Use GPU if available, CPU if not
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Pre-define seed for reproducibility
seed = 42
torch.manual_seed(seed)
if cuda_available:
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

CUDA is available.
Number of GPUs available: 1
Device 1: NVIDIA A100 80GB PCIe
  - Total Memory: 79.32 GB
  - Multiprocessors: 108
  - Compute Capability: 8.0
Using device: cuda


## Import and split data

In [None]:
from tdc.generation import MolGen

# Get MolGen datasets, and split 70/15/15 (train/validation/test)
def load_and_prepare_data(source_name, test_size=0.15, val_size=0.1765, random_state=42):
    # Initialize and get data splits
    data_source = MolGen(name=source_name)
    split_data = data_source.get_split()

    # Combine train, valid, and test into a single DataFrame and drop NaNs
    df = pd.concat([split_data['train'], split_data['valid'], split_data['test']], ignore_index=True)
    df.dropna(subset=['smiles'], inplace=True)

    # Train-validation-test split
    train_val, test = train_test_split(df, test_size=test_size, random_state=random_state)
    train, valid = train_test_split(train_val, test_size=val_size, random_state=random_state)
    
    return train, valid, test

# Load and prepare datasets
train_moses, valid_moses, test_moses = load_and_prepare_data('MOSES')
train_chembl, valid_chembl, test_chembl = load_and_prepare_data('ChEMBL')

# Concatenate the two sources
train_df = pd.concat([train_moses, train_chembl], ignore_index=True)
valid_df = pd.concat([valid_moses, valid_chembl], ignore_index=True)
test_df = pd.concat([test_moses, test_chembl], ignore_index=True)

# Display the first few entries and shapes of datasets
print(train_df.head())
print(f"Train set shape: {train_df.shape}")
print(f"Validation set shape: {valid_df.shape}")
print(f"Test set shape: {test_df.shape}")

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


                                   smiles
0           COCCOCC1NCCc2cc3ccc(O)cc3cc21
1     Cc1ccc(NC(=O)N2CCCC2C(O)C(F)(F)F)s1
2  COc1ccc(C(C)NC(=O)CCn2cc(C)cn2)c(OC)c1
3   COc1ccc(Cl)cc1-c1ccc(C#N)c(=O)n1C(C)C
4         O=C(c1cc(-n2cccn2)ccn1)N1CCOCC1
Train set shape: (2714758, 1)
Validation set shape: (581853, 1)
Test set shape: (581756, 1)


## Tokenizer and data loaders

In [None]:
# Special tokens
SOS_TOKEN = '<SOS>'  # start of sequence
EOS_TOKEN = '<EOS>'  # end of sequence
PAD_TOKEN = '<PAD>'  # padding

In [5]:
# Load SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('smiles.model')

# Step 3: Update Dataset Class
class SMILESDataset(Dataset):
    def __init__(self, dataframe, sp_model):
        self.smiles = dataframe['smiles'].tolist()
        self.sp = sp_model

        # Process SMILES using SentencePiece
        self.processed_smiles = []
        for smile in self.smiles:
            mol = Chem.MolFromSmiles(smile)
            if mol is not None:  # Ensure valid SMILES
                canonical_smile = Chem.MolToSmiles(mol, canonical=True)
                self.processed_smiles.append(canonical_smile)
            else:
                print(f"Invalid SMILES string: {smile}")

    def __len__(self):
        return len(self.processed_smiles)

    def __getitem__(self, idx):
        smile = self.processed_smiles[idx]
        # Tokenize with SentencePiece
        tokens = [self.sp.piece_to_id(SOS_TOKEN)] + self.sp.encode(smile) + [self.sp.piece_to_id(EOS_TOKEN)]
        return torch.tensor(tokens, dtype=torch.long)

# Custom collate function for padding sequences
def collate_fn(batch):
    batch = [item for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=sp.piece_to_id(PAD_TOKEN))
    return batch

# Dataset setup
train_dataset = SMILESDataset(train_df, sp)
valid_dataset = SMILESDataset(valid_df, sp)
test_dataset = SMILESDataset(test_df, sp)



## Define model architecture

In [6]:
class RotaryPositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(RotaryPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=dropout)
        
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        self.register_buffer('sin_pos', torch.sin(position * div_term))
        self.register_buffer('cos_pos', torch.cos(position * div_term))

    def forward(self, x):
        seq_len = x.size(1)
        x1 = x[..., ::2]
        x2 = x[..., 1::2]
        x = torch.cat([
            x1 * self.cos_pos[:seq_len] - x2 * self.sin_pos[:seq_len],
            x1 * self.sin_pos[:seq_len] + x2 * self.cos_pos[:seq_len]
        ], dim=-1)
        return self.dropout(x)

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dropout=0.1):
        super(DecoderOnlyTransformer, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = RotaryPositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=4*d_model, dropout=dropout, activation='gelu', batch_first=True, norm_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, src, src_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, mask=src_mask)
        output = self.fc_out(output)
        return self.dropout(output)

In [7]:
# Model setup
vocab_size = sp.get_piece_size()  # Use SentencePiece to get the vocabulary size
d_model = 256
nhead = 4
num_encoder_layers = 4
dropout = 0.25

model = DecoderOnlyTransformer(vocab_size, d_model, nhead, num_layers=num_encoder_layers, dropout=dropout)
model = model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

## Load and evaluate model

### Load model and define generation functions

In [8]:
# Load the best model checkpoint
checkpoint = torch.load('best_model_mid_save.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

def create_mask(size):
    mask = torch.triu(torch.ones(size, size) * float('-inf'), diagonal=1)
    return mask

def generate_molecule(model, start_seq, sp_model, max_length=150, temperature=0.7):
    model.eval()
    with torch.no_grad():
        current_seq = start_seq.to(device).unsqueeze(0)  # Add batch dimension
        for _ in range(max_length):
            src_mask = create_mask(current_seq.size(1)).to(device)
            output = model(current_seq, src_mask)
            logits = output[0, -1, :] / temperature  # Select last time step
            next_token_idx = torch.multinomial(torch.softmax(logits, dim=-1), 1).item()
            
            if next_token_idx == sp_model.piece_to_id('<EOS>'):
                break

            next_token_tensor = torch.tensor([[next_token_idx]], device=device)
            current_seq = torch.cat([current_seq, next_token_tensor], dim=1)
    
    # Decode using SentencePiece
    generated_sequence = sp_model.decode_ids(current_seq[0].cpu().tolist())
    return generated_sequence.replace('<SOS>', '', 1)

# Example: Generate a molecule starting with a carbon atom
start_seq = torch.tensor([sp.piece_to_id('<SOS>'), sp.piece_to_id('C')], device=device)  # Start with <SOS> and a carbon atom
generated_molecule = generate_molecule(model, start_seq, sp)
print("Generated molecule:", generated_molecule)

Generated molecule: COc1cc2cc(OC)c(OC)cc2cc1C(c1ccccc1)c1ccccc1


### Define evaluation functions

In [14]:
from tdc import Evaluator
from rdkit.Chem import AllChem, DataStructs, Descriptors
from rdkit.Chem.Scaffolds import MurckoScaffold
from scipy.stats import entropy
from collections import Counter

# Validity
validity_evaluator = Evaluator(name='Validity')

# Uniqueness
uniqueness_evaluator = Evaluator(name='Uniqueness')

# Diversity
diversity_evaluator = Evaluator(name='Diversity')

# Convert SMILES to RDKit molecules
def smiles_to_mol(smiles_list):
    return [Chem.MolFromSmiles(smiles) for smiles in smiles_list if Chem.MolFromSmiles(smiles)]

# Novelty
def calculate_novelty(generated_mols, train_mols):
    train_smiles_set = set(Chem.MolToSmiles(mol) for mol in train_mols)
    novel_count = sum(1 for mol in generated_mols if Chem.MolToSmiles(mol) not in train_smiles_set)
    return novel_count / len(generated_mols)

# KL Divergence (using molecular weight as an example property)
def calculate_kl_divergence(generated_mols, train_mols):
    def get_mol_weights(mols):
        return [Chem.Descriptors.ExactMolWt(mol) for mol in mols]
    
    gen_weights = get_mol_weights(generated_mols)
    train_weights = get_mol_weights(train_mols)
    
    # Create histograms
    hist_gen, _ = np.histogram(gen_weights, bins=50, density=True)
    hist_train, _ = np.histogram(train_weights, bins=50, density=True)
    
    # Add small epsilon to avoid division by zero
    hist_gen = hist_gen + 1e-10
    hist_train = hist_train + 1e-10
    
    return entropy(hist_gen, hist_train)

# Fragment Similarity
def calculate_fragment_similarity(generated_mols, train_mols):
    def get_fragments(mol):
        return set(Chem.MolToSmiles(frag) for frag in Chem.GetMolFrags(mol, asMols=True))
    
    train_fragments = set.union(*[get_fragments(mol) for mol in train_mols])
    
    similarities = []
    for mol in generated_mols:
        gen_fragments = get_fragments(mol)
        if gen_fragments:
            similarity = len(gen_fragments.intersection(train_fragments)) / len(gen_fragments)
            similarities.append(similarity)
    
    return np.mean(similarities)

# Scaffold Diversity
def calculate_scaffold_diversity(mols):
    scaffolds = [Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(mol)) for mol in mols]
    scaffold_counts = Counter(scaffolds)
    return len(scaffold_counts) / len(mols)

# Get the training and test set SMILES
train_smiles = train_dataset.processed_smiles
test_smiles = test_dataset.processed_smiles

train_mols = smiles_to_mol(train_smiles)

### Temperature = 0.1

In [9]:
generated_smiles = []
for i in range(10000):
    start_seq = torch.tensor([sp.piece_to_id('<SOS>'), sp.piece_to_id('C')], device=device)  # Start with <SOS> and a carbon atom
    generated_molecule = generate_molecule(model, start_seq, sp, max_length=150, temperature=0.1)  # Pass the SentencePiece model
    generated_molecule = generated_molecule  # Remove start of sequence token
    generated_smiles.append(generated_molecule)
    
    # Print every 1000th molecule
    if (i + 1) % 1000 == 0:
        print(f"Generated molecule {i + 1}: {generated_molecule}")

Generated molecule 1000: C(C)(C)NC(=O)C1CCN(C(=O)C2CCCO2)CC1
Generated molecule 2000: C(C)C(NC(=O)c1ccc(C)c(C)c1)C(=O)N1CCC(C)CC1
Generated molecule 3000: C(C)C(NC(=O)C1CC1)c1ccc(OC)cc1
Generated molecule 4000: C(C)C(C)NC(=O)C1CCN(C(=O)c2ccc(F)cc2)CC1
Generated molecule 5000: C(C)C(=O)Nc1ccc(C(=O)Nc2ccc(C)c(C)c2)cc1
Generated molecule 6000: C(C)C(=O)Nc1ccc(NC(=O)C2CC2)cc1
Generated molecule 7000: C(C)C(NC(=O)c1ccc(C)c(C)c1)C(=O)N(C)C
Generated molecule 8000: C(C)(C)NC(=O)C1CCN(C(=O)c2ccc(C)c(F)c2)CC1
Generated molecule 9000: C(C)C(NC(=O)c1ccc(C)c(C)c1)C(=O)N(C)C
Generated molecule 10000: C(C)C(NC(=O)c1ccc(C)c(C)c1)C(=O)N1CCCC1


In [10]:
# Validity
validity_score = validity_evaluator(generated_smiles)
print(f"Validity: {validity_score:.2f}")

# Uniqueness
uniqueness_score = uniqueness_evaluator(generated_smiles)
print(f"Uniqueness: {uniqueness_score:.2f}")

# Diversity
diversity_score = diversity_evaluator(generated_smiles)
print(f"Diversity: {diversity_score:.2f}")

Validity: 1.00
Uniqueness: 0.04
Diversity: 0.76


In [11]:
generated_mols = smiles_to_mol(generated_smiles)

# Novelty
novelty = calculate_novelty(generated_mols, train_mols)
print(f"Novelty: {novelty:.4f}")

# KL Divergence (using molecular weight as an example property)
kl_divergence = calculate_kl_divergence(generated_mols, train_mols)
print(f"KL Divergence: {kl_divergence:.4f}")

# Fragment Similarity
fragment_similarity = calculate_fragment_similarity(generated_mols, train_mols)
print(f"Fragment Similarity: {fragment_similarity:.4f}")

# Scaffold Diversity
scaffold_diversity = calculate_scaffold_diversity(generated_mols)
print(f"Scaffold Diversity: {scaffold_diversity:.4f}")



Novelty: 0.9455
KL Divergence: 6.4742
Fragment Similarity: 0.0545
Scaffold Diversity: 0.0148


### Temperature = 0.25

In [10]:
generated_smiles = []
for i in range(10000):
    start_seq = torch.tensor([sp.piece_to_id('<SOS>'), sp.piece_to_id('C')], device=device)  # Start with <SOS> and a carbon atom
    generated_molecule = generate_molecule(model, start_seq, sp, max_length=150, temperature=0.25)  # Pass the SentencePiece model
    generated_molecule = generated_molecule  # Remove start of sequence token
    generated_smiles.append(generated_molecule)
    
    # Print every 1000th molecule
    if (i + 1) % 1000 == 0:
        print(f"Generated molecule {i + 1}: {generated_molecule}")

Generated molecule 1000: C(C)C(NC(=O)c1ccc(C)c(C)c1)C(=O)NC(C)C
Generated molecule 2000: C(C)(C)C(=O)Nc1cccc(C(=O)NCC2CCCO2)c1
Generated molecule 3000: Cc1nc(CN2CCOC(c3ccc(C)cc3)C2)no1
Generated molecule 4000: C(C)C(=O)Nc1ccc(C(=O)Nc2cc(C)on2)cc1
Generated molecule 5000: C(C)(C)NC(=O)CC1CCN(C(=O)c2ccc(C)c(F)c2)CC1
Generated molecule 6000: C(C)NC(=O)c1ccc(C)c(S(=O)(=O)N2CCOCC2)c1
Generated molecule 7000: C(C)C(NC(=O)C1CC(=O)N(C)C1)c1ccccc1
Generated molecule 8000: C(C)(C)NC(=O)CN1C(=O)COc2ccc(N)cc21
Generated molecule 9000: C(C)C(=O)Nc1ccc(S(=O)(=O)N2CCCC2)cc1
Generated molecule 10000: C(C)NC(=O)C(C)Sc1nc2ccccc2s1


In [11]:
# Validity
validity_score = validity_evaluator(generated_smiles)
print(f"Validity: {validity_score:.2f}")

# Uniqueness
uniqueness_score = uniqueness_evaluator(generated_smiles)
print(f"Uniqueness: {uniqueness_score:.2f}")

# Diversity
diversity_score = diversity_evaluator(generated_smiles)
print(f"Diversity: {diversity_score:.2f}")

Validity: 1.00
Uniqueness: 0.49
Diversity: 0.81


In [15]:
generated_mols = smiles_to_mol(generated_smiles)

# Novelty
novelty = calculate_novelty(generated_mols, train_mols)
print(f"Novelty: {novelty:.4f}")

# KL Divergence (using molecular weight as an example property)
kl_divergence = calculate_kl_divergence(generated_mols, train_mols)
print(f"KL Divergence: {kl_divergence:.4f}")

# Fragment Similarity
fragment_similarity = calculate_fragment_similarity(generated_mols, train_mols)
print(f"Fragment Similarity: {fragment_similarity:.4f}")

# Scaffold Diversity
scaffold_diversity = calculate_scaffold_diversity(generated_mols)
print(f"Scaffold Diversity: {scaffold_diversity:.4f}")



Novelty: 0.8347
KL Divergence: 4.3664
Fragment Similarity: 0.1653
Scaffold Diversity: 0.1398


### Temperature = 0.5

In [17]:
generated_smiles = []
for i in range(10000):
    start_seq = torch.tensor([sp.piece_to_id('<SOS>'), sp.piece_to_id('C')], device=device)  # Start with <SOS> and a carbon atom
    generated_molecule = generate_molecule(model, start_seq, sp, max_length=150, temperature=0.5)  # Pass the SentencePiece model
    generated_molecule = generated_molecule  # Remove start of sequence token
    generated_smiles.append(generated_molecule)
    
    # Print every 1000th molecule
    if (i + 1) % 1000 == 0:
        print(f"Generated molecule {i + 1}: {generated_molecule}")

Generated molecule 1000: C(C)(C)C(=O)Nc1ccc(C(=O)Nc2ccc(C)cc2)cc1
Generated molecule 2000: C1(CC2CCN(C(=O)c3ccc(F)cc3)CC2)CC1
Generated molecule 3000: Cc1nc(CN2CCN(C(=O)C(C)(C)C)CC2)no1
Generated molecule 4000: C(C)N(C)C(=O)CN1CCN(C(=O)C(C)C)CC1
Generated molecule 5000: C(C)C(=O)N1CCC(n2ncc3c(NCc4ccccc4)ncnc32)CC1
Generated molecule 6000: Cc1ccc(C(=O)Nc2cc(C)no2)cc1
Generated molecule 7000: C(=O)N1CCc2nc(-c3ccccc3)cc(NC(=O)Nc3ccc(F)c(C(F)(F)F)c3)c2C1
Generated molecule 8000: C(C)(C)N(C)C(=O)CCc1nc2ccccc2s1
Generated molecule 9000: Cc1c(C(=O)NCc2ccccn2)cnn1-c1ccccn1
Generated molecule 10000: C(CC(=O)N1CCC(CCC(=O)O)CC1)c1ccc(F)cc1


In [18]:
# Validity
validity_score = validity_evaluator(generated_smiles)
print(f"Validity: {validity_score:.2f}")

# Uniqueness
uniqueness_score = uniqueness_evaluator(generated_smiles)
print(f"Uniqueness: {uniqueness_score:.2f}")

# Diversity
diversity_score = diversity_evaluator(generated_smiles)
print(f"Diversity: {diversity_score:.2f}")

Validity: 0.98
Uniqueness: 0.95
Diversity: 0.85


In [19]:
generated_mols = smiles_to_mol(generated_smiles)

# Novelty
novelty = calculate_novelty(generated_mols, train_mols)
print(f"Novelty: {novelty:.4f}")

# KL Divergence (using molecular weight as an example property)
kl_divergence = calculate_kl_divergence(generated_mols, train_mols)
print(f"KL Divergence: {kl_divergence:.4f}")

# Fragment Similarity
fragment_similarity = calculate_fragment_similarity(generated_mols, train_mols)
print(f"Fragment Similarity: {fragment_similarity:.4f}")

# Scaffold Diversity
scaffold_diversity = calculate_scaffold_diversity(generated_mols)
print(f"Scaffold Diversity: {scaffold_diversity:.4f}")

Novelty: 0.8768
KL Divergence: 5.7033
Fragment Similarity: 0.1237
Scaffold Diversity: 0.4556


### Temperature = 0.7

In [20]:
generated_smiles = []
for i in range(10000):
    start_seq = torch.tensor([sp.piece_to_id('<SOS>'), sp.piece_to_id('C')], device=device)  # Start with <SOS> and a carbon atom
    generated_molecule = generate_molecule(model, start_seq, sp, max_length=150, temperature=0.7)  # Pass the SentencePiece model
    generated_molecule = generated_molecule  # Remove start of sequence token
    generated_smiles.append(generated_molecule)
    
    # Print every 1000th molecule
    if (i + 1) % 1000 == 0:
        print(f"Generated molecule {i + 1}: {generated_molecule}")

Generated molecule 1000: C1Cc2cccc3cccc(c23)C1C3=O
Generated molecule 2000: C(C(=O)N1CCC(c2nc(C)no2)CC1)n1cccn1
Generated molecule 3000: Cc1cc(NC(=O)c2cc(C)n(C)c2Cl)n(C)n1
Generated molecule 4000: C(C)NC(=O)CC1Sc2ccccc2NC1=O
Generated molecule 5000: C(=O)Nc1sc2c(c1C(N)=O)CCCC2
Generated molecule 6000: C1Nc2ccccc2C(c2cccnc2)N1c1ccc(Cl)cc1
Generated molecule 7000: C(=O)N1CCC(c2c(C)oc3cc(C)ccc23)CC1
Generated molecule 8000: Cc1cc2cccn2[nH]1
Generated molecule 9000: C1CN(C(=O)C2(c3ccc(Cl)cc3)CC2)CC(C)O1
Generated molecule 10000: C(=O)N1CCN(C(=O)c2cc(OC)cc(OC)c2)CC1


In [21]:
# Validity
validity_score = validity_evaluator(generated_smiles)
print(f"Validity: {validity_score:.2f}")

# Uniqueness
uniqueness_score = uniqueness_evaluator(generated_smiles)
print(f"Uniqueness: {uniqueness_score:.2f}")

# Diversity
diversity_score = diversity_evaluator(generated_smiles)
print(f"Diversity: {diversity_score:.2f}")

[00:05:51] Conflicting single bond directions around double bond at index 3.
[00:05:51]   BondStereo set to STEREONONE and single bond directions set to NONE.


Validity: 0.96


[00:05:52] Conflicting single bond directions around double bond at index 3.
[00:05:52]   BondStereo set to STEREONONE and single bond directions set to NONE.


Uniqueness: 0.95


[00:05:54] Conflicting single bond directions around double bond at index 3.
[00:05:54]   BondStereo set to STEREONONE and single bond directions set to NONE.


Diversity: 0.87


In [22]:
generated_mols = smiles_to_mol(generated_smiles)

# Novelty
novelty = calculate_novelty(generated_mols, train_mols)
print(f"Novelty: {novelty:.4f}")

# KL Divergence (using molecular weight as an example property)
kl_divergence = calculate_kl_divergence(generated_mols, train_mols)
print(f"KL Divergence: {kl_divergence:.4f}")

# Fragment Similarity
fragment_similarity = calculate_fragment_similarity(generated_mols, train_mols)
print(f"Fragment Similarity: {fragment_similarity:.4f}")

# Scaffold Diversity
scaffold_diversity = calculate_scaffold_diversity(generated_mols)
print(f"Scaffold Diversity: {scaffold_diversity:.4f}")

[00:07:24] Conflicting single bond directions around double bond at index 3.
[00:07:24]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:07:24] Conflicting single bond directions around double bond at index 3.
[00:07:24]   BondStereo set to STEREONONE and single bond directions set to NONE.


Novelty: 0.9240
KL Divergence: 5.6936
Fragment Similarity: 0.0778
Scaffold Diversity: 0.6540


### Temperature = 0.9

In [23]:
# Generate a large set of molecules (e.g., 10,000 or 30,000)
generated_smiles = []
for i in range(10000):
    start_seq = torch.tensor([sp.piece_to_id('<SOS>'), sp.piece_to_id('C')], device=device)  # Start with <SOS> and a carbon atom
    generated_molecule = generate_molecule(model, start_seq, sp, max_length=150, temperature=0.9)  # Pass the SentencePiece model
    generated_molecule = generated_molecule  # Remove start of sequence token
    generated_smiles.append(generated_molecule)
    
    # Print every 1000th molecule
    if (i + 1) % 1000 == 0:
        print(f"Generated molecule {i + 1}: {generated_molecule}")

Generated molecule 1000: Cc1cc(C(=O)N[C@]2(C(=O)O)C[C@H]3C[C@@](C)(C(=O)O)[C@H](C)CC[C@H]32)cs1
Generated molecule 2000: Cn1c(-c2ccc(NC(=O)c3ccc4c(c3)CCO4)cc2)nc2n(c1=O)CCCCC2
Generated molecule 3000: Cc1cc(C(=O)N2CCCC2c2ccnc(-n3ccnc3C)c2)co1
Generated molecule 4000: C(C#N)Oc1ccc(CP(=O)(O)O)cc1
Generated molecule 5000: C1CCCN(C(=O)CCCCCNC(=O)c2sc3c(c2C)CCCC3)C1
Generated molecule 6000: C1(C#N)CCCN(C(=O)NCCCO)C1
Generated molecule 7000: C(=O)Nc1cc2ncnc(Sc3cccc(NC(=O)c4ccccc4Cl)c3)c2cc1Cl
Generated molecule 8000: C1CCN(C2CCc3c2n(Cc2ccccc4C)c(=O)c3C(=O)NC2CCc3c(C)noc32)CC1
Generated molecule 9000: C[C@H](O[C@@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@H]1O)C(=O)OC
Generated molecule 10000: Cc1cc( ⁇ )ccc1OC(=O)c1c(C)nc(NCC(C)C)nc1O


In [24]:
# Validity
validity_score = validity_evaluator(generated_smiles)
print(f"Validity: {validity_score:.2f}")

# Uniqueness
uniqueness_score = uniqueness_evaluator(generated_smiles)
print(f"Uniqueness: {uniqueness_score:.2f}")

# Diversity
diversity_score = diversity_evaluator(generated_smiles)
print(f"Diversity: {diversity_score:.2f}")

[00:37:34] Conflicting single bond directions around double bond at index 40.
[00:37:34]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:37:35] Conflicting single bond directions around double bond at index 1.
[00:37:35]   BondStereo set to STEREONONE and single bond directions set to NONE.


Validity: 0.88


[00:37:35] Conflicting single bond directions around double bond at index 40.
[00:37:35]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:37:36] Conflicting single bond directions around double bond at index 1.
[00:37:36]   BondStereo set to STEREONONE and single bond directions set to NONE.


Uniqueness: 0.88


[00:37:37] Conflicting single bond directions around double bond at index 40.
[00:37:37]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:37:38] Conflicting single bond directions around double bond at index 1.
[00:37:38]   BondStereo set to STEREONONE and single bond directions set to NONE.


Diversity: 0.88


In [25]:
generated_mols = smiles_to_mol(generated_smiles)

# Novelty
novelty = calculate_novelty(generated_mols, train_mols)
print(f"Novelty: {novelty:.4f}")

# KL Divergence (using molecular weight as an example property)
kl_divergence = calculate_kl_divergence(generated_mols, train_mols)
print(f"KL Divergence: {kl_divergence:.4f}")

# Fragment Similarity
fragment_similarity = calculate_fragment_similarity(generated_mols, train_mols)
print(f"Fragment Similarity: {fragment_similarity:.4f}")

# Scaffold Diversity
scaffold_diversity = calculate_scaffold_diversity(generated_mols)
print(f"Scaffold Diversity: {scaffold_diversity:.4f}")

[00:38:53] Conflicting single bond directions around double bond at index 40.
[00:38:53]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:38:53] Conflicting single bond directions around double bond at index 40.
[00:38:53]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:38:55] Conflicting single bond directions around double bond at index 1.
[00:38:55]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:38:55] Conflicting single bond directions around double bond at index 1.
[00:38:55]   BondStereo set to STEREONONE and single bond directions set to NONE.


Novelty: 0.9562
KL Divergence: 5.3179
Fragment Similarity: 0.0502
Scaffold Diversity: 0.7524
