<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/Smiles_generation_LanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets rdkit pandas tqdm


Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [28]:
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
from tqdm import tqdm
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from rdkit import Chem

In [29]:

# ========== Step 1: Load Dataset ==========
# Example: Load SMILES strings from CSV or .smi file
def load_smiles(file_path, max_length=100):
    df = pd.read_csv(file_path)
    smiles_list = df['smiles'].dropna().unique().tolist()
    smiles_list = [s for s in smiles_list if len(s) <= max_length and Chem.MolFromSmiles(s)]
    return smiles_list

# Example dataset
smiles = [
    "CC(=O)OC1=CC=CC=C1C(=O)O",
    "C1=CC=C(C=C1)C=O",
    "CCN(CC)CCOC(=O)C1=CC=CC=C1Cl",
    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
    "CCOC(=O)C1=CC=CC=C1OC",
    "CC(C)C1=CC=C(C=C1)C(C)C(=O)NC",
    "COC1=CC=CC=C1OC",
    "CC(C)C(=O)NC1=CC=C(C=C1)Cl",
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
    "CCC(CC)COC(=O)C1=CC=CC=C1",
    "CC1=CC(=O)NC(=O)N1",
    "CC(C)OC(=O)C1=CC=CC=C1Cl",
    "CN(C)C(=O)C1=CC=C(C=C1)Cl",
    "COC1=CC=CC=C1C(=O)O",
    "C1=CC=C(C=C1)N",
    "CCOC(=O)C1=CC=CC=C1F",
    "CN(C)C(=O)C1=CC=C(C=C1)OC",
    "CCC(=O)OC1=CC=CC=C1C(=O)O",
    "CC(C)NC1=CC=C(C=C1)OC",
    "C1=CC(=CC=C1C=O)O",
    "CC1=CC(=O)NC(=O)N1C",
    "CCC(C)OC(=O)C1=CC=CC=C1",
    "CCOC(=O)C1=CC=CC=C1Cl",
    "CN1C=NC2=C1C(=O)NC(=O)N2",
    "CC(C)OC(=O)C1=CC=CC=C1F",
    "C1=CC=C2C(=C1)C=CC=C2",
    "C1=CC(=CC=C1C=O)Cl",
    "CCN(CC)CCOC(=O)C1=CC=CC=C1F",
    "CC(C)C1=CC=C(C=C1)O",
    "COC1=CC=C(C=C1)C=O",
    "CCOC(=O)C1=CC=CC=C1NO",
    "CC(C)OC(=O)C1=CC=CC=C1Br",
    "CCC(=O)OC1=CC=CC=C1F",
    "COC1=CC=CC(=C1)C=O",
    "CCC(C)OC(=O)C1=CC=CC=C1Cl",
    "CN1C=NC2=C1C(=O)N(C(=O)N2)C",
    "CC(C)CC1=CC=CC=C1O",
    "CCOC(=O)C1=CC=CC=C1Br",
    "COC1=CC=C(C=C1)C(=O)O",
    "CC(C)OC(=O)C1=CC=CC=C1N",
    "CN(C)C(=O)C1=CC=C(C=C1)F",
    "CCC(=O)OC1=CC=CC=C1NO",
    "COC1=CC=CC=C1C=O",
    "CCOC(=O)C1=CC=CC=C1NO2",
    "CN1C=NC2=C1C(=O)NC(=O)N2C",
    "CC(C)OC(=O)C1=CC=CC=C1CN",
    "C1=CC=C(C=C1)C(=O)O",
    "C1=CC=C(C=C1)Br",
    "COC1=CC=CC=C1OC",
]

# Save to file for training
with open("smiles.txt", "w") as f:
    for s in smiles:
        f.write(s + "\n")

In [30]:
# ========== Step 2: Tokenizer ==========
# GPT-2 uses byte-level BPE tokenizer; we adapt it to SMILES
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Needed for batching
tokenizer.add_special_tokens({'bos_token': '<bos>', 'eos_token': '<eos>'})

2

In [31]:
# ========== Step 3: Prepare Dataset ==========
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="smiles.txt",
    block_size=128,
)
print(f"Loaded {len(dataset)} samples")


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)


Loaded 49 samples




In [32]:
# ========== Step 4: Load GPT2 and Fine-Tune ==========
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


Embedding(50259, 768)

In [33]:

training_args = TrainingArguments(
    output_dir="./smiles-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=50,
    save_steps=5000,
    save_total_limit=1,
    logging_steps=5000,
    prediction_loss_only=True,
    learning_rate=5e-4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

# ========== Step 5: Save Model ==========
#trainer.save_model("./smiles-gpt2")
#tokenizer.save_pretrained("./smiles-gpt2")



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


TrainOutput(global_step=5, training_loss=3.1902097702026366, metrics={'train_runtime': 141.7194, 'train_samples_per_second': 1.729, 'train_steps_per_second': 0.035, 'total_flos': 3250840320000.0, 'train_loss': 3.1902097702026366, 'epoch': 5.0})

In [34]:

# ========== Step 6: Generate Novel SMILES ==========

def generate_smiles(model, tokenizer, prompt="C", num_return_sequences=5, max_length=64, retry_limit=5):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    smiles_list = []

    for _ in range(retry_limit):
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,  # Adjust temperature for better results
            pad_token_id=tokenizer.eos_token_id,
        )

        for i in range(num_return_sequences):
            decoded = tokenizer.decode(output[i], skip_special_tokens=True)
            smiles = decoded.strip().split("\n")[0].strip()

            if smiles:
                mol = Chem.MolFromSmiles(smiles)
                if mol:
                    smiles_list.append(smiles)
                else:
                    print(f"Invalid SMILES skipped: {smiles}")

        if smiles_list:  # If at least one valid SMILES is generated, break loop
            break

    return smiles_list



generated_smiles = generate_smiles(model, tokenizer, num_return_sequences=10)

# ========== Step 7: Validate and Print ==========
valid_smiles = [s for s in generated_smiles if Chem.MolFromSmiles(s)]
print("Generated Valid SMILES:")
for s in valid_smiles:
    print(s)

Invalid SMILES skipped: COC=CCCCCCCC1CCCC=CC1CC=CC=CC=CC(CC1)CC(CC1=CC)CC(CC1)1)CC1C1O(CC(CC(CC1)C)1)CC1(CC1(CCCC1
Invalid SMILES skipped: C(((CC)CC)(O)CCCC(CC1)(CCCC1=CC)O)CC)CC(C(C)CC(CC)CC1CC1CC)CCCC)(1CC1(C1)CC1CC1CC)1(CC
Invalid SMILES skipped: C(N1(C1)O)(1C1)1(C(1(1)C((1)N1))(CC(CC(CC)CC1(C(C(CC))CC11O)C1(1)1(CC1)
Invalid SMILES skipped: C(C)CC1C((CC)CC1CC(CCCC)CC1CCCC(CC)CC1C(CC)(C)1C(1)CC1)CC(CC)O1(1CC)CC)C1CC(1)CC1CC1
Invalid SMILES skipped: C()O)CC1(CC=CC(CC)CC)CC1C1CC1CC=CC1CCCC)C1C)CC1CC1(CC)1)CC1CC1CCCCCCCCCC(1)CC1CC1CC(CC)CCCC
Invalid SMILES skipped: COC=OC(=CC1)CC1C1(CC)CCCC1CC1CC(CC1CC1C1C1(CC1)CC=CC11)CC1CC1C1O1CC(CC(=CC(O)CC)CC(11
Invalid SMILES skipped: C(1)C((1=1=CC)O)O)CC((CC1CC)CC)CC1CC1)CC(CC)1(C1)CC1)1CC((1(CC1)C1(1CC)CC1CC)CC=
Invalid SMILES skipped: C,=CC=CC(CC1=CC)C1(CC=CC)CC1CC=CC(CC1=CCCC(CC)O)CC1CC(CC1CC1CC1CC1CC)C(CC)C1C1(1(CC)CC(
Invalid SMILES skipped: C1C1CC1CC=CCCC=CC(C(CC1CC)1CC)C(CC1=CC(CC)O)CC(CC1CC)CCCC(1CC)CC1C(CC(CC)(1)CC(CC)1CC1CC
Generated Val

[20:54:48] SMILES Parse Error: extra close parentheses while parsing: COC=CCCCCCCC1CCCC=CC1CC=CC=CC=CC(CC1)CC(CC1=CC)CC(CC1)1)CC1C1O(CC(CC(CC1)C)1)CC1(CC1(CCCC1
[20:54:48] SMILES Parse Error: check for mistakes around position 56:
[20:54:48] 1)CC(CC1=CC)CC(CC1)1)CC1C1O(CC(CC(CC1)C)1
[20:54:48] ~~~~~~~~~~~~~~~~~~~~^
[20:54:49] SMILES Parse Error: Failed parsing SMILES 'COC=CCCCCCCC1CCCC=CC1CC=CC=CC=CC(CC1)CC(CC1=CC)CC(CC1)1)CC1C1O(CC(CC(CC1)C)1)CC1(CC1(CCCC1' for input: 'COC=CCCCCCCC1CCCC=CC1CC=CC=CC=CC(CC1)CC(CC1=CC)CC(CC1)1)CC1C1O(CC(CC(CC1)C)1)CC1(CC1(CCCC1'
[20:54:49] SMILES Parse Error: syntax error while parsing: C(((CC)CC)(O)CCCC(CC1)(CCCC1=CC)O)CC)CC(C(C)CC(CC)CC1CC1CC)CCCC)(1CC1(C1)CC1CC1CC)1(CC
[20:54:49] SMILES Parse Error: check for mistakes around position 3:
[20:54:49] C(((CC)CC)(O)CCCC(CC1)(CCCC1=CC)O)CC)CC(C
[20:54:49] ~~^
[20:54:49] SMILES Parse Error: Failed parsing SMILES 'C(((CC)CC)(O)CCCC(CC1)(CCCC1=CC)O)CC)CC(C(C)CC(CC)CC1CC1CC)CCCC)(1CC1(C1)CC1CC1CC)1(CC' for inpu

In [44]:
"""
Enhanced SMILES Generator using Chemistry-Aware Language Models
"""
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Descriptors
import warnings
warnings.filterwarnings('ignore')

# Disable WANDB
os.environ["WANDB_DISABLED"] = "true"

# Check CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class SMILESDataset(Dataset):
    """Custom dataset for SMILES strings"""
    def __init__(self, smiles_list, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.inputs = []

        for smiles in tqdm(smiles_list, desc="Tokenizing SMILES"):
            # Add special tokens for better learning
            text = f"<smiles>{smiles}</smiles>"
            encodings = tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding="max_length",
                return_tensors="pt"
            )
            self.inputs.append({
                'input_ids': encodings['input_ids'][0],
                'attention_mask': encodings['attention_mask'][0]
            })

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx]

def download_chembl_smiles(max_compounds=10000, min_atoms=5, max_atoms=50):
    """Download and filter ChEMBL-like drug compounds"""
    # For demonstration, let's use a larger sample dataset
    # In a real implementation, you'd want to download from ChEMBL or use a proper dataset

    # Example drug-like SMILES (expanded from your original dataset)
    sample_smiles = [
        "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
        "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
        "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen
        "CN1C=NC2=C1C(=O)NC(=O)N2",  # Theophylline
        "CCN(CC)CCOC(=O)C1=CC=CC=C1Cl",  # Diethyl procainamide
        "CC(C)OC(=O)C1=CC=CC=C1C(=O)O",  # Ketoprofen
        "CCOC(=O)C1=CC=CC=C1N",  # Procaine
        "CC(=O)NC1=CC=C(C=C1)O",  # Acetaminophen
        "COC1=CC=CC=C1OC",  # Dimethoxybenzene
        "CN1C=NC2=C1C(=O)N(C(=O)N2)C",  # Theobromine
        "CC1=CC(=O)NC(=O)N1",  # 5-Methyluracil
        "CC(C)OC(=O)C1=CC=CC=C1Cl",  # Chlorbenzoxamine
        "CN(C)C(=O)C1=CC=C(C=C1)Cl",  # Chlorpheniramine
        "COC1=CC=CC=C1C(=O)O",  # Methyl salicylate
        "C1=CC=C(C=C1)N",  # Aniline
        "C1=CC=C2C(=C1)C=CC=C2",  # Naphthalene
        "COC1=CC=C(CC(=O)O)C=C1",  # 4-Methoxyphenylacetic acid
        "CC(C)C1=CC=C(C=C1)O",  # 4-isopropylphenol
        "CC1=CC=C(C=C1)S(=O)(=O)NC(=O)NC1CCCCC1",  # Tolbutamide
        "CN1CCN(CC1)C1=CC=C(Cl)C=C1",  # Chlorpromazine derivative
        "OC1=CC=CC=C1C(=O)NN",  # Salicylhydrazide
        "C1CC(=O)NC(=O)C1",  # Glutarimide
        "CC1=CC=C(C=C1)NC(=O)C",  # 4-Methylacetanilide
        "CC1=CC=CC=C1O",  # o-Cresol
        "CC1=CC=CC=C1N",  # o-Toluidine
        "CC(=O)OC1=CC=CC=C1",  # Phenyl acetate
        "CC1=CN=C(C=C1)C(=O)N",  # Nicotinamide derivative
        "C1=CC=C(C=C1)C(=O)C=O",  # Phenylglyoxal
        "COC1=CC=C(C=C1)CCN",  # 4-Methoxyphenethylamine
        "CC1=CC=CC=C1CC(=O)O",  # 2-Methylphenylacetic acid
        "CC1=CC=CC(=C1)C(=O)O",  # 3-Methylbenzoic acid
        "CC1=CC=CC=C1C(=O)O",  # 2-Methylbenzoic acid
        "CC1=CC=C(C=C1)C(=O)O",  # 4-Methylbenzoic acid
        "CC1=CC=C(O)C=C1",  # p-Cresol
        "CC1=CC=C(C=C1)C(C)N",  # 4-Methylamphetamine
        "CC1=CC=C(C=C1)S(=O)(=O)N",  # Toluenesulfonamide
        "C1=CC=C(C=C1)CC(=O)O",  # Phenylacetic acid
        "C1=CC=C(C=C1)CCCC(=O)O",  # 4-Phenylbutyric acid
        "C1=CC=C(C=C1)C(=O)N",  # Benzamide
        "CC(=O)NC1=CC=CC=C1",  # Acetanilide
        "CC(C)(C)C1=CC=C(C=C1)O",  # 4-tert-butylphenol
        "CC(C)(C)C1=CC=CC=C1",  # tert-butylbenzene
        "CC1=CC(=CC=C1)N",  # m-Toluidine
    ]

    # For a real application, you would download from ChEMBL:
    # import deepchem as dc
    # dc.molnet.load_chembl25(featurizer='ECFP', split='random')

    # Validate and filter SMILES
    valid_smiles = []
    for smi in sample_smiles:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            # Apply filters (e.g., size, drug-likeness)
            num_atoms = mol.GetNumAtoms()
            if min_atoms <= num_atoms <= max_atoms:
                # Calculate properties to ensure drug-likeness
                logp = Descriptors.MolLogP(mol)
                if -0.4 <= logp <= 5.6:  # Lipinski's rule of 5 range
                    valid_smiles.append(smi)

    # Add more compounds if needed (for a real implementation)
    if len(valid_smiles) < max_compounds:
        # In a real implementation, get more from a database
        pass

    return valid_smiles[:max_compounds]

def preprocess_dataset():
    """Preprocess SMILES strings and prepare datasets"""
    # Get SMILES data
    smiles_list = download_chembl_smiles(max_compounds=2000)
    print(f"Total valid SMILES: {len(smiles_list)}")

    # Split into train/validation sets
    np.random.shuffle(smiles_list)
    split_idx = int(len(smiles_list) * 0.9)
    train_smiles = smiles_list[:split_idx]
    val_smiles = smiles_list[split_idx:]

    return train_smiles, val_smiles

def create_tokenizer_and_model(model_name="gpt2-medium"):
    """Initialize tokenizer and model"""
    # Use a chemistry-aware model if available
    # For demo, we'll fine-tune GPT-2-medium which has better capacity than base GPT-2
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Add special tokens for SMILES
    special_tokens = {
        'pad_token': '[PAD]',
        'bos_token': '<smiles>',
        'eos_token': '</smiles>',
        'additional_special_tokens': ['[C]', '[O]', '[N]', '[S]', '[Cl]', '[F]', '[Br]', '[I]']
    }

    # Add special tokens to tokenizer
    tokenizer.add_special_tokens(special_tokens)

    # Initialize model
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.resize_token_embeddings(len(tokenizer))

    return tokenizer, model

def train_smiles_generator(model, tokenizer, train_smiles, val_smiles, output_dir="./smiles-generator"):
    """Train the SMILES generator model"""
    # Create datasets
    train_dataset = SMILESDataset(train_smiles, tokenizer)
    val_dataset = SMILESDataset(val_smiles, tokenizer)

    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")

    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,  # Adjust based on dataset size
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_steps=500,
        save_steps=500,
        warmup_steps=500,
        prediction_loss_only=False,
        logging_dir=f"{output_dir}/logs",
        logging_steps=500,
        load_best_model_at_end=True,
        save_total_limit=1,
        fp16=True if torch.cuda.is_available() else False,
        gradient_accumulation_steps=2,
        learning_rate=5e-5,  # Lower learning rate for fine-tuning
    )

    # Data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We're doing causal language modeling, not masked
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Train the model
    print("Starting training...")
    trainer.train()

    # Save model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

    return model, tokenizer

def generate_novel_smiles(model, tokenizer, num_sequences=25, max_length=100, temperature=0.7, top_p=0.9):
    """Generate novel SMILES strings"""
    # Set model to evaluation mode
    model.eval()
    model.to(device)

    # Generation parameters
    generation_config = {
        "do_sample": True,
        "top_p": top_p,
        "temperature": temperature,
        "top_k": 50,
        "max_length": max_length,
        "num_return_sequences": num_sequences,
        "pad_token_id": tokenizer.pad_token_id,
        "bos_token_id": tokenizer.bos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
    }

    # Generate sequences
    input_ids = tokenizer("<smiles>", return_tensors="pt").input_ids.to(device)

    # Generate with guidance
    outputs = model.generate(input_ids, **generation_config)

    # Decode generated SMILES
    generated_smiles = []
    for output in outputs:
        decoded = tokenizer.decode(output, skip_special_tokens=True)
        # Extract SMILES string between tags if present
        if "<smiles>" in decoded and "</smiles>" in decoded:
            smiles = decoded.split("<smiles>")[1].split("</smiles>")[0].strip()
        else:
            smiles = decoded.strip()
        generated_smiles.append(smiles)

    return generated_smiles

def validate_smiles(smiles_list):
    """Validate generated SMILES and calculate properties"""
    valid_mols = []
    valid_smiles = []
    properties = []

    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol:
                valid_mols.append(mol)
                valid_smiles.append(smiles)

                # Calculate basic molecular properties
                properties.append({
                    'SMILES': smiles,
                    'MolWeight': round(Descriptors.MolWt(mol), 2),
                    'LogP': round(Descriptors.MolLogP(mol), 2),
                    'NumHDonors': Descriptors.NumHDonors(mol),
                    'NumHAcceptors': Descriptors.NumHAcceptors(mol),
                    'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
                    'TPSA': round(Descriptors.TPSA(mol), 2)
                })
        except:
            continue

    print(f"Generated {len(smiles_list)} SMILES, {len(valid_smiles)} valid ({len(valid_smiles)/len(smiles_list)*100:.1f}%)")

    return valid_mols, valid_smiles, properties

def visualize_molecules(mols, n_per_row=5, max_mols=10):
    """Visualize generated molecules"""
    if len(mols) > max_mols:
        mols = mols[:max_mols]

    img = Draw.MolsToGridImage(
        mols,
        molsPerRow=n_per_row,
        subImgSize=(250, 250),
        legends=[f"Mol {i+1}" for i in range(len(mols))]
    )
    return img

def analyze_properties(properties):
    """Analyze properties of generated molecules"""
    if not properties:
        return "No valid molecules to analyze"

    df = pd.DataFrame(properties)

    # Check Lipinski's Rule of 5
    df['Lipinski_Violations'] = (
        (df['MolWeight'] > 500).astype(int) +
        (df['LogP'] > 5).astype(int) +
        (df['NumHDonors'] > 5).astype(int) +
        (df['NumHAcceptors'] > 10).astype(int)
    )

    summary = {
        'Total_Molecules': len(df),
        'Rule_of_5_Compliant': (df['Lipinski_Violations'] <= 1).sum(),
        'Avg_MolWeight': df['MolWeight'].mean(),
        'Avg_LogP': df['LogP'].mean(),
        'Avg_TPSA': df['TPSA'].mean(),
    }

    print(f"Property Analysis Summary:")
    for k, v in summary.items():
        print(f"  {k}: {v}")

    return df



Using device: cpu


In [45]:
    """Main execution function"""
    # Process dataset
    train_smiles, val_smiles = preprocess_dataset()

    # Initialize tokenizer and model
    tokenizer, model = create_tokenizer_and_model()

    # Train model
    model, tokenizer = train_smiles_generator(model, tokenizer, train_smiles, val_smiles)

    # Generate novel molecules
    print("\nGenerating novel SMILES...")
    generated_smiles = generate_novel_smiles(model, tokenizer, num_sequences=50)

    # Validate and analyze generated molecules
    mols, valid_smiles, properties = validate_smiles(generated_smiles)

    # Analyze properties
    props_df = analyze_properties(properties)

    # Display results
    print(f"\nExample valid SMILES generated:")
    for i, smiles in enumerate(valid_smiles[:5]):
        print(f"{i+1}. {smiles}")


Total valid SMILES: 39


Tokenizing SMILES: 100%|██████████| 35/35 [00:00<00:00, 1776.15it/s]
Tokenizing SMILES: 100%|██████████| 4/4 [00:00<00:00, 753.93it/s]

Train dataset size: 35
Validation dataset size: 4





ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: IntervalStrategy.NO
- Save strategy: SaveStrategy.STEPS