<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/Diffusion_molecule_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from rdkit import Chem
from rdkit.Chem import Draw
import random

# Prepare SMILES dataset (You can use any dataset here, for simplicity, we use a small example)
smiles_data = [
    'CCO', 'CCN', 'CCC', 'CCCC', 'CCCO', 'CCCN', 'CC=O', 'CCOCC', 'CCNCC', 'CC=C'
]

# Create a dictionary of unique characters in the SMILES
all_smiles = ''.join(smiles_data)
chars = sorted(set(all_smiles))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

# Convert SMILES strings to integer sequences
sequences = [[char_to_idx[char] for char in smile] for smile in smiles_data]
max_length = max(len(seq) for seq in sequences)

# Pad sequences to ensure equal length
sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# Define the diffusion model (simplified)
def build_model(vocab_size, seq_length, hidden_dim=128):
    model = models.Sequential([
        layers.InputLayer(input_shape=(seq_length,)),
        layers.Embedding(input_dim=vocab_size, output_dim=hidden_dim),
        layers.LSTM(hidden_dim, return_sequences=True),
        layers.LSTM(hidden_dim),
        layers.Dense(vocab_size, activation='softmax')
    ])
    return model

# Build the model
vocab_size = len(chars)
model = build_model(vocab_size, max_length)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare the data for training
X = sequences_padded[:, :-1]
y = sequences_padded[:, 1:]

# Train the model
model.fit(X, np.expand_dims(y, -1), epochs=50, batch_size=2)

# Function to generate new molecules from the model
def generate_molecule(model, seed, max_length, temperature=1.0):
    molecule = list(seed)
    seed_idx = [char_to_idx[char] for char in seed]
    seed_idx = pad_sequences([seed_idx], maxlen=max_length, padding='post')

    for _ in range(max_length - len(seed)):
        predictions = model.predict(seed_idx, verbose=0)
        predictions = predictions[0, -1, :]

        # Apply temperature scaling
        predictions = np.log(predictions + 1e-7) / temperature
        predictions = np.exp(predictions) / np.sum(np.exp(predictions))

        # Sample from the predictions
        next_char_idx = np.random.choice(len(predictions), p=predictions)
        next_char = idx_to_char[next_char_idx]
        molecule.append(next_char)

        # Update the seed sequence
        seed_idx = np.roll(seed_idx, shift=-1, axis=1)
        seed_idx[0, -1] = next_char_idx

    return ''.join(molecule)

# Generate a new molecule (seed with "C")
generated_smiles = generate_molecule(model, seed='C', max_length=max_length)
print("Generated SMILES:", generated_smiles)

# Convert SMILES to molecule and visualize
mol = Chem.MolFromSmiles(generated_smiles)
if mol:
    img = Draw.MolToImage(mol)
    img.show()
else:
    print("Invalid SMILES string")




Epoch 1/50


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(2, 4), output.shape=(2, 4)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from rdkit import Chem
from rdkit.Chem import Draw
import random

# SMILES dataset (replace with a larger dataset for real-world use)
smiles_data = ['CCO', 'CCN', 'CCC', 'CCCC', 'CCCO', 'CCCN', 'CC=O', 'CCOCC', 'CCNCC', 'CC=C']

# Create a dictionary of unique characters in the SMILES
all_smiles = ''.join(smiles_data)
chars = sorted(set(all_smiles))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

# Convert SMILES strings to integer sequences
sequences = [[char_to_idx[char] for char in smile] for smile in smiles_data]
max_length = max(len(seq) for seq in sequences)

# Pad sequences to ensure equal length
sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# Diffusion forward process (adds noise)
def forward_diffusion(x, t, noise_level=0.1):
    noise = np.random.normal(0, noise_level, size=x.shape)
    return x + noise * t

# Diffusion reverse process model (denoising)
def build_diffusion_model(vocab_size, seq_length, hidden_dim=128):
    model = models.Sequential([
        layers.InputLayer(input_shape=(seq_length,)),
        layers.Embedding(input_dim=vocab_size, output_dim=hidden_dim),
        layers.LSTM(hidden_dim, return_sequences=True),
        layers.LSTM(hidden_dim),
        layers.Dense(vocab_size, activation='softmax')
    ])
    return model

# Build the diffusion model
vocab_size = len(chars)
diffusion_model = build_diffusion_model(vocab_size, max_length)
diffusion_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare the data for training
X = sequences_padded[:, :-1]
y = sequences_padded[:, 1:]

# Train the model
diffusion_model.fit(X, np.expand_dims(y, -1), epochs=50, batch_size=2)

# Function to generate new molecules using reverse diffusion
def generate_molecule_reverse(diffusion_model, seed, max_length, noise_level=0.1, temperature=1.0):
    molecule = list(seed)
    seed_idx = [char_to_idx[char] for char in seed]
    seed_idx = pad_sequences([seed_idx], maxlen=max_length, padding='post')

    for _ in range(max_length - len(seed)):
        # Add noise (forward diffusion)
        noisy_input = forward_diffusion(seed_idx, t=noise_level)

        # Predict next character (denoising, reverse diffusion)
        predictions = diffusion_model.predict(noisy_input, verbose=0)
        predictions = predictions[0, -1, :]

        # Apply temperature scaling
        predictions = np.log(predictions + 1e-7) / temperature
        predictions = np.exp(predictions) / np.sum(np.exp(predictions))

        # Sample from the predictions
        next_char_idx = np.random.choice(len(predictions), p=predictions)
        next_char = idx_to_char[next_char_idx]
        molecule.append(next_char)

        # Update the seed sequence
        seed_idx = np.roll(seed_idx, shift=-1, axis=1)
        seed_idx[0, -1] = next_char_idx

    return ''.join(molecule)

# Generate a new molecule (seed with "C")
generated_smiles = generate_molecule_reverse(diffusion_model, seed='C', max_length=max_length)
print("Generated SMILES:", generated_smiles)

# Convert SMILES to molecule and visualize
mol = Chem.MolFromSmiles(generated_smiles)
if mol:
    img = Draw.MolToImage(mol)
    img.show()
else:
    print("Invalid SMILES string")
