<a href="https://colab.research.google.com/github/ananya21/Drug-Discovery-Generative-Learning/blob/main/Drug_Discovery_Generative_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install --upgrade tensorflow

import tensorflow as tf
from tensorflow.keras.layers import LSTM



In [5]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM, Dense, Reshape

def build_generator(latent_dim, sequence_length, vocab_size):
    model = tf.keras.Sequential([
        # Start with a Dense layer to map the latent noise vector to a shape that can be reshaped into what the LSTM expects
        layers.Dense(sequence_length * vocab_size, input_dim=latent_dim),
        layers.Reshape((sequence_length, vocab_size)),
        layers.LSTM(256, return_sequences=True),
        layers.LSTM(256, return_sequences=True),
        layers.TimeDistributed(layers.Dense(vocab_size, activation='softmax')),
    ])
    return model

def build_discriminator(sequence_length, vocab_size):
    model = tf.keras.Sequential([
        layers.Input(shape=(sequence_length, 31)),
        layers.LSTM(256, return_sequences=True),
        layers.LSTM(256),
        layers.Dense(1, activation='sigmoid')
    ])
    return model


In [6]:
latent_dim = 217
sequence_length = 217
vocab_size = 31

# Instantiate and compile models
generator = build_generator(latent_dim, sequence_length, vocab_size)
discriminator = build_discriminator(sequence_length, vocab_size)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [7]:
generator.summary()

In [8]:
discriminator.summary()

In [9]:
def compile_gan(generator, discriminator, latent_dim):
    # Make the discriminator not trainable when we are training the generator.
    discriminator.trainable = False

    # Input for the generator
    gan_input = layers.Input(shape=(latent_dim,))

    generator_output = generator(gan_input)

    # Output of the generator
    gan_output = discriminator(generator_output)
    # GAN model
    gan = tf.keras.models.Model(gan_input, gan_output)

    # Compile the discriminator
    discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Compile the GAN model
    gan.compile(optimizer='adam', loss='binary_crossentropy')

    return gan

gan = compile_gan(generator, discriminator, latent_dim)

In [11]:
!pip install --upgrade pubchempy
!pip install pandas
!pip install rdkit



In [12]:
import pubchempy as pcp
import pandas as pd

# Search for a compound by name
compound = pcp.get_compounds('Aspirin', 'name')[0]  # This returns a list; we'll just use the first result

# Print some properties
print(f"Name: {compound.iupac_name}")
print(f"Molecular Formula: {compound.molecular_formula}")
print(f"SMILES String: {compound.canonical_smiles}")

Name: 2-acetyloxybenzoic acid
Molecular Formula: C9H8O4
SMILES String: CC(=O)OC1=CC=CC=C1C(=O)O


In [13]:
# Get a compound by CID
compound = pcp.Compound.from_cid(2244) # (compound IDs)

# Print some properties
print(f"Name: {compound.iupac_name}")
print(f"SMILES: {compound.canonical_smiles}")

Name: 2-acetyloxybenzoic acid
SMILES: CC(=O)OC1=CC=CC=C1C(=O)O


In [14]:
!curl -I https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/benzene/record/JSON

HTTP/2 400 
[1mstrict-transport-security[0m: max-age=31536000; includeSubDomains; preload
[1mreferrer-policy[0m: origin-when-cross-origin
[1mcontent-security-policy[0m: upgrade-insecure-requests
[1mdate[0m: Mon, 08 Apr 2024 14:12:44 GMT
[1mserver[0m: Apache
[1mcache-control[0m: private
[1mexpires[0m: Mon, 08 Apr 2024 15:12:44 GMT
[1mncbi-phid[0m: 90C5CA20613FB5C10000000000000001.m_2
[1mncbi-sid[0m: 90C5CA20613FB5C1_0000SID
[1mx-throttling-control[0m: Request Count status: Green (1%), Request Time status: Green (0%), Service status: Green (40%)
[1mcontent-type[0m: application/json
[1mset-cookie[0m: ncbi_sid=90C5CA20613FB5C1_0000SID; domain=.nih.gov; path=/; expires=Tue, 08 Apr 2025 14:12:44 GMT
[1mx-ua-compatible[0m: IE=Edge
[1mx-xss-protection[0m: 1; mode=block
[1maccess-control-allow-origin[0m: *



In [15]:
!ping -c 1 google.com

/bin/bash: line 1: ping: command not found


In [16]:
import requests

# Define the compound name
name = 'Aspirin'

# Define the properties
properties = 'MolecularFormula,MolecularWeight,CanonicalSMILES'

# Construct the URL
url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{properties}/JSON'

# Make the HTTP GET request
response = requests.get(url)

# Check the response
if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print("Failed to fetch data:", response.status_code)


{'PropertyTable': {'Properties': [{'CID': 2244, 'MolecularFormula': 'C9H8O4', 'MolecularWeight': '180.16', 'CanonicalSMILES': 'CC(=O)OC1=CC=CC=C1C(=O)O'}]}}


In [17]:
# Get a compound by CID
compound = pcp.Compound.from_cid(962) # (compound IDs)

# Print some properties
print(f"Name: {compound.iupac_name}")
print(f"SMILES: {compound.canonical_smiles}")

Name: oxidane
SMILES: O


In [18]:
import time
import pubchempy as pcp
from rdkit import Chem

# Batching utility function
def chunker(seq, size):
    for i in range(0, len(seq), size):
        yield seq[i:i + size]

# Define substructure patterns with RDKit
carboxylic_acid = Chem.MolFromSmarts('C(=O)O')
amine = Chem.MolFromSmarts('N')
phenol = Chem.MolFromSmarts('c1ccccc1O')

results = []

# Settings
batch_size = 100  # Adjust based on what you find works best
retries = 5  # Number of retries per batch
delay = 2  # Delay between retries in seconds
input_vector = []

# Main processing loop with batching and retries
for cid_chunk in chunker(range(2000, 10000), batch_size):
    attempt = 0
    while attempt < retries:
        try:
            compounds = pcp.get_compounds(list(cid_chunk), 'cid')
            for compound in compounds:
                if compound is None: continue  # Skip if no compound was fetched
                # Fetch the compound by CID and get its canonical SMILES
                smiles = compound.canonical_smiles

                # Convert the SMILES string to an RDKit molecule object
                compound_mol = Chem.MolFromSmiles(smiles)

                # Check for the presence of each substructure
                has_carboxylic_acid = compound_mol.HasSubstructMatch(carboxylic_acid)
                has_amine = compound_mol.HasSubstructMatch(amine)
                has_phenol = compound_mol.HasSubstructMatch(phenol)

                if has_carboxylic_acid and has_amine and has_phenol:
                    results.append(compound.cid)
                    input_vector.append(smiles)
            break  # Break the retry loop upon successful processing
        except Exception as e:
            print(f"Error processing batch starting with CID {cid_chunk[0]}: {e}")
            if 'PUGREST.ServerBusy' in str(e):
                attempt += 1
                print(f"Server busy, retrying batch (Attempt {attempt+1}/{retries})...")
                time.sleep(delay)
            else:
                # Break on other errors to avoid infinite loop
                break

print(f"Results: {results}")
print(input_vector)


[14:13:24] Explicit valence for atom # 22 Cl, 3, is greater than permitted


Error processing batch starting with CID 2300: 'NoneType' object has no attribute 'HasSubstructMatch'
Results: [2004, 2008, 2054, 2064, 2074, 2101, 2102, 2110, 2112, 2171, 2199, 2204, 2236, 2245, 2257, 2258, 2276, 2278, 2290, 2291, 2293, 2314, 2316, 2329, 2339, 2342, 2436, 2437, 2439, 2440, 2454, 2471, 2488, 2521, 2536, 2563, 2566, 2575, 2601, 2610, 2629, 2630, 2642, 2646, 2680, 2724, 2870, 2912, 2932, 2982, 3074, 3075, 3076, 3083, 3084, 3088, 3105, 3125, 3142, 3171, 3175, 3176, 3178, 3179, 3190, 3199, 3268, 3347, 3447, 3549, 3567, 3592, 3628, 3683, 3710, 3722, 3750, 3800, 3889, 3894, 3895, 3896, 3903, 3933, 3943, 3944, 3990, 4012, 4031, 4039, 4075, 4097, 4102, 4103, 4107, 4138, 4230, 4241, 4258, 4259, 4260, 4271, 4294, 4302, 4310, 4315, 4322, 4334, 4337, 4338, 4339, 4413, 4414, 4415, 4456, 4502, 4503, 4504, 4505, 4526, 4544, 4545, 4546, 4547, 4574, 4580, 4583, 4622, 4633, 4649, 4681, 4682, 4730, 4731, 4758, 4759, 4769, 4776, 4811, 4849, 4862, 4863, 4864, 4918, 4921, 4934, 4935, 4944, 

In [19]:
print(len(results))
print(len(input_vector))

257
257


In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def tokenize_sequences(smile_compounds):
  tokenizer = Tokenizer(char_level=True)
  tokenizer.fit_on_texts(smile_compounds)
  sequences = tokenizer.texts_to_sequences(smile_compounds)
  max_len = max(map(len, sequences))
  num_unique_chars = len(tokenizer.word_index)

  # Find the length of the longest SMILES string
  max_len = max([len(s) for s in smile_compounds])
  print(num_unique_chars)
  print(max_len)
  padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
  return padded_sequences

In [21]:
print(len(tokenize_sequences(input_vector)))
print(input_vector[0])
print(len(tokenize_sequences(input_vector[4])))

31
217
257
CCOC(=O)C(CC1=CC=C(C=C1)O)NC(=O)C
11
1
84


In [22]:
tokenize_sequences(input_vector).shape

31
217


(257, 217)

In [23]:
def one_hot_encoding(dataset, vocab_size):
  num_samples, sequence_length = dataset.shape
  one_hot_dataset = tf.one_hot(dataset, depth=vocab_size)
  return one_hot_dataset

In [24]:
import tensorflow as tf
import numpy as np

def train_gan(generator, discriminator, gan, dataset, latent_dim, epochs=10000, batch_size=100):
    # Assuming dataset is already a tf.data.Dataset object
    # Prepare the dataset for training: batching and prefetching
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    for epoch in range(epochs):
        # Iterate over batches in the dataset
        for real_data in dataset:
            # Ensure real_data has the correct shape for discriminator
            # real_data shape should be (batch_size, sequence_length, vocab_size)

            current_batch_size = real_data.shape[0]

            # Generate random noise
            noise = np.random.normal(0, 1, size=[current_batch_size, latent_dim])

            # Generate fake data from noise
            generated_data = generator.predict(noise)

            # Labels for real and fake data
            real_labels = np.ones((current_batch_size, 1))
            fake_labels = np.zeros((current_batch_size, 1))

            # Train discriminator on real and fake data
            d_loss_real = discriminator.train_on_batch(real_data, real_labels)
            d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Generate noise for another batch
            noise = np.random.normal(0, 1, size=[batch_size, latent_dim])

            # Train generator (via combined model)
            g_loss = gan.train_on_batch(noise, np.ones(batch_size))

        # Print progress
        print(f"Epoch: {epoch} \t Discriminator Loss: {d_loss} \t Generator Loss: {g_loss}")

        if epoch % 3 == 0:
            # Save models and/or generate and save sample outputs
            generator.save('save/generator.keras')
            discriminator.save('save/discriminator.keras')
            gan.save('save/gan.keras')
            pass

In [25]:
final_results = one_hot_encoding(tokenize_sequences(input_vector), 31)
print(final_results.shape)
final_results = tf.data.Dataset.from_tensor_slices(final_results)
# final_results = final_results.numpy()
# print("Second type of final_results, ", type(final_results))
# train_gan(generator, discriminator, gan, final_results, 217)

31
217
(257, 217, 31)


In [26]:
import torch

def save_checkpoint(generator, discriminator, optimizer_G, optimizer_D, epoch, filename="gan_checkpoint.pth"):
    """Save a checkpoint at the current state."""
    torch.save({
        'epoch': epoch,
        'generator_state_dict': generator.state_dict(),
        'discriminator_state_dict': discriminator.state_dict(),
        'optimizer_G_state_dict': optimizer_G.state_dict(),
        'optimizer_D_state_dict': optimizer_D.state_dict(),
    }, filename)

def load_checkpoint(filename, generator, discriminator, optimizer_G, optimizer_D):
    """Load a checkpoint and resume training."""
    checkpoint = torch.load(filename)
    generator.load_state_dict(checkpoint['generator_state_dict'])
    discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
    optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict'])
    optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict'])
    return checkpoint['epoch']


In [27]:
def train_gan(generator, discriminator, gan, final_results, epochs, checkpoint_interval=20):
    start_epoch = 0
    checkpoint_filename = "gan_checkpoint.pth"

    # Try to load the latest checkpoint if it exists
    try:
        start_epoch = load_checkpoint(checkpoint_filename, generator, discriminator, optimizer_G, optimizer_D) + 1
        print(f"Resuming training from epoch {start_epoch}")
    except FileNotFoundError:
        print("No checkpoint found, starting training from scratch")

    for epoch in range(start_epoch, epochs):
        # Your training logic here
        # Iterate over batches in the dataset
        for real_data in dataset:
            # Ensure real_data has the correct shape for discriminator
            # real_data shape should be (batch_size, sequence_length, vocab_size)

            current_batch_size = real_data.shape[0]

            # Generate random noise
            noise = np.random.normal(0, 1, size=[current_batch_size, latent_dim])

            # Generate fake data from noise
            generated_data = generator.predict(noise)

            # Labels for real and fake data
            real_labels = np.ones((current_batch_size, 1))
            fake_labels = np.zeros((current_batch_size, 1))

            # Train discriminator on real and fake data
            d_loss_real = discriminator.train_on_batch(real_data, real_labels)
            d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Generate noise for another batch
            noise = np.random.normal(0, 1, size=[batch_size, latent_dim])

            # Train generator (via combined model)
            g_loss = gan.train_on_batch(noise, np.ones(batch_size))

        # Print progress
        print(f"Epoch: {epoch} \t Discriminator Loss: {d_loss} \t Generator Loss: {g_loss}")

        # Save checkpoint at the specified interval
        if (epoch + 1) % checkpoint_interval == 0:
            save_checkpoint(generator, discriminator, optimizer_G, optimizer_D, epoch, filename=checkpoint_filename)
            print(f"Checkpoint saved at epoch {epoch}")

train_gan(generator, discriminator, gan, final_results, 10000, checkpoint_interval=20)

NameError: name 'optimizer_G' is not defined