In [1]:
import csv
import pandas as pd
from rdkit import Chem
from tensorflow.keras.layers import LSTM, Dense, RepeatVector

In [2]:
from matplotlib import pyplot as plt
import tensorflow as tf
import numpy as np
from tensorflow import keras

from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LeakyReLU, BatchNormalization, ReLU
from keras.layers import Conv2D, Conv2DTranspose, Reshape, Flatten
from keras.optimizers import Adam
from keras import initializers
from keras.utils import plot_model
from keras import backend as K
import pickle

In [3]:
# Function to load the vocabulary mappings from the pickle file
def load_vocabulary(vocab_file_path):
    with open(vocab_file_path, 'rb') as file:
        char_to_int, int_to_char = pickle.load(file)
    return char_to_int, int_to_char

In [4]:
# Function to decode one-hot encoded representations back to SMILES strings
def decode_generated_molecules(generated_numerical_representations, int_to_char):
    decoded_molecules = []
    for generated_seq in generated_numerical_representations:
        decoded_seq = ''.join(int_to_char[np.argmax(one_hot_encoding)] for one_hot_encoding in generated_seq)
        decoded_molecules.append(decoded_seq)
    return decoded_molecules

In [5]:
def read_data_npy(file_path):
    data = np.load(file_path)
    return data

In [6]:
def build_generator(latent_dim, vocabulary_size, seq_length):
    model = Sequential([
        Dense(64, input_shape=(latent_dim,)),
        RepeatVector(seq_length),
        LSTM(128, return_sequences=True),
        LSTM(256, return_sequences=True),
        LSTM(512, return_sequences=True),
        Dense(vocabulary_size, activation='softmax')
    ])
    return model

In [7]:
def build_discriminator(seq_length, vocabulary_size):
    discriminator = tf.keras.Sequential()
    discriminator.add(LSTM(128, input_shape=(seq_length, vocabulary_size), return_sequences=True))
    discriminator.add(LSTM(256, return_sequences=True))
    discriminator.add(Flatten())
    discriminator.add(Dense(128, activation='relu'))
    discriminator.add(Dense(1, activation='sigmoid'))
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    return discriminator


In [8]:
def build_gan(generator, discriminator):
    discriminator.trainable = False
    gan = Sequential()
    gan.add(generator)
    gan.add(discriminator)
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    return gan


In [9]:
def train_gan(generator, discriminator, gan, data, latent_dim, epochs=100, batch_size=128):
    seq_length = data.shape[1]  # Get the sequence length from the data
    vocabulary_size = data.shape[2]  # Get the vocabulary size from the data

    for epoch in range(epochs):
        # Sample random noise as input for the generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))

        # Generate molecules using the generator
        generated_molecules = generator.predict(noise)

        # Sample real molecules from the dataset
        idx = np.random.randint(0, data.shape[0], batch_size)
        real_molecules = data[idx]

        # Combine real and generated molecules to create a batch for training the discriminator
        X_discriminator = np.concatenate([real_molecules, generated_molecules], axis=0)

        # Create labels for the discriminator (1 for real, 0 for fake)
        y_discriminator = np.ones(2 * batch_size)
        y_discriminator[batch_size:] = 0

        # Train the discriminator on the current batch
        discriminator.trainable = True
        d_loss = discriminator.train_on_batch(X_discriminator, y_discriminator)

        # Train the GAN (generator-discriminator combined) on the current batch
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        y_gan = np.ones(batch_size)
        discriminator.trainable = False
        g_loss = gan.train_on_batch(noise, y_gan)

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Discriminator Loss: {d_loss}, Generator Loss: {g_loss}")

    return generated_molecules


In [10]:
if __name__ == "__main__":
    file_path = 'encoded_data.npy'
    numerical_representations = read_data_npy(file_path)  # Use read_data_npy to load the 3D NumPy array

    # Convert the list of one-hot encoded representations to a 3D NumPy array
    numerical_representations = np.array(numerical_representations)
    print("Shape of numerical_representations:", numerical_representations.shape)

    # Define hyperparameters
    latent_dim = 100
    seq_length = numerical_representations.shape[1]  # Get the sequence length from the data
    vocabulary_size = numerical_representations.shape[2]  # Get the vocabulary size from the data
    print("Vocabulary size:", vocabulary_size)

    # Build the models
    generator = build_generator(latent_dim, vocabulary_size, seq_length)
    discriminator = build_discriminator(seq_length,vocabulary_size)  # Pass the required arguments
    gan = build_gan(generator, discriminator)

    # Compile the models
    generator.compile(loss='binary_crossentropy', optimizer='adam')
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    gan.compile(loss='binary_crossentropy', optimizer='adam')

    # Train the GAN and generate molecules
    generated_molecules = train_gan(generator, discriminator, gan, numerical_representations, latent_dim)
    print("Generated molecules:", generated_molecules)

Shape of numerical_representations: (4100, 807, 52)
Vocabulary size: 52
Epoch 0, Discriminator Loss: 0.6947059035301208, Generator Loss: 0.6933190226554871
Epoch 10, Discriminator Loss: 0.7153347730636597, Generator Loss: 0.6521602869033813
Epoch 20, Discriminator Loss: 0.7144851684570312, Generator Loss: 0.655400812625885
Epoch 30, Discriminator Loss: 0.7131431102752686, Generator Loss: 0.6571226716041565
Epoch 40, Discriminator Loss: 0.714958667755127, Generator Loss: 0.6543375253677368
Epoch 50, Discriminator Loss: 0.7149303555488586, Generator Loss: 0.6538726091384888
Epoch 60, Discriminator Loss: 0.7152073383331299, Generator Loss: 0.6535819172859192
Epoch 70, Discriminator Loss: 0.7149627208709717, Generator Loss: 0.6534917950630188
Epoch 80, Discriminator Loss: 0.714867115020752, Generator Loss: 0.6538876891136169
Epoch 90, Discriminator Loss: 0.7151801586151123, Generator Loss: 0.6541132926940918
Generated molecules: [[[1.82238352e-02 1.67428050e-02 1.87272280e-02 ... 1.8916601

In [11]:

vocab_file_path = "vocabulary.pkl"
char_to_int, int_to_char = load_vocabulary(vocab_file_path)

generated_molecules = decode_generated_molecules(generated_molecules, int_to_char)

print("Generated molecules:", generated_molecules)

Generated molecules: ['BBBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC', 'BBBBBBBBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

In [12]:
import pickle 
with open('generated_molecules.pkl', 'wb') as file:
    pickle.dump(generated_molecules, file)