<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/PHRED_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.utils import Sequence
import numpy as np
import random
import os
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
!pip install -q biopython
from Bio import SeqIO
from datetime import datetime
from tqdm import tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h

### GPU

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print(gpu)
    tf.config.experimental.set_memory_growth(gpu,True)

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


### Drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path to your ZIP file
zip_file_path = '/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/filtered_reads.zip'

# Directory to extract the ZIP file contents
extract_dir = '/content/filtered_reads'

In [None]:
# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

### Hyperparameters

In [None]:
BATCH_SIZE = 32
BUFFER_SIZE = 1000
LATENT_DIM = 100
LEARNING_RATE = 1e-4
SEQ_LENGTH = 300
NUM_EPOCHS = 20
PATIENCE = 5
NUM_SAMPLES = 10 # to be generated from the GAN

LOG_DIR = './logs/gan'
log_dir = os.path.join(LOG_DIR, datetime.now().strftime("%Y%m%d-%H%M%S"))

'''
CALLBACKS = [
    tf.keras.callbacks.ModelCheckpoint(filepath='gan_checkpoint.h5', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(patience=3, monitor='discriminator_loss', restore_best_weights=True),
    tf.keras.callbacks.TensorBoard(log_dir=log_dir)  # TensorBoard callback
]
'''

"\nCALLBACKS = [\n    tf.keras.callbacks.ModelCheckpoint(filepath='gan_checkpoint.h5', save_best_only=True),\n    tf.keras.callbacks.EarlyStopping(patience=3, monitor='discriminator_loss', restore_best_weights=True),\n    tf.keras.callbacks.TensorBoard(log_dir=log_dir)  # TensorBoard callback\n]\n"

### Load Dataset.

In [None]:
tf_dataset_dir = '/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET'
tf_dataset = tf.data.Dataset.load(tf_dataset_dir)
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [None]:
iterator = iter(tf_dataset)
first_element = next(iterator)
print(first_element)

tf.Tensor(
[[[34.]
  [34.]
  [34.]
  ...
  [36.]
  [18.]
  [20.]]

 [[31.]
  [34.]
  [34.]
  ...
  [27.]
  [26.]
  [27.]]

 [[34.]
  [34.]
  [34.]
  ...
  [37.]
  [37.]
  [37.]]

 ...

 [[34.]
  [34.]
  [34.]
  ...
  [34.]
  [34.]
  [24.]]

 [[34.]
  [34.]
  [34.]
  ...
  [36.]
  [37.]
  [37.]]

 [[34.]
  [34.]
  [34.]
  ...
  [ 7.]
  [ 7.]
  [ 7.]]], shape=(32, 300, 1), dtype=float32)


In [None]:
def plot_quality_distributions(tf_dataset):
    for batch, qualities_batch in enumerate(tf_dataset):
        for i in range(qualities_batch.shape[0]):
            plt.hist(qualities_batch[i,:,0], bins=20)
            plt.title(f"Quality Score Distribution for Sequence {i} in Batch {batch}")
            plt.xlabel("Quality Score")
            plt.ylabel("Frequency")
            plt.show()

#plot_quality_distributions(tf_dataset)

### Generator

In [None]:
class Generator(tf.keras.Model):
    def __init__(self, seq_length, latent_dim):
        super(Generator, self).__init__()
        self.seq_length = seq_length
        self.latent_dim = latent_dim
        self.dense1 = tf.keras.layers.Dense(256, input_shape=(latent_dim,))
        self.leaky_relu1 = tf.keras.layers.LeakyReLU()
        self.dense2 = tf.keras.layers.Dense(512)
        self.leaky_relu2 = tf.keras.layers.LeakyReLU()
        self.dense3 = tf.keras.layers.Dense(seq_length)
        self.reshape = tf.keras.layers.Reshape((seq_length, 1))

    def call(self, x):
        x = self.dense1(x)
        x = self.leaky_relu1(x)
        x = self.dense2(x)
        x = self.leaky_relu2(x)
        x = self.dense3(x)
        return self.reshape(x)

### Discriminator

In [None]:
class Discriminator(tf.keras.Model):
    def __init__(self, seq_length):
        super(Discriminator, self).__init__()
        self.seq_length = seq_length
        self.flatten = tf.keras.layers.Flatten(input_shape=(seq_length, 1))
        self.dense1 = tf.keras.layers.Dense(512)
        self.leaky_relu1 = tf.keras.layers.LeakyReLU()
        self.dense2 = tf.keras.layers.Dense(256)
        self.leaky_relu2 = tf.keras.layers.LeakyReLU()
        self.dense3 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.leaky_relu1(x)
        x = self.dense2(x)
        x = self.leaky_relu2(x)
        return self.dense3(x)

### GAN

In [None]:
class GAN(tf.keras.Model):
    def __init__(self, generator, discriminator, latent_dim=LATENT_DIM, lr=LEARNING_RATE):
        super(GAN, self).__init__()
        self.generator = generator
        self.discriminator = discriminator
        self.latent_dim = latent_dim
        self.lr = lr

        self.gen_optimizer = tf.keras.optimizers.Adam(2*lr) # Train generator at a higher rate
        self.disc_optimizer = tf.keras.optimizers.Adam(lr)

    def compile(self, disc_optimizer, gen_optimizer, loss_fn):
        super(GAN, self).compile()
        self.disc_optimizer = disc_optimizer
        self.gen_optimizer = gen_optimizer
        self.loss_fn = loss_fn

    @tf.function
    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        noise = tf.random.normal([batch_size, self.latent_dim])

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            fake_data = self.generator(noise, training=True)

            real_pred = self.discriminator(real_data, training=True)
            fake_pred = self.discriminator(fake_data, training=True)

            gen_loss = tf.keras.losses.MeanSquaredError()(real_data, fake_data)
            disc_loss_real = tf.keras.losses.BinaryCrossentropy()(tf.ones_like(real_pred), real_pred)
            disc_loss_fake = tf.keras.losses.BinaryCrossentropy()(tf.zeros_like(fake_pred), fake_pred)
            disc_loss = disc_loss_real + disc_loss_fake

        gen_gradients = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        disc_gradients = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

        self.gen_optimizer.apply_gradients(zip(gen_gradients, self.generator.trainable_variables))
        self.disc_optimizer.apply_gradients(zip(disc_gradients, self.discriminator.trainable_variables))

        return {
            "Generator Loss": gen_loss,
            "Discriminator Loss": disc_loss
        }

    def fit(self, dataset, num_epochs, log_dir, patience=5):
        # Create a summary writer for TensorBoard
        summary_writer = tf.summary.create_file_writer(log_dir)
        checkpoint_dir = './checkpoints'
        os.makedirs(checkpoint_dir, exist_ok=True)

        # Early stopping variables
        best_disc_loss = float('inf')
        epochs_without_improvement = 0

        for epoch in range(num_epochs):
            print(f"Epoch {epoch + 1}/{num_epochs}")

            # Initialize progress bar
            epoch_progress = tqdm(total=len(dataset), desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")

            for batch, qualities_batch in enumerate(dataset):
                # Perform forward pass
                loss_dict = self.train_step(qualities_batch)

                if loss_dict is None:
                    print("train_step did not return any values. Please check the train_step method.")
                    return

                # Update progress bar with loss values
                epoch_progress.set_postfix(
                    gen_loss=float(loss_dict["Generator Loss"]),
                    disc_loss=float(loss_dict["Discriminator Loss"])
                )
                epoch_progress.update(1)

                # Log the losses to TensorBoard
                with summary_writer.as_default():
                    tf.summary.scalar('Generator Loss', loss_dict["Generator Loss"], step=epoch * len(dataset) + batch)
                    tf.summary.scalar('Discriminator Loss', loss_dict["Discriminator Loss"], step=epoch * len(dataset) + batch)

            epoch_progress.close()

            # Print the losses after each epoch
            print(f'Epoch {epoch + 1}/{num_epochs} completed.')
            print(f'Generator Loss: {loss_dict["Generator Loss"]}, Discriminator Loss: {loss_dict["Discriminator Loss"]}')

            # Save model checkpoints manually
            self.generator.save_weights(os.path.join(checkpoint_dir, f'generator_epoch_{epoch+1}.ckpt'))
            self.discriminator.save_weights(os.path.join(checkpoint_dir, f'discriminator_epoch_{epoch+1}.ckpt'))

            # Early stopping check
            if loss_dict["Discriminator Loss"] < best_disc_loss:
                best_disc_loss = loss_dict["Discriminator Loss"]
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print(f"Early stopping triggered after {patience} epochs without improvement.")
                    break

### Model Initialization

In [None]:
# defining optimizer for Models
gen_optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
disc_optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)

# Define Loss Function
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
# Instantiate Generator and Discriminator
generator = Generator(latent_dim=LATENT_DIM, seq_length=SEQ_LENGTH)
discriminator = Discriminator(seq_length=SEQ_LENGTH)

In [None]:
# Instanciate GAN model
gan_model = GAN(generator, discriminator, LATENT_DIM, LEARNING_RATE)

gan_model.compile(
    disc_optimizer=disc_optimizer,
    gen_optimizer=gen_optimizer,
    loss_fn=cross_entropy
)

### Model Training

In [None]:
training_history = gan_model.fit(tf_dataset, NUM_EPOCHS, LOG_DIR)

Epoch 1/20


Epoch 1/20: 100%|██████████| 75081/75081 [13:21<00:00, 93.63batch/s, disc_loss=1.61e-5, gen_loss=5.8] 


Epoch 1/20 completed.
Generator Loss: 5.802612781524658, Discriminator Loss: 1.6096091712825e-05
Epoch 2/20


Epoch 2/20: 100%|██████████| 75081/75081 [13:21<00:00, 93.63batch/s, disc_loss=0.000107, gen_loss=5.97] 


Epoch 2/20 completed.
Generator Loss: 5.965404510498047, Discriminator Loss: 0.0001069755235221237
Epoch 3/20


Epoch 3/20: 100%|██████████| 75081/75081 [12:21<00:00, 101.20batch/s, disc_loss=1.45e-5, gen_loss=5.58]


Epoch 3/20 completed.
Generator Loss: 5.580908298492432, Discriminator Loss: 1.4499837561743334e-05
Epoch 4/20


Epoch 4/20: 100%|██████████| 75081/75081 [12:39<00:00, 98.80batch/s, disc_loss=2.92e-5, gen_loss=4.24]


Epoch 4/20 completed.
Generator Loss: 4.237154006958008, Discriminator Loss: 2.9229006031528115e-05
Epoch 5/20


Epoch 5/20: 100%|██████████| 75081/75081 [14:21<00:00, 87.11batch/s, disc_loss=0.000121, gen_loss=4.47] 


Epoch 5/20 completed.
Generator Loss: 4.471123218536377, Discriminator Loss: 0.00012096053978893906
Epoch 6/20


Epoch 6/20:   1%|          | 718/75081 [00:07<12:21, 100.23batch/s, disc_loss=4.52e-6, gen_loss=66]  

KeyboardInterrupt: 

### Visualize training results.

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./logs/gan

### Sample Generation

In [None]:
noise = tf.random.normal([NUM_SAMPLES, 100])
generated_samples = generator(noise)

In [None]:
print(generated_samples[0])

In [None]:
generated_samples_np = generated_samples.numpy().reshape(NUM_SAMPLES, SEQ_LENGTH)

save_samples_dir = '/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET'

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = f"generated_samples_{timestamp}.npy"

np.save(save_samples_dir, generated_samples_np)
print(f"Generated samples saved to {save_samples_dir}")