<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/DNA_seq_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [1]:
!pip install -q tensorflow-gpu
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.utils import Sequence
import numpy as np
import random
import os
import zipfile
import matplotlib.pyplot as plt
from google.colab import drive
!pip install -q biopython
from Bio import SeqIO

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [2]:
device_name = tf.test.gpu_device_name()
print(tf.config.list_physical_devices('GPU'))

[]


In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Path to your ZIP file
zip_file_path = '/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/filtered_reads.zip'

# Directory to extract the ZIP file contents
extract_dir = '/content/filtered_reads'

In [5]:
# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Now you can access the extracted data
data_dir = os.path.join(extract_dir, 'filtered_reads')

# Define data directory containing FASTQ files
data_dir = '/content/filtered_reads'

## Dataset

In [7]:
class FastqDataset(Sequence):
    def __init__(self, data_dir, batch_size=4, shuffle=True):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_list = self.get_file_list()
        self.indexes = list(range(len(self.file_list)))
        if self.file_list:
            random.shuffle(self.indexes)

    def __len__(self):
        return len(self.file_list) // self.batch_size

    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_files = [self.file_list[i] for i in batch_indexes]
        batch_data = self.load_batch(batch_files)

        # Filter out batches with 0 size
        batch_data = [data for data in batch_data if data]

        return batch_data

    def get_file_list(self):
        file_list = []
        for filename in os.listdir(self.data_dir):
            if filename.endswith(".fastq"):
                file_path = os.path.join(self.data_dir, filename)
                file_list.append(file_path)

        return file_list

    def load_batch(self, batch_files):
        batch_data = []
        for file_path in batch_files:
            reads, qualities = self.parse_fastq(file_path)
            batch_data.extend(zip(reads, qualities))
        return batch_data

    def parse_fastq(self, file_path):
        reads, qualities = [], []

        for record in SeqIO.parse(file_path, 'fastq'):
            reads.append(str(record.seq))
            qualities.append(record.letter_annotations['phred_quality'])

        return reads, qualities


In [8]:
# Create an instance of the custom dataset
dataset = FastqDataset(data_dir)

# Example usage of the dataset
for batch_data in dataset:
    # Process batch_data as needed
    print("Batch Size:", len(batch_data))

Batch Size: 189675
Batch Size: 153468
Batch Size: 149071
Batch Size: 303648
Batch Size: 78490
Batch Size: 281819
Batch Size: 18271
Batch Size: 174967
Batch Size: 277538
Batch Size: 466566
Batch Size: 172910


## Generator

In [9]:
class Generator(tf.keras.Model):
    def __init__(self, latent_dim, seq_length):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.seq_length = seq_length
        self.fc1 = layers.Dense(128, activation='relu')
        self.fc2 = layers.Dense(256, activation='relu')
        self.fc3 = layers.Dense(seq_length * 4, activation='relu')
        self.fc4 = layers.Dense (seq_length)

    def call(self, inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return x

## Discriminator

In [10]:
class Discriminator(tf.keras.Model):
    def __init__(self, sequence_length):
        super(Discriminator, self).__init__()
        self.fc1 = layers.Dense(256, activation='relu')
        self.fc2 = layers.Dense(128, activation='relu')
        self.fc3 = layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

## GAN

In [11]:
# Define hyperparameters
LATENT_DIM = 100
LEARNING_RATE = 1e-4
SEQ_LENGTH = 300

In [12]:
class GAN(tf.keras.Model):
    def __init__(self, generator, discriminator, latent_dim=LATENT_DIM, lr=LEARNING_RATE):
        super(GAN, self).__init__()
        self.generator = generator
        self.discriminator = discriminator
        self.latent_dim = latent_dim
        self.lr = lr

        self.gen_optimizer = tf.keras.optimizers.Adam(lr)
        self.disc_optimizer = tf.keras.optimizers.Adam(lr)

    def compile(self):
        self.generator.compile(optimizer=self.gen_optimizer)
        self.discriminator.compile(optimizer=self.disc_optimizer)

    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        noise = tf.random.normal([batch_size, self.latent_dim])

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            fake_data = self.generator(noise, training=True)

            real_pred = self.discriminator(real_data, training=True)
            fake_pred = self.discriminator(fake_data, training=True)

            gen_loss = tf.keras.losses.MeanSquaredError()(real_data, fake_data)
            disc_loss = tf.keras.losses.BinaryCrossentropy()(tf.ones_like(real_pred), real_pred) + \
                        tf.keras.losses.BinaryCrossentropy()(tf.zeros_like(fake_pred), fake_pred)

        gen_gradients = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        disc_gradients = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

        self.gen_optimizer.apply_gradients(zip(gen_gradients, self.generator.trainable_variables))
        self.disc_optimizer.apply_gradients(zip(disc_gradients, self.discriminator.trainable_variables))

        return {"Generator Loss": gen_loss, "Discriminator Loss": disc_loss}

## Model Initialization

In [13]:
# Instantiate Generator and Discriminator
generator = Generator(latent_dim=100, seq_length=SEQ_LENGTH)
discriminator = Discriminator(sequence_length=SEQ_LENGTH)

In [14]:
# Instanciate GAN model
gan_model = GAN(generator, discriminator)

gan_model.compile()

## Model Training

In [17]:
# Define a dictionary mapping nucleotides to integers
nucleotide_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

def encode_sequence(sequence):
    encoded_sequence = np.zeros((len(sequence), 4))
    for i, nucleotide in enumerate(sequence):
        if nucleotide in nucleotide_to_index:
            index = nucleotide_to_index[nucleotide]
            encoded_sequence[i, index] = 1
    return encoded_sequence

# Example usage
sequence = "GCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGTCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCTGAACAACTGGACTTTNTTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCCTG"
encoded_sequence = encode_sequence(sequence)
print(encoded_sequence)


[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 ...
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]


In [19]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

# Training loop
NUM_EPOCHS = 4
for epoch in range(NUM_EPOCHS):
    for batch_data in dataset:
        # Filter out batches with zero size
        if len(batch_data) == 0:
            continue

        # Convert each element of the batch to a TensorFlow tensor
        batch_tensors = []
        for data in batch_data:
            reads_tensor = tf.convert_to_tensor(encode_sequence(data[0]), dtype=tf.float32)
            qualities_tensor = tf.convert_to_tensor(data[1], dtype=tf.float32)
            batch_tensors.append((reads_tensor, qualities_tensor))

        # Perform forward pass
        with tf.device('/device:GPU:0'):
            loss_dict = gan_model.train_step(batch_tensors)

        # Perform backward pass and update weights
        gen_gradients = loss_dict['Generator Loss']
        disc_gradients = loss_dict['Discriminator Loss']

        optimizer.apply_gradients(zip(gen_gradients, gan_model.generator.trainable_variables))
        optimizer.apply_gradients(zip(disc_gradients, gan_model.discriminator.trainable_variables))

    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}, Generator Loss: {loss_dict["Generator Loss"]}, Discriminator Loss: {loss_dict["Discriminator Loss"]}')


InvalidArgumentError: {{function_node __wrapped__Pack_N_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Shapes of all inputs must match: values[0].shape = [300,4] != values[1].shape = [300] [Op:Pack] name: 0

## Sample Generation

In [None]:
NUM_SAMPLES = 5
noise = tf.random.normal([NUM_SAMPLES, 100])
generated_samples = generator(noise)

In [None]:
for sample in generated_samples:
    print(sample)