<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/FASTQ_DataLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load the Dataset.

### Connect with Google Drive.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Libraries.

In [3]:
import os
import random
import shutil
import zipfile
import tarfile
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.utils import Sequence
!pip install -q Bio
from Bio import SeqIO

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.0/281.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### Unzip the dataset folder and import it into the Colab notebook.

In [4]:
def extract_or_create_zip(source_path, extract_path):
    if os.path.isdir(source_path):
        shutil.make_archive(extract_path, "zip", source_path)
        print(f"Created ZIP file from directory: {source_path}")

    elif os.path.isfile(source_path) and source_path.lower().endswith('.zip'):
        with zipfile.ZipFile(source_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
            print(f"Extraction completed successfully: {source_path}")

    else:
        print(f"Error: Invalid source path '{source_path}'. Must be a directory or a ZIP file.")

In [5]:
# Define source and extraction paths
source_path = "/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/FASTQ_FILES.zip"
extract_path = "/content/fastq_dataset"

# Extract or create ZIP file based on source path
extract_or_create_zip(source_path, extract_path)

Extraction completed successfully: /content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/FASTQ_FILES.zip


## Build the DataLoader.

### Create a FASTQ Dataset Class. (not used)

In [6]:
class FastqDataset:
    def __init__(self, data_dir, batch_size, shuffle=True):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_list = self.get_file_list()
        self.indexes = list(range(len(self.file_list)))
        self.num_samples = self.calculate_num_samples()
        self.num_batches = self.num_samples // self.batch_size

    def calculate_num_samples(self):
        total_samples = 0
        for file_path in self.file_list:
            for record in SeqIO.parse(file_path, 'fastq'):
                total_samples += 1
        return total_samples

    def get_file_list(self):
        return [os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir) if f.endswith(".fastq")]

    def parse_fastq(self, file_path):
        reads, qualities = [], []
        for record in SeqIO.parse(file_path, 'fastq'):
            reads.append(str(record.seq))
            qualities.append(record.letter_annotations['phred_quality'])
        return reads, qualities

    def load_batch(self, batch_files):
        batch_data = []
        for file_path in batch_files:
            reads, qualities = self.parse_fastq(file_path)
            batch_data.extend(qualities)
        return batch_data

    def pad_batch(self, batch_data, batch_size):
        padding_data = batch_data.copy()
        while len(padding_data) < batch_size:
            padding_data.append(batch_data[len(padding_data) % len(batch_data)])
        return padding_data

    def data_generator(self, num_samples=None):
        if self.shuffle:
            random.shuffle(self.indexes)

        total_samples = num_samples if num_samples is not None else len(self.indexes)
        for start in range(0, total_samples, self.batch_size):
            end = min(start + self.batch_size, total_samples)
            batch_indexes = self.indexes[start:end]
            batch_files = [self.file_list[i] for i in batch_indexes]
            batch_data = self.load_batch(batch_files)

            if len(batch_data) < self.batch_size:
                batch_data = self.pad_batch(batch_data, self.batch_size)

            # Convert to tensor and ensure the right shape
            batch_data = tf.ragged.constant(batch_data, dtype=tf.int32).to_tensor(shape=(self.batch_size, -1, 1))
            yield batch_data

    def create_tf_dataset(self, num_samples=None):
        output_signature = tf.TensorSpec(shape=(self.batch_size, None, 1), dtype=tf.int32)
        dataset = tf.data.Dataset.from_generator(
            lambda: self.data_generator(num_samples),
            output_signature=output_signature
        )
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return dataset

    def get_num_batches(self):
        return self.num_batches

    def __iter__(self):
        return self.data_generator()

# Create the FASTQ dataset obect
#fastq_dataset = FastqDataset(data_dir=DATA_DIR, batch_size=BATCH_SIZE)

### Create a Quality Score Dataset Class.

In [7]:
class QualityScoreDataset:
    def __init__(self, data_dir, buffer_size=10000, batch_size=32, shuffle=True, seed=None):
        self.data_dir = data_dir
        self.shuffle = shuffle
        self.seed = seed
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.file_list = self.get_file_list()

    def get_file_list(self):
        return [os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir) if f.endswith(".fastq")]

    def parse_fastq(self, file_path):
        qualities = []
        for record in SeqIO.parse(file_path, 'fastq'):
            qualities.append(record.letter_annotations['phred_quality'])
        return qualities

    def quality_scores_generator(self):
        for file_path in self.file_list:
            qualities = self.parse_fastq(file_path)
            for quality in qualities:
                if len(quality) == 300:
                    yield tf.expand_dims(tf.convert_to_tensor(quality, dtype=tf.int32), axis=-1)

    def create_tf_dataset(self):
        dataset = tf.data.Dataset.from_generator(
            self.quality_scores_generator,
            output_signature=tf.TensorSpec(shape=(300, 1), dtype=tf.int32)
        )
        if self.shuffle:
            dataset = dataset.shuffle(buffer_size=self.buffer_size, seed=self.seed)
        dataset = dataset.batch(self.batch_size).prefetch(tf.data.experimental.AUTOTUNE)
        return dataset

    def calculate_sequence_insights(self, sample_size=None, seed=None):
        insights = []
        quality_scores = list(self.quality_scores_generator())

        if sample_size:
            if seed is not None:
                random.seed(seed)

            sampled_indices = random.sample(range(len(quality_scores)), min(sample_size, len(quality_scores)))
            sampled_qualities = [quality_scores[i].numpy().flatten() for i in sampled_indices]

            for seq_num, quality in zip(sampled_indices, sampled_qualities):
                quality_array = np.array(quality)
                mean_val = np.mean(quality_array)
                std_val = np.std(quality_array)
                min_val = np.min(quality_array)
                max_val = np.max(quality_array)
                insights.append({
                    'Sequence Number': seq_num,
                    'Mean': mean_val,
                    'Standard Deviation': std_val,
                    'Minimum': min_val,
                    'Maximum': max_val
                })
        else:
            for seq_num, quality in enumerate(quality_scores):
                quality_array = quality.numpy().flatten()
                mean_val = np.mean(quality_array)
                std_val = np.std(quality_array)
                min_val = np.min(quality_array)
                max_val = np.max(quality_array)
                insights.append({
                    'Sequence Number': seq_num,
                    'Mean': mean_val,
                    'Standard Deviation': std_val,
                    'Minimum': min_val,
                    'Maximum': max_val
                })

        return insights

    def display_insights(self, insights):
        print(f"{'Sequence':<15} {'Mean':<15} {'Std Dev':<15} {'Min':<10} {'Max':<10}")
        print("="*65)
        for insight in insights:
            print(f"{insight['Sequence Number']:<15} {insight['Mean']:<15.2f} {insight['Standard Deviation']:<15.2f} {insight['Minimum']:<10} {insight['Maximum']:<10}")

### Create a dataset object.

Define hyperparameters and create the object.

In [8]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
SEED = 42

quality_dataset = QualityScoreDataset(data_dir=extract_path, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, shuffle=True, seed=SEED)

Calculate and display the insights of the dataset.

In [10]:
#sequence_insights = quality_dataset.calculate_sequence_insights(sample_size=5, seed=42)
#quality_dataset.display_insights(sequence_insights)

### Create a Tensor dataset.

In [9]:
tf_dataset = quality_dataset.create_tf_dataset()

## Save the Tensor dataset in Drive.

In [11]:
# Directory to save the dataset
save_dir = '/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET'
save_name = f'phred_dataset_tf_{BATCH_SIZE}'
full_save_path = os.path.join(save_dir, save_name)

tf.data.Dataset.save(tf_dataset, full_save_path)
print(f"Dataset saved to {save_dir}")

spec_path = os.path.join(save_dir, f'{save_name}_spec')
tf.data.Dataset.save_spec(tf_dataset, spec_path)
print(f"Dataset spec saved to {spec_path}")

Dataset saved to /content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET


AttributeError: type object 'DatasetV2' has no attribute 'save_spec'