<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/FASTQ_DataLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load the Dataset.

### Connect with Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Libraries.

In [None]:
import os
import shutil
import zipfile
import tarfile
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

### Unzip the dataset folder and import it into the Colab notebook.

In [None]:
def extract_or_create_zip(source_path, extract_path):
    # Check if the source path is a directory
    if os.path.isdir(source_path):
        # Create a ZIP file from the directory contents
        shutil.make_archive(extract_path, "zip", source_path)
        print(f"Created ZIP file from directory: {source_path}")

    elif os.path.isfile(source_path) and source_path.lower().endswith('.zip'):
        # Extract the existing ZIP file
        with zipfile.ZipFile(source_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
            print(f"Extraction completed successfully: {source_path}")

    else:
        print(f"Error: Invalid source path '{source_path}'. Must be a directory or a ZIP file.")

In [None]:
# Define source and extraction paths
source_path = "/content/drive/MyDrive/THESIS_KECHAGIAS/DATA/DATASET/FASTQ_FILES.zip"
extract_path = "/content/fastq_dataset"

# Extract or create ZIP file based on source path
extract_or_create_zip(source_path, extract_path)

Extraction completed successfully: /content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/FASTQ_FILES.zip


### Extract the inner compressed folders, if needed.

In [None]:
def extract_compressed_files(source_path, extract_dir):
    # Check if the source path exists
    if not os.path.exists(source_path):
        print(f"Error: Source path '{source_path}' not found.")
        return

    # Create the extraction directory if it doesn't exist
    os.makedirs(extract_dir, exist_ok=True)

    # Iterate through all files in the directory
    for file in os.listdir(source_path):
        file_path = os.path.join(source_path, file)

        # Extract compressed files
        if file.endswith('.zip'):
            extract_file(file_path, extract_dir)

        elif file.endswith('.tar'):
            extract_file(file_path, extract_dir)

def extract_file(source_path, extract_dir):
    try:
        # Get the file extension
        file_extension = os.path.splitext(source_path)[1].lower()

        # Extract based on file type
        if file_extension == '.zip':
            with zipfile.ZipFile(source_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)

        elif file_extension == '.tar':
            with tarfile.open(source_path, 'r') as tar_ref:
                tar_ref.extractall(extract_dir)

        print(f"Extraction completed successfully: {source_path}")

    except Exception as e:
        print(f"Error extracting {source_path}: {e}")

In [None]:
# Extract the compressed file
extract_compressed_files(extract_path, extract_path)

Extraction completed successfully: /content/fastq_dataset/FASTQ_FILES.zip


## Build the DataLoader.

### Import Libraries.

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.utils import Sequence
!pip install -q Bio
from Bio import SeqIO

### Initialize the Dataset.

In [None]:
dataset_path = "/content/fastq_dataset"
directory_paths = []

for dir_name in os.listdir(dataset_path):
    dir_path = os.path.join(dataset_path, dir_name)

    if dir_path.endswith('fastq'):
        directory_paths.append(dir_path)

print(directory_paths)

['/content/fastq_dataset/ERR5960500_2.fastq', '/content/fastq_dataset/ERR5885031_1.fastq', '/content/fastq_dataset/ERR5885024_2.fastq', '/content/fastq_dataset/ERR5885026_2.fastq', '/content/fastq_dataset/ERR6608944_2.fastq', '/content/fastq_dataset/ERR6155196_1.fastq', '/content/fastq_dataset/ERR5885029_1.fastq', '/content/fastq_dataset/ERR6608942_2.fastq', '/content/fastq_dataset/ERR6155195_1.fastq', '/content/fastq_dataset/ERR5960498_2.fastq', '/content/fastq_dataset/ERR5885030_2.fastq', '/content/fastq_dataset/ERR5960498_1.fastq', '/content/fastq_dataset/ERR6155196_2.fastq', '/content/fastq_dataset/ERR5885024_1.fastq', '/content/fastq_dataset/ERR6053338_2.fastq', '/content/fastq_dataset/ERR5885033_2.fastq', '/content/fastq_dataset/ERR5885028_1.fastq', '/content/fastq_dataset/ERR5885027_1.fastq', '/content/fastq_dataset/ERR6608942_1.fastq', '/content/fastq_dataset/ERR5885025_2.fastq', '/content/fastq_dataset/ERR6053337_1.fastq', '/content/fastq_dataset/ERR5885031_2.fastq', '/content

### Create a DataLoader class.

In [None]:
class FastqDataset(Sequence):
    def __init__(self, data_dir, batch_size=32, shuffle=True):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.file_list = self.get_file_list()
        self.indexes =  list(range(len(self.file_list)))
        if self.file_list:
            random.shuffle(self.indexes)

    def __len__(self):
        return len(self.file_list) // self.batch_size

    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_files = [self.file_list[i] for i in batch_indexes]
        batch_data = self.load_batch(batch_files)

        return batch_data

    def get_file_list(self):
        file_list = []
        for filename in os.listdir(self.data_dir):
            if filename.endswith(".fastq"):
                file_path = os.path.join(self.data_dir, filename)
                file_list.append(file_path)

        return file_list


    def load_batch(self, batch_files):
        batch_data = []
        for file_path in batch_files:
            reads, qualities = self.parse_fastq(file_path)
            batch_data.extend(zip(reads, qualities))
        return batch_data

    def parse_fastq(self, fiel_path):
        reads, qualities = [], []

        for record in SeqIO.parse(file_path, 'fastq')
            reads.append(str( record.seq))
            qualities.append(record.letter_annotations['phred_quality'])

        return  reads, qualities

In [None]:
# Define data directory containing FASTQ files
data_dir = '/content/fastq_dataset'

# Create an instance of the custom dataset
dataset = FastqDataset(data_dir)

# Example usage of the dataset
for batch_data in dataset:
    # Process batch_data as needed
    print("Batch Size:", len(batch_data))