<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/FASTQ_Data_Miner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Import XML file with FASTQ files' links.

In [None]:
!pip install rarfile

import rarfile
import os

dataset_drive_path = "/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/SOURCES/ena_sra-experiment_20231030-2024.rar"
xml_path = "/content/"

os.makedirs(xml_path, exist_ok=True)

with rarfile.RarFile(dataset_drive_path) as rf:
  rf.extractall(xml_path)

Collecting rarfile
  Downloading rarfile-4.1-py3-none-any.whl (28 kB)
Installing collected packages: rarfile
Successfully installed rarfile-4.1


## Create a txt file with the report links.

In [None]:
from lxml import etree

# Create an XMLParser with error recovery
parser = etree.XMLParser(recover=True)

xml_path = "/content/ena_sra-experiment_20231030-2024.xml"

# Try to parse the XML file
try:
    tree = etree.parse(xml_path, parser=parser)
    root = tree.getroot()

except Exception as e:
    print("An error occurred while parsing the XML file:")
    print(e)
    root = None

# Download FASTQ files
fastq_links = []

if root is not None:
    for experiment in root.findall('.//EXPERIMENT'):
        experiment_links = experiment.findall('.//EXPERIMENT_LINKS/EXPERIMENT_LINK/XREF_LINK')
        for link in experiment_links:
            db = link.find('DB')
            if db is not None and db.text == 'ENA-FASTQ-FILES':
                fastq_link = link.find('ID')
                if fastq_link is not None:
                    fastq_links.append(fastq_link.text)

'''
# Print the list of FASTQ download links
for link in fastq_links:
    print(link)

print(len(fastq_links))

'''

fastq_links_file = "/content/fastq_links.txt"
with open(fastq_links_file, 'w') as file:
    file.write('\n'.join(fastq_links))

## Download the file reports.

In [None]:
import requests
import random

def download_reports(fastq_links, download_dir, num_files=None, seed=None):
    os.makedirs(download_dir, exist_ok=True)

    if seed is not None:
        random.seed(seed)
        random.shuffle(fastq_links)

    if num_files is not None:
        fastq_links = fastq_links[:num_files]

    for link in fastq_links:
        file_name = link.split('/')[-1]
        file_path = os.path.join(download_dir, file_name)

        response = requests.get(link, stream=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    f.write(chunk)

    print(f"Downloaded {len(fastq_links)} FASTQ files to {download_dir}.")

In [None]:
# Clear the folder from previous downloads (optional)
def clear_folder(folder_path):
    files_in_folder = len(os.listdir(folder_path))
    print(f"{files_in_folder} files found.")

    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path,file)
        os.remove(file_path)

    print(f"{files_in_folder} removed from {folder_path}.")

In [None]:
#clear_folder('/content/report_files')

In [None]:
# Define the hyperparameters
DOWNLOAD_DIR = "/content/report_files"
NUM_OF_FILES_TO_DOWNLOAD = 50
SEED = 2 # 12,10,6,8,14,42,38 (Notes for the previous downloads)

# Call the function
download_reports(fastq_links, DOWNLOAD_DIR, num_files=NUM_OF_FILES_TO_DOWNLOAD, seed=SEED)

Downloaded 50 FASTQ files to /content/report_files.


## Download FASTQ files from FTP links.

The files are compressed (.fastq.gz) and a maximum file size is defined before downloading.

In [None]:
# Define the maximum file size to download (MB)
MAX_FILE_SIZE = 50

# Specify the directory path to download files
download_dir = "/content/fastq_files/"

# Check file size before downloading and download a file from the ftp link
def check_and_download_file(ftp_link):
    if not ftp_link.startswith("http://") and not ftp_link.startswith("https://"):
        ftp_link = "https://" + ftp_link

    # Send a HEAD request to get the file metadata
    response = requests.head(ftp_link)

    if response.status_code == 200:
        # Extract the content length from the response headers
        content_length = int(response.headers.get("Content-Length", 0))

        # Convert content length to megabytes
        content_length_mb = content_length / (1024 * 1024)

        if content_length_mb <= MAX_FILE_SIZE:
            # Download the file
            file_name = ftp_link.split('/')[-1]
            file_path = os.path.join(download_dir, file_name)

            # Check if the file already exists
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    # Download the file
                    file_response = requests.get(ftp_link, stream=True)
                    for chunk in file_response.iter_content(chunk_size=1024):
                        f.write(chunk)

                print(f"Downloaded: {file_name}")
            else:
                print(f"File already exists: {file_name}")
        else:
            print(f"Skipping file: {ftp_link} (Size exceeds maximum file size)")
            '''
            Based on this case, there could be an additional option to save these links
            in order to use them for the creation of a different dataset that contains such large files:

            skipped_files.append(ftp_link)

            '''
    else:
        print(f"Failed to retrieve metadata for file: {ftp_link}")

In [None]:
!mkdir /content/fastq_files

In [None]:
# Clear the folder from previous downloads (optional)
#clear_folder('/content/fastq_files')

In [None]:
# Define the directory path containing the FTP links
directory_path = "/content/report_files/"

# Define the directory path in which the files will be downloaded
download_dir = "/content/fastq_files/"

# List all files in the directory
all_files = os.listdir(directory_path)

# Filter files that contain the necessary information
info_files = [file for file in all_files if "filereport" in file]

# Iterate through each info file
for info_file in info_files:
    file_path = os.path.join(directory_path, info_file)

    # Read the content of the file
    with open(file_path, 'r') as f:
        # Take only the data line
        lines = f.readlines()
        data_line = lines[1]

        #Check if there is data in the file and take only the first ftp link
        if data_line is not None:
            columns = data_line.strip().split('\t')

            if len(columns) >= 2:
                ftp_link = columns[1]
                ftp_link_1 = ftp_link.split(";")[0]
                #print (ftp_link_1)

                check_and_download_file(ftp_link_1) # Use ftp_link instead to download all files

print("Downloaded FASTQ files.")

Downloaded: SRR14167530_1.fastq.gz
Downloaded: SRR16316187.fastq.gz
Downloaded: SRR13841556_1.fastq.gz
Downloaded: SRR14187989_1.fastq.gz
Downloaded: SRR7781714_1.fastq.gz
Skipping file: https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR128/069/SRR12824069/SRR12824069.fastq.gz (Size exceeds maximum file size)
Downloaded: SRR14592499_1.fastq.gz
Downloaded: SRR7151338.fastq.gz
Skipping file: https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR801/009/ERR8017479/ERR8017479_1.fastq.gz (Size exceeds maximum file size)
Downloaded: SRR4446771.fastq.gz
Skipping file: https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR103/046/SRR10314346/SRR10314346.fastq.gz (Size exceeds maximum file size)
Skipping file: https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR317/007/SRR3170977/SRR3170977_1.fastq.gz (Size exceeds maximum file size)
Downloaded: SRR14592962_1.fastq.gz
Skipping file: https://ftp.sra.ebi.ac.uk/vol1/fastq/ERR809/001/ERR8093081/ERR8093081_1.fastq.gz (Size exceeds maximum file size)
Downloaded: SRR7151162.fastq.gz
Downloaded: SRR79

## Decompress .FASTQ.GZ files.

In [None]:
import gzip
import shutil

def decompress_gz_files(directory_path):
    # List all files in the directory
    all_files = os.listdir(directory_path)

    # Filter files to keep only .gz files
    gz_files = [file for file in all_files if file.endswith(".gz")]

    # Decompress each .gz file
    for gz_file in gz_files:
        gz_file_path = os.path.join(directory_path, gz_file)

        # Determine the output file name by removing the '.gz' extension
        output_file_path = os.path.splitext(gz_file_path)[0]

        # Decompress the .gz file
        try:
            with gzip.open(gz_file_path, 'rb') as f_in, open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

            # Remove the original .gz file after decompression
            os.remove(gz_file_path)

        except (OSError, gzip.BadGzipFile) as e:
            print(f"Error decompressing {gz_file}: {e}")

    print("Decompression complete.")

In [None]:
# Specify the directory path containing .gz files
directory_path = "/content/fastq_files/"

# Call the function to decompress .gz files in the specified directory
decompress_gz_files(directory_path)

Decompression complete.


## Save to Drive.

In [None]:
from datetime import datetime

# Define the source folder and its contents
source_folder = '/content/fastq_files'

# Zip the folder
shutil.make_archive(source_folder, 'zip', source_folder)

# Move the zip file to Google Drive
timestamp = datetime.now().strftime("%Y%m%d")
seed = str(SEED)
new_filename = f'fastq_files_seed_{seed}_{timestamp}.zip'

shutil.move(source_folder + '.zip', f'/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/{new_filename}')

print(f"Folder '{source_folder}' has been zipped and uploaded to Google Drive with seed {seed}.")

Folder '/content/fastq_files' has been zipped and uploaded to Google Drive with seed 2.
