<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/FASTQ_Data_Miner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Connect Google Drive.

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import Libraries.

In [None]:
import os
import shutil
!pip install rarfile
import rarfile
import zipfile
import gzip
import csv
import numpy as np
import pandas as pd
import requests
import re
import random
import requests
!pip install Bio
from Bio import SeqIO
from datetime import datetime

### Import XML file with report file links.

The XML file contains links for one or more report files. These report files contain the FASTQ files to be downloaded.

In [3]:
def extract_files(source_path, destination_path):
    try:
        # Create the destination directory if it doesn't exist
        os.makedirs(destination_path, exist_ok=True)

        # Check the file extension and extract or copy, accordingly
        if source_path.endswith('.rar'):
            with rarfile.RarFile(source_path) as rf:
                rf.extractall(destination_path)

        elif source_path.endswith('.zip'):
            with zipfile.ZipFile(source_path, 'r') as zf:
                zf.extractall(destination_path)

        elif source_path.endswith('.xml'):
            shutil.copy(source_path, destination_path)

        else:
            raise ValueError("Unsupported file type. Only RAR, ZIP, and XML files are supported.")

        print(f"Extraction completed successfully to: {destination_path}")

    except Exception as e:
        print(f"Error during extraction: {e}")

In [4]:
# Define the source and destination paths
dataset_path = "/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/SOURCES/PRJEB44548.xml"
xml_path = "/content/"

# Extract the files
extract_files(dataset_path, xml_path)

Extraction completed successfully to: /content/


### Create a CSV file with the report file links.

In [5]:
def extract_report_file_links(text):
    report_files = []
    lines = text.split('\n')
    index = 0

    while index < len(lines):
        line = lines[index]

        if "<DB>ENA-FASTQ-FILES</DB>" in line:
            # Search for the link below this line
            next_line = lines[index + 1]

            if "<ID>" in next_line:
                link_start_index = next_line.find("<ID><![CDATA[")

                if link_start_index != -1:
                    link_end_index = next_line.find("]]></ID>")

                    if link_end_index != -1:
                        fastq_link = next_line[link_start_index+len("<ID><![CDATA["):link_end_index]
                        report_files.append(fastq_link)

            # Move to the next line after the link
            index += 2
        else:
            index += 1

    return report_files

def read_xml_file(xml_path):
    try:
        with open(xml_path, 'r') as f:
            xml_text = f.read()

        return xml_text

    except Exception as e:
        print(f"Error reading XML file: {e}")

        return None

def save_links_to_csv(links, xml_path):
    # Extract CSV file name from XML file name
    base_name = os.path.basename(xml_path)
    csv_filename = os.path.splitext(base_name)[0] + "_links.csv"
    csv_path = os.path.join("/content/", csv_filename)

    try:
        with open(csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Report file Links"])

            for link in links:
                writer.writerow([link])

        print(f"Report file links saved to '{csv_path}' successfully.")

        return csv_path

    except Exception as e:
        print(f"Error saving CSV file: {e}")

In [6]:
# Read the XML file
xml_path += os.path.basename(dataset_path)
xml_text = read_xml_file(xml_path)

if xml_text:
    report_files = extract_report_file_links(xml_text)

    if report_files:
        report_csv_path = save_links_to_csv(report_files, xml_path)

    else:
        print("No report file links found in the XML file.")
else:
    print("Error reading XML file. Check the file path.")

Report file links saved to '/content/PRJEB44548_links.csv' successfully.


Display the csv file information.

In [7]:
'''
def display_csv_content(csv_path):
    try:
        # Create a Pandas DataFrame
        df = pd.read_csv(csv_path)

        # Display the DataFrame head and info
        print(df.head())
        print(df.info())

    except Exception as e:
        print(f"Error reading CSV file: {e}")

display_csv_content(csv_path)
'''

'\ndef display_csv_content(csv_path):\n    try:\n        # Create a Pandas DataFrame\n        df = pd.read_csv(csv_path)\n\n        # Display the DataFrame head and info\n        print(df.head())\n        print(df.info())\n\n    except Exception as e:\n        print(f"Error reading CSV file: {e}")\n\ndisplay_csv_content(csv_path)\n'

### Download the report files.

Extract the desired number of file reports from the CSV file that contains the links.

In [8]:
def download_reports(report_links, download_dir, num_files=None, seed=None):
    os.makedirs(download_dir, exist_ok=True)

    if seed is not None:
        random.seed(seed)
        random.shuffle(report_links)

    if num_files is not None:
        report_links = report_links[:num_files]

    for link in report_links:
        file_name = link.split('/')[-1]
        file_path = os.path.join(download_dir, file_name)

        response = requests.get(link, stream=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    f.write(chunk)

Define the hyperparameters and call the function to download the reports.

In [9]:
df = pd.read_csv(report_csv_path)
REPORT_LINKS = df['Report file Links']
DOWNLOAD_DIR = "/content/REPORT_FILES/"
NUM_FILES = 50
SEED = 2

download_reports(REPORT_LINKS, DOWNLOAD_DIR, NUM_FILES, SEED)

### Extract the FTP links.

In [10]:
for report_file in os.listdir(DOWNLOAD_DIR):
    report_file_path = os.path.join(DOWNLOAD_DIR, report_file)

    with open(report_file_path, 'r') as file:
        # Skip the first line (header)
        next(file)

        # Initialize a list to store FTP links
        ftp_links = []

        # Iterate through each line in the file
        for line in file:
            # Split the line based on tab ('\t') delimiter
            line_parts = line.strip().split('\t')

            # Get the FTP links from the second column
            ftp_links.extend(line_parts[1].split(';'))

# Print the number of FTP links
print(f"{len(ftp_links)} FTPs saved.")

46 FTPs saved.


### Download FASTQ.GZ files from FTP links.

In [11]:
def check_and_download_file(ftp_link, max_file_size, download_dir):
    if not ftp_link.startswith("http://") and not ftp_link.startswith("https://"):
        ftp_link = "https://" + ftp_link

    # Send a HEAD request to get the file metadata
    response = requests.head(ftp_link)

    if response.status_code == 200:
        # Extract the content length from the response headers
        content_length = int(response.headers.get("Content-Length", 0))

        # Convert content length to megabytes
        content_length_mb = content_length / (1024 * 1024)

        if content_length_mb <= max_file_size:
            # Download the file
            file_name = ftp_link.split('/')[-1]
            file_path = os.path.join(download_dir, file_name)

            # Check if the file already exists
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    # Download the file
                    file_response = requests.get(ftp_link, stream=True)
                    for chunk in file_response.iter_content(chunk_size=1024):
                        f.write(chunk)

                print(f"Downloaded: {file_name}")
            else:
                print(f"File already exists: {file_name}")
        else:
            print(f"Skipping file: {ftp_link} (Size exceeds maximum file size)")
            '''
            Based on this case, there could be an additional option to save these links
            in order to use them for the creation of a different dataset that contains such large files:

            skipped_files.append(ftp_link)

            '''
    else:
        print(f"Failed to retrieve metadata for file: {ftp_link}")

In [12]:
MAX_FILE_SIZE = 500

for ftp_link in ftp_links:
  check_and_download_file(ftp_link, MAX_FILE_SIZE, DOWNLOAD_DIR)

Downloaded: ERR5885024_1.fastq.gz
Downloaded: ERR5885024_2.fastq.gz
Downloaded: ERR5960498_1.fastq.gz
Downloaded: ERR5960498_2.fastq.gz
Downloaded: ERR5960500_1.fastq.gz
Downloaded: ERR5960500_2.fastq.gz
Downloaded: ERR6053338_1.fastq.gz
Downloaded: ERR6053338_2.fastq.gz
Downloaded: ERR5885023_1.fastq.gz
Downloaded: ERR5885023_2.fastq.gz
Downloaded: ERR5885025_1.fastq.gz
Downloaded: ERR5885025_2.fastq.gz
Downloaded: ERR5885028_1.fastq.gz
Downloaded: ERR5885028_2.fastq.gz
Downloaded: ERR5885029_1.fastq.gz
Downloaded: ERR5885029_2.fastq.gz
Downloaded: ERR5885030_1.fastq.gz
Downloaded: ERR5885030_2.fastq.gz
Downloaded: ERR5885032_1.fastq.gz
Downloaded: ERR5885032_2.fastq.gz
Downloaded: ERR5960499_1.fastq.gz
Downloaded: ERR5960499_2.fastq.gz
Downloaded: ERR6053339_1.fastq.gz
Downloaded: ERR6053339_2.fastq.gz
Downloaded: ERR6608942_1.fastq.gz
Downloaded: ERR6608942_2.fastq.gz
Downloaded: ERR6608944_1.fastq.gz
Downloaded: ERR6608944_2.fastq.gz
Downloaded: ERR6608943_1.fastq.gz
Downloaded: ER

### Decompress FASTQ.GZ files.

In [14]:
def decompress_fastq_gz(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            f_out.write(f_in.read())

In [16]:
input_dir = "/content/REPORT_FILES"
output_dir = "/content/FASTQ_FILES"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate through files in the input directory
for file_name in os.listdir(input_dir):
    if file_name.endswith('.fastq.gz'):
        input_file_path = os.path.join(input_dir, file_name)
        output_file_path = os.path.join(output_dir, file_name[:-3])  # Remove the .gz extension

        # Decompress the file
        decompress_fastq_gz(input_file_path, output_file_path)

        print(f"Decompressed: {file_name}")

Decompressed: ERR6053337_1.fastq.gz
Decompressed: ERR5960500_2.fastq.gz
Decompressed: ERR5885028_2.fastq.gz
Decompressed: ERR6155196_1.fastq.gz
Decompressed: ERR5885029_2.fastq.gz
Decompressed: ERR5960499_2.fastq.gz
Decompressed: ERR6155196_2.fastq.gz
Decompressed: ERR6053338_1.fastq.gz
Decompressed: ERR6155194_1.fastq.gz
Decompressed: ERR6155194_2.fastq.gz
Decompressed: ERR5885030_1.fastq.gz
Decompressed: ERR5960498_2.fastq.gz
Decompressed: ERR5885030_2.fastq.gz
Decompressed: ERR6608944_1.fastq.gz
Decompressed: ERR6053339_1.fastq.gz
Decompressed: ERR6608944_2.fastq.gz
Decompressed: ERR5885033_2.fastq.gz
Decompressed: ERR6608943_2.fastq.gz
Decompressed: ERR6608942_2.fastq.gz
Decompressed: ERR5885031_2.fastq.gz
Decompressed: ERR5885023_1.fastq.gz
Decompressed: ERR6155195_2.fastq.gz
Decompressed: ERR6053339_2.fastq.gz
Decompressed: ERR5960500_1.fastq.gz
Decompressed: ERR5885033_1.fastq.gz
Decompressed: ERR5885025_2.fastq.gz
Decompressed: ERR5885027_2.fastq.gz
Decompressed: ERR5960499_1.f

### Get the meta data of the FASTQ files.

In [22]:
def extract_metadata_from_fastq(directory):
    metadata = []
    for filename in os.listdir(directory):
        if filename.endswith(".fastq"):
            file_path = os.path.join(directory, filename)
            with open(file_path, "r") as handle:
                for record in SeqIO.parse(handle, "fastq"):
                    # Extract relevant metadata from each record
                    record_metadata = {
                        "file_name": filename,
                        "sequence_id": record.id,
                        "sequence_length": len(record.seq),
                        "quality_scores": record.letter_annotations["phred_quality"]
                    }
                    metadata.append(record_metadata)
    return metadata

In [None]:
# Specify the directory containing FASTQ files
fastq_directory = '/content/FASTQ_FILES'

# Extract metadata from FASTQ files
fastq_metadata = extract_metadata_from_fastq(fastq_directory)

# Example printing of metadata
for record_metadata in fastq_metadata:
    print(record_metadata)

## Compress and save to Drive.

In [None]:
# Define the source folder and its contents
source_folder = '/content/FASTQ_FILES'

# Zip the folder
shutil.make_archive(source_folder, 'zip', source_folder)

# Move the zip file to Google Drive
timestamp = datetime.now().strftime("%Y%m%d")
seed = str(SEED)
new_filename = f'fastq_files_seed_{seed}_{timestamp}.zip'

shutil.move(source_folder + '.zip', f'/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/{new_filename}')

print(f"Folder '{source_folder}' has been zipped and uploaded to Google Drive with seed {seed}.")

Folder '/content/fastq_files' has been zipped and uploaded to Google Drive with seed 2.
