<a href="https://colab.research.google.com/github/andrkech/GENERATIVE-METHODS-IN-GENOMICS/blob/main/FASTQ_Data_Miner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Connect Google Drive.

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import Libraries.

In [2]:
import os
import shutil
!pip install rarfile
import rarfile
import zipfile
import gzip
import csv
import numpy as np
import pandas as pd
import requests
import re
import random
import requests

Collecting rarfile
  Downloading rarfile-4.2-py3-none-any.whl (29 kB)
Installing collected packages: rarfile
Successfully installed rarfile-4.2


### Import XML file with report file links.

The XML file contains links for one or more report files. These report files contain the FASTQ files to be downloaded.

In [3]:
def extract_files(source_path, destination_path):
    try:
        # Create the destination directory if it doesn't exist
        os.makedirs(destination_path, exist_ok=True)

        # Check the file extension and extract or copy, accordingly
        if source_path.endswith('.rar'):
            with rarfile.RarFile(source_path) as rf:
                rf.extractall(destination_path)

        elif source_path.endswith('.zip'):
            with zipfile.ZipFile(source_path, 'r') as zf:
                zf.extractall(destination_path)

        elif source_path.endswith('.xml'):
            shutil.copy(source_path, destination_path)

        else:
            raise ValueError("Unsupported file type. Only RAR, ZIP, and XML files are supported.")

        print(f"Extraction completed successfully to: {destination_path}")

    except Exception as e:
        print(f"Error during extraction: {e}")

In [4]:
# Define the source and destination paths
dataset_path = "/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/SOURCES/PRJEB44548.xml"
xml_path = "/content/"

# Extract the files
extract_files(dataset_path, xml_path)

Extraction completed successfully to: /content/


### Create a CSV file with the report file links.

In [5]:
def extract_report_file_links(text):
    report_files = []
    lines = text.split('\n')
    index = 0

    while index < len(lines):
        line = lines[index]

        if "<DB>ENA-FASTQ-FILES</DB>" in line:
            # Search for the link below this line
            next_line = lines[index + 1]

            if "<ID>" in next_line:
                link_start_index = next_line.find("<ID><![CDATA[")

                if link_start_index != -1:
                    link_end_index = next_line.find("]]></ID>")

                    if link_end_index != -1:
                        fastq_link = next_line[link_start_index+len("<ID><![CDATA["):link_end_index]
                        report_files.append(fastq_link)

            # Move to the next line after the link
            index += 2
        else:
            index += 1

    return report_files

def read_xml_file(xml_path):
    try:
        with open(xml_path, 'r') as f:
            xml_text = f.read()

        return xml_text

    except Exception as e:
        print(f"Error reading XML file: {e}")

        return None

def save_links_to_csv(links, xml_path):
    # Extract CSV file name from XML file name
    base_name = os.path.basename(xml_path)
    csv_filename = os.path.splitext(base_name)[0] + "_links.csv"
    csv_path = os.path.join("/content/", csv_filename)

    try:
        with open(csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Report file Links"])

            for link in links:
                writer.writerow([link])

        print(f"Report file links saved to '{csv_path}' successfully.")

        return csv_path

    except Exception as e:
        print(f"Error saving CSV file: {e}")

In [6]:
# Read the XML file
xml_path += os.path.basename(dataset_path)
xml_text = read_xml_file(xml_path)

if xml_text:
    report_files = extract_report_file_links(xml_text)

    if report_files:
        report_csv_path = save_links_to_csv(report_files, xml_path)

    else:
        print("No report file links found in the XML file.")
else:
    print("Error reading XML file. Check the file path.")

Report file links saved to '/content/PRJEB44548_links.csv' successfully.


Display the csv file information.

In [7]:
'''
def display_csv_content(csv_path):
    try:
        # Create a Pandas DataFrame
        df = pd.read_csv(csv_path)

        # Display the DataFrame head and info
        print(df.head())
        print(df.info())

    except Exception as e:
        print(f"Error reading CSV file: {e}")

display_csv_content(csv_path)
'''

'\ndef display_csv_content(csv_path):\n    try:\n        # Create a Pandas DataFrame\n        df = pd.read_csv(csv_path)\n\n        # Display the DataFrame head and info\n        print(df.head())\n        print(df.info())\n\n    except Exception as e:\n        print(f"Error reading CSV file: {e}")\n\ndisplay_csv_content(csv_path)\n'

### Download the report files.

Extract the desired number of file reports from the CSV file that contains the links.

In [8]:
def download_reports(report_links, download_dir, num_files=None, seed=None):
    os.makedirs(download_dir, exist_ok=True)

    if seed is not None:
        random.seed(seed)
        random.shuffle(report_links)

    if num_files is not None:
        report_links = report_links[:num_files]

    for link in report_links:
        file_name = link.split('/')[-1]
        file_path = os.path.join(download_dir, file_name)

        response = requests.get(link, stream=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    f.write(chunk)

Define the hyperparameters and call the function to download the reports.

In [9]:
df = pd.read_csv(report_csv_path)
REPORT_LINKS = df['Report file Links']
DOWNLOAD_DIR = "/content/REPORT_FILES/"
NUM_FILES = 50
SEED = 2

download_reports(REPORT_LINKS, DOWNLOAD_DIR, NUM_FILES, SEED)

### Extract the FTP links.

In [10]:
for report_file in os.listdir(DOWNLOAD_DIR):
    report_file_path = os.path.join(DOWNLOAD_DIR, report_file)

    with open(report_file_path, 'r') as file:
        # Skip the first line (header)
        next(file)

        # Initialize a list to store FTP links
        ftp_links = []

        # Iterate through each line in the file
        for line in file:
            # Split the line based on tab ('\t') delimiter
            line_parts = line.strip().split('\t')

            # Get the FTP links from the second column
            ftp_links.extend(line_parts[1].split(';'))

# Print the number of FTP links
print(f"{len(ftp_links)} FTPs saved.")

46 FTPs saved.


In [11]:
def check_and_download_file(ftp_link, max_file_size, download_dir):
    if not ftp_link.startswith("http://") and not ftp_link.startswith("https://"):
        ftp_link = "https://" + ftp_link

    # Send a HEAD request to get the file metadata
    response = requests.head(ftp_link)

    if response.status_code == 200:
        # Extract the content length from the response headers
        content_length = int(response.headers.get("Content-Length", 0))

        # Convert content length to megabytes
        content_length_mb = content_length / (1024 * 1024)

        if content_length_mb <= max_file_size:
            # Download the file
            file_name = ftp_link.split('/')[-1]
            file_path = os.path.join(download_dir, file_name)

            # Check if the file already exists
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    # Download the file
                    file_response = requests.get(ftp_link, stream=True)
                    for chunk in file_response.iter_content(chunk_size=1024):
                        f.write(chunk)

                print(f"Downloaded: {file_name}")
            else:
                print(f"File already exists: {file_name}")
        else:
            print(f"Skipping file: {ftp_link} (Size exceeds maximum file size)")
            '''
            Based on this case, there could be an additional option to save these links
            in order to use them for the creation of a different dataset that contains such large files:

            skipped_files.append(ftp_link)

            '''
    else:
        print(f"Failed to retrieve metadata for file: {ftp_link}")

In [12]:
MAX_FILE_SIZE = 500

for ftp_link in ftp_links:
  check_and_download_file(ftp_link, MAX_FILE_SIZE, DOWNLOAD_DIR)

Downloaded: ERR5885024_1.fastq.gz
Downloaded: ERR5885024_2.fastq.gz
Downloaded: ERR5960498_1.fastq.gz
Downloaded: ERR5960498_2.fastq.gz
Downloaded: ERR5960500_1.fastq.gz
Downloaded: ERR5960500_2.fastq.gz
Downloaded: ERR6053338_1.fastq.gz
Downloaded: ERR6053338_2.fastq.gz
Downloaded: ERR5885023_1.fastq.gz
Downloaded: ERR5885023_2.fastq.gz
Downloaded: ERR5885025_1.fastq.gz
Downloaded: ERR5885025_2.fastq.gz
Downloaded: ERR5885028_1.fastq.gz
Downloaded: ERR5885028_2.fastq.gz
Downloaded: ERR5885029_1.fastq.gz
Downloaded: ERR5885029_2.fastq.gz
Downloaded: ERR5885030_1.fastq.gz
Downloaded: ERR5885030_2.fastq.gz
Downloaded: ERR5885032_1.fastq.gz
Downloaded: ERR5885032_2.fastq.gz
Downloaded: ERR5960499_1.fastq.gz
Downloaded: ERR5960499_2.fastq.gz
Downloaded: ERR6053339_1.fastq.gz
Downloaded: ERR6053339_2.fastq.gz
Downloaded: ERR6608942_1.fastq.gz
Downloaded: ERR6608942_2.fastq.gz
Downloaded: ERR6608944_1.fastq.gz
Downloaded: ERR6608944_2.fastq.gz
Downloaded: ERR6608943_1.fastq.gz
Downloaded: ER

In [None]:
input_dir = "/content/FILEREPORTS_FILES"
output_dir = "/content/FASTQ_FILES"

def decompress_fastq_gz(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            f_out.write(f_in.read())

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate through files in the input directory
for file_name in os.listdir(input_dir):
    if file_name.endswith('.fastq.gz'):
        input_file_path = os.path.join(input_dir, file_name)
        output_file_path = os.path.join(output_dir, file_name[:-3])  # Remove the .gz extension

        # Decompress the file
        decompress_fastq_gz(input_file_path, output_file_path)

        print(f"Decompressed: {file_name}")

Decompressed: ERR5885029_2.fastq.gz


In [13]:
# Clear the folder from previous downloads (optional)
def clear_folder(folder_path):
    files_in_folder = len(os.listdir(folder_path))
    print(f"{files_in_folder} files found.")

    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path,file)
        os.remove(file_path)

    print(f"{files_in_folder} removed from {folder_path}.")

In [None]:
#clear_folder('/content/downloaded_files')

46 files found.
46 removed from /content/downloaded_files.


In [None]:
# Define the hyperparameters
DOWNLOAD_DIR = "/content/report_files"
NUM_OF_FILES_TO_DOWNLOAD = 50
SEED = 2 # 12,10,6,8,14,42,38 (Notes for the previous downloads)

# Call the function
download_reports(fastq_links, DOWNLOAD_DIR, num_files=NUM_OF_FILES_TO_DOWNLOAD, seed=SEED)

Downloaded 1 FASTQ files to /content/report_files.


## Download FASTQ files from FTP links.

The files are compressed (.fastq.gz) and a maximum file size is defined before downloading.

In [None]:
# Define the maximum file size to download (MB)
MAX_FILE_SIZE = 50

# Specify the directory path to download files
download_dir = "/content/fastq_files/"

# Check file size before downloading and download a file from the ftp link
def check_and_download_file(ftp_link):
    if not ftp_link.startswith("http://") and not ftp_link.startswith("https://"):
        ftp_link = "https://" + ftp_link

    # Send a HEAD request to get the file metadata
    response = requests.head(ftp_link)

    if response.status_code == 200:
        # Extract the content length from the response headers
        content_length = int(response.headers.get("Content-Length", 0))

        # Convert content length to megabytes
        content_length_mb = content_length / (1024 * 1024)

        if content_length_mb <= MAX_FILE_SIZE:
            # Download the file
            file_name = ftp_link.split('/')[-1]
            file_path = os.path.join(download_dir, file_name)

            # Check if the file already exists
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    # Download the file
                    file_response = requests.get(ftp_link, stream=True)
                    for chunk in file_response.iter_content(chunk_size=1024):
                        f.write(chunk)

                print(f"Downloaded: {file_name}")
            else:
                print(f"File already exists: {file_name}")
        else:
            print(f"Skipping file: {ftp_link} (Size exceeds maximum file size)")
            '''
            Based on this case, there could be an additional option to save these links
            in order to use them for the creation of a different dataset that contains such large files:

            skipped_files.append(ftp_link)

            '''
    else:
        print(f"Failed to retrieve metadata for file: {ftp_link}")

In [None]:
!mkdir /content/fastq_files

In [None]:
# Clear the folder from previous downloads (optional)
#clear_folder('/content/fastq_files')

In [None]:
# Define the directory path containing the FTP links
directory_path = "/content/report_files/"

# Define the directory path in which the files will be downloaded
download_dir = "/content/fastq_files/"

# List all files in the directory
all_files = os.listdir(directory_path)

# Filter files that contain the necessary information
info_files = [file for file in all_files if "filereport" in file]

# Iterate through each info file
for info_file in info_files:
    file_path = os.path.join(directory_path, info_file)

    # Read the content of the file
    with open(file_path, 'r') as f:
        # Take only the data line
        lines = f.readlines()
        data_line = lines[1]

        #Check if there is data in the file and take only the first ftp link
        if data_line is not None:
            columns = data_line.strip().split('\t')

            if len(columns) >= 2:
                ftp_link = columns[1]
                ftp_link_1 = ftp_link.split(";")[0]
                #print (ftp_link_1)

                check_and_download_file(ftp_link_1) # Use ftp_link instead to download all files

print("Downloaded FASTQ files.")

Downloaded: ERR5885024_1.fastq.gz
Downloaded FASTQ files.


## Decompress .FASTQ.GZ files.

In [None]:
import gzip
import shutil

def decompress_gz_files(directory_path):
    # List all files in the directory
    all_files = os.listdir(directory_path)

    # Filter files to keep only .gz files
    gz_files = [file for file in all_files if file.endswith(".gz")]

    # Decompress each .gz file
    for gz_file in gz_files:
        gz_file_path = os.path.join(directory_path, gz_file)

        # Determine the output file name by removing the '.gz' extension
        output_file_path = os.path.splitext(gz_file_path)[0]

        # Decompress the .gz file
        try:
            with gzip.open(gz_file_path, 'rb') as f_in, open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

            # Remove the original .gz file after decompression
            os.remove(gz_file_path)

        except (OSError, gzip.BadGzipFile) as e:
            print(f"Error decompressing {gz_file}: {e}")

    print("Decompression complete.")

In [None]:
# Specify the directory path containing .gz files
directory_path = "/content/fastq_files/"

# Call the function to decompress .gz files in the specified directory
decompress_gz_files(directory_path)

Decompression complete.


## Save to Drive.

In [None]:
from datetime import datetime

# Define the source folder and its contents
source_folder = '/content/fastq_files'

# Zip the folder
shutil.make_archive(source_folder, 'zip', source_folder)

# Move the zip file to Google Drive
timestamp = datetime.now().strftime("%Y%m%d")
seed = str(SEED)
new_filename = f'fastq_files_seed_{seed}_{timestamp}.zip'

shutil.move(source_folder + '.zip', f'/content/drive/MyDrive/BIOINFORMATICS/THESIS_KECHAGIAS/DATA/DATASET/{new_filename}')

print(f"Folder '{source_folder}' has been zipped and uploaded to Google Drive with seed {seed}.")

Folder '/content/fastq_files' has been zipped and uploaded to Google Drive with seed 2.
