<a href="https://colab.research.google.com/github/atulchander/Concatenate_RNA/blob/main/Concatenate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Code for removing duplicate files that might have been downloaded by errors.

In [None]:
import os
import glob

# Define the folder path where your FASTQ files are located
folder_path = '/content/drive/MyDrive/Google colab/xyz'

# Get a list of all the FASTQ files in the folder
fastq_files = glob.glob(os.path.join(folder_path, '*.fastq*'))

# Dictionary to store unique files based on their name and size
unique_files = {}

# Loop through all files to identify duplicates
for file in fastq_files:
    # Extract base name (everything before the first .fastq)
    base_name = os.path.basename(file).split('.fastq')[0]

    # Get file size
    file_size = os.path.getsize(file)

    # Create a unique key based on base name and file size
    key = f"{base_name}_{file_size}"

    # Check if this key already exists
    if key in unique_files:
        print(f"Duplicate found: {file} (removing)")
        os.remove(file)  # Remove the duplicate file
    else:
        unique_files[key] = file  # Keep the unique file

print(f"Processed {len(fastq_files)} files. Kept {len(unique_files)} unique files.")

Processed 8 files. Kept 8 unique files.


##Concatenate R1 and R2 RNA seq reads from different lanes

In [None]:
# Create a subfolder for the concatenated files
output_folder = os.path.join(folder_path, 'concatenated_files')
os.makedirs(output_folder, exist_ok=True)

# Define the checkpoint file path
checkpoint_file = os.path.join(output_folder, 'concatenation_checkpoint.txt')

# Function to read the checkpoint file and get the list of completed keys
def read_checkpoint():
    if (os.path.exists(checkpoint_file)):
        with open(checkpoint_file, 'r') as f:
            completed_files = f.read().splitlines()
    else:
        completed_files = []
    return completed_files

# Function to update the checkpoint file after each successful concatenation
def update_checkpoint(key):
    with open(checkpoint_file, 'a') as f:
        f.write(key + '\n')

# Get a list of all the FASTQ files in the folder
fastq_files = glob.glob(os.path.join(folder_path, '*.fastq*'))

# Group files by sample ID and read direction (R1 or R2), ignoring lane
file_groups = {}
for file in fastq_files:
    # Extract sample ID and read direction (ignoring lane)
    parts = os.path.basename(file).split('_')
    sample_id = parts[0]  # e.g., XYZ1
    read_direction = parts[3]  # e.g., R1 or R2

    # Create a key to group by sample ID and read direction
    key = f"{sample_id}_{read_direction}"

    # Add file to the corresponding group
    if key not in file_groups:
        file_groups[key] = []
    file_groups[key].append(file)

# Get the list of already completed concatenations from the checkpoint
completed_files = read_checkpoint()

# Print the number of files in each group before concatenating, and show file names with sizes
for key, files in file_groups.items():
    print(f"\nSample {key}: {len(files)} files")
    for fname in sorted(files):
        size_in_bytes = os.path.getsize(fname)
        size_in_mb = size_in_bytes / (1024 * 1024)  # Convert bytes to MB
        print(f" - {fname} ({size_in_mb:.2f} MB)")

# Concatenate files in each group and save them in the subfolder
for key, files in file_groups.items():
    if key in completed_files:
        print(f"\nSkipping {key} as it's already concatenated.")
        continue

    # Define output file name in the new subfolder
    output_file = os.path.join(output_folder, f"{key}_combined.fastq.gz")

    # Concatenate files
    with open(output_file, 'wb') as outfile:
        for fname in sorted(files):  # Sorting to ensure consistent order
            with open(fname, 'rb') as infile:
                outfile.write(infile.read())

    # Update checkpoint after successful concatenation
    update_checkpoint(key)

    print(f"\nConcatenated {len(files)} files into {output_file}")

print("\nAll files have been concatenated and stored in:", output_folder)


Sample EPC1_R2: 4 files
 - /content/drive/MyDrive/Google colab/EPC1/EPC1_S1_L001_R2_001.fastq.gz (626.91 MB)
 - /content/drive/MyDrive/Google colab/EPC1/EPC1_S25_L002_R2_001.fastq.gz (657.66 MB)
 - /content/drive/MyDrive/Google colab/EPC1_S49_L003_R2_001.fastq.gz (593.23 MB)
 - /content/drive/MyDrive/Google colab/EPC1_S73_L004_R2_001.fastq.gz (685.75 MB)

Sample EPC1_R1: 4 files
 - /content/drive/MyDrive/Google colab/EPC1/EPC1_S1_L001_R1_001.fastq.gz (668.08 MB)
 - /content/drive/MyDrive/Google colab/EPC1/EPC1_S25_L002_R1_001.fastq.gz (700.89 MB)
 - /content/drive/MyDrive/Google colab/EPC1/EPC1_S49_L003_R1_001.fastq.gz (636.44 MB)
 - /content/drive/MyDrive/Google colab/EPC1/EPC1_S73_L004_R1_001.fastq.gz (713.80 MB)

Concatenated 4 files into /content/drive/MyDrive/Google colab/EPC1/concatenated_files/EPC1_R2_combined.fastq.gz

Concatenated 4 files into /content/drive/MyDrive/Google colab/EPC1/concatenated_files/EPC1_R1_combined.fastq.gz

All files have been concatenated and stored in: