<a href="https://colab.research.google.com/github/astrid12345/recyclo/blob/Combine-datasets/Combined_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Reading 3 zipped datasets from Google Drive and unzipping them

from google.colab import drive
drive.mount('/content/drive')

import zipfile
import os

# Specify the path to the folder in your Google Drive after adding a shortcut
drive_folder_path = '/content/drive/MyDrive/Recyclo/datasets'

# List to store the names of found zip files
found_zip_files = []

# Initialize the list to store extracted dataset names before the loop
extracted_dataset_names = []

# Check if the directory exists before listing its contents
if not os.path.exists(drive_folder_path):
    print(f"Error: The directory '{drive_folder_path}' was not found.")
    print("Please ensure you have added a shortcut to the shared folder in your 'My Drive' and the path is correct.")
else:
    # List all files in the folder
    for filename in os.listdir(drive_folder_path):
      if filename.endswith(".zip"):
        # Add the found zip file name to the list
        found_zip_files.append(filename)

        zip_path = os.path.join(drive_folder_path, filename)

        # Create a directory to extract to (optional)
        # Ensure the filename is clean for directory naming
        safe_filename = filename.replace('.zip', '')
        extract_path = f'/content/{safe_filename}_extracted'
        # Append the extracted path to the list within the loop
        extracted_dataset_names.append(extract_path)

        os.makedirs(extract_path, exist_ok=True)

        # Open the zip file
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
              # Extract all the contents into the specified directory
              zip_ref.extractall(extract_path)
              print(f"Extracted {filename} to {extract_path}")
        except zipfile.BadZipFile:
            print(f"Error: {filename} is not a valid zip file.")
        except FileNotFoundError:
             print(f"Error: Zip file not found at {zip_path}. This is unexpected after listing the directory.")

# Optional: Print the list of found zip files after the loop
print("\nFound the following zip files:")
for zip_name in found_zip_files:
    print(f"- {zip_name}")

print("\nExtracted dataset names:")
for name in extracted_dataset_names:
  print(f"- {name}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracted 20250602z_mju-waste_yolo.zip to /content/20250602z_mju-waste_yolo_extracted
Extracted 20250603_TACO_yolo_1500.zip to /content/20250603_TACO_yolo_1500_extracted
Extracted 20250610_aquatrash_yolo.zip to /content/20250610_aquatrash_yolo_extracted

Found the following zip files:
- 20250602z_mju-waste_yolo.zip
- 20250603_TACO_yolo_1500.zip
- 20250610_aquatrash_yolo.zip

Extracted dataset names:
- /content/20250602z_mju-waste_yolo_extracted
- /content/20250603_TACO_yolo_1500_extracted
- /content/20250610_aquatrash_yolo_extracted


In [6]:
"""import shutil
import os

# List of directories to delete
directories_to_delete = [
    '/content/20250602z_mju-waste_yolo_extracted',
    '/content/20250610_aquatrash_yolo_extracted',
    '/content/20250603_TACO_yolo_1500_extracted',
    '/content/merged_yolo_dataset'
] # Replace with your actual list of folder paths

for directory_path in directories_to_delete:
    if os.path.exists(directory_path):
        try:
            shutil.rmtree(directory_path)
            print(f"Deleted directory and its contents: {directory_path}")
        except OSError as e:
            print(f"Error deleting directory {directory_path}: {e}")
    else:
        print(f"Directory not found: {directory_path}")"""

Deleted directory and its contents: /content/20250602z_mju-waste_yolo_extracted
Deleted directory and its contents: /content/20250610_aquatrash_yolo_extracted
Deleted directory and its contents: /content/20250603_TACO_yolo_1500_extracted
Deleted directory and its contents: /content/merged_yolo_dataset


In [9]:
"""
- Merges 3 YOLO datasets while preserving the original train, val, and test splits.
- Maps all labels to a single one, "trash".
- Keeps dataset name prefixes in all image and label names.
- Creates a .yaml file for the merged dataset.
- Skips images with no label files.
"""

# %%
import os
import shutil
import yaml
# No need for 'random' if not reshuffling
# import random

# --- Configuration ---
# List of paths to the extracted YOLO datasets.
# This list is now populated correctly from the previous code block.

extracted_yolo_dataset_paths = [
    '/content/20250602z_mju-waste_yolo_extracted',
    '/content/20250603_TACO_yolo_1500_extracted'
    ]

aquatrash_extracted_base_path = '/content/20250610_aquatrash_yolo_extracted'

# Path to the AquaTrash_yolo directory within the extraction
aquatrash_yolo_path = os.path.join(aquatrash_extracted_base_path, 'kaggle', 'working', 'AquaTrash_yolo')

# Output directory for the merged YOLO dataset
merged_output_dir = '/content/merged_yolo_dataset' # Changed back to the original name

# Define a mapping for merging class names
# Key: original class name from a dataset
# Value: the desired class name in the merged dataset
class_name_mapping = {
    'Rubbish': 'trash',
    'trash': 'trash',
    'glass': 'trash',
    'metal': 'trash',
    'paper': 'trash',
    'plastic': 'trash'
}

# --- Merging Process ---

# Create the main merged directories and split subdirectories
merged_images_base = os.path.join(merged_output_dir, 'images')
merged_labels_base = os.path.join(merged_output_dir, 'labels')

# Create train, val, test subdirectories within the merged structure
for subdir in ['train', 'val', 'test']:
    os.makedirs(os.path.join(merged_images_base, subdir), exist_ok=True)
    os.makedirs(os.path.join(merged_labels_base, subdir), exist_ok=True)

# Keep track of encountered class names and map them to new indices
class_names_set = set()
combined_class_names = []
class_name_to_id = {}

datasets_to_process = extracted_yolo_dataset_paths + [aquatrash_yolo_path]

# --- Step 1: Collect and Map Class Names from all datasets ---
# This step remains the same as we need the final class map before processing labels.
print("Collecting and mapping class names from all data.yaml files...")

for dataset_path in datasets_to_process:
    print(f"Processing data.yaml for: {dataset_path}")
    data_yaml_path = os.path.join(dataset_path, 'data.yaml')

    if os.path.exists(data_yaml_path):
        try:
            with open(data_yaml_path, 'r') as f:
                current_data_yaml = yaml.safe_load(f)
                current_class_names = current_data_yaml.get('names', [])
                print(f"  Found original classes in {os.path.basename(dataset_path)}: {current_class_names}")

            # Update combined class names and IDs based on mapping
            for original_class_name in current_class_names:
                mapped_class_name = class_name_mapping.get(original_class_name, original_class_name)
                if mapped_class_name not in class_names_set:
                    class_names_set.add(mapped_class_name)
                    combined_class_names.append(mapped_class_name)

            print(f"  Combined unique mapped classes so far: {combined_class_names}")

        except FileNotFoundError:
             print(f"  Warning: data.yaml not found at {data_yaml_path}. Cannot get class names from this dataset.")
        except Exception as e:
             print(f"  Error loading data.yaml from {data_yaml_path}: {e}. Skipping class processing for this dataset.")
    else:
         print(f"  Warning: data.yaml not found at {data_yaml_path}. Cannot get class names from this dataset.")

# After processing all data.yaml files, sort combined_class_names to have consistent ID assignment
combined_class_names.sort()
class_name_to_id = {name: i for i, name in enumerate(combined_class_names)}
print("\nFinal Combined Class Names and IDs:")
print(combined_class_names)
print(class_name_to_id)
print(f"Total combined classes: {len(combined_class_names)}")


# --- Step 2: Copy and Process Files while preserving splits ---
print("\nCopying and processing files to merged dataset while preserving splits...")

for dataset_path in datasets_to_process:
    print(f"Processing dataset: {dataset_path} for file copying")

    # Extract a simple prefix from the dataset path
    # Handle the AquaTrash path specifically if needed, or use a consistent approach
    # Ensure the prefix is safe for filenames
    if 'AquaTrash_yolo' in dataset_path:
        dataset_prefix = 'aquatrash_yolo_'
    elif 'mju-waste_yolo' in dataset_path:
         dataset_prefix = 'mju_waste_yolo_'
    elif 'TACO_yolo_1500' in dataset_path:
         dataset_prefix = 'taco_yolo_1500_'
    else:
        # Fallback prefix if the path doesn't match expected patterns
        dataset_prefix = os.path.basename(dataset_path).replace('_extracted', '').replace('-', '_').lower() + '_'

    # Load class names for the current dataset again for annotation processing
    data_yaml_path = os.path.join(dataset_path, 'data.yaml')
    current_class_id_to_name = {}
    if os.path.exists(data_yaml_path):
        try:
            with open(data_yaml_path, 'r') as f:
                current_data_yaml = yaml.safe_load(f)
                current_class_names = current_data_yaml.get('names', [])
                current_class_id_to_name = {i: name for i, name in enumerate(current_class_names)}
        except Exception as e:
             print(f"  Warning: Error loading data.yaml again from {data_yaml_path} for file copying: {e}")
             current_class_id_to_name = {}
    else:
        print(f"  Warning: data.yaml not found at {data_yaml_path} during file copying.")
        current_class_id_to_name = {} # Cannot map class IDs if data.yaml is missing

    # Iterate through original splits and copy to corresponding merged splits
    for split_subdir in ['train', 'val', 'test']:
        original_images_path = os.path.join(dataset_path, 'images', split_subdir)
        original_labels_path = os.path.join(dataset_path, 'labels', split_subdir)
        merged_split_images_path = os.path.join(merged_images_base, split_subdir)
        merged_split_labels_path = os.path.join(merged_labels_base, split_subdir)

        if not os.path.exists(original_images_path):
            print(f"  Info: Image directory not found for {split_subdir} in {os.path.basename(dataset_path)}: {original_images_path}. Skipping this split.")
            continue
        if not os.path.exists(original_labels_path):
            print(f"  Info: Label directory not found for {split_subdir} in {os.path.basename(dataset_path)}: {original_labels_path}. Skipping this split.")
            continue

        print(f"  Copying {split_subdir} images and labels from {os.path.basename(dataset_path)}...")

        for filename in os.listdir(original_images_path):
            img_name, img_ext = os.path.splitext(filename)
            label_filename = img_name + '.txt'

            original_image_file = os.path.join(original_images_path, filename)
            original_label_file = os.path.join(original_labels_path, label_filename)

            # Check if corresponding label file exists
            if not os.path.exists(original_label_file):
                print(f"    Warning: Label file not found for image {filename} at {original_labels_path}. Skipping this image and its missing label.")
                continue

            # Construct new filenames with dataset prefix
            new_img_filename = f"{dataset_prefix}{filename}"
            new_label_filename = f"{dataset_prefix}{label_filename}"

            merged_dest_image_path = os.path.join(merged_split_images_path, new_img_filename)
            merged_dest_label_path = os.path.join(merged_split_labels_path, new_label_filename)

            try:
                # Copy files
                shutil.copy2(original_image_file, merged_dest_image_path)

                # --- Update Class Indices in Label File ---
                updated_lines = []
                # Open the original label file for reading and the new label file for writing
                with open(original_label_file, 'r') as infile, open(merged_dest_label_path, 'w') as outfile:
                    for line in infile:
                        parts = line.strip().split()
                        if parts and len(parts) == 5:
                            try:
                                original_class_id = int(parts[0])
                                # Use the class map specific to the original dataset this file came from

                                # --- Added check for key existence ---
                                if current_class_id_to_name and original_class_id in current_class_id_to_name:
                                    original_class_name = current_class_id_to_name[original_class_id]
                                else:
                                     # Handle case where original class ID is not found in the map
                                     print(f"    Warning: Original class ID {original_class_id} not found in original class map for {new_label_filename}. Skipping annotation line: '{line.strip()}'")
                                     continue # Skip this line

                                mapped_class_name = class_name_mapping.get(original_class_name, original_class_name)

                                # Check if the mapped class name is in the final combined class map
                                if mapped_class_name in class_name_to_id:
                                    new_class_id = class_name_to_id[mapped_class_name]
                                    parts[0] = str(new_class_id)
                                    updated_lines.append(" ".join(parts))
                                else:
                                     # This warning indicates an issue with class mapping or data.yaml loading
                                     print(f"    Warning: Mapped class name '{mapped_class_name}' (original: '{original_class_name}') not found in combined class map for {new_label_filename}. Skipping annotation line: '{line.strip()}'")
                                     pass
                            except ValueError:
                                print(f"    Warning: Invalid class ID format in label file {new_label_filename} line: '{line.strip()}'. Skipping this line.")
                                pass # Skip lines with invalid format
                            except Exception as line_e:
                                print(f"    Error processing line '{line.strip()}' in {new_label_filename}: {line_e}. Skipping this line.")
                                pass # Skip lines with other errors
                        else:
                             print(f"    Warning: Skipping malformed line in {new_label_filename}: '{line.strip()}'")

                    # Write the processed lines to the new label file
                    outfile.write("\n".join(updated_lines))

            except FileNotFoundError:
                 print(f"    Error copying files for {filename}. Image or label file not found unexpectedly.")
            except Exception as e:
                 print(f"    Error processing file {filename}: {e}")

print("\nYOLO dataset merging complete.")

# --- Create Merged data.yaml ---
print("Creating merged data.yaml...")

merged_data_yaml = {
    'path': merged_output_dir,
    'train': os.path.join('images', 'train'), # Relative path to the train images directory
    'val': os.path.join('images', 'val'),     # Relative path to the val images directory
    'test': os.path.join('images', 'test'),   # Relative path to the test images directory
    'nc': len(combined_class_names),
    'names': combined_class_names
}

# Save the merged data.yaml file
merged_data_yaml_path = os.path.join(merged_output_dir, 'data.yaml')
with open(merged_data_yaml_path, 'w') as f:
    yaml.dump(merged_data_yaml, f, sort_keys=False) # Use sort_keys=False to keep names order consistent

print(f"Merged data.yaml saved to: {merged_data_yaml_path}")
print(f"Total classes in merged dataset: {len(combined_class_names)}")
print(f"Combined class names: {combined_class_names}")

# --- Next Steps ---
print("\nNext Steps:")
print(f"1. Your merged YOLO dataset is located at: {merged_output_dir}")
print(f"2. The data.yaml file is in: {merged_data_yaml_path}")
print("3. This dataset contains images and labels from the original train, val, and test splits, merged into corresponding subdirectories, with original dataset prefixes.")
print("4. You can now use this merged dataset for training with YOLO.")

Collecting and mapping class names from all data.yaml files...
Processing data.yaml for: /content/20250602z_mju-waste_yolo_extracted
  Found original classes in 20250602z_mju-waste_yolo_extracted: ['Rubbish']
  Combined unique mapped classes so far: ['trash']
Processing data.yaml for: /content/20250603_TACO_yolo_1500_extracted
  Found original classes in 20250603_TACO_yolo_1500_extracted: ['trash']
  Combined unique mapped classes so far: ['trash']
Processing data.yaml for: /content/20250610_aquatrash_yolo_extracted/kaggle/working/AquaTrash_yolo
  Found original classes in AquaTrash_yolo: ['trash']
  Combined unique mapped classes so far: ['trash']

Final Combined Class Names and IDs:
['trash']
{'trash': 0}
Total combined classes: 1

Copying and processing files to merged dataset while preserving splits...
Processing dataset: /content/20250602z_mju-waste_yolo_extracted for file copying
  Copying train images and labels from 20250602z_mju-waste_yolo_extracted...
  Copying val images and

In [10]:
# Summarizing the % of images from all 3 datasets in train, val, test datasets
# %%
import os

# Path to your merged dataset directory
merged_output_dir = '/content/merged_yolo_dataset' # Ensure this matches your output directory

# Define the expected dataset prefixes you used during merging
# These should match the ones generated from your original zip file names
dataset_prefixes = [
    'mju_waste_yolo_', # Derived from '20250602z_mju-waste_yolo.zip'
    'aquatrash_yolo_', # Derived from '20250610_aquatrash_yolo.zip' (corrected based on your path logic)
    'taco_yolo_1500_'  # Derived from '20250603_TACO_yolo_1500.zip'
]

# Dictionary to store counts
# Structure: {split: {prefix: count}}
image_counts_by_dataset = {}

# Dictionary to store total counts per split
total_images_per_split = {}

print("\nCounting images from each original dataset in train, val, and test splits...")

total_image_label_pairs_collected = 0 # Initialize a counter for total images

# Iterate through each split (train, val, test)
for split_subdir in ['train', 'val', 'test']:
    split_images_path = os.path.join(merged_output_dir, 'images', split_subdir)
    image_counts_by_dataset[split_subdir] = {prefix: 0 for prefix in dataset_prefixes} # Initialize counts for this split
    total_images_per_split[split_subdir] = 0 # Initialize total for this split

    if os.path.exists(split_images_path):
        print(f"Processing split: {split_subdir}")
        # List all files in the image directory for the current split
        for filename in os.listdir(split_images_path):
            # Check if the file is an image (you might want to add more robust checks for image extensions)
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                # Try to match the filename with a known dataset prefix
                matched_prefix = None
                for prefix in dataset_prefixes:
                    if filename.startswith(prefix):
                        matched_prefix = prefix
                        break # Found a match, no need to check other prefixes

                if matched_prefix:
                    # Increment the count for the matched prefix in the current split
                    image_counts_by_dataset[split_subdir][matched_prefix] += 1
                    total_images_per_split[split_subdir] += 1 # Increment total for this split
                    total_image_label_pairs_collected += 1 # Increment overall total
                else:
                    # Handle files that don't match any expected prefix (shouldn't happen if merging worked as expected)
                    print(f"  Warning: Image '{filename}' in {split_subdir} does not match any known dataset prefix.")
    else:
        print(f"  Info: Image directory for {split_subdir} not found at {split_images_path}")

# Print the total number of image-label pairs collected
print(f"\nTotal image-label pairs collected across all merged splits: {total_image_label_pairs_collected}")


# Print the results per dataset per split
print("\n--- Image Counts Summary (per dataset within each split) ---")
for split_name, prefix_counts in image_counts_by_dataset.items():
    print(f"\nSplit: {split_name}")
    total_in_split = sum(prefix_counts.values()) # This should match total_images_per_split[split_name]
    if total_in_split > 0:
        for prefix, count in prefix_counts.items():
            # Clean up prefix for display
            display_name = prefix.strip('_').replace('_yolo', '').replace('_', ' ')
            print(f"  - {display_name}: {count} images ({count/total_in_split:.1%})")
        print(f"  Total images in {split_name} split: {total_in_split}")
    else:
        print(f"  No images found in the {split_name} split.")

# Print the overall train/val/test split percentages
print("\n--- Overall Train/Val/Test Split Summary ---")

if total_image_label_pairs_collected > 0:
    for split_name, count in total_images_per_split.items():
        percentage = (count / total_image_label_pairs_collected) * 100 if total_image_label_pairs_collected > 0 else 0
        print(f"  - {split_name}: {count} images ({percentage:.1f}%)")
else:
    print("No images found in the merged dataset to calculate split percentages.")


print("\n--- End of Summary ---")


Counting images from each original dataset in train, val, and test splits...
Processing split: train
Processing split: val
Processing split: test

Total image-label pairs collected across all merged splits: 4336

--- Image Counts Summary (per dataset within each split) ---

Split: train
  - mju waste: 1477 images (49.7%)
  - aquatrash: 295 images (9.9%)
  - taco 1500: 1200 images (40.4%)
  Total images in train split: 2972

Split: val
  - mju waste: 248 images (57.0%)
  - aquatrash: 37 images (8.5%)
  - taco 1500: 150 images (34.5%)
  Total images in val split: 435

Split: test
  - mju waste: 742 images (79.9%)
  - aquatrash: 37 images (4.0%)
  - taco 1500: 150 images (16.1%)
  Total images in test split: 929

--- Overall Train/Val/Test Split Summary ---
  - train: 2972 images (68.5%)
  - val: 435 images (10.0%)
  - test: 929 images (21.4%)

--- End of Summary ---


In [14]:
# Summarizing the total number of pairs in each of 3 datasets
# %%
import os

# List of paths to the original extracted YOLO datasets.
# Ensure these paths match the ones used earlier in your notebook.
original_dataset_paths = [
    '/content/20250602z_mju-waste_yolo_extracted',
    '/content/20250603_TACO_yolo_1500_extracted',
    '/content/20250610_aquatrash_yolo_extracted/kaggle/working/AquaTrash_yolo' # Explicit path for AquaTrash
]

# Dictionary to store counts from original datasets
# Structure: {dataset_name: total_pairs}
original_dataset_counts = {}

print("Counting image-label pairs in original datasets...")

for dataset_path in original_dataset_paths:
    dataset_name = os.path.basename(dataset_path)
    total_pairs_in_dataset = 0
    print(f"Processing original dataset: {dataset_name}")

    # Iterate through potential splits in the original dataset
    for split_subdir in ['train', 'val', 'test']:
        original_images_path = os.path.join(dataset_path, 'images', split_subdir)
        original_labels_path = os.path.join(dataset_path, 'labels', split_subdir)

        if os.path.exists(original_images_path) and os.path.exists(original_labels_path):
            # List files in the image directory for the current split
            for filename in os.listdir(original_images_path):
                 # Check if the file is an image and has a corresponding label
                img_name, img_ext = os.path.splitext(filename)
                label_filename = img_name + '.txt'
                original_label_file = os.path.join(original_labels_path, label_filename)

                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')) and os.path.exists(original_label_file):
                    total_pairs_in_dataset += 1
        # else:
             # Optional: Uncomment if you want info about missing splits in original datasets
             # print(f"  Info: Split '{split_subdir}' not found in {dataset_name}")

    original_dataset_counts[dataset_name] = total_pairs_in_dataset
    print(f"  Found {total_pairs_in_dataset} image-label pairs in {dataset_name}")


Counting image-label pairs in original datasets...
Processing original dataset: 20250602z_mju-waste_yolo_extracted
  Found 2467 image-label pairs in 20250602z_mju-waste_yolo_extracted
Processing original dataset: 20250603_TACO_yolo_1500_extracted
  Found 1500 image-label pairs in 20250603_TACO_yolo_1500_extracted
Processing original dataset: AquaTrash_yolo
  Found 369 image-label pairs in AquaTrash_yolo
