In [3]:
#Reading 3 zipped datasets from Google Drive and unzipping them

from google.colab import drive
drive.mount('/content/drive')

import zipfile
import os

# Specify the path to the folder in your Google Drive after adding a shortcut
drive_folder_path = '/content/drive/MyDrive/Recyclo/datasets'

# List to store the names of found zip files
found_zip_files = []

# Initialize the list to store extracted dataset names *before* the loop
extracted_dataset_names = []

# Check if the directory exists before listing its contents
if not os.path.exists(drive_folder_path):
    print(f"Error: The directory '{drive_folder_path}' was not found.")
    print("Please ensure you have added a shortcut to the shared folder in your 'My Drive' and the path is correct.")
else:
    # List all files in the folder
    for filename in os.listdir(drive_folder_path):
      if filename.endswith(".zip"):
        # Add the found zip file name to the list
        found_zip_files.append(filename)

        zip_path = os.path.join(drive_folder_path, filename)

        # Create a directory to extract to (optional)
        # Ensure the filename is clean for directory naming
        safe_filename = filename.replace('.zip', '')
        extract_path = f'/content/{safe_filename}_extracted'
        # Append the extracted path to the list *within* the loop
        extracted_dataset_names.append(extract_path)

        os.makedirs(extract_path, exist_ok=True)

        # Open the zip file
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
              # Extract all the contents into the specified directory
              zip_ref.extractall(extract_path)
              print(f"Extracted {filename} to {extract_path}")
        except zipfile.BadZipFile:
            print(f"Error: {filename} is not a valid zip file.")
        except FileNotFoundError:
             print(f"Error: Zip file not found at {zip_path}. This is unexpected after listing the directory.")

        # You can now process the extracted files in 'extract_path'

# Optional: Print the list of found zip files after the loop
print("\nFound the following zip files:")
for zip_name in found_zip_files:
    print(f"- {zip_name}")

print("\nExtracted dataset names:")
for name in extracted_dataset_names:
  print(f"- {name}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracted 20250602_aquatrash_yolo.zip to /content/20250602_aquatrash_yolo_extracted
Extracted 20250602z_mju-waste_yolo.zip to /content/20250602z_mju-waste_yolo_extracted
Extracted 20250603_TACO_yolo_1500.zip to /content/20250603_TACO_yolo_1500_extracted

Found the following zip files:
- 20250602_aquatrash_yolo.zip
- 20250602z_mju-waste_yolo.zip
- 20250603_TACO_yolo_1500.zip

Extracted dataset names:
- /content/20250602_aquatrash_yolo_extracted
- /content/20250602z_mju-waste_yolo_extracted
- /content/20250603_TACO_yolo_1500_extracted


In [2]:
import shutil
import os

# List of directories to delete
directories_to_delete = [
    '/content/20250602z_mju-waste_yolo_extracted',
    '/content/20250602_aquatrash_yolo_extracted',
    '/content/20250603_TACO_yolo_1500_extracted',
    '/content/merged_yolo_dataset'
] # Replace with your actual list of folder paths

for directory_path in directories_to_delete:
    if os.path.exists(directory_path):
        try:
            shutil.rmtree(directory_path)
            print(f"Deleted directory and its contents: {directory_path}")
        except OSError as e:
            print(f"Error deleting directory {directory_path}: {e}")
    else:
        print(f"Directory not found: {directory_path}")

Deleted directory and its contents: /content/20250602z_mju-waste_yolo_extracted
Deleted directory and its contents: /content/20250602_aquatrash_yolo_extracted
Deleted directory and its contents: /content/20250603_TACO_yolo_1500_extracted
Deleted directory and its contents: /content/merged_yolo_dataset


In [4]:
# %%
import os
import shutil
import yaml # You might need to install PyYAML: pip install PyYAML

# --- Configuration ---
# List of paths to the extracted YOLOv8 datasets.
# This list is now populated correctly from the previous code block.
extracted_yolo_dataset_paths = extracted_dataset_names

# Subdirectories expected within each YOLOv8 dataset
yolo_image_subdirs = ['images/train', 'images/val', 'images/test'] # Add 'test' if applicable
yolo_label_subdirs = ['labels/train', 'labels/val', 'labels/test'] # Add 'test' if applicable

# Output directory for the merged YOLOv8 dataset
merged_output_dir = '/content/merged_yolo_dataset'

# Define a mapping for merging class names
# Key: original class name from a dataset
# Value: the desired class name in the merged dataset
class_name_mapping = {
    'Rubbish': 'trash', # Map 'rubbish' from any dataset to 'trash'
    'trash': 'trash',
    'glass': 'trash',
    'metal': 'trash',
    'paper': 'trash',
    'plastic': 'trash'
    # Ensure 'trash' is also mapped to 'trash' (optional, but good practice)
    # Add mappings for other classes if needed, e.g.:
    # 'plastic_bottle': 'bottle',
    # 'glass_bottle': 'bottle',
}


# --- Merging Process ---

# Create the main merged directories
merged_images_base = os.path.join(merged_output_dir, 'images')
merged_labels_base = os.path.join(merged_output_dir, 'labels')

# Create train, val, test subdirectories within the merged structure
for subdir in ['train', 'val', 'test']:
    os.makedirs(os.path.join(merged_images_base, subdir), exist_ok=True)
    os.makedirs(os.path.join(merged_labels_base, subdir), exist_ok=True)

# Keep track of encountered class names and map them to new indices
# This assumes all datasets use consistent class names, even if indices differ.
# If names differ for the same object, you'll need a manual mapping.
class_names_set = set()
merged_class_names = []
class_name_to_id = {}
next_class_id = 0

# Map to track image/label file renames due to conflicts (optional but safe)
# Example: {'original_name.jpg': 'dataset1_original_name.jpg'}
renamed_files_map = {}

print("Starting YOLO dataset merging...")

for dataset_path in extracted_yolo_dataset_paths:
    print(f"Processing dataset: {dataset_path}")

    # --- Process Class Names ---
    # Load data.yaml from the current dataset to get class names
    data_yaml_path = os.path.join(dataset_path, 'data.yaml')
    current_class_names = []
    current_class_id_to_name = {}

    if os.path.exists(data_yaml_path):
        try:
            with open(data_yaml_path, 'r') as f:
                current_data_yaml = yaml.safe_load(f)
                current_class_names = current_data_yaml.get('names', [])
                # Create a map from original ID to name for this dataset
                current_class_id_to_name = {i: name for i, name in enumerate(current_class_names)}

            # --- Apply Class Name Mapping and Build Merged Class List ---
            for original_class_name in current_class_names:
                # Apply the mapping
                merged_class_name = class_name_mapping.get(original_class_name, original_class_name) # Use original if not in mapping

                if merged_class_name not in class_names_set:
                    class_names_set.add(merged_class_name)
                    merged_class_names.append(merged_class_name)
                    class_name_to_id[merged_class_name] = next_class_id
                    next_class_id += 1
            print(f"  Found {len(current_class_names)} original classes. Merged classes now: {len(merged_class_names)}")

            if current_class_names:
                print(f"  Original classes found in {os.path.basename(dataset_path)}:")
                for i, name in enumerate(current_class_names):
                    # Also show how they are mapped
                    mapped_name = class_name_mapping.get(name, name)
                    print(f"    - Original ID: {i}, Original Name: {name} -> Mapped Name: {mapped_name}")

        except FileNotFoundError:
             print(f"  Warning: data.yaml not found at {data_yaml_path}. Cannot get class names from this dataset.")
             # Continue processing files but cannot map class IDs correctly if no data.yaml
             current_class_names = []
             current_class_id_to_name = {}
        except Exception as e:
             print(f"  Error loading data.yaml from {data_yaml_path}: {e}. Skipping class processing for this dataset.")
             current_class_names = []
             current_class_id_to_name = {}
    else:
         print(f"  Warning: data.yaml not found at {data_yaml_path}. Cannot get class names from this dataset.")
         current_class_names = []
         current_class_id_to_name = {}


    # --- Copy Images and Labels ---
    for i, subdir in enumerate(['train', 'val', 'test']):
        original_images_path = os.path.join(dataset_path, yolo_image_subdirs[i])
        original_labels_path = os.path.join(dataset_path, yolo_label_subdirs[i])
        merged_images_path = os.path.join(merged_images_base, subdir)
        merged_labels_path = os.path.join(merged_labels_base, subdir)

        if not os.path.exists(original_images_path):
            print(f"  Warning: Image directory not found: {original_images_path}. Skipping {subdir} for this dataset.")
            continue
        if not os.path.exists(original_labels_path):
            print(f"  Warning: Label directory not found: {original_labels_path}. Skipping {subdir} for this dataset.")
            continue

        print(f"  Copying {subdir} images and labels...")

        for filename in os.listdir(original_images_path):
            img_name, img_ext = os.path.splitext(filename)
            label_filename = img_name + '.txt'

            original_image_file = os.path.join(original_images_path, filename)
            original_label_file = os.path.join(original_labels_path, label_filename)

            # Check if corresponding label file exists
            if not os.path.exists(original_label_file):
                print(f"    Warning: Label file not found for image {filename} at {original_labels_path}. Skipping this image and its missing label.")
                continue

            # --- Handle File Name Conflicts ---
            new_img_filename = filename
            new_label_filename = label_filename

            # Check if a file with this name already exists in the destination
            merged_dest_image_path = os.path.join(merged_images_path, new_img_filename)
            merged_dest_label_path = os.path.join(merged_labels_path, new_label_filename)

            # Simple conflict check: Does a file with this name already exist in the destination?
            # A more robust check would compare file hashes if the same name could mean different images.
            if os.path.exists(merged_dest_image_path) or os.path.exists(merged_dest_label_path):
                 # Conflict detected, rename the files
                 # Using a simple prefix like 'dataset_index_' or a hash
                 prefix = f"{len(renamed_files_map.keys())}_" # Use a simple counter for uniqueness
                 new_img_filename = prefix + filename
                 new_label_filename = prefix + label_filename
                 renamed_files_map[filename] = new_img_filename # Store original to new mapping
                 print(f"    Conflict for {filename}. Renaming to {new_img_filename}")

                 merged_dest_image_path = os.path.join(merged_images_path, new_img_filename)
                 merged_dest_label_path = os.path.join(merged_labels_path, new_label_filename)


            # --- Copy Files ---
            try:
                shutil.copy2(original_image_file, merged_dest_image_path)
                shutil.copy2(original_label_file, merged_dest_label_path)

                # --- Update Class Indices in Label File ---
                # Only process label files if we successfully loaded class names for this dataset
                if current_class_id_to_name:
                     updated_lines = []
                     with open(merged_dest_label_path, 'r') as f: # This line was causing the error
                         for line in f:
                             parts = line.strip().split()
                             if parts:
                                 try:
                                     original_class_id = int(parts[0])
                                     # Get the original class name using the original ID map for this dataset
                                     original_class_name = current_class_id_to_name.get(original_class_id)

                                     if original_class_name is not None:
                                         # Get the mapped class name
                                         merged_class_name = class_name_mapping.get(original_class_name, original_class_name)

                                         if merged_class_name in class_name_to_id:
                                             # Get the new class ID from the merged map
                                             new_class_id = class_name_to_id[merged_class_name]
                                             # Replace the original class ID with the new one
                                             parts[0] = str(new_class_id)
                                             updated_lines.append(" ".join(parts))
                                         else:
                                              print(f"    Warning: Mapped class name '{merged_class_name}' (original: '{original_class_name}') not found in merged class map for {label_filename}. Skipping this annotation line.")
                                              pass # Skip the annotation line
                                     else:
                                          print(f"    Warning: Original class ID {original_class_id} not found in original class map for {label_filename}. Skipping this annotation line.")
                                          pass # Skip the annotation line
                                 except ValueError:
                                     print(f"    Warning: Invalid class ID format in label file {label_filename} line: '{line.strip()}'. Skipping this line.")
                                     pass # Skip lines with invalid format
                                 except Exception as line_e:
                                     print(f"    Error processing line '{line.strip()}' in {label_filename}: {line_e}. Skipping this line.")
                                     pass # Skip lines with other errors


                     # Write the updated lines back to the label file
                     with open(merged_dest_label_path, 'w') as f:
                         f.write("\n".join(updated_lines))

            except FileNotFoundError:
                 print(f"    Error copying files for {filename}. Image or label file not found unexpectedly.")
            except Exception as e:
                 print(f"    Error processing file {filename}: {e}")


print("YOLO dataset merging complete.")

# --- Create Merged data.yaml ---
print("Creating merged data.yaml...")

merged_data_yaml = {
    'path': merged_output_dir,
    'train': os.path.join(merged_images_base, 'train'),
    'val': os.path.join(merged_images_base, 'val'),
    # 'test': os.path.join(merged_images_base, 'test'), # Include if you have test data
    'nc': len(merged_class_names),
    'names': merged_class_names
}

# Save the merged data.yaml file
merged_data_yaml_path = os.path.join(merged_output_dir, 'data.yaml')
with open(merged_data_yaml_path, 'w') as f:
    yaml.dump(merged_data_yaml, f, sort_keys=False) # Use sort_keys=False to keep names order

print(f"Merged data.yaml saved to: {merged_data_yaml_path}")
print(f"Total classes in merged dataset: {len(merged_class_names)}")
print(f"Merged class names: {merged_class_names}")

# --- Next Steps ---
print("\nNext Steps:")
print(f"1. Your merged YOLOv8 dataset is located at: {merged_output_dir}")
print(f"2. The data.yaml file is in: {merged_data_yaml_path}")
print("3. You can now use this merged dataset for training with YOLOv8.")
print("   Point your YOLOv8 training command to this data.yaml file.")

Starting YOLO dataset merging...
Processing dataset: /content/20250602_aquatrash_yolo_extracted
  Found 4 original classes. Merged classes now: 1
  Original classes found in 20250602_aquatrash_yolo_extracted:
    - Original ID: 0, Original Name: glass -> Mapped Name: trash
    - Original ID: 1, Original Name: metal -> Mapped Name: trash
    - Original ID: 2, Original Name: paper -> Mapped Name: trash
    - Original ID: 3, Original Name: plastic -> Mapped Name: trash
  Copying train images and labels...
  Copying val images and labels...
Processing dataset: /content/20250602z_mju-waste_yolo_extracted
  Found 1 original classes. Merged classes now: 1
  Original classes found in 20250602z_mju-waste_yolo_extracted:
    - Original ID: 0, Original Name: Rubbish -> Mapped Name: trash
  Copying train images and labels...
  Copying val images and labels...
  Copying test images and labels...
Processing dataset: /content/20250603_TACO_yolo_1500_extracted
  Found 1 original classes. Merged classe