# Merge train/valid/test to one folder

In [None]:
import os
import shutil
from tqdm import tqdm

sets = ["train", 'valid', 'test']

data_path = r''

merged_images = os.path.join(data_path, "merged", "images")
merged_labels = os.path.join(data_path, "merged", "labels")

os.makedirs(merged_images, exist_ok=True)
os.makedirs(merged_labels, exist_ok=True)

for s in sets:
  img_dir = os.path.join(data_path, s, "images")
  labels_dir = os.path.join(data_path, s, "labels")

  for filename in tqdm(os.listdir(img_dir), desc = f"copynig images {s}"):

    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        src_img = os.path.join(img_dir, filename)
        dst_img = os.path.join(merged_images, filename)
        shutil.copy(src_img, dst_img)

        # Match label file (.txt with same name)
        label_name = os.path.splitext(filename)[0] + ".txt"
        src_lbl = os.path.join(labels_dir, label_name)
        dst_lbl = os.path.join(merged_labels, label_name)

        # Copy label if exists
        if os.path.exists(src_lbl):
            shutil.copy(src_lbl, dst_lbl)

# Handle Images Without Labels Or Vice Versa

In [None]:
import os
import pandas as pd

# Paths
images_folder = r"D:\roboflow\merged-2\images"
labels_folder = r"D:\roboflow\merged-2\labels"

# Files
image_files = {os.path.splitext(f)[0] for f in os.listdir(images_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))}
label_files = {os.path.splitext(f)[0] for f in os.listdir(labels_folder) if f.lower().endswith('.txt')}

# ‚úÖ Images that do not have a corresponding label
no_label = image_files - label_files

# ‚ö†Ô∏è Labels that do not have a corresponding image
no_image = label_files - image_files

print(f"‚úÖ Total images: {len(image_files)}")
print(f"‚úÖ Total labels: {len(label_files)}")
print(f"‚ö†Ô∏è Images without labels: {len(no_label)}")
print(f"‚ö†Ô∏è Labels without images: {len(no_image)}")

if no_label:
    print("\nüî∏ Example (image with no label):", list(no_label)[:5])
if no_image:
    print("\nüî∏ Example (label with no image):", list(no_image)[:5])

In [None]:
import os
image_folder = r"D:\roboflow\merged-2\images"

no_label_list = list(no_label) # Convert set to list for iteration

for name in no_label_list:
    for ext in ['.jpg', '.jpeg', '.png']:
        img_path = os.path.join(image_folder, name + ext)
        if os.path.exists(img_path):
            os.remove(img_path)
            print(f"üóëÔ∏è Deleted {img_path}")

In [None]:
import os

# üìÇ Paths
images_folder = r"D:\roboflow\merged-2\images"
labels_folder = r"D:\roboflow\merged-2\labels"

# üìã Get filenames without extensions
image_files = {os.path.splitext(f)[0] for f in os.listdir(images_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))}
label_files = {os.path.splitext(f)[0] for f in os.listdir(labels_folder) if f.lower().endswith('.txt')}

# ‚ö†Ô∏è Identify labels that do not have a corresponding image
no_image = label_files - image_files

print(f"üìÑ Total labels without image: {len(no_image)}")

# üóëÔ∏è Delete them
deleted = 0
for name in no_image:
    label_path = os.path.join(labels_folder, name + ".txt")
    if os.path.exists(label_path):
        try:
            os.remove(label_path)
            deleted += 1
            print(f"üóëÔ∏è Deleted: {label_path}")
        except Exception as e:
            print(f"‚ö†Ô∏è Error deleting {label_path}: {e}")

print(f"\n‚úÖ Done! Deleted {deleted} labels with no matching image.")

# Handle duplicated

In [None]:
import os
import imagehash
from PIL import Image
from tqdm import tqdm
import pandas as pd

# üìÇ Full path to the folder containing the images
image_folder = r"D:\roboflow\merged\images"
# Change the path according to your device

hash_dict = {}

# üîç Loop through all images inside the folder
for filename in tqdm(os.listdir(image_folder)):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff')):
        filepath = os.path.join(image_folder, filename)
        try:
            img = Image.open(filepath).convert('RGB')
            hash_value = str(imagehash.phash(img))  # You can change it to dhash or ahash
            if hash_value in hash_dict:
                hash_dict[hash_value].append(filename)
            else:
                hash_dict[hash_value] = [filename]
        except Exception as e:
            print(f"‚ö†Ô∏è Error in {filename}: {e}")

# üìã Extract duplicate images
duplicates = {h: files for h, files in hash_dict.items() if len(files) > 1}

print(f"‚úÖ Found {len(duplicates)} duplicate groups")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89660/89660 [23:05<00:00, 64.70it/s] 


‚úÖ Found 1948 duplicate groups


In [None]:
# import matplotlib.pyplot as plt
# for i, (h, files) in enumerate(duplicates.items(), 1):
#     print(f"\nüîÅ Group {i} (hash={h}): {files}")
#     imgs = [Image.open(os.path.join(image_folder, f)) for f in files]
    
#   # Display images side by side
#     plt.figure(figsize=(4 * len(imgs), 4))
#     for j, img in enumerate(imgs):
#         plt.subplot(1, len(imgs), j + 1)
#         plt.imshow(img)
#         plt.title(files[j])
#         plt.axis("off")
#     plt.show()

In [None]:
# calculate total duplicate images
total_duplicate_images = sum(len(files) for files in duplicates.values())

print(f"üì∏ Total duplicate images found: {total_duplicate_images}")

üì∏ Total duplicate images found: 4160


# save results to remove dublicated images labels

In [None]:
# Save Results As CSV
rows = []
for h, files in duplicates.items():
    for f in files:
        rows.append({"hash": h, "filename": f})

df = pd.DataFrame(rows)
df.to_csv("duplicates_6.csv", index=False)
print("üíæ Saved results to duplicates.csv")

üíæ Saved results to duplicates.csv


# remove duplicated images

In [7]:
def delete_duplicates(duplicates_dict, image_folder, dry_run = True):

  deleted = []
  for h, file in duplicates_dict.items():

    for f in file[1:]:
      path = os.path.join(image_folder, f)
      if dry_run:
        print(f'[DRY RUN] deleted {path}')
      else:

        try:

          os.remove(path)
          deleted.append(path)

        except Exception as e:
          print(f"Error in deleting{path}:{e}")
  if not dry_run:
    print("the images was deleted")
  else:
    print("No images deleted")

In [10]:
delete_duplicates(duplicates_dict=duplicates, image_folder=image_folder, dry_run=False)

Error in deletingD:\roboflow\merged\images\-80-038-_-_9_-_-_-_jpg.rf.ca42f0556778b500405123e00ff86576.jpg:[WinError 2] The system cannot find the file specified: 'D:\\roboflow\\merged\\images\\-80-038-_-_9_-_-_-_jpg.rf.ca42f0556778b500405123e00ff86576.jpg'
Error in deletingD:\roboflow\merged\images\019_jpg.rf.6dc6604e512cd9e69c1c58dc2f3b28ce.jpg:[WinError 2] The system cannot find the file specified: 'D:\\roboflow\\merged\\images\\019_jpg.rf.6dc6604e512cd9e69c1c58dc2f3b28ce.jpg'
Error in deletingD:\roboflow\merged\images\056_jpg.rf.99712e02c53aaa7f1a06310e7ee451e2.jpg:[WinError 2] The system cannot find the file specified: 'D:\\roboflow\\merged\\images\\056_jpg.rf.99712e02c53aaa7f1a06310e7ee451e2.jpg'
Error in deletingD:\roboflow\merged\images\056_jpg.rf.1aede8e9ba48b912015b96b48a2eadbd.jpg:[WinError 2] The system cannot find the file specified: 'D:\\roboflow\\merged\\images\\056_jpg.rf.1aede8e9ba48b912015b96b48a2eadbd.jpg'
Error in deletingD:\roboflow\merged\images\COCO_train2014_0000

# remove dublicated images labels

In [None]:
import os
import pandas as pd

# üìÇ paths
images_folder = r"D:\roboflow\merged\images"
labels_folder = r"D:\roboflow\merged\labels"
duplicates_csv = r"duplicates_6.csv"  # file generated from the hashing code

# üß† read the duplicates file
dups_df = pd.read_csv(duplicates_csv)

# convert to a Python set for faster lookup
duplicate_files = set(dups_df["filename"].apply(lambda x: os.path.splitext(x)[0]).tolist())

deleted_labels = []
missing_labels = []

# üîÅ loop through each duplicate filename
for base_name in duplicate_files:
    label_path = os.path.join(labels_folder, base_name + ".txt")
    
    if os.path.exists(label_path):
        os.remove(label_path)
        deleted_labels.append(label_path)
    else:
        missing_labels.append(label_path)

print(f"‚úÖ Deleted {len(deleted_labels)} label files")
print(f"‚ö†Ô∏è Missing {len(missing_labels)} labels (not found)")

‚úÖ Deleted 1948 label files
‚ö†Ô∏è Missing 2212 labels (not found)
