# Preprocesamiento I

## 1. Lectura de archivos

### 1.1 Module 1: read_images.py

Handles reading image names from the folders.

In [None]:
import os

def read_image_names(folders):
    """
    Reads the image names and extensions from test, train, and valid subfolders 
    for the given list of folders.
    
    Args:
        folders (list): List of folder names to process.

    Returns:
        dict: A dictionary with folder names as keys and sub-dictionaries 
              for splits (test, train, valid), each containing a set of image names.
    """
    image_data = {folder: {'test': set(), 'train': set(), 'valid': set()} for folder in folders}
    
    for folder in folders:
        for split in ['test', 'train', 'valid']:
            split_path = os.path.join(folder, split, 'images')
            if os.path.exists(split_path):
                image_data[folder][split].update(os.listdir(split_path))
    
    return image_data


### 1.2 Module 2: count_images.py

Counts the number of images in each split.

In [None]:
def count_images(image_data):
    """
    Prints the count of images in each split for each folder.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.
    """
    for folder, splits in image_data.items():
        print(f"\nFolder: {folder}")
        for split, images in splits.items():
            print(f"  {split.capitalize()}: {len(images)} images")


### 1.3 Module 3: find_repeated.py

Finds image names that are repeated across folders.

In [None]:
def find_repeated_images(image_data):
    """
    Finds image names that are repeated across folders.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.

    Returns:
        set: Set of image names that are repeated across folders.
    """
    all_images = {}
    
    for folder, splits in image_data.items():
        for split, images in splits.items():
            for image in images:
                if image not in all_images:
                    all_images[image] = 0
                all_images[image] += 1
    
    repeated_images = {image for image, count in all_images.items() if count > 1}
    return repeated_images


### 1.4 Module 4: unique_train_images.py

Finds unique image names in the train split for each folder.

In [None]:
def find_unique_train_images(image_data):
    """
    Finds unique image names for the `train` split of each folder.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.

    Returns:
        dict: Dictionary with folder names as keys and sets of unique image names as values.
    """
    unique_images = {}
    all_train_images = set()

    # Collect all train images from all folders
    for folder, splits in image_data.items():
        all_train_images.update(splits['train'])

    # Identify unique images for each folder
    for folder, splits in image_data.items():
        unique_images[folder] = splits['train'] - (all_train_images - splits['train'])
    
    return unique_images

### 1.5 Module 5: display_extensions.py

Displays the file extensions of images in the test, train, and valid splits for each folder.

In [None]:
from collections import defaultdict

def display_image_extensions(image_data):
    """
    Displays the unique file extensions of images in test, train, and valid splits for each folder.

    Args:
        image_data (dict): Dictionary containing image data organized by folders and splits.

    Returns:
        dict: Dictionary with folder names as keys and sub-dictionaries containing unique extensions for each split.
    """
    extensions = {folder: {'test': set(), 'train': set(), 'valid': set()} for folder in image_data.keys()}
    
    for folder, splits in image_data.items():
        for split, images in splits.items():
            for image in images:
                ext = os.path.splitext(image)[1].lower()  # Get extension
                extensions[folder][split].add(ext)
    
    return extensions


### 1.6 Module 6: display_dimensions.py

Displays the dimensions of images in the test, train, and valid splits for each folder.

In [None]:
from PIL import Image
import os

def display_unique_dimensions(folders):
    """
    Displays the unique dimensions of images in test, train, and valid splits for each folder.

    Args:
        folders (list): List of folder names to process.

    Returns:
        dict: Dictionary with folder names as keys and sub-dictionaries for splits 
              (test, train, valid), each containing a set of unique image dimensions.
    """
    unique_dimensions = {folder: {'test': set(), 'train': set(), 'valid': set()} for folder in folders}
    
    for folder in folders:
        for split in ['test', 'train', 'valid']:
            split_path = os.path.join(folder, split, 'images')
            if os.path.exists(split_path):
                for image in os.listdir(split_path):
                    image_path = os.path.join(split_path, image)
                    try:
                        with Image.open(image_path) as img:
                            unique_dimensions[folder][split].add(img.size)  # Add (width, height) to the set
                    except Exception as e:
                        print(f"Could not read dimensions for {image_path}: {e}")
    
    return unique_dimensions


### 1.7 Main Script: main.py

Combine all modules and execute the steps.

In [None]:

if __name__ == "__main__":
    # Step 1: Define folders
    folders = [
        'ANPR2.v1i.yolov8',
        'NumberPlates.v1i.yolov8',
        'Peru License Plate.v7i.yolov8',
        'Peru Plate Numbers.v3i.yolov8'
    ]

    # Step 2: Read image names
    image_data = read_image_names(folders)

    # Step 3: Display image counts
    print("Image counts:")
    count_images(image_data)

    # Step 4: Find repeated images
    repeated_images = find_repeated_images(image_data)
    print(f"\nRepeated images across folders: {len(repeated_images)}")
    print(repeated_images)

    # Step 5: Find unique train images
    unique_train_images = find_unique_train_images(image_data)
    print("\nUnique train images per folder:")
    for folder, unique_images in unique_train_images.items():
        print(f"  {folder}: {len(unique_images)} unique images")


    # Step 6: Display image extensions
    print("\nImage extensions:")
    extensions = display_image_extensions(image_data)
    for folder, splits in extensions.items():
        print(f"Folder: {folder}")
        for split, ext_set in splits.items():
            print(f"  {split.capitalize()}: {ext_set}")

    # Step 7: Display unique image dimensions
    print("\nUnique image dimensions:")
    unique_dimensions = display_unique_dimensions(folders)
    for folder, splits in unique_dimensions.items():
        print(f"Folder: {folder}")
        for split, dimensions in splits.items():
            print(f"  {split.capitalize()}: {dimensions}")


## 2. Limpieza de imagenes con aumento de datos de los datasets publicos

### 2.1 Dataset "ANPR2"

#### i. Primer filtro de imagenes

In [None]:
import os
import shutil
from collections import defaultdict

In [None]:
# Path to the directory containing the images
image_dir = "ANPR2.v1i.yolov8/train/images"
filtered_dir = "ANPR2.v1i.yolov8/train_filter_v2/images"

# Ensure the filtered directory exists
os.makedirs(filtered_dir, exist_ok=True)

# Dictionary to store the original images
original_images = []

# Group files by their prefix (common part before ".rf.")
grouped_files = defaultdict(list)
for filename in os.listdir(image_dir):
    if filename.endswith(".jpg"):
        prefix = filename.split(".rf.")[0]
        grouped_files[prefix].append(filename)

# Identify the original images
for prefix, files in grouped_files.items():
    # Sort files to ensure consistent order
    files.sort()

    # Determine the position of the original image based on the prefix
    position = 0  # Default to the first 

    # Add the original image to the list
    if position < len(files):
        original_images.append(files[position])

# Copy the original images to the filtered directory
for img in original_images:
    src_path = os.path.join(image_dir, img)
    dst_path = os.path.join(filtered_dir, img)
    shutil.copy(src_path, dst_path)

# Output the original images
print("Original Images copied to filtered directory:")
for img in original_images:
    print(img)


#### ii. Stage 2: Depuracion de imagnes no originales

In [None]:
import os
import shutil
import pandas as pd
from collections import defaultdict

# Define paths
image_dir = "ANPR2.v1i.yolov8/train/images"
invalid_file = "limpieza_dataset1_stage1.csv"
output_dir = "ANPR2.v1i.yolov8/train_stage2_options/images"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 1: Read all image filenames
image_files = sorted(os.listdir(image_dir))

# Step 2: Load invalid prefixes (from CSV)
invalid_prefixes = set()
if os.path.exists(invalid_file):
    df_invalid = pd.read_csv(invalid_file, header=None)
    for filename in df_invalid[0]:
        prefix = filename.split(".rf.")[0]  # Extract prefix
        invalid_prefixes.add(prefix)

# Step 3: Group images by prefix
grouped_images = defaultdict(list)
for image in image_files:
    if ".rf." in image:
        prefix = image.split(".rf.")[0]  # Extract the shared prefix
        grouped_images[prefix].append(image)

# Step 4: Select one of the remaining options for review
for prefix, files in grouped_images.items():
    if prefix in invalid_prefixes:
        remaining_files = [f for f in files if f.split(".rf.")[0] == prefix]
        
        # Select the last one in sorted order (arbitrary choice)
        if remaining_files:
            selected_file = remaining_files[-1]
            
            # Copy selected image to train_stage2_options/images
            src_path = os.path.join(image_dir, selected_file)
            dest_path = os.path.join(output_dir, selected_file)
            shutil.copy2(src_path, dest_path)

print("Images for stage 2 review copied successfully.")

#### iii. Stage 3: Depuracion

In [None]:
import os
import shutil
import pandas as pd
from collections import defaultdict

# Define paths
image_dir = "ANPR2.v1i.yolov8/train/images"
invalid_file1 = "limpieza_dataset1_stage1.csv"
invalid_file2 = "limpieza_dataset_placas_stage-stage2.csv"
output_dir = "ANPR2.v1i.yolov8/train_stage3_options/images"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 1: Read all image filenames
image_files = sorted(os.listdir(image_dir))
print(f"Total images found: {len(image_files)}")

# Step 2: Load invalid prefixes from both CSVs (use prefixes, not full filenames)
invalid_prefixes_stage1 = set()
if os.path.exists(invalid_file1):
    df_invalid1 = pd.read_csv(invalid_file1, header=None)
    for filename in df_invalid1[0]:
        prefix = filename.split(".rf.")[0]  # Extract prefix
        invalid_prefixes_stage1.add(prefix)
    print(f"Invalid prefixes from stage 1: {len(invalid_prefixes_stage1)}")

invalid_prefixes_stage2 = set()
if os.path.exists(invalid_file2):
    df_invalid2 = pd.read_csv(invalid_file2, header=None)
    for filename in df_invalid2[0]:
        prefix = filename.split(".rf.")[0]  # Extract prefix
        invalid_prefixes_stage2.add(prefix)
    print(f"Invalid prefixes from stage 2: {len(invalid_prefixes_stage2)}")

# Find prefixes mentioned in both invalid lists
common_invalid_prefixes = invalid_prefixes_stage1.intersection(invalid_prefixes_stage2)
print(f"Common invalid prefixes (intersection): {len(common_invalid_prefixes)}")

# Step 3: Group images by prefix
grouped_images = defaultdict(list)
for image in image_files:
    if ".rf." in image:
        prefix = image.split(".rf.")[0]  # Extract the shared prefix
        grouped_images[prefix].append(image)

print(f"Total prefixes grouped: {len(grouped_images)}")

# Step 4: Select the only remaining option for each prefix
copied_count = 0
for prefix in common_invalid_prefixes:
    if prefix in grouped_images:
        files = grouped_images[prefix]
        print(f"\nChecking prefix: {prefix}")
        print(f"Files under prefix {prefix}: {files}")

        # Filter out images that are listed as invalid (by filename)
        invalid_fullnames = set(df_invalid1[0].tolist() + df_invalid2[0].tolist())
        remaining_files = [f for f in files if f.replace('.jpg', '') not in invalid_fullnames]

        print(f"Remaining files for prefix {prefix}: {remaining_files}")

        # If only one option remains, copy it to output directory
        if len(remaining_files) == 1:
            selected_file = remaining_files[0]
            print(f"Selected file for prefix {prefix}: {selected_file}")

            src_path = os.path.join(image_dir, selected_file)
            dest_path = os.path.join(output_dir, selected_file)
            shutil.copy2(src_path, dest_path)
            copied_count += 1
        else:
            print(f"Multiple or no options found for prefix {prefix}, skipping...")
    else:
        print(f"No files grouped for prefix {prefix}, skipping...")

print(f"\nImages copied for stage 3 review: {copied_count}")
print("Process completed.")


#### iv. Stage 4: Union

In [None]:
import os
import pandas as pd

# Define paths
filter_v2_dir = "ANPR2.v1i.yolov8/train_filter_v2/images"
stage2_dir = "ANPR2.v1i.yolov8/train_stage2_options/images"
stage3_dir = "ANPR2.v1i.yolov8/train_stage3_options/images"

filter_v2_invalid_file = "limpieza_dataset1_stage1.csv"
stage2_invalid_file = "limpieza_dataset_placas_stage-stage2.csv"
output_csv = "original_images.csv"

# Step 1: Helper function to extract prefix (before .rf.)
def get_prefix(filename):
    return filename.split(".rf.")[0]

# Step 2: Load invalid prefixes from CSV files (prefixes only)
filter_v2_invalid_prefixes = set(pd.read_csv(filter_v2_invalid_file, header=None)[0].apply(get_prefix))
print(f"Invalid prefixes from filter_v2: {len(filter_v2_invalid_prefixes)}")

stage2_invalid_prefixes = set(pd.read_csv(stage2_invalid_file, header=None)[0].apply(get_prefix))
print(f"Invalid prefixes from stage 2: {len(stage2_invalid_prefixes)}")

# Step 3: Get original images from Filter V2
filter_v2_images = set(os.listdir(filter_v2_dir))
filter_v2_original = {img for img in filter_v2_images if get_prefix(img) not in filter_v2_invalid_prefixes}
print(f"Original images from filter_v2: {len(filter_v2_original)}")

# Step 4: Get original images from Stage 2
stage2_images = set(os.listdir(stage2_dir))
stage2_original = {img for img in stage2_images if get_prefix(img) not in stage2_invalid_prefixes}
print(f"Original images from stage 2: {len(stage2_original)}")

# Step 5: Get original images from Stage 3 (remaining after subsets 1 and 2)
stage3_images = set(os.listdir(stage3_dir))
already_accounted_prefixes = {get_prefix(img) for img in filter_v2_original.union(stage2_original)}
stage3_original = {img for img in stage3_images if get_prefix(img) not in already_accounted_prefixes}
print(f"Original images from stage 3: {len(stage3_original)}")

# Step 6: Combine all original images
all_original_images = sorted(filter_v2_original.union(stage2_original, stage3_original))
print(f"Total original images collected: {len(all_original_images)}")

# Step 7: Save to CSV
df_original = pd.DataFrame(all_original_images, columns=["original_images"])
df_original.to_csv(output_csv, index=False)
print(f"Original images saved to {output_csv}")


In [None]:
#copy paste
import os
import shutil
import pandas as pd

# Step 1: Define paths
original_images_file = "original_images.csv"
train_images_dir = "ANPR2.v1i.yolov8/train/images"
output_dir = "ANPR2.v1i.yolov8/train_stage4_options/images"

# Step 2: Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 3: Load selected original image filenames from CSV
df = pd.read_csv(original_images_file)
original_images = set(df["original_images"].tolist())
print(f"Total selected original images: {len(original_images)}")

# Step 4: Copy matching images from train/images to train_stage4_options/images
copied_count = 0
for image in original_images:
    src_path = os.path.join(train_images_dir, image)
    dest_path = os.path.join(output_dir, image)

    if os.path.exists(src_path):
        shutil.copy2(src_path, dest_path)
        copied_count += 1
    else:
        print(f"Image not found: {image}")

print(f"Total images copied: {copied_count}")
print(f"Images saved to: {output_dir}")


#### v. Eliminacion de imagenes de motos y placas obstruidas

In [None]:
import os
import csv

# Directories for the images
stage6_dir = "ANPR2.v1i.yolov8/train_stage6_options/images"
filter_dir = "ANPR2.v1i.yolov8/train_del/train_filter_v2/images"

def get_prefixes(directory):
    """
    Returns a dictionary mapping each unique prefix (the part before ".rf")
    to a list of corresponding filenames in the given directory.
    """
    prefix_to_files = {}
    for filename in os.listdir(directory):
        if ".rf" in filename:
            prefix = filename.split(".rf")[0]
            prefix_to_files.setdefault(prefix, []).append(filename)
    return prefix_to_files

# Get prefix mappings from both directories
stage6_prefixes = get_prefixes(stage6_dir)
filter_prefixes = get_prefixes(filter_dir)

# Determine unique prefixes in stage6 that are not in filter_dir
unique_stage6_prefixes = set(stage6_prefixes.keys()) - set(filter_prefixes.keys())

# Delete files from stage6 with the unique prefixes
for prefix in unique_stage6_prefixes:
    for filename in stage6_prefixes[prefix]:
        file_path = os.path.join(stage6_dir, filename)
        os.remove(file_path)

# Save the deleted prefixes to a CSV file
csv_file = "deleted_prefixes.csv"
with open(csv_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["prefix"])
    for prefix in unique_stage6_prefixes:
        writer.writerow([prefix])

# Print the number of unique prefixes that were deleted
print(f"Deleted {len(unique_stage6_prefixes)} unique prefixes. CSV saved to {csv_file}.")

#### vi. Comparacion de prefijos

In [None]:
import os
from PIL import Image

# Define directories
stage4_dir = "ANPR2.v1i.yolov8/train_stage5_options/images"
#train_dir = "ANPR2.v1i.yolov8/train/images"
train_dir = "ANPR2.v1i.yolov8/train_del/train_filter_v2/images"

def get_unique_prefixes(directory):
    prefixes = set()
    for filename in os.listdir(directory):
        if '.rf' in filename:
            prefix = filename.split('.rf')[0]
            prefixes.add(prefix)
    return prefixes

def get_unique_extensions(directory):
    extensions = set()
    for filename in os.listdir(directory):
        ext = os.path.splitext(filename)[1].lower()
        extensions.add(ext)
    return extensions

def get_unique_dimensions(directory):
    dimensions = set()
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            with Image.open(file_path) as img:
                dimensions.add(img.size)
        except Exception:
            continue
    return dimensions

# Get unique prefixes
prefixes_stage4 = get_unique_prefixes(stage4_dir)
prefixes_train = get_unique_prefixes(train_dir)
missing_prefixes = prefixes_train - prefixes_stage4

print("Count of unique prefixes in stage images directory:", len(prefixes_stage4))
print("Count of unique prefixes in mod images directory:", len(prefixes_train))
print("\nPrefixes missing in stage4 images directory:")
for prefix in sorted(missing_prefixes):
    print(prefix)

# Get unique extensions and dimensions for stage4 directory
extensions_stage4 = get_unique_extensions(stage4_dir)
dimensions_stage4 = get_unique_dimensions(stage4_dir)

print("\nUnique extensions in stage4 images directory:")
for ext in sorted(extensions_stage4):
    print(ext)
print("\nUnique dimensions in stage4 images directory:")
for dim in sorted(dimensions_stage4):
    print(dim)

# Get unique extensions and dimensions for train directory
extensions_train = get_unique_extensions(train_dir)
dimensions_train = get_unique_dimensions(train_dir)

print("\nUnique extensions in train images directory:")
for ext in sorted(extensions_train):
    print(ext)
print("\nUnique dimensions in train images directory:")
for dim in sorted(dimensions_train):
    print(dim)


### 2.2 Dataset "Peru Plate Numbers"

#### i. Primer filtro de imagenes

In [None]:
import os
import shutil
from collections import defaultdict

# Define paths
source_dir = "Peru Plate Numbers.v3i.yolov8/train/images"
target_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v2/images"

# Ensure target directories exist
os.makedirs(target_dir, exist_ok=True)

# Get list of images
image_files = sorted(os.listdir(source_dir))

# Group images by their prefix before '.rf.'
grouped_images = defaultdict(list)

for image in image_files:
    if ".rf." in image:
        prefix = image.split(".rf.")[0]  # Extract prefix before .rf.
        grouped_images[prefix].append(image)

# Identify the original images and copy them
for prefix, group in grouped_images.items():
    group.sort()  # Sort within the group (to maintain order)
    
    # Assume the last image in sorted order is the original
    original_image = group[-1]  

    # Copy the original image to the target directory
    src_path = os.path.join(source_dir, original_image)
    dest_path = os.path.join(target_dir, original_image)
    shutil.copy2(src_path, dest_path)

print("Original images copied successfully.")


#### ii. Imprimir numero de imagenes

In [None]:
import os

# Define paths
source_dir = "Peru Plate Numbers.v3i.yolov8/train/images"
target_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v2/images"

# Function to count image files in a directory
def count_images(directory):
    if not os.path.exists(directory):
        return 0  # Return 0 if the directory does not exist
    return len([f for f in os.listdir(directory) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

# Count images in both directories
source_count = count_images(source_dir)
target_count = count_images(target_dir)

# Print results
print(f"Number of images in source directory ({source_dir}): {source_count}")
print(f"Number of images in target directory ({target_dir}): {target_count}")


#### iii. Stage 3: Depuracion

In [None]:
import os
import csv
import shutil

# Directories and CSV files
train_dir = "Peru Plate Numbers.v3i.yolov8/train/images"
filter_v2_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v2/images"
filter_v3_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v3/images"
csv_input = "limpieza_dataset_placas_stage-dataset2.csv"
csv_output = "dataset2_stage2_remaining.csv"

def get_prefix(filename):
    # Extract the part before ".rf" (keeps the _jpg if present)
    return filename.split(".rf")[0]

def build_prefix_mapping(directory):
    mapping = {}
    for f in os.listdir(directory):
        if ".rf" in f:
            prefix = get_prefix(f)
            mapping.setdefault(prefix, []).append(f)
    return mapping

# Build mappings for train and filter_v2 directories.
train_mapping = build_prefix_mapping(train_dir)
filter_v2_mapping = build_prefix_mapping(filter_v2_dir)

# Process CSV file and build set of prefixes to review.
review_prefixes = set()
with open(csv_input, newline="") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row:
            entry = row[0].strip()
            prefix_csv = entry.split(".rf")[0]
            review_prefixes.add(prefix_csv)

# Save the review prefixes to a CSV file.
with open(csv_output, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["prefix"])
    for prefix in review_prefixes:
        writer.writerow([prefix])
print(f"Saved {len(review_prefixes)} prefixes to {csv_output}")

# Ensure destination folder exists.
if not os.path.exists(filter_v3_dir):
    os.makedirs(filter_v3_dir)

# For each prefix needing review, select one remaining image from train (excluding the filter_v2 choice)
selected_count = 0
for prefix in review_prefixes:
    train_images = train_mapping.get(prefix, [])
    if not train_images:
        print(f"No train images found for prefix: {prefix}")
        continue
    # In filter_v2 there should be one image per prefix.
    v2_images = filter_v2_mapping.get(prefix, [])
    chosen_v2 = v2_images[0] if v2_images else None
    # Exclude the filter_v2 image from train images.
    remaining = [img for img in train_images if img != chosen_v2]
    if not remaining:
        print(f"No remaining images for prefix: {prefix}")
        continue
    remaining.sort()
    selected_img = remaining[0]
    src_path = os.path.join(train_dir, selected_img)
    dst_path = os.path.join(filter_v3_dir, selected_img)
    shutil.copy(src_path, dst_path)
    selected_count += 1

print(f"Copied {selected_count} images to {filter_v3_dir}")


#### iv. Stage 4: Depuracion

In [None]:
import os
import csv
import shutil

# Define directories and CSV file
train_dir = "Peru Plate Numbers.v3i.yolov8/train/images"
filter_v2_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v2/images"
filter_v3_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v3/images"
filter_v4_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v4/images"
csv_d2stage3 = "limpieza_dataset_placas_stage-d2stage3.csv"

def get_prefix(filename):
    # Returns the part before ".rf"
    return filename.split(".rf")[0]

def build_prefix_mapping(directory):
    """
    Builds a dictionary mapping each prefix (extracted from filenames) to a list of filenames.
    """
    mapping = {}
    for f in os.listdir(directory):
        if ".rf" in f:
            prefix = get_prefix(f)
            mapping.setdefault(prefix, []).append(f)
    return mapping

# 1. Build mapping for train images (prefix -> list of filenames)
train_mapping = build_prefix_mapping(train_dir)

# Build mapping for filter_v2 images (one image per prefix)
v2_mapping = {}
for f in os.listdir(filter_v2_dir):
    if ".rf" in f:
        prefix = get_prefix(f)
        v2_mapping[prefix] = f

# Build mapping for filter_v3 images (one image per prefix)
v3_mapping = {}
for f in os.listdir(filter_v3_dir):
    if ".rf" in f:
        prefix = get_prefix(f)
        v3_mapping[prefix] = f

# 2. Read CSV file and extract unique prefixes to review
review_prefixes = set()
with open(csv_d2stage3, newline="") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row:
            entry = row[0].strip()
            prefix = entry.split(".rf")[0]
            review_prefixes.add(prefix)
print(f"Found {len(review_prefixes)} prefixes to review from CSV.")

# 3. For each prefix, determine the remaining candidate image and copy it to filter_v4
if not os.path.exists(filter_v4_dir):
    os.makedirs(filter_v4_dir)

copied_count = 0
for prefix in review_prefixes:
    # Get the list of train images for this prefix
    imgs = train_mapping.get(prefix, [])
    if not imgs:
        print(f"No train images found for prefix: {prefix}")
        continue
    # Exclude the image from filter_v2 (the one randomly picked)
    if prefix in v2_mapping:
        imgs = [img for img in imgs if img != v2_mapping[prefix]]
    # Exclude the image from filter_v3 (the non-original candidate)
    if prefix in v3_mapping:
        imgs = [img for img in imgs if img != v3_mapping[prefix]]
    # The remaining image should be the original candidate
    if len(imgs) == 1:
        src = os.path.join(train_dir, imgs[0])
        dst = os.path.join(filter_v4_dir, imgs[0])
        shutil.copy(src, dst)
        copied_count += 1
    else:
        print(f"Unexpected number of remaining images for prefix '{prefix}': {imgs}")

print(f"Copied {copied_count} images to {filter_v4_dir}")


#### v. Stage 5. Unir

In [None]:
import os
import csv
import shutil

# Define directories
filter_v2_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v2/images"
filter_v3_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v3/images"
filter_v4_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v4/images"
filter_v5_dir = "Peru Plate Numbers.v3i.yolov8/train_filter_v5/images"

# CSV files
csv_v2 = "limpieza_dataset_placas_stage-dataset2.csv"
csv_v3 = "limpieza_dataset_placas_stage-d2stage3.csv"

def load_csv_set(csv_file):
    """Reads the CSV file and returns a set of image identifiers (filenames without extension)."""
    s = set()
    with open(csv_file, newline="") as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                # Each row is assumed to have one entry, e.g.
                # 20231009_192443_jpg.rf.da5d595c40e613d9878567c3c34f2c1d
                s.add(row[0].strip())
    return s

# Load CSV identifiers
csv_v2_set = load_csv_set(csv_v2)
csv_v3_set = load_csv_set(csv_v3)

# Create destination folder if it doesn't exist
os.makedirs(filter_v5_dir, exist_ok=True)

copied_count = 0

# Subset 1: Images from filter_v2 not in csv_v2_set
for filename in os.listdir(filter_v2_dir):
    if filename.lower().endswith(".jpg"):
        identifier = os.path.splitext(filename)[0]
        if identifier not in csv_v2_set:
            src_path = os.path.join(filter_v2_dir, filename)
            dst_path = os.path.join(filter_v5_dir, filename)
            shutil.copy(src_path, dst_path)
            copied_count += 1

# Subset 2: Images from filter_v3 not in csv_v3_set
for filename in os.listdir(filter_v3_dir):
    if filename.lower().endswith(".jpg"):
        identifier = os.path.splitext(filename)[0]
        if identifier not in csv_v3_set:
            src_path = os.path.join(filter_v3_dir, filename)
            dst_path = os.path.join(filter_v5_dir, filename)
            shutil.copy(src_path, dst_path)
            copied_count += 1

# Subset 3: All images from filter_v4
for filename in os.listdir(filter_v4_dir):
    if filename.lower().endswith(".jpg"):
        src_path = os.path.join(filter_v4_dir, filename)
        dst_path = os.path.join(filter_v5_dir, filename)
        shutil.copy(src_path, dst_path)
        copied_count += 1

print(f"Copied {copied_count} images to {filter_v5_dir}")

#### vi. Eliminar imagenes de motocicletas y placas obstruidas

In [None]:
import os
import csv

# Define directories and CSV output file
filter_v6_dir = r"Peru Plate Numbers.v3i.yolov8\train_filter_v6\images"
del_dir = r"Peru Plate Numbers.v3i.yolov8\train_del\images"
csv_output = "deleted_prefixes_dataset2.csv"

def get_prefix(filename):
    # Return the part before ".rf"
    return filename.split(".rf")[0]

def collect_prefixes(directory):
    prefixes = set()
    for f in os.listdir(directory):
        if ".rf" in f:
            prefixes.add(get_prefix(f))
    return prefixes

# Get prefixes from both directories
v6_prefixes = collect_prefixes(filter_v6_dir)
del_prefixes = collect_prefixes(del_dir)

# Determine prefixes in v6 that are not in del
prefixes_to_delete = v6_prefixes - del_prefixes

# Save these prefixes to CSV
with open(csv_output, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["prefix"])
    for prefix in sorted(prefixes_to_delete):
        writer.writerow([prefix])
print(f"Saved {len(prefixes_to_delete)} prefixes to {csv_output}")

# Delete images in filter_v6 with the identified prefixes
deleted_count = 0
for filename in os.listdir(filter_v6_dir):
    if ".rf" in filename:
        prefix = get_prefix(filename)
        if prefix in prefixes_to_delete:
            os.remove(os.path.join(filter_v6_dir, filename))
            deleted_count += 1

print(f"Deleted {deleted_count} images from {filter_v6_dir}")
