### Compare already validated but unused files with newly selected ones and copy-replace

This code is to be used in the process from iteration 3 onwards as from that point there will be validated clips coming form top scoring selections that were not yet been used for training. To avoid validating repeated clips, this code will read the files within the folder `unused_validated_clips` with the clips selected for each of the classes so it will recursively scan all the `class_folders` which are inside the `segments_validation/uncertainty/` folder. Inside this last directory will be lcoated the `class_folders` these could be the terminal folders in which case inside those there will be `.WAV`. It could also happen that if some classes are well-performing classes, then inside the `class_folder` there will be two other folders named `top_scoring` and `for_training`, in this case, the code should scan *only* the `for_training` folder.

I need two functions one to extract the core filename which is already defined as extract_core_filename. Then I need a function to move or replace files from validated_files directory into the corresponding folders in which there is a clips named like this one it is not move actually is copy and replace in the destin folder. 

In [6]:
import os
import shutil
import glob
import pandas as pd
from pathlib import Path

In [7]:
# Load directories
validated_clips_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/unused_validated_clips/'
selected_uncertainty_dir = '/mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/it_3/uncertainty/not_validated/'

# Path to tracker file
tracker_path = os.path.join(validated_clips_dir, 'unused_validated_clips_tracker.csv')


In [8]:
# Function to extract elements 1 to 5 of the filename for comparison
def extract_core_filename(filename):
    parts = Path(filename).stem.split('_')  # Remove extension and split filename
    return '_'.join(parts[1:6])  # Extract elements [1] to [5] 

In [9]:
# Load tracker data and create mapping from core_filename to full filename
if os.path.exists(tracker_path):
    tracker_df = pd.read_csv(tracker_path)
    validated_map = dict(zip(tracker_df['core_filename'], tracker_df['file']))
else:
    print(f"Tracker file not found")
    exit()

In [10]:
# Recursively scan all class folders or their subfolders
for class_folder in os.listdir(selected_uncertainty_dir):
    class_path = os.path.join(selected_uncertainty_dir, class_folder)
    if not os.path.isdir(class_path):
        continue
    
    # Look for subfolders 'for_training' and 'top_scoring'
    subfolders = ['for_training', 'top_scoring']
    search_paths = [os.path.join(class_path, subfolder) for subfolder in subfolders if os.path.isdir(os.path.join(class_path, subfolder))]

    # If no subfolders, add the class folder itself
    if not search_paths:
        search_paths = [class_path]

    # Compare core filenames and move-replace if a match
    for search_path in search_paths:
        selected_clips = glob.glob(os.path.join(search_path, '*.WAV'))

        if not selected_clips:
            continue

        for selected_clip in selected_clips:
            core_filename = extract_core_filename(Path(selected_clip).name)

            if core_filename in validated_map:
                validated_file = validated_map[core_filename]
                src_path = os.path.join(validated_clips_dir, validated_file)

                # Keep original filename (don't just use core_filename)
                dest_path = os.path.join(search_path, Path(validated_file).name)
                
                # BEFORE MOVING: DELETE EXISTING MATCHING FILE (if any)
                existing_files = glob.glob(os.path.join(search_path, f"*{core_filename}*.WAV"))
                for existing_file in existing_files:
                    try:
                        os.remove(existing_file)
                        print(f"Deleted existing file: {existing_file}")
                    except Exception as e:
                        print(f"Error deleting {existing_file}: {e}")

                try:
                    # Move the validated file to the destination folder
                    shutil.move(src_path, dest_path)
                    print(f"Moved and replaced: {src_path} → {dest_path}")
                except Exception as e:
                    print(f"Error moving {src_path} to {dest_path}: {e}")

print("✅ All matching files have been replaced and moved successfully.")
                

Deleted existing file: /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/test_contrast_files/uncertainty/Tropical Screech-Owl/top_scoring/1.0_84_CH18_SN13_20220805_013000.WAV
Moved and replaced: /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/test_contrast_files/unused_validated_clips/0.9999_84_CH18_SN13_20220805_013000_M.choliba.WAV → /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/test_contrast_files/uncertainty/Tropical Screech-Owl/top_scoring/0.9999_84_CH18_SN13_20220805_013000_M.choliba.WAV
Deleted existing file: /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/test_contrast_files/uncertainty/Tropical Screech-Owl/top_scoring/1.0_42_CH18_SN07_20220729_201000.WAV
Moved and replaced: /mnt/d/retraining_BirdNET_2025/iterative_training/segments_validation/test_contrast_files/unused_validated_clips/0.9999_42_CH18_SN07_20220729_201000_M.choliba.WAV → /mnt/d/retraining_BirdNET_2025/iterative_training/segme