### Count unique source files

In [3]:
import os # import os module which provides a way of interacting with the operating system, 
# allowing me to access and manipulate the file system

#The purpose of this function is to count the number of unique source files 
# in a given directory based on the filenames.

def get_unique_sources(directory):
    unique_sources = set() #This line initializes an empty set called unique_sources. 
    # A set is a collection type in Python that only stores unique items. It automatically handles duplicates.

#loop that iterates over every file in the specified directory
    for filename in os.listdir(directory):
    # os.listdir(directory) function returns a list of all the files and subdirectories in the given directory    
        if filename.endswith(".wav"):
            # Split the filename into parts based on underscores
            parts = filename.split('_')
            
            # Ensure there are enough parts to access the required elements
            if len(parts) > 2:
                # Extract the relevant parts (3rd, 4th, 5th, and 6th)
                source_id = f"{parts[3]}_{parts[4]}_{parts[5]}_{parts[6]}"
                # creates a string source_id that concatenates the parts of the filename that follow the second 
                # underscore. The f"{...}" syntax is called an f-string, which allows you to embed expressions 
                #inside string literals. The variables inside the curly braces {} are evaluated and inserted into the
                # string. Parts 2 here refer two the part with index 2, considering elements start being index with 0.
                # Add the source identifier to the set
                unique_sources.add(source_id)
                #This line adds the source_id to the unique_sources set. Since sets only store unique items, 
                #any duplicate source_id values will automatically be ignored.
            else:
                print(f"Skipping file with unexpected format: {filename}")
                #This line starts an else block, which is executed if the if condition (len(parts) > 2) is not met.
    # Return the number of unique sources
    return len(unique_sources)
    # The len() function returns the number of elements in the unique_sources set, which corresponds to the number of
    # unique source_id values
# Example usage:
directory_path = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/Noise/"
num_unique_sources = get_unique_sources(directory_path)
print(f"Number of different source files: {num_unique_sources}")


IndexError: list index out of range

### Compare filenames to avoid duplicate validation

In [1]:
import os
import shutil

In [2]:
# Define the directories
dir1= '/mnt/e/retraining_BirdNET/iterative_training/segments_validation/1st_model/Barn Owl/'
dir2= '/mnt/e/retraining_BirdNET/iterative_training/segments_validation/2nd_model/Barn Owl/'

In [3]:
def extract_relevant_parts(filename):
    """Extract parts indexed as 1, 2, 3, 4, 5, 6 from the filename."""
    parts = filename.split('_')
    return '_'.join(parts[1:7])

def move_previously_detected_files(dir1, dir2):
    # Create a set to store the relevant parts of filenames from directory 1
    dir1_filenames = set()

    # Loop through all items in dir1
    for filename in os.listdir(dir1):
        filepath = os.path.join(dir1, filename)
        
        # Check if the item is a file (not a directory) and is a .wav file
        if os.path.isfile(filepath) and filename.endswith(".wav"):
            relevant_part = extract_relevant_parts(filename)
            dir1_filenames.add(relevant_part)

                                  
    # Prepare the subdirectory in directory 2 for moving files
    previously_detected_dir = os.path.join(dir2, 'previously_detected')
    if not os.path.exists(previously_detected_dir):
        os.makedirs(previously_detected_dir)

    # Loop through all files in directory 2
    for filename in os.listdir(dir2):
        if filename.endswith(".wav"):
            relevant_part = extract_relevant_parts(filename)
            if relevant_part in dir1_filenames:
                # Move the file to the 'previously_detected' directory
                src_file_path = os.path.join(dir2, filename)
                dest_file_path = os.path.join(previously_detected_dir, filename)
                shutil.move(src_file_path, dest_file_path)

    print(f"Files moved to '{previously_detected_dir}'.")


In [4]:
# Run the function
move_previously_detected_files(dir1, dir2)

Files moved to '/mnt/e/retraining_BirdNET/iterative_training/segments_validation/2nd_model/Barn Owl/previously_detected'.


In [5]:
import os
import shutil

def extract_relevant_parts(filename):
    """Extract parts indexed as 1, 2, 3, 4, 5, 6 from the filename."""
    parts = filename.split('_')
    return '_'.join(parts[1:7])

def move_previously_detected_files(subdir1, subdir2):
    # Create a set to store the relevant parts of filenames from subdir1
    subdir1_filenames = set()

    # Loop through all files in subdir1 (ignoring further subdirectories)
    for filename in os.listdir(subdir1):
        filepath = os.path.join(subdir1, filename)
        if os.path.isfile(filepath) and filename.endswith(".wav"):
            relevant_part = extract_relevant_parts(filename)
            subdir1_filenames.add(relevant_part)

    # Prepare the subdirectory in subdir2 for moving files
    previously_detected_dir = os.path.join(subdir2, 'previously_detected')
    if not os.path.exists(previously_detected_dir):
        os.makedirs(previously_detected_dir)

    # Loop through all files in subdir2 (only .wav files should be here)
    for filename in os.listdir(subdir2):
        filepath = os.path.join(subdir2, filename)
        if os.path.isfile(filepath) and filename.endswith(".wav"):
            relevant_part = extract_relevant_parts(filename)
            if relevant_part in subdir1_filenames:
                # Move the file to the 'previously_detected' directory
                src_file_path = os.path.join(subdir2, filename)
                dest_file_path = os.path.join(previously_detected_dir, filename)
                shutil.move(src_file_path, dest_file_path)

def process_all_subdirectories(base_dir1, base_dir2):
    # Loop through all subdirectories in dir1
    for subdir in os.listdir(base_dir1):
        subdir1 = os.path.join(base_dir1, subdir)
        subdir2 = os.path.join(base_dir2, subdir)
        
        # Check if the corresponding subdirectory exists in dir2 and both are directories
        if os.path.isdir(subdir1) and os.path.isdir(subdir2):
            print(f"Processing folder: {subdir}")
            move_previously_detected_files(subdir1, subdir2)
        else:
            print(f"Skipping {subdir} because it is not a directory or does not exist in both locations.")

# Define the base directories
base_dir1 = r'/mnt/e/retraining_BirdNET/iterative_training/segments_validation/1st_model/'
base_dir2 = r'/mnt/e/retraining_BirdNET/iterative_training/segments_validation/2nd_model/'

# Run the function to process all subdirectories
process_all_subdirectories(base_dir1, base_dir2)


Processing folder: Barn Owl
Processing folder: Black-banded Owl
Processing folder: Black-capped Screech-Owl
Processing folder: Brown Tinamou
Processing folder: Buff-fronted Owl
Processing folder: Burrowing Owl
Processing folder: Common Pauraque
Processing folder: Common Potoo
Processing folder: Ferruginous Pygmy-Owl
Processing folder: Little Nightjar
Processing folder: Long-tailed Potoo
Processing folder: Long-tufted Screech-Owl
Processing folder: Mottled Owl_call
Processing folder: Mottled Owl_song
Processing folder: Ocellated Poorwill
Processing folder: Rufous Nightjar
Processing folder: Rufous-capped Motmot
Processing folder: Rusty-barred Owl_call
Processing folder: Rusty-barred Owl_call1
Processing folder: Rusty-barred Owl_song
Processing folder: Short-tailed Nighthawk
Processing folder: Silky-tailed Nightjar
Processing folder: Spot-winged Wood-Quail
Processing folder: Striped Owl_call
Processing folder: Striped Owl_song
Processing folder: Stygian Owl_call
Processing folder: Stygia