### Count unique source files

In [79]:
import os # import os module which provides a way of interacting with the operating system, 
# allowing me to access and manipulate the file system

#The purpose of this function is to count the number of unique source files 
# in a given directory based on the filenames.

def get_unique_sources(directory):
    unique_sources = set() #This line initializes an empty set called unique_sources. 
    # A set is a collection type in Python that only stores unique items. It automatically handles duplicates.

#loop that iterates over every file in the specified directory
    for filename in os.listdir(directory):
    # os.listdir(directory) function returns a list of all the files and subdirectories in the given directory    
        if filename.lower().endswith('.wav'):
            # Split the filename into parts based on underscores
            parts = filename.split('_')
            
            # Ensure there are enough parts to access the required elements
            if len(parts) > 1:
                # Extract the relevant parts (3rd, 4th, 5th, and 6th)
                source_id = f"{parts[0]}_{parts[1]}"#_{parts[2]}"#_{parts[3]}"
                # creates a string source_id that concatenates the parts of the filename that follow the second 
                # underscore. The f"{...}" syntax is called an f-string, which allows you to embed expressions 
                #inside string literals. The variables inside the curly braces {} are evaluated and inserted into the
                # string. Parts 2 here refer two the part with index 2, considering elements start being index with 0.
                # Add the source identifier to the set
                unique_sources.add(source_id)
                #This line adds the source_id to the unique_sources set. Since sets only store unique items, 
                #any duplicate source_id values will automatically be ignored.
            else:
                print(f"Skipping file with unexpected format: {filename}")
                #This line starts an else block, which is executed if the if condition (len(parts) > 2) is not met.
    # Return the number of unique sources
    return len(unique_sources)
    # The len() function returns the number of elements in the unique_sources set, which corresponds to the number of
    # unique source_id values
# Example usage:
directory_path = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/Athene cunicularia_Burrowing Owl_call/"
num_unique_sources = get_unique_sources(directory_path)
print(f"Number of different source files: {num_unique_sources}")


Number of different source files: 22


In [11]:
import os

def count_unique_sources(directory):
    unique_names = set()

    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            # Split the filename before the last underscore
            unique_part = '_'.join(filename.split('_')[:-2])
            unique_names.add(unique_part)
    
    return len(unique_names)

# Specify your directory path here
directory_path = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/Other/"

# Call the function and print the result
unique_count = count_unique_sources(directory_path)
print(f'Number of unique source files: {unique_count}')


Number of unique source files: 96


In [12]:

def count_files_starting_with_xc(directory):
    count = 0

    for filename in os.listdir(directory):
        if filename.startswith('XC_'):
            count += 1
    
    return count

# Specify your directory path here
directory_path = '/mnt/c/Users/agos-/OneDrive/Escritorio/ESCRITORIO/Doctorado/ANALISIS_DOCTORADO/procesamiento_audios/retraining_BirdNET/model_train/1st_model/Other/'

# Call the function and print the result
xc_count = count_files_starting_with_xc(directory_path)
print(f'Number of files starting with "XC_": {xc_count}')


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Users/agos-/OneDrive/Escritorio/ESCRITORIO/Doctorado/ANALISIS_DOCTORADO/procesamiento_audios/retraining_BirdNET/model_train/1st_model/Other/'

In [80]:
import os

def count_unique_xc_files(directory):
    unique_numbers = set()

    for filename in os.listdir(directory):
        if filename.startswith('XC_'):
            parts = filename.split('_')
            if len(parts) > 2:  # Ensure there are enough parts to extract the unique number
                unique_number = parts[1]
                unique_numbers.add(unique_number)
    
    return len(unique_numbers)

# Specify your directory path here
directory_path = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/Athene cunicularia_Burrowing Owl_call/"

# Call the function and print the result
unique_xc_count = count_unique_xc_files(directory_path)
print(f'Number of unique XC files: {unique_xc_count}')


Number of unique XC files: 0


In [36]:
import os

def count_unique_non_xc_files(directory):
    unique_ids = set()

    for filename in os.listdir(directory):
        if not filename.startswith('XC_'):
            parts = filename.split('_')
            if len(parts) > 4:  # Ensure there are enough parts to construct the unique ID
                unique_id = f"{parts[0]}_{parts[1]}_{parts[3]}_{parts[4]}"
                unique_ids.add(unique_id)
    
    return len(unique_ids)

# Specify your directory path here
directory_path = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/Noise/"

# Call the function and print the result
unique_non_xc_count = count_unique_non_xc_files(directory_path)
print(f'Number of unique non-XC files: {unique_non_xc_count}')


Number of unique non-XC files: 77


In [34]:
import os
from collections import Counter

def count_unique_files(directory):
    # Step 2: List files in the directory
    files = os.listdir(directory)
    
    # Step 3: Extract the first four parts of the file names
    key_parts = ['_'.join(f.split('_')[:2]) for f in files if '_' in f]
    
    # Step 4: Count occurrences of these key parts
    counter = Counter(key_parts)
    
    # Step 5: Get the unique file names based on the first four parts
    unique_files = set(key_parts)
    
    # Calculate the total number of unique file names
    total_unique_files = len(unique_files)
    
    return unique_files, total_unique_files

# Example usage:
directory_path = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/Noise/"# Replace with the path to your directory
unique_files, total_unique_files = count_unique_files(directory_path)

# Print the unique file names
for unique_file in unique_files:
    print(unique_file)

# Print the total number of unique file names
print(f"Total number of unique file names: {total_unique_files}")



20220909_034000
20220919_050000
20220909_210000
20220909_064000
20220909_201000
20220910_204000
20220909_200000
20220909_185000
20220909_180000
20220909_051000
20220909_213000
20220909_050000
20220909_195000
20220715_034000
20220917_011000
20220712_032000
M7_20220713
ARD3_CO17
20220711_202000
20220713_051000
20220909_070000
20220712_003000
20220909_193000
20220909_032000
20220714_032000
20220922_052000
20220714_070000
20220712_000000
20220909_222000
20220909_063000
20220909_015000
M39_SN03
20220909_041000
20220909_040000
20220910_051000
20220910_194000
20220910_060000
20220715_234000
20220910_042000
20220910_043000
20220919_064000
20220910_041000
20220909_024000
ARD3_BO07
20220713_061000
20220716_202000
20220909_182000
20220910_014000
20220910_033000
20220909_205000
20220714_051000
20220910_054000
20220909_035000
M31_SN09
20220909_060000
20220716_170000
20220909_194000
20220910_005000
20220910_193000
M31_SN01
20220910_050000
20220713_054000
20220909_044000
20220909_203000
20220909_0450