## Random Sampling

Select 248 random clips in total (considering 31 classes) or 8 random clips per class. Although I am including 31 classes to respect the 20% of random sampled clips I am avoinding including clips labeled as Res Junglefowl, as there are so many and I want to avoid labeling clips from this class. The same as other confounding sounds. For these I will only include them in uncertainty sampling.

Global Random Sampling:

Perform random sampling across all remaining predictions without class constraints.
Adjust sampling to exclude clips belonging to any unwanted classes (e.g., Gallus_gallus), if needed in the future.

In [None]:
import pandas as pd
from pathlib import Path
import random
from opensoundscape import Audio

# Paths
annotations_file = '/mnt/d/retraining_BirdNET/iterative_training_2025/output_results_2025/model_0/BirdNET_SelectionTable.txt'
tracker_file = '/mnt/d/retraining_BirdNET/model_train_2025/training_set_tracker.csv'
audio_files_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/input_20%/'
output_dir = '/mnt/d/retraining_BirdNET/iterative_training_2025/segments_validation_2025/model_0/'

iteration_number = 1  # Current iteration number
random_clip_count = 248  # Number of random clips to sample

In [None]:
# Load BirdNET predictions
predictions = pd.read_csv(annotations_file, sep='\t', usecols=['Begin Path', 'File Offset (s)', 'Common Name', 'Confidence'])
predictions.rename(columns={
    'Begin Path': 'file_path',
    'File Offset (s)': 'offset',
    'Common Name': 'class',
    'Confidence': 'score'
}, inplace=True)

In [None]:
# Construct `offset_filename` using ONLY offset and filename with extension
predictions['offset_filename'] = predictions.apply(
    lambda row: f"{int(row['offset'])}_{Path(row['file_path']).name}", axis=1  # Keep original extension
)

In [None]:
# Load training set tracker
if Path(tracker_file).exists():
    tracker = pd.read_csv(tracker_file)

    # Only consider files added in iteration 1 and onwards
    tracker = tracker[tracker['iteration'] >= 1]

    # Extract only the first 5 elements of the filename for comparison
    def extract_core_filename(filename):
        parts = Path(filename).stem.split('_')  # Remove extension and split filename
        return '_'.join(parts[:5])  # Keep only first 5 elements

    tracker['core_filename'] = tracker['file'].apply(lambda x: extract_core_filename(Path(x).name))
    tracker_filenames = tracker['core_filename'].tolist()
else:
    tracker_filenames = []

In [None]:
# Apply the same logic to predictions
predictions['core_filename'] = predictions['offset_filename'].apply(lambda x: extract_core_filename(x))

In [None]:
# Filter out unwanted classes
unwanted_classes = ["Gallus gallus_Red Junglefowl"]
# Apply both filters in one step using `~` (negation) to filter clips already in the traini set and avoid includin unwanted classes
predictions = predictions[
    (~predictions["class"].isin(unwanted_classes)) & 
    (~predictions["core_filename"].isin(tracker_filenames))
]

# Display result for verification
print(f"Filtered predictions: {len(predictions)} clips remaining")

In [None]:
# Randomly sample clips
random_samples = predictions.sample(min(random_clip_count, len(predictions)), random_state=42)

# Save selected random clips
for _, row in random_samples.iterrows():
    class_name = row["class"]
    class_dir = Path(output_dir) / class_name
    class_dir.mkdir(parents=True, exist_ok=True)
    
    score = row['score']
    offset = int(row['offset'])
    filename = Path(row['file_path']).name
    output_filename = f"{score}_{offset}_{filename}"
  
            
    # Assuming clips are extracted similarly as before
    audio_path = Path(audio_files_dir) / filename
    try:
        # Extract audio segment using Opensoudscape Audio class
        audio = Audio.from_file(audio_path, offset=offset, duration=3)
        audio.save(class_dir / output_filename)
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        
print("Random sampling complete. Clips saved.")
