### Long format training-set tracker df

Plus we included a function to restore a training set back to any iteration

In [2]:
import os
import pandas as pd

def update_training_set_tracker(directory, tracker_path, current_iteration):
    """
    Update the training set tracker:
    - Adds new rows for moved files with their new paths, classes, and iterations.
    - Keeps the original row for the previous iteration to preserve history.
    - Updates the 'Moved' column to reflect the previous class for moved files.
    """

    # List to hold current training set file paths and class labels
    current_training_clips = []

    for class_folder in os.listdir(directory):
        class_path = os.path.join(directory, class_folder)
        if os.path.isdir(class_path):  # Ensure it's a directory
            for clip in os.listdir(class_path):
                if clip.endswith('.wav') or clip.endswith('.WAV'):
                    current_training_clips.append({
                        "file": os.path.join(class_folder, clip),  # Store relative path
                        "class": class_folder,  # Store class name
                        "iteration": current_iteration,  # Store current iteration number
                        "moved": None  # Default for new files
                    })

    # Convert the current files to a DataFrame
    current_df = pd.DataFrame(current_training_clips)

    # Load the existing tracker if it exists
    if os.path.exists(tracker_path):
        tracker_df = pd.read_csv(tracker_path)
    else:
        # First-time initialization (Iteration 0)
        tracker_df = current_df.copy()
        tracker_df["moved"] = None
        tracker_df.to_csv(tracker_path, index=False)
        print(f"Tracker initialized with {len(tracker_df)} clips (Iteration 0).")
        return

    # Prepare a list for rows to append (new or moved files)
    rows_to_append = []

    for _, current_row in current_df.iterrows():
        current_file = current_row["file"]
        current_class = current_row["class"]

        # Check if the file already exists in the tracker
        filename_only = os.path.basename(current_file)
        matching_rows = tracker_df[tracker_df["file"].str.contains(filename_only)]

        if not matching_rows.empty:
            # If the file exists, check if the path (class) has changed
            previous_row = matching_rows.iloc[-1]
            previous_class = previous_row["class"]

            if previous_class != current_class:
                # File has been moved; create a new row with updated info
                current_row["moved"] = previous_class
                rows_to_append.append(current_row)

        else:
            # If the file is new, append it as a new entry
            rows_to_append.append(current_row)

    # Append new/moved rows to the tracker
    if rows_to_append:
        rows_to_append_df = pd.DataFrame(rows_to_append)
        tracker_df = pd.concat([tracker_df, rows_to_append_df], ignore_index=True)

        # Sort valued by class
        tracker_df = tracker_df.sort_values(['class', 'iteration'])
    
    # Save the updated tracker
    tracker_df.to_csv(tracker_path, index=False)
    print(f"Tracker updated. Total files tracked: {len(tracker_df)}")

# Example: Running the tracker
training_directory = "/mnt/d/retraining_BirdNET_2025/model_train/train_set/"
tracker_csv_path = "/mnt/d/retraining_BirdNET_2025/model_train/training_set_tracker.csv"
current_iteration = 0  # Update this for each new iteration

update_training_set_tracker(training_directory, tracker_csv_path, current_iteration)


Tracker initialized with 4875 clips (Iteration 0).


To See Class Distribution Per Iteration

In [None]:
import pandas as pd

tracker_df = pd.read_csv("/mnt/d/retraining_BirdNET_2025/model_train/training_set_tracker.csv")
summary = tracker_df.groupby(["iteration", "class"]).size().reset_index(name="clip_count")
print(summary)


To Restore a Training Set from a Specific Iteration:

In [None]:
import pandas as pd
import shutil
import os

def restore_training_set(tracker_path, restore_iteration, restore_dir):
    """
    Restore the training set from a specific iteration.
    - tracker_path: Path to the CSV tracking file
    - restore_iteration: Iteration to restore
    - restore_dir: Directory to copy the restored training set
    """
    tracker_df = pd.read_csv(tracker_path)
    restore_df = tracker_df[tracker_df["iteration"] <= restore_iteration]

    for _, row in restore_df.iterrows():
        file_path = row["file"]
        class_folder = row["class"]
        dest_folder = os.path.join(restore_dir, class_folder)
        os.makedirs(dest_folder, exist_ok=True)
        shutil.copy(file_path, dest_folder)

    print(f"Training set for iteration {restore_iteration} has been restored in {restore_dir}")

# Example usage
restore_training_set(
    tracker_path="/path/to/training_set_tracker.csv",
    restore_iteration=2,
    restore_dir="/path/to/restored_training_set"
)
