### Long format training-set tracker df

Plus we included a function to restore a training set back to any iteration

In [None]:
import os
import pandas as pd

def update_training_set_tracker(directory, tracker_path, current_iteration):
    """
    Update the training set tracker with files from the training directory.
    Each file is tracked by its filename, assigned class, and iteration number.
    Moves between classes are detected and logged.
    """
    # List to hold new training clip data
    training_clips = []

    # Walk through each class folder in the directory
    for class_folder in os.listdir(directory):
        class_path = os.path.join(directory, class_folder)
        if os.path.isdir(class_path):  # Ensure it's a directory
            for clip in os.listdir(class_path):
                if clip.endswith('.wav'):
                    training_clips.append({
                        "file": os.path.join(class_folder, clip),  # Store relative path
                        "class": class_folder,  # Store class name
                        "iteration": current_iteration,  # Store iteration number
                        "previous_class": None  # Will be updated if file was moved
                    })

    # Convert list to DataFrame
    new_clips_df = pd.DataFrame(training_clips)

    # Load existing tracker (if it exists)
    if os.path.exists(tracker_path):
        tracker_df = pd.read_csv(tracker_path)
    else:
        # If no tracker exists, initialize it with new data
        tracker_df = new_clips_df.copy()
        tracker_df.to_csv(tracker_path, index=False)
        print(f"Tracker initialized with {len(tracker_df)} clips.")
        return

    # Detect moved files
    for _, row in new_clips_df.iterrows():
        file_path = row["file"]
        new_class = row["class"]

        # Check if file exists in tracker
        existing_entry = tracker_df[tracker_df["file"] == file_path]

        if not existing_entry.empty:
            previous_class = existing_entry.iloc[-1]["class"]

            # If the class has changed, update the previous_class column
            if previous_class != new_class:
                row["previous_class"] = previous_class

    # Append new data while keeping previous history
    updated_tracker_df = pd.concat([tracker_df, new_clips_df], ignore_index=True).drop_duplicates(subset=["file", "class", "iteration"], keep="last")

    # Save the updated tracker
    updated_tracker_df.to_csv(tracker_path, index=False)
    print(f"Tracker updated. Total files tracked: {len(updated_tracker_df)}")

# Specify the directory containing training clips
training_directory = "/mnt/d/DiscoTrainEval_Backup/retraining_BirdNET/model_train/"

# Specify the path to the tracker CSV
tracker_csv_path = "/mnt/d/DiscoTrainEval_Backup/retraining_BirdNET/training_set_tracker.csv"

# Set the current iteration (update this for each iteration)
current_iteration = 0  # Update this for each new iteration

# Run the tracker update
update_training_set_tracker(training_directory, tracker_csv_path, current_iteration)


To See Class Distribution Per Iteration

In [None]:
import pandas as pd

tracker_df = pd.read_csv("/path/to/training_set_tracker.csv")
summary = tracker_df.groupby(["iteration", "class"]).size().reset_index(name="clip_count")
print(summary)


To Restore a Training Set from a Specific Iteration:

In [None]:
import pandas as pd
import shutil
import os

def restore_training_set(tracker_path, restore_iteration, restore_dir):
    """
    Restore the training set from a specific iteration.
    - tracker_path: Path to the CSV tracking file
    - restore_iteration: Iteration to restore
    - restore_dir: Directory to copy the restored training set
    """
    tracker_df = pd.read_csv(tracker_path)
    restore_df = tracker_df[tracker_df["iteration"] <= restore_iteration]

    for _, row in restore_df.iterrows():
        file_path = row["file"]
        class_folder = row["class"]
        dest_folder = os.path.join(restore_dir, class_folder)
        os.makedirs(dest_folder, exist_ok=True)
        shutil.copy(file_path, dest_folder)

    print(f"Training set for iteration {restore_iteration} has been restored in {restore_dir}")

# Example usage
restore_training_set(
    tracker_path="/path/to/training_set_tracker.csv",
    restore_iteration=2,
    restore_dir="/path/to/restored_training_set"
)
