In [None]:
import glob
import pandas as pd
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir, "src")))
from utils import Utils
from labelUtils import LabelUtils
from reportFunctions import Reports
import matplotlib.pyplot as plt
from overlayFunctions import Overlays
from utils import Utils
import numpy as np
import shutil
from tqdm import tqdm
from pathlib import Path
import zipfile
import os

In [None]:
# Functions
def zip_batch_pred_labels(batch_folder, zip_it=True):
    lbl_folder = os.path.join(batch_folder, "yolo_labels")
    if zip_it and os.path.exists(lbl_folder):
        Utils.zip_folder(lbl_folder)
    return lbl_folder

def copy_zip_batch_gt_labels(batch_folder, image_list, zip_it=True):
    lbl_paths = []
    for img_path in image_list:
        p = Path(img_path)

        lbl_p = p.parents[1] / "labels" / p.with_suffix('.txt').name
        lbl_paths.append(str(lbl_p))

    lbl_folder = os.path.join(batch_folder, "gt_labels")
    os.makedirs(lbl_folder, exist_ok=True)
    
    # Using your helper to copy the files
    Utils.copy_files_lst(lbl_paths, lbl_folder)
    
    if zip_it:
        Utils.zip_folder(lbl_folder)
    return lbl_folder

def copy_zip_batch_images_labels(batch_folder, zip_it=True, gt=False):
    # 1. Load image list
    image_list = Utils.read_list_txt(os.path.join(batch_folder, "images.txt"))
    img_folder = os.path.join(batch_folder, "images")
    os.makedirs(img_folder, exist_ok=True)
    
    # 2. Copy images
    Utils.copy_files_lst(image_list, img_folder)
    
    # 3. Handle Labels
    if not gt:
        # Just use the existing yolo_labels generated by previous step
        target_lbl_folder = os.path.join(batch_folder, "yolo_labels")
    else:
        # Copy GT labels from source
        target_lbl_folder = copy_zip_batch_gt_labels(batch_folder, image_list, zip_it=zip_it)

    # 4. Assertion (Must happen BEFORE zipping/deleting)
    # Note: This will still fail if you have background images with no GT txt files
    img_count = len(os.listdir(img_folder))
    lbl_count = len(os.listdir(target_lbl_folder))
    print(f"Batch {batch_folder}: Images({img_count}) vs Labels({lbl_count})")
    
    # 5. Cleanup and Zipping
    if zip_it:
        Utils.zip_folder(img_folder)
        Utils.zip_folder(target_lbl_folder)
        
        # Only remove if we actually zipped them
        shutil.rmtree(img_folder)
        shutil.rmtree(target_lbl_folder)
        
        # If we created gt_labels and yolo_labels was also there, clean up yolo_labels
        pred_folder = os.path.join(batch_folder, "yolo_labels")
        if gt and os.path.exists(pred_folder):
            shutil.rmtree(pred_folder)
            
def clean_empty_images_from_batch(batch_folder):
    pred_zip_path = os.path.join(batch_folder, "yolo_labels.zip")
    gt_zip_path = os.path.join(batch_folder, "gt_labels.zip")
    img_zip_path = os.path.join(batch_folder, "images.zip")
    
    # Check if files exist
    if not all(os.path.exists(f) for f in [pred_zip_path, gt_zip_path, img_zip_path]):
        print(f"   Skipping {batch_folder}: Missing ZIPs.")
        return

    valid_stems = set()

    # 1. Identify "Valid" images (non-empty labels)
    for zip_path in [pred_zip_path, gt_zip_path]:
        with zipfile.ZipFile(zip_path, 'r') as z:
            for info in z.infolist():
                if info.filename.endswith('.txt') and info.file_size > 0:
                    # Use Path(info.filename).stem to ignore folder names in zip
                    valid_stems.add(Path(info.filename).stem)

    print(f"   Identified {len(valid_stems)} images with annotations.")

    # 2. Extract and Filter Images
    temp_img_folder = os.path.join(batch_folder, "temp_images_extract")
    if os.path.exists(temp_img_folder): shutil.rmtree(temp_img_folder)
    os.makedirs(temp_img_folder, exist_ok=True)
    
    with zipfile.ZipFile(img_zip_path, 'r') as z:
        z.extractall(temp_img_folder)
    
    # Walk through extracted files (handles nested folders safely)
    final_image_paths = []
    for root, dirs, files in os.walk(temp_img_folder):
        for file in files:
            file_path = os.path.join(root, file)
            file_stem = Path(file).stem
            
            if file_stem not in valid_stems:
                os.remove(file_path)
            else:
                final_image_paths.append(file_stem)

    # 3. Re-zip and Cleanup
    os.remove(img_zip_path) # Delete old zip
    Utils.zip_folder(temp_img_folder) 
    
    new_zip_name = temp_img_folder + ".zip"
    if os.path.exists(new_zip_name):
        os.rename(new_zip_name, img_zip_path)
    shutil.rmtree(temp_img_folder)

    # 4. Update images.txt and labels.txt
    # We read the existing ones to keep the path formatting (drive letters, etc.)
    # but filter them by the stems we kept.
    
    for txt_file in ["images.txt", "labels.txt"]:
        path = os.path.join(batch_folder, txt_file)
        if os.path.exists(path):
            with open(path, 'r') as f:
                lines = f.readlines()
            
            # Keep line if the filename stem is in our valid_stems
            filtered_lines = [l for l in lines if Path(l.strip()).stem in valid_stems]
            
            with open(path, 'w') as f:
                f.writelines(filtered_lines)
                
    print(f"   Done. Kept {len(final_image_paths)} images and updated .txt lists.")


def create_shared_labels_for_batch(batch_folder):
    img_zip_path = os.path.join(batch_folder, "images.zip")
    gt_zip_path = os.path.join(batch_folder, "gt_labels.zip")
    pred_zip_path = os.path.join(batch_folder, "yolo_labels.zip")
    
    if not os.path.exists(img_zip_path):
        return

    # 1. Get the list of images actually present in images.zip
    current_image_stems = set()
    with zipfile.ZipFile(img_zip_path, 'r') as z:
        for name in z.namelist():
            if not name.endswith('/'): # Skip directories
                current_image_stems.add(Path(name).stem)

    # 2. Setup temporary extraction directories
    temp_gt = os.path.join(batch_folder, "temp_gt")
    temp_pred = os.path.join(batch_folder, "temp_pred")
    share_labels_dir = os.path.join(batch_folder, "share_labels")
    
    for d in [temp_gt, temp_pred, share_labels_dir]:
        if os.path.exists(d): shutil.rmtree(d)
        os.makedirs(d, exist_ok=True)

    # 3. Extract source labels
    if os.path.exists(gt_zip_path):
        with zipfile.ZipFile(gt_zip_path, 'r') as z:
            z.extractall(temp_gt)
            
    if os.path.exists(pred_zip_path):
        with zipfile.ZipFile(pred_zip_path, 'r') as z:
            z.extractall(temp_pred)

    # 4. Perform Prioritized Copy
    # Strategy: GT first, then Supplement with Pred
    count_gt = 0
    count_pred = 0

    for stem in current_image_stems:
        target_name = f"{stem}.txt"
        found = False
        
        # Look in GT first (Check for file existence and content)
        gt_file = None
        # Walk temp_gt in case of nested structure
        for root, _, files in os.walk(temp_gt):
            if target_name in files:
                gt_file = os.path.join(root, target_name)
                break
        
        if gt_file and os.path.getsize(gt_file) > 0:
            shutil.copy2(gt_file, os.path.join(share_labels_dir, target_name))
            count_gt += 1
            found = True
            
        # If not found in GT, look in Pred
        if not found:
            pred_file = None
            for root, _, files in os.walk(temp_pred):
                if target_name in files:
                    pred_file = os.path.join(root, target_name)
                    break
            
            if pred_file and os.path.getsize(pred_file) > 0:
                shutil.copy2(pred_file, os.path.join(share_labels_dir, target_name))
                count_pred += 1
                found = True
        
        # If still not found, create empty file to maintain 1:1 image-label ratio
        if not found:
            open(os.path.join(share_labels_dir, target_name), 'a').close()

    # 5. Zip and Cleanup
    Utils.zip_folder(share_labels_dir)
    
    # Verify zip exists before rmtree
    if os.path.exists(share_labels_dir + ".zip"):
        shutil.rmtree(share_labels_dir)
    
    shutil.rmtree(temp_gt)
    shutil.rmtree(temp_pred)

    print(f"   Success: {count_gt} GT labels, {count_pred} Pred labels used.")


In [None]:
# --- Execution ---
high_conf_batch_folders = [
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_001_conf_0.83",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_002_conf_0.58",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_003_conf_0.52",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_004_conf_0.48",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_005_conf_0.45",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_006_conf_0.42",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_007_conf_0.39",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_008_conf_0.37",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_009_conf_0.34",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_010_conf_0.32",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_011_conf_0.30",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_012_conf_0.29",
r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_013_conf_0.27",
]
# for batch in high_conf_batch_folders:
#     copy_zip_batch_images_labels(batch, zip_it=True, gt=False)

In [None]:
## At and below confidence=0.25 we start having more discrepancy between the labels and the predictions so we priorize ground truths
low_conf_batch_folders = [
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_014_conf_0.25",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_015_conf_0.24",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_016_conf_0.22",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_017_conf_0.21",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_018_conf_0.20",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_019_conf_0.19",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_020_conf_0.18",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_021_conf_0.17",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_022_conf_0.16",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_023_conf_0.15",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_024_conf_0.14",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_025_conf_0.13",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_026_conf_0.12",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_027_conf_0.12",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_028_conf_0.11",
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\HNM_batch_029_conf_0.11",
]
for batch in low_conf_batch_folders:
    pass
    # print(f"Copying original labels and images from {batch}...")
    # copy_zip_batch_images_labels(batch_folder, zip_it=True, gt=True)
    # print(f"Processing {batch}...")
    # clean_empty_images_from_batch(batch)
    # print(f"Merging Labels for {batch}...")
    # create_shared_labels_for_batch(batch)

In [None]:
all_meta_nf = pd.read_csv(r"Z:\__AdvancedTechnologyBackup\07_Database\MetadataCombined\all_annotated_meta_splits_filtered_20251030.csv", index_col=0, low_memory=False)
# filter for images with n_fish >= n images
df_abs2_meta_nf = all_meta_nf[(all_meta_nf.n_fish >= 0)  & (all_meta_nf.imh > 2176)]
df_abs1_meta_nf = all_meta_nf[(all_meta_nf.n_fish >= 0)  & (all_meta_nf.imh == 2176)]
abiss2_len = len(df_abs2_meta_nf) 
abiss1_len = len(df_abs1_meta_nf)  
# export_paths(df_abs1_meta_nf[df_abs1_meta_nf.split == "test"], run13_folder, subfolder="abiss1_test")
print("usable Abiss1 images:", abiss1_len)
print("usable Abiss2 images:", abiss2_len) #usable Abiss2 images: 14225
few_fish_df = df_abs2_meta_nf[(df_abs2_meta_nf.n_fish < 2)]
new_path = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\AUV_datasets\one_fish\images"
output_dir = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\AUV_datasets\one_fish\overlays"
# Overlays.plot_label_overlays(few_fish_df.image_path, few_fish_df.label_path, output_dir, overwrite=False)
# usable Abiss1 images: 33356
# usable Abiss2 images: 13549

In [None]:
def get_tiles_paths_from_tile_names(TILE_NAMES, BASE_DIR):
    all_tiled_image_paths, all_tiled_label_paths = Utils.get_all_img_lbl_pths(BASE_DIR)
    img_path_map_no_ext = {os.path.basename(p).split(".")[0]: p for p in all_tiled_image_paths}
    lbl_path_map_no_ext = {os.path.basename(p).split(".")[0]: p for p in all_tiled_label_paths}
    tiled_im_no_ext = list(map(lambda x: x.split(".")[0], TILE_NAMES))
    batch_tiled_im_fp = [img_path_map_no_ext[bn] for bn in tiled_im_no_ext if bn in img_path_map_no_ext]
    batch_tiled_lb_fp = [lbl_path_map_no_ext[bn] for bn in tiled_im_no_ext if bn in lbl_path_map_no_ext]
    return sorted(batch_tiled_im_fp), sorted(batch_tiled_lb_fp)

In [None]:
def get_tiles_paths_from_batch_dir(BATCH_DIR, BASE_DIR, SPLITS):
    all_tiled_image_paths, all_tiled_label_paths = Utils.get_all_img_lbl_pths(BASE_DIR, SPLITS)
    batch_tiled_im = os.listdir(BATCH_DIR+"\\"+"images")
    batch_tiled_lb = os.listdir(BATCH_DIR+"\\"+"labels")
    batch_tiled_im_no_ext = list(map(lambda x: x.split(".")[0], batch_tiled_im))
    batch_tiled_lb_no_ext = list(map(lambda x: x.split(".")[0], batch_tiled_lb))

    # --- Refactored & Completed Assignments ---

    # 1. Create basename-to-fullpath dictionaries (The key step for efficient lookup)
    # Note: zip is often cleaner than map(lambda...) for pairing elements.
    img_path_map_no_ext = {os.path.basename(p).split(".")[0]: p for p in all_tiled_image_paths}
    lbl_path_map_no_ext = {os.path.basename(p).split(".")[0]: p for p in all_tiled_label_paths}

    # 2. Use list comprehensions to look up the full paths using the batch basenames
    # This is much faster than searching a list repeatedly.

    batch_tiled_im_fp = [img_path_map_no_ext[bn] for bn in batch_tiled_im_no_ext if bn in img_path_map_no_ext]
    batch_tiled_lb_fp = [lbl_path_map_no_ext[bn] for bn in batch_tiled_lb_no_ext if bn in lbl_path_map_no_ext]

    # --- Verification (Optional) ---
    print(f"Total full paths found for images in batch: {len(batch_tiled_im_fp)}")
    print(f"Total full paths found for labels in batch: {len(batch_tiled_lb_fp)}")
    # assert len(batch_tiled_im_fp) == len(batch_tiled_im), "Some image files not found in the master list."
    # assert len(batch_tiled_lb_fp) == len(batch_tiled_lb), "Some label files not found in the master list."
    return sorted(batch_tiled_im_fp), sorted(batch_tiled_lb_fp)

# BATCH_DIR = r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\transects\20240804_001_Iver3069_ABS2\20240804_001_Iver3069_ABS2_batch_00"
# BASE_DIR = r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\transects\20240804_001_Iver3069_ABS2"
# batch_tiled_im_fp, batch_tiled_lb_fp = get_tiles_paths_from_batch_dir(BATCH_DIR = BATCH_DIR, BASE_DIR = BASE_DIR, SPLITS = None)

# Utils.write_list_txt(sorted(batch_tiled_im_fp), BATCH_DIR+"\\"+"images.txt")
# Utils.write_list_txt(sorted(batch_tiled_lb_fp), BATCH_DIR+"\\"+"labels.txt")

In [None]:
def get_full_paths_from_tiled_images(tiled_images):
    batch_set = set(map(lambda x: Utils.convert_tile_img_pth_to_basename(x), tiled_images))
    all_full_imgs = glob.glob(r"D:\datasets\full\*\images\*.png")
    all_full_imgs += glob.glob(r"D:\datasets\full\*\images\*.jpg")
    all_full_lbls = glob.glob(r"D:\datasets\full\*\labels\*.txt")
    print("full images", len(all_full_imgs))
    print("full labels", len(all_full_lbls))
    assert len(all_full_imgs) == len(all_full_lbls)
    img_path_map_no_ext = {os.path.basename(p).split(".")[0]: p for p in all_full_imgs}
    lbl_path_map_no_ext = {os.path.basename(p).split(".")[0]: p for p in all_full_lbls}
    batch_full_im_fp = [img_path_map_no_ext[bn] for bn in batch_set if bn in img_path_map_no_ext]
    batch_full_lb_fp = [lbl_path_map_no_ext[bn] for bn in batch_set if bn in lbl_path_map_no_ext]
    assert len(batch_full_im_fp) == len(batch_full_lb_fp), "number of images and labels do not match"
    print("full image paths from tiled paths", len(batch_full_lb_fp))
    return batch_full_im_fp, batch_full_lb_fp
    
# batch_full_im_fp, batch_full_lb_fp = get_full_paths_from_tiled_images(batch_tiled_im_fp)   
# Utils.write_list_txt(batch_full_im_fp, r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\relabel\2025_goby_relabel_batch_01\full\images.txt")
# Utils.write_list_txt(batch_full_lb_fp, r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\relabel\2025_goby_relabel_batch_01\full\labels.txt")

In [None]:
def create_conf_batches(scores_df: pd.DataFrame, pred_df: pd.DataFrame, output_directory: str, batch_size: int = 1500):
    """
    Bins the dataframe samples into batches of EXACTLY 'batch_size' unique images (except the final batch).
    """
    print(f"Starting batch creation for {len(scores_df['Filename'].unique())} unique images based on scores_df...")
    
    # 1. Sort the scores DataFrame by 'conf'
    df_scored_sorted = scores_df.sort_values(by='conf', ascending=False)
    
    # Get the unique image filenames in the sorted order as a NumPy array for clean splitting
    unique_filenames_array = df_scored_sorted['Filename'].drop_duplicates().to_numpy()
    num_unique_images = len(unique_filenames_array)
    
    # 2. Determine batch splits
    # Calculate the exact number of batches required
    num_batches = int(np.ceil(num_unique_images / batch_size))

    # --- CORE FIX ---
    # Use array_split to divide the unique_filenames_array into 'num_batches' parts.
    # This guarantees that the first (N-1) parts are as close to 'batch_size' as possible.
    # When num_batches is calculated based on batch_size, this ensures the size target.
    batch_filename_splits = np.array_split(unique_filenames_array, num_batches)
    # ----------------
    
    base_output_path = Path(output_directory)
    
    # 3. Iterate through batches
    for i, batch_filenames in enumerate(batch_filename_splits):
        
        # 'batch_filenames' is now an array containing the exact filenames for this batch.
        
        # Select the hard negative rows for the batch for scoring/metadata
        scores_df_isin = df_scored_sorted[df_scored_sorted['Filename'].isin(batch_filenames)].copy()
        
        # Select ALL prediction rows for the batch for YOLO label generation
        batch_df_isin = pred_df[pred_df['Filename'].isin(batch_filenames)].copy()
        
        # 4. Determine confidence range of the FP for naming
        conf_min = scores_df_isin['conf'].min()
        conf_max = scores_df_isin['conf'].max()
        
        # 5. Create batch output directory
        folder_name = f"HNM_batch_{i+1:03d}_conf_{conf_max:.2f}"
        batch_output_path = base_output_path / folder_name
        os.makedirs(batch_output_path, exist_ok=True)

        # Save the hard negative dataframe for this specific batch
        scores_df_isin.to_csv(os.path.join(batch_output_path, "scores_df.csv"), index=False)
        
        # 6. Save image list file (images.txt)
        image_paths = batch_df_isin['tile_path'].unique().tolist()
      
        # image_list_filename = batch_output_path / "images.txt"
        # Utils.write_list_txt(image_paths, str(image_list_filename))
        
        # 7. Save YOLO prediction labels for the batch (assuming LabelUtils is fixed to accept confidence_thresh)
        LabelUtils.convert_predict_df_to_yolo_labels(batch_df_isin, confidence_thresh=0.25, batch_output_dir=str(batch_output_path))
        
        # Print actual size of the batch
        print(f"✅ Batch {i+1}/{num_batches} (Images: {len(image_paths)}, Conf Range: [{conf_min:.2f}, {conf_max:.2f}]) created in: {batch_output_path}", end="  \r")

    print("\nBatch creation complete.")

In [None]:
# Get HNM tiles - Tiles with False positive predictions with high confidence
# thresh = 0.1
# run13_HNM_test_score = pd.read_csv(r"D:\ageglio-1\gobyfinder_yolov8\output\test_runs\run13-tile-test\scores.csv", index_col=0)

# run13_HNM_test_score_fp = run13_HNM_test_score[run13_HNM_test_score.fp==1]
# HNM_fp_objects = run13_HNM_test_score_fp[run13_HNM_test_score_fp.conf>=thresh].copy()
# HNM_fp_tiles = HNM_fp_objects.Filename.unique()
# HNM_fp_full_images = HNM_fp_objects.Filename.apply(lambda x: Utils.convert_tile_img_pth_to_basename(x)).unique()

# pred_df = pd.read_csv(r"D:\ageglio-1\gobyfinder_yolov8\output\test_runs\run13-tile-test\predictions.csv", index_col=0)
# pred_df_filt = pred_df.copy()
# pred_df_filt["tile_path"] = get_tiles_paths_from_tile_names(pred_df_filt.Filename, BASE_DIR= r"D:\datasets\tiled")
# assert pred_df_filt.tile_path.notna().all()

# print("n fp objects", len(HNM_fp_objects))
# print("n fp tiles", len(HNM_fp_tiles))
# print("n fp full imgs", len(HNM_fp_full_images))

# assert len(pred_df_filt[pred_df_filt.Filename.isin(HNM_fp_tiles)].Filename.unique()) == len(HNM_fp_tiles)

# # write the yolo labels and filepath data using the create_conf_batches function
# # Bins the dataframe samples into batches of EXACTLY 'batch_size' unique images (except the final batch).
# output_dir = r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM"
# create_conf_batches(HNM_fp_objects, pred_df_filt, output_dir, batch_size=1500)

In [None]:
def export_batch_paths_tile_relabel(all_images_df, all_sampled_HNM_image_lst):
    """
    Exports full and tiled image/label paths for relabeling batches.
    Prevents resampling and re-exporting of batches that already have an 'images.txt' list.
    """

    # --- Setup ---
    BASE_DIR = r"D:\datasets\tiled"
    SPLITS = ["train", "test", "validation"]
    all_tiled_image_paths, all_tiled_label_paths = Utils.get_all_img_lbl_pths(BASE_DIR, SPLITS)

    BATCH_SIZE = 100
    innodata_update = r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\relabel"

    # ============================================================
    # --- STEP 0: REMOVE ALL IMAGES ALREADY USED IN BATCH 00 + 01
    # ============================================================

    # Convert tiled image paths → full-image basenames
    # Example: Z/.../tile_123_0_0.png → "123"
    already_used_basenames = {
        Utils.convert_tile_img_pth_to_basename(p) for p in all_sampled_HNM_image_lst
    }

    # Drop them from the pool
    used_indices = all_images_df[all_images_df.Filename.isin(already_used_basenames)].index
    remaining_images_df = all_images_df.drop(used_indices)

    print(f"Removed {len(used_indices)} images already used in HNM.")

    # ============================================================
    # --- STEP 1: Handle the Initial Samples (Batch 00)
    # ============================================================

    name_initial = "2025_goby_relabel_batch_00"
    initial_full_dir = os.path.join(innodata_update, name_initial, "full")
    initial_images_list_path = os.path.join(initial_full_dir, "images.txt")

    if os.path.exists(initial_images_list_path):
        initial_images = Utils.read_list_txt(initial_images_list_path)
        initial_bn = {os.path.basename(x).split(".")[0] for x in initial_images}

        # Remove batch 00 images (safe even if already removed above)
        initial_indices = remaining_images_df[remaining_images_df.Filename.isin(initial_bn)].index
        remaining_images_df = remaining_images_df.drop(initial_indices)
        print(f"Batch 00: Removed {len(initial_indices)} images from remaining pool.")
    else:
        print(f"Error: Initial sample list not found at {initial_images_list_path}. Exiting.")
        return

    # ============================================================
    # --- STEP 1B: Handle Batch 01 (also already completed)
    # ============================================================

    name_batch01 = "2025_goby_relabel_batch_01"
    batch01_full_dir = os.path.join(innodata_update, name_batch01, "full")
    batch01_images_list_path = os.path.join(batch01_full_dir, "images.txt")

    if os.path.exists(batch01_images_list_path):
        batch01_images = Utils.read_list_txt(batch01_images_list_path)
        batch01_bn = {os.path.basename(x).split(".")[0] for x in batch01_images}

        batch01_indices = remaining_images_df[remaining_images_df.Filename.isin(batch01_bn)].index
        remaining_images_df = remaining_images_df.drop(batch01_indices)
        print(f"Batch 01: removed {len(batch01_indices)} images from remaining pool.")
    else:
        print("WARNING: Batch 01 images.txt not found — skipping explicit removal.")

    # ============================================================
    # --- STEP 2: Process Subsequent Batches (Start at Batch 02)
    # ============================================================

    batch_number = 2   # <-- CRITICAL: start at batch 02
    total_new_batches = 0

    while len(remaining_images_df) > 0:
        name = f"2025_goby_relabel_batch_{batch_number:02d}"

        tile_img_dir = os.path.join(innodata_update, name, "tiled", "images")

        # --------------------------------------------------------
        # If batch already exists → skip it
        # --------------------------------------------------------
        if os.path.exists(tile_img_dir):
            filenames = os.listdir(tile_img_dir)
            tiled_imgs = [f for f in filenames if f.endswith(".png") or f.endswith(".jpg")]

            if len(tiled_imgs) > 0:
                existing_tiled_txt = os.path.join(innodata_update, name, "tiled", "images.txt")
                print(f"Skipping {name}: {len(tiled_imgs)} already exist. Removing images from remaining pool.")

                Utils.check_txt_file_vs_images(existing_tiled_txt, tile_img_dir)

                # Convert tiled paths → basenames
                existing_tiled_list = Utils.read_list_txt(existing_tiled_txt)
                existing_basenames = {
                    Utils.convert_tile_img_pth_to_basename(p) for p in existing_tiled_list
                }

                sampled_indices = remaining_images_df[
                    remaining_images_df.Filename.isin(existing_basenames)
                ].index

                remaining_images_df = remaining_images_df.drop(sampled_indices)

        # --------------------------------------------------------
        # Create a new batch
        # --------------------------------------------------------
        else:
            total_new_batches += 1
            print(f"Creating new batch: {name}")

            current_batch_size = min(BATCH_SIZE, len(remaining_images_df))
            subset = remaining_images_df.sample(n=current_batch_size, random_state=batch_number)

            # Export full images
            full_dir = os.path.join(innodata_update, name, "full")
            os.makedirs(full_dir, exist_ok=True)
            Utils.write_list_txt(subset.image_path.values, os.path.join(full_dir, "images.txt"))
            Utils.write_list_txt(subset.label_path.values, os.path.join(full_dir, "labels.txt"))

            # Export tiled images
            tiled_images, tiled_labels = Utils.list_tiled_set(
                subset.Filename.values, all_tiled_image_paths, all_tiled_label_paths
            )
            tile_dir = os.path.join(innodata_update, name, "tiled")
            os.makedirs(tile_dir, exist_ok=True)
            Utils.write_list_txt(tiled_images, os.path.join(tile_dir, "images.txt"))
            Utils.write_list_txt(tiled_labels, os.path.join(tile_dir, "labels.txt"))

            # Remove sampled items
            remaining_images_df.drop(subset.index, inplace=True)

        batch_number += 1

    print(f"Total batches checked: {batch_number - 1}")
    print(f"Total NEW batches created: {total_new_batches}")

In [None]:
all_sampled_HNM_image_lst_files = glob.glob(
    r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\HNM\*\images.txt"
)
all_sampled_HNM_image_lst = []
for file in all_sampled_HNM_image_lst_files:
    all_sampled_HNM_image_lst.extend(Utils.read_list_txt(file))
print("all HNM tiles list", len(all_sampled_HNM_image_lst))

all_images_df = df_abs2_meta_nf
# export_batch_paths_tile_relabel(all_images_df, all_sampled_HNM_image_lst)

In [None]:
# # Copy Batches
completed_batches = ["00", "01", "02", "03","04","05","06","07","08","09","10","11","12"] # Change this to the desired batch number
# for batch in batches:
#     path = f"Z:\\__AdvancedTechnologyBackup\\04_ProjectData\\Innodata_2025\\Goby\\relabel\\2025_goby_relabel_batch_{batch}\\tiled"
#     copy_zip_batch_images_labels(path, zip_it=True, gt=True)

In [None]:
def copy_subsampled_transect_tiled_images_labels_with_objects_for_labeling(collect_id, run_copy=True):
    # assuming the collect_id inference run is stored locally
    all_tiled_image_paths, all_tiled_label_paths = glob.glob(f"..\\output\\inference\\{collect_id}\\tiled\\images\\*.png"), glob.glob(f"..\\output\\inference\\{collect_id}\\tiled\\labels\\*.txt")
    print(len(all_tiled_image_paths), len(all_tiled_label_paths))
    assert len(all_tiled_image_paths) == len(all_tiled_label_paths), "Mismatch between image and label counts."
    full_imgs = Utils.read_list_txt(f"..\\output\\transects\\{collect_id}\\subsampled_images_0.3\\images.txt")
    tiled_images, tiled_labels  = Utils.list_tiled_set(full_imgs, all_tiled_image_paths, all_tiled_label_paths)
    assert len(tiled_images) > 0, "No tiled images found"
    assert len(tiled_images) == len(tiled_labels), "Mismatch between number of images and labels"
    print(f"Number of tiled images from full_imgs: {len(tiled_images)}")
    pd.Series(tiled_images).to_csv(f"..\\output\\inference\\{collect_id}\\tiled\\selected_tiled_images.csv", header=False, index=False)
    pd.Series(tiled_labels).to_csv(f"..\\output\\inference\\{collect_id}\\tiled\\selected_tiled_labels.csv", header=False, index=False)
    # Write the tiled labels to the dataset folder if they are not empty
    innodata_project_folder = f"Z:\\__AdvancedTechnologyBackup\\04_ProjectData\\Innodata_2025\\Goby\\transects\\{collect_id}"
    img_folder = f"{innodata_project_folder}\\images"
    lbl_folder = f"{innodata_project_folder}\\labels"
    if not os.path.exists(img_folder):
        os.makedirs(img_folder)
    if not os.path.exists(lbl_folder):
        os.makedirs(lbl_folder)
    for img, lbl in tqdm(zip(tiled_images, tiled_labels)):
        if os.path.exists(lbl) and os.path.getsize(lbl) > 0:
            shutil.copy2(lbl, lbl_folder)
            shutil.copy2(img, img_folder)
# tranect tiling and preping for labeling
collect_id = "20240618_001_Iver3069_ABS2"
# collect_id = "20240804_001_Iver3069_ABS2"
# copy_subsampled_transect_tiled_images_labels_with_objects_for_labeling(collect_id, run_copy=True)

In [None]:
def export_batch_paths_tile_transect(collect_id):
    """
    Exports tiled image/label paths for transect batches.
    Prevents resampling of transects that already have an export directory.
    Samples by full image basename to ensure all tiles for a full image stay together.
    """
    # Define constants
    BATCH_SIZE = 100
    transect_update = f"Z:\\__AdvancedTechnologyBackup\\04_ProjectData\\Innodata_2025\\Goby\\transects\\{collect_id}"

    # Step 1: Create a DataFrame with ALL existing TILE paths
    # Note: Use os.path.join for robust path construction in glob
    base_dir = os.path.join(transect_update, "images")
    all_image_tiles = glob.glob(os.path.join(base_dir, "*.png"))
    all_label_tiles = glob.glob(os.path.join(transect_update, "labels", "*.txt"))
    
    # Check for empty data before proceeding
    if not all_image_tiles or not all_label_tiles:
        print(f"No tiled data found for {collect_id}. Exiting.")
        return 0

    df = pd.DataFrame(np.c_[all_image_tiles, all_label_tiles], columns=["image_path", "label_path"])
    
    # CRITICAL: Extract the 'Full Image Basename' from the tile label path
    # Assuming tile paths look like 'full_image_name_x000_y000.txt'
    df['basename'] = df.label_path.apply(lambda x: os.path.basename(x).rsplit('_', 2)[0])
    
    # The pool of images to sample from is the UNIQUE full image basenames
    # Use tolist() and convert back to Series later for easier indexing/sampling
    unbatched_basenames_list = df['basename'].unique().tolist()
    unbatched_basenames = pd.Series(unbatched_basenames_list)

    # Now, we proceed with the batching by basename
    batch_number = 0
    total_new_batches = 0
    total_tiles = 0
    
    while len(unbatched_basenames) > 0:
        name = f"{collect_id}_batch_{batch_number:02d}"
        transect_path = os.path.join(transect_update, name)
        
        # --- Skipping Logic ---
        images_txt_path = os.path.join(transect_path, "images.txt")
        if os.path.exists(images_txt_path):
            print(f"Skipping existing batch: {name}")
            
            # Read the list of TILED images that were exported
            # Assumes Utils.read_list_txt reads the contents of the file
            exported_tiled_paths = Utils.read_list_txt(images_txt_path)
            total_tiles += len(exported_tiled_paths)
            print(f"Number of tiled images in existing batch: {len(exported_tiled_paths)}")
            # Convert the tiled paths back to their full image basenames
            # Assuming Utils.convert_tile_img_pth_to_basename extracts the full image basename
            # e.g., converts 'path/to/img_x000_y000.png' to 'img'
            exported_basenames = {Utils.convert_tile_img_pth_to_basename(x) for x in exported_tiled_paths}
            print(f"Number of unique basenames in existing batch: {len(exported_basenames)}")
            # Filter the unbatched pool to remove the basenames that are already in this batch
            # Convert Series to list/set for difference, then back to Series for sampling
            remaining_basenames_set = set(unbatched_basenames.tolist()).difference(exported_basenames)
            unbatched_basenames = pd.Series(list(remaining_basenames_set)) # Restore as Series
            
            batch_number += 1
            continue # Go to the next loop iteration (next batch number)
            
        # --- Create a New Batch ---
        else:
            total_new_batches += 1
            print(f"Creating new batch: {name}")
            
            current_batch_size = min(BATCH_SIZE, len(unbatched_basenames))

            # Sample the unique full image basenames from the remaining pool
            # Use random_state=batch_number to make the sample repeatable for this batch number
            # but only if you want repeatability. The prompt said "Do NOT set random_state" 
            # for different random samples, so we'll remove it.
            # Fix: Since unbatched_basenames is a Series of the BASENAMES, sampling works
            # Sample by position (frac=None) since index might be arbitrary
            subset_basenames = unbatched_basenames.sample(
                n=current_batch_size, random_state=None, replace=False
            ) 
            print(f"Sampled {len(subset_basenames)} basenames for new batch.")
            # Get ALL tile paths associated with the sampled basenames
            subset_df = df[df['basename'].isin(subset_basenames.tolist())]
            print(f"Number of TILE images/labels in new batch: {len(subset_df)}")
            total_tiles += len(subset_df)
            # Export the list of TILE paths
            os.makedirs(transect_path, exist_ok=True)
            Utils.write_list_txt(subset_df.image_path.tolist(), os.path.join(transect_path, "images.txt"))
            Utils.write_list_txt(subset_df.label_path.tolist(), os.path.join(transect_path, "labels.txt"))
            
            # CRITICAL: Remove the sampled basenames from the remaining pool for the next iteration
            remaining_basenames_set = set(unbatched_basenames.tolist()).difference(set(subset_basenames.tolist()))
            unbatched_basenames = pd.Series(list(remaining_basenames_set))
            
            # Increment the batch counter
            batch_number += 1

    print(f"Total batches checked: {batch_number}")
    print(f"Total NEW batches created: {total_new_batches}")
    print(f"Total TILE images processed: {total_tiles}")
    return batch_number

In [None]:
# Transect batches
collect_id = "20240804_001_Iver3069_ABS2"
# collect_id = "20240618_001_Iver3069_ABS2"
n_batches = export_batch_paths_tile_transect(collect_id = collect_id)

In [None]:
def copy_transect_batch_image_labels(collect_id, batch = 0, copy=False):  # Change this to the desired batch number
    # # Copy Batches
    transect_update = f"Z:\\__AdvancedTechnologyBackup\\04_ProjectData\\Innodata_2025\\Goby\\transects\\{collect_id}"
    path = transect_update + "\\" + f"{collect_id}_batch_{batch:02d}"
    images, labels = Utils.read_list_txt(os.path.join(path, "images.txt")), Utils.read_list_txt(os.path.join(path, "labels.txt"))
    img_folder, lbl_folder = os.path.join(path, "images"), os.path.join(path, "labels")
    if copy:
        Utils.copy_files_lst(images, img_folder)
        Utils.copy_files_lst(labels, lbl_folder)
    Utils.check_txt_file_vs_images(os.path.join(path, "images.txt"), img_folder)

for b in [8]:
    copy_transect_batch_image_labels(collect_id="20240618_001_Iver3069_ABS2", batch = b, copy=True)

In [None]:
# --- Path Construction ---
def get_im_lb_folders(batch, collect_id):
    transect_update = f"Z:\\__AdvancedTechnologyBackup\\04_ProjectData\\Innodata_2025\\Goby\\transects\\{collect_id}"
    batch_name = f"{collect_id}_batch_{batch:02d}"
    path = os.path.join(transect_update, batch_name)
    img_folder = os.path.join(path, "images")
    lbl_folder = os.path.join(path, "yolo_labels")
    return img_folder, lbl_folder


# batches = [8]
# collect_id = "20240618_001_Iver3069_ABS2"
# for batch in batches:
#     img_folder, lbl_folder = get_im_lb_folders(batch, collect_id)
#     Utils.zip_folder(img_folder)
#     Utils.zip_folder(lbl_folder)


In [None]:
curve_path_0n = r"..\output\validation\detect\test_sept_1fish\test_run13_1n_curves.csv"
support_0n = df_abs2_meta_nf[(df_abs2_meta_nf.n_fish < 2)].shape[0]
curve_path_2n = r"..\output\validation\detect\test_sept_2-3fish\test_run13_2-3n_curves.csv"
support_2n = df_abs2_meta_nf[(df_abs2_meta_nf.n_fish >= 2) & (df_abs2_meta_nf.n_fish <= 3)].shape[0]
curve_path_4n = r"..\output\validation\detect\test_sept_4fish+\test_run13_4n+_curves.csv"
support_4n = df_abs2_meta_nf[(df_abs2_meta_nf.n_fish >= 4)].shape[0]
curve_path_abs2 = r"..\output\validation\detect\test_sept_abiss2\test_run13_abiss2_curves.csv"
support_abs2 = df_abs2_meta_nf.shape[0]
curve_path_abs1 = r"..\output\validation\detect\test_sept_abiss1\test_run13_abiss1_curves.csv"
support_abs1 = df_abs1_meta_nf.shape[0]
df_0n, fmax_0n, cmax_0n, c_eq_0n, pr_eq_0n = Reports.get_metrics(curve_path_0n) 
df_2n, fmax_2n, cmax_2n, c_eq_2n, pr_eq_2n = Reports.get_metrics(curve_path_2n) 
df_4n, fmax_4n, cmax_4n, c_eq_4n, pr_eq_4n = Reports.get_metrics(curve_path_4n) 
dfabs2, fmaxabs2, cmaxabs2, c_eqabs2, pr_eqabs2 = Reports.get_metrics(curve_path_abs2) 
dfabs1, fmaxabs1, cmaxabs1, c_eqabs1, pr_eqabs1 = Reports.get_metrics(curve_path_abs1) 

In [None]:
plt.plot(df_0n.precision, df_0n.recall, label=f"ABISS2 only 1 fish, {support_0n}, {support_0n/support_abs2*100:0.1f}%", color="purple")
plt.plot(df_2n.precision, df_2n.recall, label=f"ABISS2 At 2-3 fish, {support_2n}, {support_2n/support_abs2*100:0.1f}%", color="green")
plt.plot(df_4n.precision, df_4n.recall, label=f"ABISS2 At least 4 fish, {support_4n}, {support_4n/support_abs2*100:0.1f}%", color="orange")
plt.plot(dfabs1.precision, dfabs1.recall, label=f"ABISS1 Test images, {support_abs1}", color="red")
plt.xlabel('Recall') 
plt.ylabel('Precision') 
plt.title('Goby count bins Precision-Recall Curve')
plt.legend() # 1461

In [None]:
plt.plot(dfabs2.precision, dfabs2.recall, label=f"ABISS2 Test images, {support_abs2}")
plt.plot(dfabs1.precision, dfabs1.recall, label=f"ABISS1 Test images, {support_abs1}", color="red")
plt.xlabel('Recall') 
plt.ylabel('Precision') 
plt.title('ABISS1 vs ABISS2 Precision-Recall Curve')
plt.legend()

In [None]:
json_files = glob.glob(r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\relabel\2025_goby_relabel_initial_sample\full\Innodata Output\Initial Re-Label Full\json\*.json")
image_lst = glob.glob(r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Innodata_2025\Goby\relabel\2025_goby_relabel_initial_sample\full\original\images\images\*.png")
image_path = image_lst[2]
json_path = json_files[2]
Overlays.plot_coco_boxes(image_path, json_path)