In [None]:
import glob
import pandas as pd
import os
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))
from tqdm import tqdm
from utils import Utils
import os
import pandas as pd
import numpy as np

In [None]:
# Use a dictionary to map file paths for cleaner code
PATHS = {
    "op_table": r"Z:\__AdvancedTechnologyBackup\07_Database\OP_TABLE.xlsx",
    "metadata": r"Z:\__AdvancedTechnologyBackup\07_Database\MetadataCombined\all_annotated_meta_splits_20250915.csv",
    "output_dir": r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\AUV_datasets",
    "local_dir": r"D:\datasets",
    "addl_species": r"z:\__AdvancedTechnologyBackup\07_Database\addl_species_log.xlsx"
}

# Define the sets of collect_ids for each split
COLLECT_IDS = {
    "transects": {"20200806_001_Iver3069_ABS1", "20200816_001_Iver3069_ABS1",
                  "20210825_001_Iver3069_ABS1", "20210720_001_Iver3069_ABS1"},
    "test": {"20200809_001_Iver3069_ABS1", "20200818_001_Iver3069_ABS1", "20200902_001_Iver3069_ABS1", "20200820_001_Iver3069_ABS1", "20200821_001_Iver3069_ABS1", "20200823_001_Iver3069_ABS1",
             "20210811_001_Iver3069_ABS1", "20210812_001_Iver3069_ABS1", "20210812_002_Iver3069_ABS1", "20210719_001_Iver3069_ABS1", "20210829_001_Iver3069_ABS1", "20210911_001_Iver3069_ABS1", "20210911_002_Iver3069_ABS1", "20210925_001_Iver3069_ABS1",
             "20220624_001_Iver3069_ABS1", "20220714_002_Iver3069_ABS1", "20220727_001_Iver3069_ABS2", "20220811_002_Iver3098_ABS2", "20220807_003_Iver3069_ABS2", "20220901_001_Iver3069_ABS2", "20220814_001_Iver3069_ABS2", "20220814_002_Iver3069_ABS2",
             "20230710_001_Iver3098_ABS2", "20230909_001_Iver3069_ABS2", "20230810_002_Iver3098_ABS2", "20230727_001_Iver3098_ABS2"},
    "validation": {"20200916_001_Iver3069_ABS1", "20200922_002_Iver3069_ABS1", "20200923_002_Iver3069_ABS1",
                   "20210712_001_Iver3069_ABS1", "20210909_001_Iver3069_ABS1", "20210920_001_Iver3069_ABS1", "20210707_001_Iver3069_ABS1", "20210912_001_Iver3069_ABS1", "20210912_002_Iver3069_ABS1", "20210913_001_Iver3069_ABS1",
                   "20220711_002_Iver3069_ABS1", "20220714_003_Iver3069_ABS1", "20220717_001_Iver3098_ABS2", "20220825_001_Iver3098_ABS2", "20220914_002_Iver3069_ABS2", "20220902_001_Iver3069_ABS2",
                   "20230802_001_Iver3098_ABS2", "20230625_001_Iver3098_ABS2", "20230718_002_Iver3098_ABS2", "20230811_001_Iver3098_ABS2", "20230715_001_Iver3098_ABS2"}
}

# Ensure the output directory exists
os.makedirs(PATHS["output_dir"], exist_ok=True)

# --- 1. Load and classify all metadata in one pass ---
print("Loading and preparing metadata...")
all_annotated_meta = pd.read_csv(PATHS["metadata"], low_memory=False)

def get_split(collect_id):
    for split_name, ids in COLLECT_IDS.items():
        if collect_id in ids:
            return split_name
    return 'train'

# Classify each row with its split
all_annotated_meta['split'] = all_annotated_meta['collect_id'].apply(get_split)

# Apply initial filters and exclude NaNs in 'DistanceToBottom_m' and any additional species
addl_species_Filenames = pd.read_excel(PATHS["addl_species"]).Filename

filtered_meta = all_annotated_meta.query(
    'Usability == "Usable" and ((year == 2021 and n_fish >= 2) or (year in [2020, 2022, 2023])) and not DistanceToBottom_m.isnull()'
)
filtered_meta = filtered_meta[~filtered_meta['Filename'].isin(addl_species_Filenames)]
print("Filtered Usable data", filtered_meta.shape)
filtered_meta.to_csv(r"Z:\__AdvancedTechnologyBackup\07_Database\MetadataCombined\all_annotated_meta_splits_filtered_20251030.csv")
# --- 2. Generate and save the text files for each split ---
for split_name in ['train', 'test', 'validation', 'transects']:
    # Get filenames for the current split
    filenames = filtered_meta[filtered_meta['split'] == split_name]['Filename'].tolist()
    
    # Write each filename on a new line
    output_path = os.path.join(PATHS["output_dir"], f"{split_name}.txt")
    output_path = os.path.join(PATHS["local_dir"], f"{split_name}.txt")
    with open(output_path, 'w') as f:
        # The key change is here: use '\n'.join()
        f.write('\n'.join(filenames))
        
    print(f"Saved {len(filenames)} filenames to {output_path}")

print("\n--- Generating statistics reports ---")

# --- 3. Group and calculate summary statistics ---
df_grouped = filtered_meta.groupby(["collect_id", "split"]).agg(
    n_images=('Filename', 'count'),
    n_fish_p_collect=('n_fish', 'sum')
).reset_index()

df_grouped['n_fish_p_image'] = df_grouped['n_fish_p_collect'] / df_grouped['n_images']
df_grouped['year'] = df_grouped['collect_id'].str[:4].astype(int)
df_grouped['camera'] = df_grouped['collect_id'].str[-4:]

# --- 4. Merge with op_table for additional info ---
op_table = pd.read_excel(PATHS["op_table"])
df_stats = df_grouped.merge(
    op_table[["COLLECT_ID", "LAKE_NAME", "MISSION_NAME", "PORT_NAME", "LATITUDE", "LONGITUDE"]],
    left_on="collect_id",
    right_on="COLLECT_ID",
    how='left'
).drop(columns='COLLECT_ID')

df_stats.to_csv(f"{PATHS['output_dir']}/Run13_collect_stats.csv", index=False)
df_stats.to_csv(f"{PATHS['local_dir']}/Run13_collect_stats.csv", index=False)
# --- 5. Export individual and combined summary CSVs ---
for split_name, df_split in df_stats.groupby('split'):
    print(f"Total images for {split_name}:", df_split['n_images'].sum())

# combined_df.to_csv(f"{PATHS['output_dir']}/yearly_split_stats.csv", index=False)
print("\nAll stats CSVs generated successfully.")

--- Generating statistics reports ---
Total images for test: 4460
Total images for train: 31324
Total images for transects: 7388
Total images for validation: 3741

In [None]:
import pandas as pd
import numpy as np
# --- Initial Data Loading ---
# Note: The original path uses a Windows-style path string, which is fine, 
# but using a raw string (r"...") can prevent potential issues with backslashes.
summary = pd.read_csv(r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\AUV_datasets\Run13_collect_stats.csv")

# --- Initial Data Cleanup/Prep ---
# Simplify the camera column extraction immediately
summary['camera'] = summary['collect_id'].str.split('_').str[-1]

# --- Core Refactoring: Aggregate and Pivot ---

# 1. Group by year, camera, and split, then sum n_images
# 2. Pivot the 'split' column to make 'train', 'test', 'validation' separate columns
df_pivot = summary.pivot_table(
    index=['year', 'camera'],
    columns='split',
    values='n_images',
    aggfunc='sum'
).reset_index()

# 3. Rename columns for clarity (matching your original final names)
df_pivot = df_pivot.rename(columns={
    'train': 'n images train',
    'test': 'n images test',
    'validation': 'n images validation'
})

# 4. Calculate total images and percentages
total_col = (
    df_pivot['n images train'] + 
    df_pivot['n images test'] + 
    df_pivot['n images validation']
)
df_pivot['n_images_total'] = total_col.astype(int)
df_pivot['test (%)'] = (df_pivot['n images test'] / total_col * 100).round(2)
df_pivot['validation (%)'] = (df_pivot['n images validation'] / total_col * 100).round(2)

# --- Final Calculations (Based on the original logic) ---

# Calculate n_tiles (now a single operation using np.where)
# The logic: n_images * 9 if 'ABS2' is in 'camera', else n_images * 6
df_pivot['n_tiles'] = np.where(
    df_pivot['camera'].str.contains('ABS2', na=False),
    total_col * 9,
    total_col * 6
).astype(int)

# 5. Drop the individual split columns and rename the total column
summary_run13 = df_pivot.drop(
    columns=['n images train', 'n images test', 'n images validation']
)
summary_run13

In [None]:
print("n tiles", summary_run13.n_tiles.sum())
print("n tiles by camera", summary_run13.groupby(by="camera").n_tiles.sum())

In [None]:
n_im_col = df_stats.n_images
df_stats['n_tiles'] = np.where(
    df_stats['camera'].str.contains('ABS2', na=False),
    n_im_col * 9,
    n_im_col * 6
).astype(int)
train_tiles = df_stats[df_stats['split'] == 'train'].n_tiles.sum()
print(f"Total tiles in train set: {train_tiles}")
test_tiles = df_stats[df_stats['split'] == 'test'].n_tiles.sum()
print(f"Total tiles in test set: {test_tiles}")
validation_tiles = df_stats[df_stats['split'] == 'validation'].n_tiles.sum()
print(f"Total tiles in validation set: {validation_tiles}")
transect_tiles = df_stats[df_stats['split'] == 'transects'].n_tiles.sum()
print(f"Total tiles in transects: {transect_tiles}")
# Total tiles in train set: 220317
# Total tiles in test set: 31266
# Total tiles in validation set: 26217
# Total tiles in transects: 44328

In [None]:
# Write Flull images txt files
def write_split_txt_full(split, write=True):
    dataset_folder = "D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\full"
    imgs = Utils.read_list_txt(f"Z:\\__Organized_Directories_InProgress\\GobyFinderDatasets\\AUV_datasets\\{split}.txt")
    all_image_paths = glob.glob(f"D:\\datasets\\full\\{split}\\images\\*.png")
    all_image_paths += glob.glob(f"D:\\datasets\\full\\{split}\\images\\*.jpg")
    all_label_paths = glob.glob(f"D:\\datasets\\full\\{split}\\labels\\*.txt")
    images, labels  = Utils.list_full_set(imgs, all_image_paths, all_label_paths)
    if len(imgs) < len(all_image_paths):
        print("there are more images locally than in the split")
    elif len(imgs) > len(all_image_paths):
        print("There are more images in the split than copied locally")
    assert len(images) == len(labels), "Mismatch between number of images and labels"
    print(f"Number of full images for {split}: {len(images)}")
    if write:
        Utils.write_list_txt(images, f"{dataset_folder}\\{split}\\images.txt")
        Utils.write_list_txt(labels, f"{dataset_folder}\\{split}\\labels.txt")
    return images, labels

In [None]:
# Number of full images for train: 31324
# Number of full images for test: 4460
# Number of full images for validation: 3741
splits = ["train", "test", "validation"]  # Change this to 'train', 'test', or 'transects' as needed
for split in splits:
    write_split_txt_full(split)

In [None]:
# We have to convert validation images to jpg for the dataloader
def convert_val_to_hq_jpg(VAL_OUTPUT_DIR = r"D:\datasets\tiled\validation\jpg", move_pngs = False):

    validation_images = glob.glob(VAL_OUTPUT_DIR+"\\"+"*png")
    # Create the output directory if it doesn't exist
    os.makedirs(VAL_OUTPUT_DIR, exist_ok=True)

    # Iterate and convert all files
    for filename in tqdm(validation_images):
        if filename.lower().endswith('.png'):
            png_path = filename
            jpg_filename = os.path.splitext(filename)[0] + '.jpg'
            jpg_path = os.path.join(VAL_OUTPUT_DIR, jpg_filename)
            Utils.convert_png_to_highest_quality_jpeg(png_path, jpg_path)
    if move_pngs: 
        Utils.MOVE_files_lst(validation_images, r"D:\datasets\tiled\validation\png\images")

In [None]:
# write tiled txt file paths
def write_split_txt_tiled(split, dataset_folder=None, write=True):
    image_folder = f'D:\\datasets\\tiled\\{split}\\images'
    label_folder = f'D:\\datasets\\tiled\\{split}\\labels'
    images = glob.glob(image_folder+"\\"+"*.png")
    images += glob.glob(image_folder+"\\"+"*.jpg")
    labels = glob.glob(label_folder+"\\"+"*.txt")
    n_images = len(images)
    n_labels = len(labels)
    # assert n_images == n_labels
    print(n_images, split)
    if write:
        Utils.write_list_txt(images, f"{dataset_folder}\\{split}\\images.txt")
        Utils.write_list_txt(labels, f"{dataset_folder}\\{split}\\labels.txt")
    return images, labels
# 220419 train
# 31266 test
# 26217 validation
dataset_folder = "D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled"
splits = ["train", "test", "validation"]
for split in splits:
    write_split_txt_tiled(split, dataset_folder)

In [None]:
def return_tiled_meta_csv(split="train"):
    img_pths, lbls_pths = write_split_txt_tiled(split, write=False)
    imgs_pths_df = pd.DataFrame(np.c_[img_pths, lbls_pths], columns=["image_path", "label_path"])
    imgs_pths_df['Tilename'] = imgs_pths_df.image_path.apply(lambda x: os.path.basename(x).split(".")[0])
    imgs_pths_df['Filename'] = imgs_pths_df.image_path.apply(lambda x: Utils.convert_tile_img_pth_to_basename(x))
    filtered_meta_tiles = filtered_meta.drop(columns=["image_path", "label_path"])
    filtered_meta_tiles = pd.merge(imgs_pths_df, filtered_meta_tiles, on="Filename", how="inner")
    filtered_meta_tiles = filtered_meta_tiles.rename(columns={"Filename":"basename", "Tilename": "Filename"})
    filtered_meta_tiles['imw'] = 1672
    filtered_meta_tiles['imh'] = 1307
    return filtered_meta_tiles
    

def return_full_meta_csv(split="train"):
    img_pths, lbls_pths = write_split_txt_full(split, write=False)
    imgs_pths_df = pd.DataFrame(np.c_[img_pths, lbls_pths], columns=["image_path", "label_path"])
    imgs_pths_df['Filename'] = imgs_pths_df.image_path.apply(lambda x: os.path.basename(x).split(".")[0])
    filtered_meta_full = filtered_meta.drop(columns=["image_path", "label_path"])
    filtered_meta_full = pd.merge(imgs_pths_df, filtered_meta_full, on="Filename", how="inner")
    return filtered_meta_full
    

# filtered_meta_tiles = return_tiled_meta_csv(split="test")
# filtered_meta_tiles.to_csv(f"Z:\\__AdvancedTechnologyBackup\\07_Database\\MetadataCombined\\Run13_tiles_metadata_{split}.csv")
# filtered_meta_full = return_full_meta_csv(split="test")
# filtered_meta_full.to_csv(f"Z:\\__AdvancedTechnologyBackup\\07_Database\\MetadataCombined\\Run13_full_metadata_{split}.csv")

In [31]:
filtered_meta_test = return_tiled_meta_csv(split="test")
filtered_meta_validation = return_tiled_meta_csv(split="validation")
filtered_meta_train = return_tiled_meta_csv(split="train")
filtered_meta_all = pd.concat([filtered_meta_test, filtered_meta_validation, filtered_meta_train], ignore_index=True)
filtered_meta_all_abs2 = filtered_meta_all[filtered_meta_all.collect_id.str.contains("ABS2")]
filter = filtered_meta_all_abs2
Utils.write_list_txt(filter.image_path, f"D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\test_sets\\ABISS2_tiles\\images.txt")
Utils.write_list_txt(filter.label_path, f"D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\test_sets\\ABISS2_tiles\\labels.txt")

31266 test
26217 validation
220419 train


In [32]:
filtered_meta_all.to_csv(f"Z:\\__AdvancedTechnologyBackup\\07_Database\\MetadataCombined\\Run13_tiles_metadata_all.csv")

In [None]:
# sanity check for porportions
train = Utils.read_list_txt("D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled\\train\\images.txt")
test = Utils.read_list_txt("D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled\\test\\images.txt")
valid = Utils.read_list_txt("D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled\\validation\\images.txt")
# Train sest
print("Train", len(train)/(len(train)+len(test)+len(valid))*100)
# Test set
print("Test", len(test)/(len(train)+len(test)+len(valid))*100)
# Validation set
print("Validation", len(valid)/(len(train)+len(test)+len(valid))*100)

In [None]:
## Hard-Negative mining (HNM) of the training set (Test run of training data completed with 2048 weights) 

lbl_report = pd.read_csv(r"D:\ageglio-1\gobyfinder_yolov8\output\test_runs\Labeled data tiled 2048 HNM\label_box_results.csv")
print("n total objects", len(lbl_report.Filename.unique()))
print("n total background tiles", len(lbl_report[lbl_report.conf.isna()].Filename.unique()))
positive_filenames = lbl_report[~lbl_report.conf.isna()].Filename.unique()
print("n postitive tiles (with labels)", len(positive_filenames))
scores = pd.read_csv(r"D:\ageglio-1\gobyfinder_yolov8\output\test_runs\Labeled data tiled 2048 HNM\scores.csv", index_col=0)
print("Total objects on background tiles", len(scores[(scores.ground_truth_id.isna()) & (scores.conf>=0.1)]), "conf>=0.1")
print("n postitive tiles (with labels)", len(lbl_report[~lbl_report.conf.isna()].Filename.unique()))
print("Total Background Tiles with FPs (conf 0.1)", len(scores[(scores.ground_truth_id.isna()) & (scores.conf>=0.1)].Filename.unique()))
medium_negatives = scores[(scores.ground_truth_id.isna()) & (scores.conf>=0.1) & (scores.conf<0.2)]
print("'Medium-Only' Negative Tiles", len(medium_negatives.Filename.unique()))
hardest_negatives = scores[(scores.ground_truth_id.isna()) & (scores.conf>=0.2)]
hardest_negative_filenames = hardest_negatives.Filename.unique()
print("Hardest Negative Tiles (conf 0.2)", len(hardest_negatives.Filename.unique()))
# 3. Combine these two lists
# (You can't use 'scores' because it's missing the 456 pure-FN tiles where the predictor missed the whole image)
# We just need the final list of filenames to keep.

# Convert arrays to lists so you can add them
all_positive_filenames_list = list(positive_filenames)
all_hardest_negative_filenames_list = list(hardest_negative_filenames)

# Combine them. This is your final set of filenames.
all_filenames_to_keep = all_positive_filenames_list + all_hardest_negative_filenames_list

print("Total tiles to keep in training:", len(all_filenames_to_keep))
# This output should now be 133360

'''
n total objects 220419
n total background tiles 112247
n postitive tiles (with labels) 108172
Total objects on background tiles 57633 conf>=0.1
n postitive tiles (with labels) 108172
Total Background Tiles with FPs (conf 0.1) 41102
'Medium-Only' Negative Tiles 22689
Hardest Negative Tiles (conf 0.2) 25188
Total tiles to keep in training: 133360
'''

In [None]:
## Apply HNM filter for train set

split="train"
tile_dir = f"D:\\datasets\\tiled\\train\\images"
lbl_dir = f"D:\\datasets\\tiled\\train\\labels" 
dataset_folder = "D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled"
all_tiles_to_keep = [os.path.join(tile_dir,f+".png") for f in all_filenames_to_keep]
all_labels_to_keep = [os.path.join(lbl_dir,f+".txt") for f in all_filenames_to_keep]
assert len(all_tiles_to_keep) == len(all_labels_to_keep)
print(len(all_tiles_to_keep)) # 133360
Utils.write_list_txt(all_tiles_to_keep, f"{dataset_folder}\\{split}\\images.txt")
Utils.write_list_txt(all_labels_to_keep, f"{dataset_folder}\\{split}\\labels.txt")
sanity_check = Utils.read_list_txt(r"D:\ageglio-1\gobyfinder_yolov8\datasets\AUV_datasets\run13\tiled\train\images.txt")
assert len(sanity_check) == 133360