In [8]:
import glob
import pandas as pd
import os
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))
from tqdm import tqdm
from utils import Utils
import os
import pandas as pd
import numpy as np

In [4]:
# Use a dictionary to map file paths for cleaner code
PATHS = {
    "op_table": r"Z:\__AdvancedTechnologyBackup\07_Database\OP_TABLE.xlsx",
    "metadata": r"Z:\__AdvancedTechnologyBackup\07_Database\MetadataCombined\all_annotated_meta_splits_20250915.csv",
    "output_dir": r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\AUV_datasets",
    "local_dir": r"D:\datasets",
    "addl_species": r"z:\__AdvancedTechnologyBackup\07_Database\addl_species_log.xlsx"
}

# Define the sets of collect_ids for each split
COLLECT_IDS = {
    "transects": {"20200806_001_Iver3069_ABS1", "20200816_001_Iver3069_ABS1",
                  "20210825_001_Iver3069_ABS1", "20210720_001_Iver3069_ABS1"},
    "test": {"20200809_001_Iver3069_ABS1", "20200818_001_Iver3069_ABS1", "20200902_001_Iver3069_ABS1", "20200820_001_Iver3069_ABS1", "20200821_001_Iver3069_ABS1", "20200823_001_Iver3069_ABS1",
             "20210811_001_Iver3069_ABS1", "20210812_001_Iver3069_ABS1", "20210812_002_Iver3069_ABS1", "20210719_001_Iver3069_ABS1", "20210829_001_Iver3069_ABS1", "20210911_001_Iver3069_ABS1", "20210911_002_Iver3069_ABS1", "20210925_001_Iver3069_ABS1",
             "20220624_001_Iver3069_ABS1", "20220714_002_Iver3069_ABS1", "20220727_001_Iver3069_ABS2", "20220811_002_Iver3098_ABS2", "20220807_003_Iver3069_ABS2", "20220901_001_Iver3069_ABS2", "20220814_001_Iver3069_ABS2", "20220814_002_Iver3069_ABS2",
             "20230710_001_Iver3098_ABS2", "20230909_001_Iver3069_ABS2", "20230810_002_Iver3098_ABS2", "20230727_001_Iver3098_ABS2"},
    "validation": {"20200916_001_Iver3069_ABS1", "20200922_002_Iver3069_ABS1", "20200923_002_Iver3069_ABS1",
                   "20210712_001_Iver3069_ABS1", "20210909_001_Iver3069_ABS1", "20210920_001_Iver3069_ABS1", "20210707_001_Iver3069_ABS1", "20210912_001_Iver3069_ABS1", "20210912_002_Iver3069_ABS1", "20210913_001_Iver3069_ABS1",
                   "20220711_002_Iver3069_ABS1", "20220714_003_Iver3069_ABS1", "20220717_001_Iver3098_ABS2", "20220825_001_Iver3098_ABS2", "20220914_002_Iver3069_ABS2", "20220902_001_Iver3069_ABS2",
                   "20230802_001_Iver3098_ABS2", "20230625_001_Iver3098_ABS2", "20230718_002_Iver3098_ABS2", "20230811_001_Iver3098_ABS2", "20230715_001_Iver3098_ABS2"}
}

# Ensure the output directory exists
os.makedirs(PATHS["output_dir"], exist_ok=True)

# --- 1. Load and classify all metadata in one pass ---
print("Loading and preparing metadata...")
all_annotated_meta = pd.read_csv(PATHS["metadata"], low_memory=False)

def get_split(collect_id):
    for split_name, ids in COLLECT_IDS.items():
        if collect_id in ids:
            return split_name
    return 'train'

# Classify each row with its split
all_annotated_meta['split'] = all_annotated_meta['collect_id'].apply(get_split)

# Apply initial filters and exclude NaNs in 'DistanceToBottom_m' and any additional species
addl_species_Filenames = pd.read_excel(PATHS["addl_species"]).Filename

filtered_meta = all_annotated_meta.query(
    'Usability == "Usable" and ((year == 2021 and n_fish >= 2) or (year in [2020, 2022, 2023])) and not DistanceToBottom_m.isnull()'
)
filtered_meta = filtered_meta[~filtered_meta['Filename'].isin(addl_species_Filenames)]
print("Filtered Usable data", filtered_meta.shape)
filtered_meta.to_csv(r"Z:\__AdvancedTechnologyBackup\07_Database\MetadataCombined\all_annotated_meta_splits_filtered_20251030.csv")
# --- 2. Generate and save the text files for each split ---
for split_name in ['train', 'test', 'validation', 'transects']:
    # Get filenames for the current split
    filenames = filtered_meta[filtered_meta['split'] == split_name]['Filename'].tolist()
    
    # Write each filename on a new line
    output_path = os.path.join(PATHS["output_dir"], f"{split_name}.txt")
    output_path = os.path.join(PATHS["local_dir"], f"{split_name}.txt")
    with open(output_path, 'w') as f:
        # The key change is here: use '\n'.join()
        f.write('\n'.join(filenames))
        
    print(f"Saved {len(filenames)} filenames to {output_path}")

print("\n--- Generating statistics reports ---")

# --- 3. Group and calculate summary statistics ---
df_grouped = filtered_meta.groupby(["collect_id", "split"]).agg(
    n_images=('Filename', 'count'),
    n_fish_p_collect=('n_fish', 'sum')
).reset_index()

df_grouped['n_fish_p_image'] = df_grouped['n_fish_p_collect'] / df_grouped['n_images']
df_grouped['year'] = df_grouped['collect_id'].str[:4].astype(int)
df_grouped['camera'] = df_grouped['collect_id'].str[-4:]

# --- 4. Merge with op_table for additional info ---
op_table = pd.read_excel(PATHS["op_table"])
df_stats = df_grouped.merge(
    op_table[["COLLECT_ID", "LAKE_NAME", "MISSION_NAME", "PORT_NAME", "LATITUDE", "LONGITUDE"]],
    left_on="collect_id",
    right_on="COLLECT_ID",
    how='left'
).drop(columns='COLLECT_ID')

df_stats.to_csv(f"{PATHS['output_dir']}/Run13_collect_stats.csv", index=False)
df_stats.to_csv(f"{PATHS['local_dir']}/Run13_collect_stats.csv", index=False)
# --- 5. Export individual and combined summary CSVs ---
for split_name, df_split in df_stats.groupby('split'):
    print(f"Total images for {split_name}:", df_split['n_images'].sum())

# combined_df.to_csv(f"{PATHS['output_dir']}/yearly_split_stats.csv", index=False)
print("\nAll stats CSVs generated successfully.")

Loading and preparing metadata...
Filtered Usable data (46913, 170)
Saved 31324 filenames to D:\datasets\train.txt
Saved 4460 filenames to D:\datasets\test.txt
Saved 3741 filenames to D:\datasets\validation.txt
Saved 7388 filenames to D:\datasets\transects.txt

--- Generating statistics reports ---
Total images for test: 4460
Total images for train: 31324
Total images for transects: 7388
Total images for validation: 3741

All stats CSVs generated successfully.


--- Generating statistics reports ---
Total images for test: 4460
Total images for train: 31324
Total images for transects: 7388
Total images for validation: 3741

In [3]:
import pandas as pd
import numpy as np
# --- Initial Data Loading ---
# Note: The original path uses a Windows-style path string, which is fine, 
# but using a raw string (r"...") can prevent potential issues with backslashes.
summary = pd.read_csv(r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\AUV_datasets\Run13_collect_stats.csv")

# --- Initial Data Cleanup/Prep ---
# Simplify the camera column extraction immediately
summary['camera'] = summary['collect_id'].str.split('_').str[-1]

# --- Core Refactoring: Aggregate and Pivot ---

# 1. Group by year, camera, and split, then sum n_images
# 2. Pivot the 'split' column to make 'train', 'test', 'validation' separate columns
df_pivot = summary.pivot_table(
    index=['year', 'camera'],
    columns='split',
    values='n_images',
    aggfunc='sum'
).reset_index()

# 3. Rename columns for clarity (matching your original final names)
df_pivot = df_pivot.rename(columns={
    'train': 'n images train',
    'test': 'n images test',
    'validation': 'n images validation'
})

# 4. Calculate total images and percentages
total_col = (
    df_pivot['n images train'] + 
    df_pivot['n images test'] + 
    df_pivot['n images validation']
)
df_pivot['n_images_total'] = total_col.astype(int)
df_pivot['test (%)'] = (df_pivot['n images test'] / total_col * 100).round(2)
df_pivot['validation (%)'] = (df_pivot['n images validation'] / total_col * 100).round(2)

# --- Final Calculations (Based on the original logic) ---

# Calculate n_tiles (now a single operation using np.where)
# The logic: n_images * 9 if 'ABS2' is in 'camera', else n_images * 6
df_pivot['n_tiles'] = np.where(
    df_pivot['camera'].str.contains('ABS2', na=False),
    total_col * 9,
    total_col * 6
).astype(int)

# 5. Drop the individual split columns and rename the total column
summary_run13 = df_pivot.drop(
    columns=['n images train', 'n images test', 'n images validation']
)
summary_run13

split,year,camera,transects,n_images_total,test (%),validation (%),n_tiles
0,2020,ABS1,4570.0,10104,11.22,9.82,60624
1,2021,ABS1,2818.0,13705,11.63,8.9,82230
2,2022,ABS1,,2166,10.62,12.56,12996
3,2022,ABS2,,5120,12.68,7.85,46080
4,2023,ABS2,,8430,10.12,10.14,75870


In [4]:
print("n tiles", summary_run13.n_tiles.sum())
print("n tiles by camera", summary_run13.groupby(by="camera").n_tiles.sum())
# summary_run13.groupby(by="camera").n_images_total.sum()

n tiles 277800
n tiles by camera camera
ABS1    155850
ABS2    121950
Name: n_tiles, dtype: int32


In [5]:
n_im_col = df_stats.n_images
df_stats['n_tiles'] = np.where(
    df_stats['camera'].str.contains('ABS2', na=False),
    n_im_col * 9,
    n_im_col * 6
).astype(int)
train_tiles = df_stats[df_stats['split'] == 'train'].n_tiles.sum()
print(f"Total tiles in train set: {train_tiles}")
test_tiles = df_stats[df_stats['split'] == 'test'].n_tiles.sum()
print(f"Total tiles in test set: {test_tiles}")
validation_tiles = df_stats[df_stats['split'] == 'validation'].n_tiles.sum()
print(f"Total tiles in validation set: {validation_tiles}")

Total tiles in train set: 220317
Total tiles in test set: 31266
Total tiles in validation set: 26217


In [6]:
# write tiled txt file paths
def write_split_txt_tiled(split, write=True):
    dataset_folder = "D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled"
    new_image_folder = f'D:\\datasets\\tiled\\{split}\\images'
    new_label_folder = f'D:\\datasets\\tiled\\{split}\\labels'
    images = glob.glob(new_image_folder+"\\"+"*.png")
    images += glob.glob(new_image_folder+"\\"+"*.jpg")
    labels = glob.glob(new_label_folder+"\\"+"*.txt")
    n_images = len(images)
    n_labels = len(labels)
    # assert n_images == n_labels
    print(n_images, split)
    if write:
        Utils.write_list_txt(images, f"{dataset_folder}\\{split}\\images.txt")
        Utils.write_list_txt(labels, f"{dataset_folder}\\{split}\\labels.txt")
    return images, labels

# Write Flull images txt files
def write_split_txt_full(split, write=True):
    dataset_folder = "D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\full"
    imgs = Utils.read_list_txt(f"Z:\\__Organized_Directories_InProgress\\GobyFinderDatasets\\AUV_datasets\\{split}.txt")
    all_image_paths = glob.glob(f"D:\\datasets\\full\\{split}\\images\\*.png")
    all_image_paths += glob.glob(f"D:\\datasets\\full\\{split}\\images\\*.jpg")
    all_label_paths = glob.glob(f"D:\\datasets\\full\\{split}\\labels\\*.txt")
    images, labels  = Utils.list_full_set(imgs, all_image_paths, all_label_paths)
    if len(imgs) < len(all_image_paths):
        print("there are more images locally than in the split")
    elif len(imgs) > len(all_image_paths):
        print("There are more images in the split than copied locally")
    assert len(images) == len(labels), "Mismatch between number of images and labels"
    print(f"Number of full images for {split}: {len(images)}")
    if write:
        Utils.write_list_txt(images, f"{dataset_folder}\\{split}\\images.txt")
        Utils.write_list_txt(labels, f"{dataset_folder}\\{split}\\labels.txt")
    return images, labels

In [None]:
# Number of full images for train: 31324
# Number of full images for test: 4460
# Number of full images for validation: 3741
# splits = ["train", "test", "validation"]  # Change this to 'train', 'test', or 'transects' as needed
# for split in splits:
#     write_split_txt_full(split)
# 26217 validation
# 220419 train
# 31266 test
# splits = ["train", "test", "validation"]
# for split in splits:
#     write_split_txt_tiled(split)

In [12]:
def save_tiled_meta_csv(split="train"):
    train_imgs, train_lbls = write_split_txt_tiled(split, write=False)
    train_imgs_df = pd.DataFrame(np.c_[train_imgs, train_lbls], columns=["image_path", "label_path"])
    train_imgs_df['Tilename'] = train_imgs_df.image_path.apply(lambda x: os.path.basename(x).split(".")[0])
    train_imgs_df['Filename'] = train_imgs_df.image_path.apply(lambda x: Utils.convert_tile_img_pth_to_basename(x))
    filtered_meta_tiles = filtered_meta.drop(columns=["image_path", "label_path"])
    filtered_meta_tiles = pd.merge(train_imgs_df, filtered_meta_tiles, on="Filename", how="inner")
    filtered_meta_tiles = filtered_meta_tiles.rename(columns={"Filename":"basename", "Tilename": "Filename"})
    filtered_meta_tiles['imw'] = 1672
    filtered_meta_tiles['imh'] = 1307
    filtered_meta_tiles.to_csv(f"D:\\ageglio-1\\gobyfinder_yolov8\\output\\test_runs\\Labeled data tiled 2048 HNM\\tiles_metadata_{split}.csv")
save_tiled_meta_csv(split="validation")

26217 validation


In [8]:
# Moving the tiles to train test split
split = "Train"
def move_all_tiled_imgs_to_splits(split):
     dst_im_folder = r"D:\datasets\full\unusable\tiled\images"
     dst_lb_folder = r"D:\datasets\full\unusable\tiled\labels"
     # ... all your variable declarations ...
     all_tiled_image_paths = glob.glob(f"D:\\datasets\\full\\tiled\\images\\*.png")
     all_tiled_label_paths = glob.glob(f"D:\\datasets\\full\\tiled\\labels\\*.txt")
     all_image_paths = glob.glob(f"D:\\datasets\\full\\{split}\\images\\*.png")
     all_label_paths = glob.glob(f"D:\\datasets\\full\\{split}\\labels\\*.txt")
     # Make sure all your destination directories exist
     os.makedirs(dst_im_folder, exist_ok=True)
     os.makedirs(dst_lb_folder, exist_ok=True)

     imgs = Utils.read_list_txt(f"Z:\\__Organized_Directories_InProgress\\GobyFinderDatasets\\AUV_datasets\\{split}.txt")
     b_images = set(imgs) # Use a set for faster lookups!

     for im_path, lb_path in zip(all_tiled_image_paths, all_tiled_label_paths):
          basename_tile_w_ext = os.path.basename(lb_path)
          img_basename = basename_tile_w_ext.rsplit('_', 2)[0]
          # Check if the image basename (e.g., "001.png") is NOT in the desired list
          if img_basename not in b_images:
               # Construct the label path using the image's basename (minus extension)
               base_name_no_ext = os.path.splitext(basename_tile_w_ext)[0]
               #    lb_path = os.path.join(label_dir, base_name_no_ext + ".txt")

               #    # Move files
               shutil.move(im_path, dst_im_folder)
               
               #    # Add a check to ensure the corresponding label file actually exists before moving
               if os.path.exists(lb_path):
                    shutil.move(lb_path, dst_lb_folder)
               else:
                    print(f"Warning: Label file not found for {basename_tile_w_ext} at {lb_path}")


In [9]:
# We have to convert validation images to jpg for the dataloader

def convert_val_to_hq_jpg(VAL_OUTPUT_DIR = r"D:\datasets\tiled\validation\jpg", move_pngs = False):

    validation_images = glob.glob(VAL_OUTPUT_DIR+"\\"+"*png")
    # Create the output directory if it doesn't exist
    os.makedirs(VAL_OUTPUT_DIR, exist_ok=True)

    # Iterate and convert all files
    for filename in tqdm(validation_images):
        if filename.lower().endswith('.png'):
            png_path = filename
            jpg_filename = os.path.splitext(filename)[0] + '.jpg'
            jpg_path = os.path.join(VAL_OUTPUT_DIR, jpg_filename)
            Utils.convert_png_to_highest_quality_jpeg(png_path, jpg_path)
    if move_pngs: 
        Utils.MOVE_files_lst(validation_images, r"D:\datasets\tiled\validation\png\images")

In [None]:
# final sanity check
train = Utils.read_list_txt("D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled\\train\\images.txt")
test = Utils.read_list_txt("D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled\\test\\images.txt")
valid = Utils.read_list_txt("D:\\ageglio-1\\gobyfinder_yolov8\\datasets\\AUV_datasets\\run13\\tiled\\validation\\images.txt")
# Train sest
print("Train", len(train)/(len(train)+len(test)+len(valid))*100)
# Test set
print("Test", len(test)/(len(train)+len(test)+len(valid))*100)
# Validation set
print("Validation", len(valid)/(len(train)+len(test)+len(valid))*100)

Train 79.31537016646156
Test 11.250728674136926
Validation 9.433901159401517
