In [None]:
import pandas as pd
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))
import glob
from utils import Utils

# Gopro QAQC round 2

## Dataframe of all images and associated labels

In [None]:
lmbs_label_path = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\GoPro_datasets\LMBS\labels"
lmbs_label_files = Utils.list_files(lmbs_label_path, ".txt")

lmbs_image_path = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\GoPro_datasets\LMBS\images"
lmbs_image_files = Utils.list_files(lmbs_image_path, ".jpg")
assert len(lmbs_label_files) == len(lmbs_image_files), "Label and image file counts do not match."

glsc_label_path = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\GoPro_datasets\GLSC\GLSC_unique_qaqc\labels"
glsc_label_files = Utils.list_files(glsc_label_path, ".txt")

glsc_image_path = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\GoPro_datasets\GLSC\GLSC_unique_qaqc\images"
glsc_image_files = Utils.list_files(glsc_image_path, ".jpg")
assert len(glsc_label_files) == len(glsc_image_files), "Label and image file counts do not match."
'''
1075
2173'''
print(len(lmbs_image_files))
print(len(glsc_image_files))

In [None]:
import re

# Update month_map to handle both 'sep' and 'sept'
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'sept': 9, 'oct': 10, 'nov': 11, 'dec': 12
}

# Create dataframe from LMBS image file paths
lmbs_df = pd.DataFrame({'image_path': lmbs_image_files})

# Extract the date string (e.g., "01aug23" or "01sept23") from the filename
def extract_date_from_path(path):
    match = re.search(r'(\d{2}[a-z]{3,4}\d{2})', os.path.basename(path), re.IGNORECASE)
    return match.group(1) if match else None

lmbs_df['date_str'] = lmbs_df['image_path'].apply(extract_date_from_path)

# Parse day, month, year from the date string
def parse_date(date_str):
    if date_str:
        day = int(date_str[:2])
        # month can be 3 or 4 characters
        if date_str[2:6].lower() == 'sept':
            month = month_map['sept']
            year = int('20' + date_str[6:])
        else:
            month = month_map[date_str[2:5].lower()]
            year = int('20' + date_str[5:])
        return day, month, year
    return None, None, None

lmbs_df[['day', 'month', 'year']] = lmbs_df['date_str'].apply(lambda x: pd.Series(parse_date(x)))

lmbs_df.head()

In [None]:
lmbs_df[(lmbs_df.year==2022)&(lmbs_df.month==5)]

In [None]:
## round 2 checks

# Combine all image file paths from LMBS and GLSC datasets
imgs_all = lmbs_image_files + glsc_image_files

# Combine all label file paths from LMBS and GLSC datasets
lbls_all = lmbs_label_files + glsc_label_files

# Extract unique IDs from image file names (without extensions)
imgs_all_ids = list(map(lambda x: os.path.basename(x).split(".")[0], imgs_all))

# Extract unique IDs from label file names (without extensions)
lbls_all_ids = list(map(lambda x: os.path.basename(x).split(".")[0], lbls_all))

# Ensure there are no duplicate IDs in the label files
assert len(lbls_all_ids) == len(list(set(lbls_all_ids)))

# Ensure there are no duplicate IDs in the image files
assert len(imgs_all_ids) == len(list(set(imgs_all_ids)))

# Ensure that the IDs from labels match the IDs from images
assert lbls_all_ids == imgs_all_ids

# Print the total number of image files
print(len(imgs_all)) # 3248

In [None]:
## return a dataframe of all images and labels to performa a train/test/validation split
# gopro_all = generate_splits(img_pth_lst = imgs_all, bbox_pths=lbls_all, mer_pths=None).return_merged()
# functions to check the image size distributions to determine the best size to train YOLO
def get_im_w(image_path):
    im = Image.open(image_path)
    w, h = im.size
    return w
def get_im_h(image_path):
    im = Image.open(image_path)
    w, h = im.size
    return h
lim_w = lambda f: get_im_w(f)
lim_h = lambda f: get_im_h(f)
# gopro_all["imh"], gopro_all["imw"] = gopro_all.image_path.apply(lim_h), gopro_all.image_path.apply(lim_w)
gopro_all = pd.read_csv(r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\GoPro_datasets\COMBINED\GLSC_LMBS_dataset - round2\all_images_label_paths.csv")
gopro_all
# gopro_all.groupby(by=["imh","imw"]).count()

        | imh  | imw  | image_path | Filename | image_id | bbox_path | mer_path | year |
        |------|------|------------|----------|----------|-----------|----------|------|
        | 1080 | 1920 | 40         | 40       | 40       | 40        | 0        | 0    |
        | 1440 | 1920 | 67         | 67       | 67       | 67        | 0        | 0    |
        | 2304 | 3072 | 83         | 83       | 83       | 83        | 0        | 0    |
        | 2760 | 3680 | 277        | 277      | 277      | 277       | 0        | 0    |
        | 2880 | 3840 | 14         | 14       | 14       | 14        | 0        | 0    |
        | 3000 | 4000 | 2033       | 2033     | 2033     | 2033      | 0        | 0    |
        | 3888 | 5184 | 734        | 734      | 734      | 734       | 0        | 0    |


## Create Train Test Split

In [None]:
## Do a train/test/valid split on the dataframe
def do_train_test_valid_split(df, train_split=0.7, valid_split=0.2):
    l = len(df)
    print("n samples:", l)
    train_df = df.sample(int(train_split*l), random_state=42)
    df = df.drop(train_df.index)
    valid_df = df.sample(int(valid_split*l), random_state=42)
    df = df.drop(valid_df.index)
    test_df = df
    X_train, y_train = train_df.image_path.values, train_df.bbox_path.values
    X_valid, y_valid = valid_df.image_path.values, valid_df.bbox_path.values
    X_test, y_test = test_df.image_path.values, test_df.bbox_path.values
    print(f"training, testing, validation, {X_train.shape[0]}, {X_test.shape[0]},{X_valid.shape[0]}")
    return X_train, y_train, X_valid, y_valid, X_test, y_test, train_df, valid_df, test_df
X_train, y_train, X_valid, y_valid, X_test, y_test, train_df, valid_df, test_df = do_train_test_valid_split(gopro_all, train_split=0.8, valid_split=0.1)

In [None]:
## copy images and labels
import shutil
def cpy_lbls(set, dst):
    if not os.path.exists(dst): os.makedirs(dst)
    for item in set:
        shutil.copy2(item, dst)
root =  r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Proj_GobyFinder\gobyfinder_yolov8\datasets\GoPro_datasets\COMBINED\GLSC_LMBS_dataset - round2"
# cpy_lbls(y_train, os.path.join(root, "train\\labels"))
# cpy_lbls(y_valid, os.path.join(root, "validation\\labels"))
# cpy_lbls(y_test, os.path.join(root, "test\\labels"))
# cpy_lbls(X_train, os.path.join(root, "train\\images"))
# cpy_lbls(X_valid, os.path.join(root, "validation\\images"))
# cpy_lbls(X_test, os.path.join(root, "test\\images"))

## Batches for MakeSense.ai QAQC

In [None]:
root = r"datasets\GoPro_datasets\GLSC_unique_qaqc"
def create_image_label_batches(root, batch_size=100):
    batch_folder = "batches"
    images = sorted(glob.glob(os.path.join(root, "images", "*.jpg")))
    labels = sorted(glob.glob(os.path.join(root, "labels", "*.txt")))
    assert len(images) == len(labels), "Number of images and labels do not match"

    # Helper function to split a list into chunks
    def chunk_list(lst, chunk_size):
        return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

    # Create chunks of images and labels
    image_chunks = chunk_list(images, batch_size)
    label_chunks = chunk_list(labels, batch_size)

    for idx, (image_chunk, label_chunk) in enumerate(zip(image_chunks, label_chunks)):
        batch_dir = os.path.join(root, batch_folder, f"batch_{idx}")
        image_dst = os.path.join(batch_dir, "images")
        label_dst = os.path.join(batch_dir, "labels")

        # Create directories if they don't exist
        os.makedirs(image_dst, exist_ok=True)
        os.makedirs(label_dst, exist_ok=True)

        # Copy images and labels to their respective batch directories
        for img_src, lbl_src in zip(image_chunk, label_chunk):
            shutil.copy2(img_src, image_dst)
            shutil.copy2(lbl_src, label_dst)

        # Create a labels.txt file listing all label files in the batch
        labels_list_path = os.path.join(batch_dir, "labels.txt")
        label_files = [os.path.basename(lbl) for lbl in label_chunk]
        with open(labels_list_path, "w") as f:
            f.write("\n".join(label_files))

## LMBS Cage Goby Analysis

In [None]:
# function to combine the the labels and cage boxes for LMBS images
def process_labels(label_list):
    labels_df = pd.DataFrame(columns=["Filename", "cls", "x", "y", "w", "h"])
    for label in label_list:
        Filename = os.path.basename(label).split(".")[0]
        # read the label file
        with open(label, "r") as f:
            lines = f.readlines()
        # check if the file is empty
        if len(lines) == 0:
            labels_df = pd.concat([labels_df, pd.DataFrame([{"Filename": Filename, "cls": np.nan, "x": np.nan, "y": np.nan, "w": np.nan, "h": np.nan}])], ignore_index=True)
        # extract the name and coordinates
        for line in lines:
            parts = line.strip().split()
            if len(parts) == 5:
                name, x, y, w, h = parts
            # append to the dataframe
            labels_df = pd.concat([labels_df, pd.DataFrame([{"Filename": Filename, "cls": name, "x": float(x), "y": float(y), "w": float(w), "h": float(h)}])], ignore_index=True)
    return labels_df

def intersection_df(fish_box_df, cage_box_df, img_label_filepath_df, input_type="bboxes"): # 'label' or 'dectect'
    # Merge the labels dataframe with the cage bounding boxes dataframe on the "Filename" column
    # Use a left join and add suffixes to distinguish columns from the two dataframes
    df_all = pd.merge(fish_box_df, cage_box_df, on="Filename", how="left", suffixes=("_f", "_b"))

    # Ensure the number of rows in the merged dataframe matches the number of rows in the labels dataframe
    assert len(df_all) == len(fish_box_df), "lengths do not match"


    if input_type == "labels":
        # Merge the resulting dataframe with the gopro_all dataframe to include image height and width
        df_all = pd.merge(df_all, img_label_filepath_df, on="Filename", how="left")
        # create a unique fish ID for each fish in the image
        fish_id = df_all.groupby("Filename").cumcount()
        fish_id = df_all.groupby("Filename").cumcount()
        df_all[f"{input_type}_id"] = df_all["Filename"] + "_" + fish_id.astype(str)
        df_all["conf"] = 1.0

    # Define the calculate_intersection class if not already defined
    df_all['intersection'] = df_all.apply(
        lambda row: calculate_intersection().get_intersection(row) if not row.isnull().any() else np.nan,
        axis=1
    )

    df_all["inside"] = np.where(df_all.intersection>0.5, 1, 0)
    df_all = df_all[['Filename', 'cls_f', 'x_f', 'y_f', 'w_f', 'h_f', 'x_b', 'y_b', 'w_b', 'h_b', 'imh', 'imw', f'{input_type}_id', 'intersection', 'inside', 'conf']]
    return df_all

In [None]:
### Annotated Labels analysis 
test_set = [os.path.basename(x).split(".")[0] for x in X_test]
# Define the root directory for LMBS unique QAQC round 2 labels
root = r"Z:\__Organized_Directories_InProgress\GobyFinderDatasets\GoPro_datasets\LMBS"

# Get a sorted list of all label files in the directory
bbox_list = sorted(glob.glob(os.path.join(root, "labels", "*.txt")))

# Get a sorted list of all cage bounding box files in the directory
cage_box_list = sorted(glob.glob(os.path.join(root, "cages", "all_boxes", "*.txt")))

# dataframe of all images and labels with their file paths
img_label_filepath_df = gopro_all[["Filename", "imh", "imw"]]

# Process the label files into a dataframe
label_box_df = process_labels(bbox_list)

# Process the cage bounding box files into a dataframe
cage_box_df = process_labels(cage_box_list)

# Perform the intersection analysis for the annotated labels, and save the results to a CSV file
df_gopro_label_analysis = intersection_df(label_box_df, cage_box_df, img_label_filepath_df, input_type=input_type)
df_gopro_label_analysis["test_set"] = np.where(df_gopro_label_analysis["Filename"].isin(test_set), 1, 0)

input_type="labels"
# df_gopro_label_analysis.to_csv(os.path.join(root, f"{input_type}_with_cages_intersection.csv"), index=False)
df_gopro_label_analysis


In [None]:
### GobyFinder Gopro prediction analysis 
test_set = [os.path.basename(x).split(".")[0] for x in X_test]

# Get a sorted list of all cage bounding box files in the directory
cage_box_list = sorted(glob.glob(os.path.join(root, "cages", "all_boxes", "*.txt")))

# dataframe of all images and labels with their file paths
img_label_filepath_df = gopro_all[["Filename", "imh", "imw"]]

# Process the label files into a dataframe
prediction_box_df = pd.read_csv(r"Z:\__AdvancedTechnologyBackup\04_ProjectData\Proj_GobyFinder\gobyfinder_yolov8\inference\GoPro_Round2_Inference\GoPro_Round2_Inference_predictions.csv", index_col=0)

# set the min confidence level to filter the predictions
prediction_box_df = prediction_box_df[prediction_box_df.conf>=0.1]

# Process the cage bounding box files into a dataframe
cage_box_df = process_labels(cage_box_list)


# Perform the intersection analysis for the annotated labels, and save the results to a CSV file
input_type="detect"
df_gopro_prediction_analysis = intersection_df(prediction_box_df, cage_box_df, img_label_filepath_df, input_type=input_type)
df_gopro_prediction_analysis["test_set"] = np.where(df_gopro_prediction_analysis["Filename"].isin(test_set), 1, 0)
df_gopro_prediction_analysis.to_csv(os.path.join(root, f"{input_type}_with_cages_intersection.csv"), index=False)
df_gopro_prediction_analysis

## start here to calculate LBMS numbers

In [None]:
import matplotlib.pyplot as plt

# Create a histogram for the intersection column
plt.figure(figsize=(4, 3))
df_all = df_gopro_label_analysis
df_all[(df_all.intersection > 0) & (df_all.intersection < 1)].intersection.hist(bins=30, color='skyblue', edgecolor='black')

# Add title and labels
plt.title('Distribution of Intersection Values', fontsize=14)
plt.xlabel('Intersection', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Customize ticks
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Add grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Save the plot as a high-resolution image (optional)
# plt.savefig('intersection_histogram.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

Overall Validation

In [None]:
import glob
import matplotlib.pyplot as plt
from PIL import Image
import random as rand

images = glob.glob(r"C:\Users\ageglio\ageglio-1\gobyfinder_yolov8\Gopro_plot_imgs\imgs\*")
seed=1984
rand.seed(seed)  # Set the random seed for reproducibility
images = rand.sample(images, 16)

fix, ax = plt.subplots(4, 4, figsize=(9, 7))
for i, img_path in enumerate(images):  # Limit to the first 16 images
    img = Image.open(img_path)
    w = int(img.size[0] / 2)
    h = int(img.size[1] / 2)
    hor = rand.randint(0, img.size[0] - w)
    vert = rand.randint(0, img.size[1] - h)
    # Take a random zoomed sample of the image
    img = img.crop((hor, vert, hor + w, vert + h))
    img = img.resize((840, 640))
    ax[i // 4, i % 4].imshow(img)
    ax[i // 4, i % 4].axis('off')

plt.tight_layout()

# Save the figure with 600 dpi
plt.savefig(f"Gopro_plot_imgs\\sampled_images{seed}.jpg", dpi=600, bbox_inches='tight')
