In [1]:
import os
import glob
import json
import pandas as pd
from pycocotools.coco import COCO
from PIL import Image
import numpy as np
import skimage.io as io
from matplotlib import pyplot as plt
from pprint import pprint
import cv2
from pycocotools import mask as coco_mask 
%matplotlib inline
import math

LOADING AND EXPLORING DATA

In [2]:
with open('dataSet/turtles-data/data/annotations.json', 'r', encoding='utf8') as file:
    annotations = json.load(file)

In [3]:
# initialise COCO API for annotations
coco = COCO('dataSet/turtles-data/data/annotations.json')

loading annotations into memory...
Done (t=30.23s)
creating index...
index created!


In [4]:
# Retrieve all image IDs
img_ids = coco.getImgIds()
cat_ids = coco.getCatIds()

In [7]:
def get_batch_img_mask(img_ids):
    images_and_masks = []

    # Load batch of images
    imgs = coco.loadImgs(img_ids)
    anns_ids_batch = coco.getAnnIds(imgIds=img_ids, catIds=cat_ids, iscrowd=None)
    anns_batch = coco.loadAnns(anns_ids_batch)

    # Prepare a dictionary to hold masks for each image ID
    mask_dict = {img['id']: np.zeros((img['height'], img['width']), dtype=np.uint8) for img in imgs}

    # Generate the masks by adding each annotation to the appropriate image's mask
    for ann in anns_batch:
        img_id = ann['image_id']
        mask_dict[img_id] = np.maximum(mask_dict[img_id], coco.annToMask(ann))

    # Load images and pair with masks
    for img in imgs:
        file_name = f"dataSet/turtles-data/data/{img['file_name']}"
        try:
            image = np.array(Image.open(file_name))
            mask = mask_dict[img['id']]
            images_and_masks.append((image, mask))
        except FileNotFoundError:
            # Skip if image file is missing
            continue

    return images_and_masks

In [None]:
def create_splits_array(type, df, batch_size=10):
    # Filter rows based on the specified type
    filtered_df = df[df['split_open'] == type]
    if type == "train":
        filtered_df = filtered_df.sample(n=2500, random_state=42)
    elif type == "test":
        filtered_df = filtered_df.sample(n=1080, random_state=42)
    elif type == "valid":
        filtered_df = filtered_df.sample(n=527, random_state=42)
    
    img_ids = filtered_df['id'].tolist()

    data = []

    img_len = math.floor(len(img_ids))

    # Process in batches
    for i in range(0, img_len, batch_size):
        batch_img_ids = img_ids[i:i + batch_size]
        batch_data = get_batch_img_mask(batch_img_ids)
        
        # Add only valid images and masks
        for result in batch_data:
            if result != -1:
                data.append(result)

    return data

In [9]:
# Load your CSV file
df = pd.read_csv("dataSet/turtles-data/data/metadata_splits.csv")
train_data = create_splits_array("train", df)

In [10]:
test_data = create_splits_array("test", df)

In [11]:
valid_data = create_splits_array("valid", df)

In [12]:
len(valid_data)
len(train_data)
len(test_data)

560