In [1]:
import cv2
import numpy as np
from ultralytics import SAM
import torch
from tqdm import tqdm
from PIL import Image, ImageOps
import uuid
import os
import json
import pandas as pd
import deltalake as dl
import boto3
import s3fs
from deltalake.exceptions import TableNotFoundError

torch.cuda.empty_cache()

os.chdir('/home/jack/Documents')

session = boto3.Session(profile_name='default')
credentials = session.get_credentials()
credentials = credentials.get_frozen_credentials()

storage_options = {
    'AWS_REGION': 'us-west-1',
    'AWS_ACCESS_KEY_ID': credentials.access_key,
    'AWS_SECRET_ACCESS_KEY': credentials.secret_key,
    'AWS_S3_ALLOW_UNSAFE_RENAME': 'true'
}

s3 = s3fs.S3FileSystem(
    anon=False,
    use_ssl=False,
    key=storage_options['AWS_ACCESS_KEY_ID'],
    secret=storage_options['AWS_SECRET_ACCESS_KEY'],
    client_kwargs={
        'region_name': storage_options['AWS_REGION']
    }
)

In [2]:
raw_images_df = dl.DeltaTable(
    table_uri='s3a://coffee-dataset/lake/raw_images_v2',
    storage_options=storage_options
).to_pandas()

raw_annos_df = dl.DeltaTable(
    table_uri='s3a://coffee-dataset/lake/raw_annotations',
    storage_options=storage_options
).to_pandas()

try:
    sam_segged_df = dl.DeltaTable(
        table_uri='s3a://coffee-dataset/lake/raw_image_sam_segmentations_v2',
        storage_options=storage_options
    ).to_pandas(columns=['image_path'])
except TableNotFoundError:
    sam_segged_df = pd.DataFrame(columns=['image_path', 'box', 'area', 'contour', 'smooth_box', 'smooth_area', 'smoothed_contour'])

# remove images that have already been annotated
unannos_df = raw_images_df[~raw_images_df.image_path.isin(raw_annos_df.image_path.unique())]

# remove images that have already been segmented by sam
unannos_df = unannos_df[~unannos_df.image_path.isin(sam_segged_df.image_path.unique())]

unannos_df.reset_index(drop=True, inplace=True)

unannos_df = unannos_df[['image_path']]

# shuffle the dataframe
unannos_df = unannos_df.sample(frac=1).reset_index(drop=True)

unannos_df

[90m[[0m2024-09-24T19:16:24Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind: "HTTP connect", duration: 1s }), connection: Unknown } }) }))
[90m[[0m2024-09-24T19:16:25Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind

Unnamed: 0,image_path
0,coffee-dataset/raw_images/milolii_luis_farm/20...
1,coffee-dataset/raw_images/mountain_thunder_mix...
2,coffee-dataset/raw_images/mountain_thunder_mix...
3,coffee-dataset/raw_images/mountain_thunder_mix...
4,coffee-dataset/raw_images/mountain_thunder_nor...
...,...
1482,coffee-dataset/raw_images/mountain_thunder_nor...
1483,coffee-dataset/raw_images/mountain_thunder_mix...
1484,coffee-dataset/raw_images/mountain_thunder_mix...
1485,coffee-dataset/raw_images/mountain_thunder_sho...


In [3]:
sam = SAM('sam2_b.pt').cuda()

torch.cuda.empty_cache()

In [4]:
# batch unannotated images into groups of 20
batch_size = 20
num_batches = len(unannos_df) // batch_size + (len(unannos_df) % batch_size > 0)

for batch_idx in range(num_batches):
    rows = unannos_df.iloc[batch_idx * batch_size:(batch_idx + 1) * batch_size]
    
    batch = []
    
    loader = tqdm(total=len(rows), desc=f'Processing batch {batch_idx + 1}/{num_batches}', position=0, leave=True)
    
    for _, row in rows.iterrows():
        image_path = row['image_path']
        
        try:
            with s3.open(image_path, 'rb') as f:
                image = Image.open(f)
                image = ImageOps.exif_transpose(image)
                image = np.array(image)
        except Exception as e:
            print(f'Error reading image {image_path}: {e}')
            loader.update(1)
            continue
        
        torch.cuda.empty_cache()
        
        results = sam(image, labels=[1], verbose=False, device='cuda', imgsz=1024)
        
        keys = list(results[0].names.keys())
        mask_result = results[0].masks
        box_result = results[0].boxes
        contours = results[0].masks.xy
        boxes = results[0].boxes.xywh.cpu().numpy().astype(float)
        
        if len(keys) == 0:
            continue
        
        mask = np.zeros(image.shape[:2], dtype=np.uint8)
        
        for index in keys:
            contour = results[0].masks.xy[index].astype(float)
            box = boxes[index]

            cmask = mask.copy()
            cv2.fillPoly(cmask, [contour.astype(int)], 255)
            cmask = cv2.medianBlur(cmask, 17)
            cmask = cv2.erode(cmask, np.ones((5, 5), np.uint8), iterations=1)

            smooth_contour = cv2.findContours(cmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            smooth_contour = max(smooth_contour[0], key=cv2.contourArea)

            batch.append({
                'image_path': image_path,
                'box': box.tolist(),
                'area': cv2.contourArea(contour.astype(int)),
                'contour': contour.flatten().tolist(),
                'smooth_box': list(cv2.boundingRect(smooth_contour.astype(int))),
                'smooth_area': cv2.contourArea(smooth_contour.astype(int)),
                'smoothed_contour': smooth_contour.flatten().tolist()
            })
        
        loader.update(1)
    
    loader.close()
    
    if len(batch) > 0:
        
        # Append to Delta Lake
        dl.write_deltalake(
            table_or_uri='s3a://coffee-dataset/lake/raw_image_sam_segmentations_v2',
            data=pd.DataFrame(batch),
            mode='append',
            storage_options=storage_options,
            custom_metadata={
                'catalog_name': 'Raw Image SAM Segmentations',
                'catalog_description': 'Segmentations generated by SAM for raw images'
            }
        )

Processing batch 1/75: 100%|██████████| 20/20 [07:11<00:00, 21.57s/it]
  dl.write_deltalake(
Processing batch 2/75: 100%|██████████| 20/20 [06:24<00:00, 19.24s/it]
  dl.write_deltalake(
Processing batch 3/75:  60%|██████    | 12/20 [03:49<02:03, 15.38s/it]

Error reading image coffee-dataset/raw_images/mountain_thunder_mixed/20240316_092939.jpg: image file is truncated (104 bytes not processed)


Processing batch 3/75: 100%|██████████| 20/20 [06:40<00:00, 20.02s/it]
  dl.write_deltalake(
Processing batch 4/75: 100%|██████████| 20/20 [06:51<00:00, 20.59s/it]
  dl.write_deltalake(
Processing batch 5/75: 100%|██████████| 20/20 [06:10<00:00, 18.54s/it]
  dl.write_deltalake(
Processing batch 6/75: 100%|██████████| 20/20 [06:22<00:00, 19.11s/it]
  dl.write_deltalake(
Processing batch 7/75: 100%|██████████| 20/20 [06:38<00:00, 19.91s/it]
  dl.write_deltalake(
Processing batch 8/75: 100%|██████████| 20/20 [07:01<00:00, 21.09s/it]
  dl.write_deltalake(
Processing batch 9/75: 100%|██████████| 20/20 [07:06<00:00, 21.33s/it]
  dl.write_deltalake(
Processing batch 10/75: 100%|██████████| 20/20 [06:26<00:00, 19.31s/it]
  dl.write_deltalake(
Processing batch 11/75: 100%|██████████| 20/20 [06:31<00:00, 19.59s/it]
  dl.write_deltalake(
Processing batch 12/75: 100%|██████████| 20/20 [07:00<00:00, 21.03s/it]
  dl.write_deltalake(
Processing batch 13/75: 100%|██████████| 20/20 [06:37<00:00, 19.89s

Error reading image coffee-dataset/raw_images/mountain_thunder_mixed/20240316_112917.jpg: image file is truncated (28 bytes not processed)


Processing batch 49/75: 100%|██████████| 20/20 [06:23<00:00, 19.16s/it]
  dl.write_deltalake(
Processing batch 50/75: 100%|██████████| 20/20 [07:32<00:00, 22.61s/it]
  dl.write_deltalake(
Processing batch 51/75: 100%|██████████| 20/20 [07:22<00:00, 22.11s/it]
  dl.write_deltalake(
Processing batch 52/75: 100%|██████████| 20/20 [06:17<00:00, 18.89s/it]
  dl.write_deltalake(
Processing batch 53/75: 100%|██████████| 20/20 [07:31<00:00, 22.57s/it]
  dl.write_deltalake(
Processing batch 54/75: 100%|██████████| 20/20 [07:35<00:00, 22.79s/it]
  dl.write_deltalake(
Processing batch 55/75: 100%|██████████| 20/20 [07:39<00:00, 22.99s/it]
  dl.write_deltalake(
Processing batch 56/75: 100%|██████████| 20/20 [07:02<00:00, 21.13s/it]
  dl.write_deltalake(
Processing batch 57/75:  15%|█▌        | 3/20 [00:29<02:19,  8.19s/it]

Error reading image coffee-dataset/raw_images/mountain_thunder_mixed/20240316_112914.jpg: cannot identify image file <File-like object S3FileSystem, coffee-dataset/raw_images/mountain_thunder_mixed/20240316_112914.jpg>


Processing batch 57/75: 100%|██████████| 20/20 [07:14<00:00, 21.72s/it]
  dl.write_deltalake(
Processing batch 58/75: 100%|██████████| 20/20 [07:09<00:00, 21.50s/it]
  dl.write_deltalake(
Processing batch 59/75: 100%|██████████| 20/20 [06:24<00:00, 19.24s/it]
  dl.write_deltalake(
Processing batch 60/75: 100%|██████████| 20/20 [06:49<00:00, 20.45s/it]
  dl.write_deltalake(
Processing batch 61/75: 100%|██████████| 20/20 [06:01<00:00, 18.07s/it]
  dl.write_deltalake(
Processing batch 62/75:  95%|█████████▌| 19/20 [04:55<00:10, 10.84s/it]

Error reading image coffee-dataset/raw_images/mountain_thunder_mixed/20240316_092935.jpg: cannot identify image file <File-like object S3FileSystem, coffee-dataset/raw_images/mountain_thunder_mixed/20240316_092935.jpg>


Processing batch 62/75: 100%|██████████| 20/20 [05:20<00:00, 16.01s/it]
  dl.write_deltalake(
Processing batch 63/75: 100%|██████████| 20/20 [05:19<00:00, 15.98s/it]
  dl.write_deltalake(
Processing batch 64/75: 100%|██████████| 20/20 [06:25<00:00, 19.29s/it]
  dl.write_deltalake(
Processing batch 65/75: 100%|██████████| 20/20 [06:09<00:00, 18.47s/it]
  dl.write_deltalake(
Processing batch 66/75: 100%|██████████| 20/20 [06:01<00:00, 18.07s/it]
  dl.write_deltalake(
Processing batch 67/75: 100%|██████████| 20/20 [06:19<00:00, 18.97s/it]
  dl.write_deltalake(
Processing batch 68/75: 100%|██████████| 20/20 [05:52<00:00, 17.62s/it]
  dl.write_deltalake(
Processing batch 69/75: 100%|██████████| 20/20 [07:03<00:00, 21.18s/it]
  dl.write_deltalake(
Processing batch 70/75: 100%|██████████| 20/20 [06:13<00:00, 18.69s/it]
  dl.write_deltalake(
Processing batch 71/75: 100%|██████████| 20/20 [06:07<00:00, 18.39s/it]
  dl.write_deltalake(
Processing batch 72/75: 100%|██████████| 20/20 [06:18<00:00,