In [1]:
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import json
from PIL import Image
import deltalake as dl
import boto3
import os
from tqdm import tqdm
import s3fs
from concurrent.futures import ThreadPoolExecutor, as_completed

tqdm.pandas()

session = boto3.Session(profile_name='default')
credentials = session.get_credentials()
credentials = credentials.get_frozen_credentials()

storage_options = {
    'AWS_REGION': 'us-west-1',
    'AWS_ACCESS_KEY_ID': credentials.access_key,
    'AWS_SECRET_ACCESS_KEY': credentials.secret_key,
    'AWS_S3_ALLOW_UNSAFE_RENAME': 'true'
}

s3 = s3fs.S3FileSystem(
    anon=False,
    use_ssl=False,
    key=storage_options['AWS_ACCESS_KEY_ID'],
    secret=storage_options['AWS_SECRET_ACCESS_KEY'],
    client_kwargs={
        'region_name': storage_options['AWS_REGION']
    }
)

In [2]:
from shapely.geometry import Polygon
import hashlib
import cv2

def is_valid_contour(contour):
    try:
        if contour is None:
            return False
            
        if len(contour) < 24:
            return False

        if not Polygon(contour).is_valid:
            return False

    except:
        return False
    
    return True


def compute_hash(row):
    image_path = row['image_path']
    contour = row['contour'].copy()
    x, y, w, h = cv2.boundingRect(contour)
    contour[:, 0] -= x
    contour[:, 1] -= y

    contour_hash = hashlib.md5(contour.flatten().astype('uint8')).hexdigest()
    image_path_hash = hashlib.md5(image_path.encode()).hexdigest()
    mask_hash = hashlib.md5(f'{contour_hash}{image_path_hash}'.encode()).hexdigest()
    return mask_hash

In [3]:
annotations = dl.DeltaTable(
    table_uri='s3a://coffee-dataset/lake/raw_annotations',
    storage_options=storage_options
).to_pandas()

print('Total Contour Annotations:', len(annotations))

try:
    patch_annos = dl.DeltaTable(
        table_uri='s3a://coffee-dataset/lake/clear_leaf_patch_annotations',
        storage_options=storage_options
    ).to_pandas()
except:
    patch_annos = pd.DataFrame(columns=['image_path', 'patch', 'defective', 'hash'])

print('Total Patch Annotations:', len(patch_annos))
print('Total Defective Patches:', len(patch_annos[patch_annos['defective'] == 1]))
print('Total Healthy Patches:', len(patch_annos[patch_annos['defective'] == 0]))

annotations = annotations[annotations['category_id'] == 'leaf']
annotations = annotations[annotations['area'] > annotations['area'].quantile(0.05)]
annotations['contour'] = annotations['segmentation'].apply(lambda x: np.array(x).reshape(-1, 2).astype(np.int32))
annotations = annotations[annotations['contour'].apply(lambda x: is_valid_contour(x))]
annotations['hash'] = annotations.apply(compute_hash, axis=1)
annotations = annotations[['image_path', 'hash', 'contour']]

annotations.reset_index(drop=True, inplace=True)
patch_annos.reset_index(drop=True, inplace=True)

patch_annos = patch_annos.merge(annotations, on='hash', how='left', suffixes=('_patch', ''))

patch_annos = patch_annos[['image_path', 'patch', 'defective', 'hash', 'contour']]

# drop nan values
patch_annos = patch_annos.dropna()

patch_annos

[90m[[0m2024-10-07T21:47:55Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind: "HTTP connect", duration: 1s }), connection: Unknown } }) }))
[90m[[0m2024-10-07T21:47:56Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind

Total Contour Annotations: 13858
Total Patch Annotations: 13346
Total Defective Patches: 5076
Total Healthy Patches: 8270


Unnamed: 0,image_path,patch,defective,hash,contour
0,coffee-dataset/raw_images/bay_view_dead_leaves...,"[230, 77, 294, 141]",1,ec2785912ccfc157520dc7e44985fbd6,"[[1939, 651], [1938, 652], [1935, 652], [1934,..."
1,coffee-dataset/raw_images/bay_view_dead_leaves...,"[179, 76, 243, 140]",1,ec2785912ccfc157520dc7e44985fbd6,"[[1939, 651], [1938, 652], [1935, 652], [1934,..."
2,coffee-dataset/raw_images/bay_view_dead_leaves...,"[115, 75, 179, 139]",1,ec2785912ccfc157520dc7e44985fbd6,"[[1939, 651], [1938, 652], [1935, 652], [1934,..."
3,coffee-dataset/raw_images/bay_view_dead_leaves...,"[62, 72, 126, 136]",1,ec2785912ccfc157520dc7e44985fbd6,"[[1939, 651], [1938, 652], [1935, 652], [1934,..."
4,coffee-dataset/raw_images/bay_view_dead_leaves...,"[99, 98, 163, 162]",1,ec2785912ccfc157520dc7e44985fbd6,"[[1939, 651], [1938, 652], [1935, 652], [1934,..."
...,...,...,...,...,...
13341,coffee-dataset/raw_images/milolii_luis_farm/20...,"[500, 382, 564, 446]",0,d36ae6fcc22a404233638f984e7b39b7,"[[1095, 1474], [1084, 1489], [1076, 1502], [10..."
13342,coffee-dataset/raw_images/milolii_luis_farm/20...,"[527, 304, 591, 368]",0,d36ae6fcc22a404233638f984e7b39b7,"[[1095, 1474], [1084, 1489], [1076, 1502], [10..."
13343,coffee-dataset/raw_images/milolii_luis_farm/20...,"[600, 249, 664, 313]",0,d36ae6fcc22a404233638f984e7b39b7,"[[1095, 1474], [1084, 1489], [1076, 1502], [10..."
13344,coffee-dataset/raw_images/milolii_luis_farm/20...,"[250, 102, 314, 166]",0,d36ae6fcc22a404233638f984e7b39b7,"[[1095, 1474], [1084, 1489], [1076, 1502], [10..."


In [4]:
os.makedirs('patches', exist_ok=True)
os.makedirs('patches/defective', exist_ok=True)
os.makedirs('patches/healthy', exist_ok=True)

In [5]:
from PIL import Image, ImageOps

t = 64

loader = tqdm(total=len(patch_annos), position=0, leave=True)

for hash, pdf in patch_annos.groupby('hash'):
    image_path = pdf.image_path.iloc[0]
    contour = pdf.contour.iloc[0].copy()

    with s3.open(image_path, 'rb') as f:
        image = Image.open(f)
        image = ImageOps.exif_transpose(image)
        image = np.array(image)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    patches = pdf['patch'].values
    defectives = pdf['defective'].values

    x, y, w, h = cv2.boundingRect(contour.astype(int))
    mask = np.zeros(image.shape[:2], np.uint8)
    mask = cv2.drawContours(mask, [contour.astype(int)], -1, (255), -1)
    mask = cv2.bitwise_and(image, image, mask=mask)
    mask = mask[y:y+h, x:x+w]

    contour[:, 0] -= x
    contour[:, 1] -= y
    
    polygon = Polygon(contour)

    tiles = []
    
    for i, (patch, defective) in enumerate(zip(patches, defectives)):        
        loader.update(1)

        x1, y1, x2, y2 = patch
        box = Polygon([(x1, y1), (x2, y1), (x2, y2), (x1, y2)])
        intersection = box.intersection(polygon)
        
        greater_than_half = intersection.area / box.area >= 0.5
        inside_polygon = polygon.contains(box)
        
        if greater_than_half or inside_polygon:
            tile = mask[y1:y2, x1:x2]

            # if any size is less than t//2, skip
            if tile.shape[0] < t//2 or tile.shape[1] < t//2:
                continue

            # if tile is all black, skip
            if np.all(tile == 0):
                continue

            pads = ((0, t - tile.shape[0]), (0, t - tile.shape[1]), (0, 0))
            tile = np.pad(tile, pads, 'constant', constant_values=(0, 0))

            if defective:
                cv2.imwrite(f'patches/defective/{hash}_{i}.png', tile)
            else:
                cv2.imwrite(f'patches/healthy/{hash}_{i}.png', tile)
        
loader.close()

100%|██████████| 13320/13320 [07:07<00:00, 31.12it/s]
