In [1]:
import os
import json
import pandas as pd
import deltalake as dl
import boto3
import s3fs

def get_coco_maps(coco_data):
    image_map, annotation_map, category_map = {}, {}, {}
    
    for category in coco_data['categories']:
        category_map[category['id']] = category['name'].lower()
    
    for image in coco_data['images']:
        image_map[image['id']] = image

    for annotation in coco_data['annotations']:
        if annotation['image_id'] not in annotation_map:
            annotation_map[annotation['image_id']] = []
        annotation_map[annotation['image_id']].append(annotation)
        
    return image_map, annotation_map, category_map

session = boto3.Session(profile_name='default')
credentials = session.get_credentials()
credentials = credentials.get_frozen_credentials()

storage_options = {
    'AWS_REGION': 'us-west-1',
    'AWS_ACCESS_KEY_ID': credentials.access_key,
    'AWS_SECRET_ACCESS_KEY': credentials.secret_key,
    'AWS_S3_ALLOW_UNSAFE_RENAME': 'true'
}

s3 = s3fs.S3FileSystem(
    anon=False,
    use_ssl=False,
    key=storage_options['AWS_ACCESS_KEY_ID'],
    secret=storage_options['AWS_SECRET_ACCESS_KEY'],
    client_kwargs={
        'region_name': storage_options['AWS_REGION']
    }
)

In [3]:
coco_files = [f for f in s3.ls('s3://coffee-dataset/datasets/coco_annotations') if f.endswith('.json')]

coco_df = []

for f in coco_files:
    with s3.open(f) as file:
        
        data = json.load(file)
        
        image_map, annotation_map, category_map = get_coco_maps(data)
        
        for image_id, annotations in annotation_map.items():
            image = image_map[image_id]
            
            for annotation in annotations:
                
                coco_df.append({
                    'coco_file': f,
                    'file_name': image['file_name'],
                    'width': image['width'],
                    'height': image['height'],
                    'category_id': category_map[annotation['category_id']],
                    'bbox': annotation['bbox'],
                    'segmentation': annotation['segmentation'][0],
                    'area': annotation['area'],
                    'iscrowd': annotation['iscrowd'],
                    'extras': json.dumps(annotation.get('extras', {}))
                })

coco_df = pd.DataFrame(coco_df)

print(coco_df.groupby('coco_file').size())

coco_df

coco_file
coffee-dataset/datasets/coco_annotations/batch_0.json              518
coffee-dataset/datasets/coco_annotations/batch_1.json               43
coffee-dataset/datasets/coco_annotations/batch_13.json             826
coffee-dataset/datasets/coco_annotations/batch_14.json             881
coffee-dataset/datasets/coco_annotations/batch_18.json            1016
coffee-dataset/datasets/coco_annotations/batch_23.json             521
coffee-dataset/datasets/coco_annotations/fredsam2_batch_1.json    3325
coffee-dataset/datasets/coco_annotations/original_fivver.json     6728
dtype: int64


Unnamed: 0,coco_file,file_name,width,height,category_id,bbox,segmentation,area,iscrowd,extras
0,coffee-dataset/datasets/coco_annotations/batch...,20240316_111616.jpg,4032,3024,leaf,"[1962.4, 2569.2, 463.5, 278.9]","[2425.9, 2577.6, 2414.6, 2569.2, 2402.7, 2575....",58136.0,0,{}
1,coffee-dataset/datasets/coco_annotations/batch...,20240316_111616.jpg,4032,3024,leaf,"[2060.0, 0.0, 246.0, 606.6]","[2306.0, 0.0, 2176.0, 0.0, 2172.1, 5.7, 2168.0...",58334.0,0,{}
2,coffee-dataset/datasets/coco_annotations/batch...,20240316_111616.jpg,4032,3024,leaf,"[1772.9, 2013.0, 224.8, 446.6]","[1865.0, 2013.0, 1852.0, 2024.0, 1811.7, 2047....",70120.0,0,{}
3,coffee-dataset/datasets/coco_annotations/batch...,20240316_111616.jpg,4032,3024,leaf,"[2279.3, 2596.0, 250.7, 305.7]","[2522.0, 2608.0, 2497.0, 2596.0, 2464.0, 2599....",38673.0,0,{}
4,coffee-dataset/datasets/coco_annotations/batch...,20240316_111616.jpg,4032,3024,leaf,"[2384.0, 2566.0, 74.4, 177.8]","[2452.0, 2566.0, 2445.1, 2567.4, 2434.0, 2572....",6073.0,0,{}
...,...,...,...,...,...,...,...,...,...,...
13853,coffee-dataset/datasets/coco_annotations/origi...,20231203_102229.jpg,4032,3024,leaf,"[840.79, 2274.52, 373.85, 749.48]","[950.14, 2274.52, 983.79, 2291.35, 1023.04, 23...",219797.0,0,{}
13854,coffee-dataset/datasets/coco_annotations/origi...,20231203_102229.jpg,4032,3024,leaf,"[3048.74, 1619.57, 610.39, 1208.76]","[3048.74, 1628.59, 3048.74, 1670.69, 3108.88, ...",277742.0,0,{}
13855,coffee-dataset/datasets/coco_annotations/origi...,20231203_102229.jpg,4032,3024,leaf,"[1529.83, 1484.58, 243.61, 814.37]","[1708.01, 1561.88, 1716.02, 1510.69, 1702.1, 1...",98352.0,0,{}
13856,coffee-dataset/datasets/coco_annotations/origi...,20231203_102229.jpg,4032,3024,leaf,"[1692.5, 873.86, 1029.12, 1013.61]","[2688.67, 897.12, 2611.15, 873.86, 2496.8, 875...",470830.0,0,{}


In [4]:
raw_images_df = dl.DeltaTable(
    table_uri='s3a://coffee-dataset/lake/raw_images_v2',
    storage_options=storage_options
).to_pandas()

raw_images_df['image_name'] = raw_images_df['image_path'].apply(lambda x: os.path.basename(x))

raw_images_df = raw_images_df[['image_name', 'image_path']]

raw_images_df

[90m[[0m2024-09-29T18:21:59Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind: "HTTP connect", duration: 1s }), connection: Unknown } }) }))
[90m[[0m2024-09-29T18:22:00Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind

Unnamed: 0,image_name,image_path
0,20240420_153124.jpg,coffee-dataset/raw_images/alteri_farms/2024042...
1,20240420_153126.jpg,coffee-dataset/raw_images/alteri_farms/2024042...
2,20240420_153128.jpg,coffee-dataset/raw_images/alteri_farms/2024042...
3,20240420_153236.jpg,coffee-dataset/raw_images/alteri_farms/2024042...
4,20240420_153258.jpg,coffee-dataset/raw_images/alteri_farms/2024042...
...,...,...
2707,1706385698108.png,coffee-dataset/raw_images/fivver_fred/17063856...
2708,1706390135175.png,coffee-dataset/raw_images/fivver_fred/17063901...
2709,1706390236960.png,coffee-dataset/raw_images/fivver_fred/17063902...
2710,1706390360927.png,coffee-dataset/raw_images/fivver_fred/17063903...


In [5]:
from hashlib import md5
from PIL import Image

# Merge the two DataFrames on the image_name column
merged_df = pd.merge(coco_df, raw_images_df, left_on='file_name', right_on='image_name', how='left')

merged_df = merged_df.drop(columns=['file_name'])

merged_df

Unnamed: 0,coco_file,width,height,category_id,bbox,segmentation,area,iscrowd,extras,image_name,image_path
0,coffee-dataset/datasets/coco_annotations/batch...,4032,3024,leaf,"[1962.4, 2569.2, 463.5, 278.9]","[2425.9, 2577.6, 2414.6, 2569.2, 2402.7, 2575....",58136.0,0,{},20240316_111616.jpg,coffee-dataset/raw_images/mountain_thunder_sho...
1,coffee-dataset/datasets/coco_annotations/batch...,4032,3024,leaf,"[2060.0, 0.0, 246.0, 606.6]","[2306.0, 0.0, 2176.0, 0.0, 2172.1, 5.7, 2168.0...",58334.0,0,{},20240316_111616.jpg,coffee-dataset/raw_images/mountain_thunder_sho...
2,coffee-dataset/datasets/coco_annotations/batch...,4032,3024,leaf,"[1772.9, 2013.0, 224.8, 446.6]","[1865.0, 2013.0, 1852.0, 2024.0, 1811.7, 2047....",70120.0,0,{},20240316_111616.jpg,coffee-dataset/raw_images/mountain_thunder_sho...
3,coffee-dataset/datasets/coco_annotations/batch...,4032,3024,leaf,"[2279.3, 2596.0, 250.7, 305.7]","[2522.0, 2608.0, 2497.0, 2596.0, 2464.0, 2599....",38673.0,0,{},20240316_111616.jpg,coffee-dataset/raw_images/mountain_thunder_sho...
4,coffee-dataset/datasets/coco_annotations/batch...,4032,3024,leaf,"[2384.0, 2566.0, 74.4, 177.8]","[2452.0, 2566.0, 2445.1, 2567.4, 2434.0, 2572....",6073.0,0,{},20240316_111616.jpg,coffee-dataset/raw_images/mountain_thunder_sho...
...,...,...,...,...,...,...,...,...,...,...,...
13853,coffee-dataset/datasets/coco_annotations/origi...,4032,3024,leaf,"[840.79, 2274.52, 373.85, 749.48]","[950.14, 2274.52, 983.79, 2291.35, 1023.04, 23...",219797.0,0,{},20231203_102229.jpg,coffee-dataset/raw_images/mountain_thunder_mix...
13854,coffee-dataset/datasets/coco_annotations/origi...,4032,3024,leaf,"[3048.74, 1619.57, 610.39, 1208.76]","[3048.74, 1628.59, 3048.74, 1670.69, 3108.88, ...",277742.0,0,{},20231203_102229.jpg,coffee-dataset/raw_images/mountain_thunder_mix...
13855,coffee-dataset/datasets/coco_annotations/origi...,4032,3024,leaf,"[1529.83, 1484.58, 243.61, 814.37]","[1708.01, 1561.88, 1716.02, 1510.69, 1702.1, 1...",98352.0,0,{},20231203_102229.jpg,coffee-dataset/raw_images/mountain_thunder_mix...
13856,coffee-dataset/datasets/coco_annotations/origi...,4032,3024,leaf,"[1692.5, 873.86, 1029.12, 1013.61]","[2688.67, 897.12, 2611.15, 873.86, 2496.8, 875...",470830.0,0,{},20231203_102229.jpg,coffee-dataset/raw_images/mountain_thunder_mix...


In [6]:
dl_table_path = 's3a://coffee-dataset/lake/raw_annotations'

dl.write_deltalake(
    table_or_uri=dl_table_path,
    data=merged_df,
    mode='overwrite',
    schema_mode='overwrite',
    storage_options=storage_options,
    custom_metadata={
        'catalog_name': 'Raw Annotations Catalog',
        'catalog_description': 'All of the raw annotations that have been compiled from the coco datasets',
    }
)

table = dl.DeltaTable(
    table_uri=dl_table_path,
    storage_options=storage_options
)
history = table.history(1)[0]
catalog_params = {key: value for key, value in history.items() if key.startswith('catalog_')}
catalog_params

  dl.write_deltalake(


{'catalog_name': 'Raw Annotations Catalog',
 'catalog_description': 'All of the raw annotations that have been compiled from the coco datasets'}