In [31]:
import os
import pandas as pd
import deltalake as dl
import boto3
import s3fs

session = boto3.Session(profile_name='default')
credentials = session.get_credentials()
credentials = credentials.get_frozen_credentials()

storage_options = {
    'AWS_REGION': 'us-west-1',
    'AWS_ACCESS_KEY_ID': credentials.access_key,
    'AWS_SECRET_ACCESS_KEY': credentials.secret_key,
    'AWS_S3_ALLOW_UNSAFE_RENAME': 'true'
}

s3 = s3fs.S3FileSystem(
    anon=False,
    use_ssl=False,
    key=storage_options['AWS_ACCESS_KEY_ID'],
    secret=storage_options['AWS_SECRET_ACCESS_KEY'],
    client_kwargs={
        'region_name': storage_options['AWS_REGION']
    }
)

rows = []

In [32]:
folder = 'alteri_farms'

description = """
Dataset captured from Alteri Farms in Boquete, Panama. Captured in April, 2024 by Jack Mead and Devin Dennis.
Images are focused on leaves and berries.
"""

location = 'Alteri Farms, Boquete, Panama'

date = 'April, 2024'

capture_distance = '0.5 - 1.5 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [33]:
folder = 'finca_lerida_farms'

description = """
Dataset captured from Finca Lerida Farms in Boquete, Panama. Captured in April, 2024 by Jack Mead.
Dataset contains images of leaves with several instances of leaf rust.
"""

location = 'Finca Lerida Farms, Boquete, Panama'

date = 'April, 2024'

capture_distance = '0.5 - 1.5 meters'

cameras = 'Samsung Galaxy S20 5E'

persons = 'Jack Mead'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [34]:
folder = 'kona_flowers_mixed'

description = """
Dataset captured from Kona Flowers in Kona, Hawaii. Captured in 2023 - 2024 by Jack Mead.
Dataset contains images mostly focused on flowers and leaves. Some are from Mountain Thunder Coffee Plantation.
Others are from ditch banks and wild coffee plants.
"""

location = 'Kona, Hawaii'

date = '2023 - 2024'

capture_distance = '0.5 - 1.5 meters'

cameras = 'Samsung Galaxy S20 5E'

persons = 'Jack Mead'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [35]:
folder = 'milolii_luis_farm'

description = """
Dataset captured from a farm owned by Luis A in Milolii, Hawaii. Captured in early 2024 by Jack Mead and Devin Dennis.
Images contain mostly leaves of young coffee plants. Many instances of disease including black fungus.
"""

location = 'Milolii, Hawaii'

date = 'Early 2024'

capture_distance = '0.5 - 1.5 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [36]:
folder = 'mountain_thunder_mixed'

description = """
Dataset captured from Mountain Thunder Coffee Plantation in Kona, Hawaii. Captured in 2023 - 2024 by Jack Mead and Devin Dennis.
Data is a mixture of the north and shop farms. Images contain many instances of leaves and berries with many instances of disease.
"""

location = 'Mountain Thunder Coffee Plantation, Kona, Hawaii'

date = '2023 - 2024'

capture_distance = '0.5 - 3 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [37]:
folder = 'mountain_thunder_north_farm'

description = """
Dataset captured from Mountain Thunder Coffee Plantation in Kona, Hawaii. Captured in 2023 - 2024 by Jack Mead and Devin Dennis.
Data is from the northern production farm. Images contain many instances of leaves and berries with many instances of disease.
"""

location = 'Mountain Thunder Coffee Plantation, Kona, Hawaii'

date = '2023 - 2024'

capture_distance = '0.5 - 3 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [38]:
folder = 'mountain_thunder_shop_farm'

description = """
Dataset captured from Mountain Thunder Coffee Plantation in Kona, Hawaii. Captured in 2023 - 2024 by Jack Mead and Devin Dennis.
Data is from the shop organic farm. Images contain many instances of leaves and berries with many instances of disease.
"""

location = 'Mountain Thunder Coffee Plantation, Kona, Hawaii'

date = '2023 - 2024'

capture_distance = '0.5 - 1.5 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [39]:
folder = 'off_road_ditch_farm'

description = """
Dataset is captured from a small farm somewhere in Boquete, Panama near the slops of Volcan Baru.
Captured in April, 2024 by Jack Mead and Devin Dennis. Images contain many instances of leaves
with many instances of disease including leaf rust.
"""

location = 'Boquete, Panama'

date = 'April, 2024'

capture_distance = '0.5 - 1.5 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [40]:
folder = 'bay_view_farms_nursery'

description = """
Dataset captured from Bay View Farms in Kona, Hawaii. Captured in June 22nd, 2024 by Jack Mead and Devin Dennis.
These images are from the nursery and contain many instances of phoma infections with many leaf rust instances.
"""

location = 'Bay View Farms, Captain Cook, Hawaii'

date = 'June 22, 2024'

capture_distance = '0.5 - 1.0 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [41]:
folder = 'bay_view_farms_plot'

description = """
Dataset captured from Bay View Farms in Kona, Hawaii. Captured in July 11th, 2024 by Jack Mead and Devin Dennis.
These images are from the field while we were picking berries during a harvesting time so there are many ripe
and overripe berries along with diseased and dying leaves.
"""

location = 'Bay View Farms, Captain Cook, Hawaii'

date = 'July 11th, 2024'

capture_distance = '0.5 - 1.0 meters'

cameras = 'Samsung Galaxy S20 Ultra and Samsung Galaxy S20 5E'

persons = 'Jack Mead and Devin Dennis'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [42]:
folder = 'bay_view_dead_leaves_backdrop'

description = """
Dataset captured from Bay View Farms dead leaves in Kona, Hawaii. Captured in July 11th, 2024 by Jack Mead.
These images are placed on a white background table and the leaves are mostly dead or dying.
"""

location = 'Jacks Apartment, Captain Cook, Hawaii'

date = 'July 11th, 2024'

capture_distance = '0.5 - 1.0 meters'

cameras = 'Samsung Galaxy S20 5E'

persons = 'Jack Mead'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [43]:
folder = 'fivver_fred'

description = """
This was the original images captured for the project in 2023 by Jack Mead. The images are taken from a
Nixon Coolpix camera and a Canon DSLR in raw format. This dataset contains many instances of leaves and
berries from the Mountain Thunder Coffee Plantation.
"""

location = 'Mountain Thunder Coffee Plantation, Kona, Hawaii'

date = '2023'

capture_distance = '0.5 - 2.0 meters'

cameras = 'Nikon Coolpix and Canon DSLR'

persons = 'Jack Mead'

rows.append({
    'folder': folder,
    'description': description,
    'location': location,
    'capture_distance': capture_distance,
    'cameras': cameras,
    'persons': persons
})

In [44]:
df = pd.DataFrame(rows)

extensions = ['jpg', 'jpeg', 'png']

df['description'] = df['description'].str.strip()

df['total_images'] = df['folder'].apply(lambda folder: len([f for f in s3.ls(f's3://coffee-dataset/raw_images/{folder}') if f.split('.')[-1].lower() in extensions]))

df['megabytes'] = 0.0

df['percent_of_total_images'] = [round(100 * row['total_images'] / df['total_images'].sum(), 2) for index, row in df.iterrows()]

df

Unnamed: 0,folder,description,location,capture_distance,cameras,persons,total_images,megabytes,percent_of_total_images
0,alteri_farms,"Dataset captured from Alteri Farms in Boquete,...","Alteri Farms, Boquete, Panama",0.5 - 1.5 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,192,0.0,7.08
1,finca_lerida_farms,Dataset captured from Finca Lerida Farms in Bo...,"Finca Lerida Farms, Boquete, Panama",0.5 - 1.5 meters,Samsung Galaxy S20 5E,Jack Mead,120,0.0,4.42
2,kona_flowers_mixed,"Dataset captured from Kona Flowers in Kona, Ha...","Kona, Hawaii",0.5 - 1.5 meters,Samsung Galaxy S20 5E,Jack Mead,62,0.0,2.29
3,milolii_luis_farm,Dataset captured from a farm owned by Luis A i...,"Milolii, Hawaii",0.5 - 1.5 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,362,0.0,13.35
4,mountain_thunder_mixed,Dataset captured from Mountain Thunder Coffee ...,"Mountain Thunder Coffee Plantation, Kona, Hawaii",0.5 - 3 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,1027,0.0,37.87
5,mountain_thunder_north_farm,Dataset captured from Mountain Thunder Coffee ...,"Mountain Thunder Coffee Plantation, Kona, Hawaii",0.5 - 3 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,389,0.0,14.34
6,mountain_thunder_shop_farm,Dataset captured from Mountain Thunder Coffee ...,"Mountain Thunder Coffee Plantation, Kona, Hawaii",0.5 - 1.5 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,162,0.0,5.97
7,off_road_ditch_farm,Dataset is captured from a small farm somewher...,"Boquete, Panama",0.5 - 1.5 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,137,0.0,5.05
8,bay_view_farms_nursery,"Dataset captured from Bay View Farms in Kona, ...","Bay View Farms, Captain Cook, Hawaii",0.5 - 1.0 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,152,0.0,5.6
9,bay_view_farms_plot,"Dataset captured from Bay View Farms in Kona, ...","Bay View Farms, Captain Cook, Hawaii",0.5 - 1.0 meters,Samsung Galaxy S20 Ultra and Samsung Galaxy S2...,Jack Mead and Devin Dennis,60,0.0,2.21


In [45]:
dl_table_path = 's3a://coffee-dataset/lake/raw_image_locations'

dl.write_deltalake(
    table_or_uri=dl_table_path,
    data=df,
    mode='overwrite',
    storage_options=storage_options,
    custom_metadata={
        'catalog_name': 'Raw Images Locations Catalog',
        'catalog_description': 'Catalogs the folders of the raw_images with extra metadata',
    }
)

table = dl.DeltaTable(
    table_uri=dl_table_path,
    storage_options=storage_options
)
history = table.history(1)[0]
catalog_params = {key: value for key, value in history.items() if key.startswith('catalog_')}
catalog_params

  dl.write_deltalake(


{'catalog_description': 'Catalogs the folders of the raw_images with extra metadata',
 'catalog_name': 'Raw Images Locations Catalog'}