In [None]:
# load the necessary modules
import os
import darwin
import json
from pathlib import Path
import pandas as pd
import darwin.importer as importer
from darwin.client import Client
from darwin.importer import get_importer

# Setup the connection with Darwin

In [None]:
# Set up the connection
from darwin.client import Client

with open('../keys.json') as file:
    data = json.load(file)

API_KEY = data['darwin_api_key']
client = Client.from_api_key(API_KEY)

datasets = client.list_remote_datasets()

# Print dataset names
for dataset in datasets:
    print(dataset.name)


In [None]:
def get_list_images_darwin_dataset(dataset_slug):
    remote_dataset = client.get_remote_dataset(dataset_slug.lower().replace('.', '-'))
    a = remote_dataset.fetch_remote_files()
    list_files =[s.filename for s in a]
    return list_files

def clean_dataset_slug(text):
    replacements = {
        'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
        'Ä': 'A', 'Ö': 'O', 'Ü': 'U'
    }
    
    for german_char, replacement in replacements.items():
        text = text.replace(german_char, replacement)
    
    return text.lower().replace('.', '-')

# Example
original = "20241203_Löhre_Tänikon_S_30_F_70_H_12_O_krma_ID2"
result = clean_dataset_slug(original)
print(result)
# Output: 20241203_Lohre_Tanikon_S_30_F_70_H_12_O_krma_ID2


# Uploading the datasets

In [None]:
root = "/data/images/rumex/Temp"
for dataset_slug in os.listdir(root):
    list_images_local = os.listdir(f'/data/images/rumex/Temp/{dataset_slug}')
    # pick only the images with the following format png PNG JPG jpg JPEG jpeg
    list_images_local = [image for image in list_images_local if image.split('.')[-1].lower() in ['png', 'jpg', 'jpeg']]

    # Add images to a Path list
    images = [Path(os.path.join(root, dataset_slug, i)) for i in list_images_local]
    
    try:
        remote_dataset = client.get_remote_dataset(clean_dataset_slug(dataset_slug))
        print(f"Dataset '{dataset_slug}' exists.")

        # The dataset exists, check if the images are already uploaded
        list_images_remote= remote_dataset.fetch_remote_files()
        list_images_remote =[s.filename for s in list_images_remote]

        list_images_on_local_but_not_remote = list(set(list_images_local) - set(list_images_remote))

        
        if len(list_images_on_local_but_not_remote) == 0:
            print(f"No images uploaded in dataset '{dataset_slug}'")
        else:
            print(f"Uploading {len(list_images_on_local_but_not_remote)} images to dataset '{dataset_slug}'")
            print(list_images_on_local_but_not_remote)
            images_to_upload = [Path(os.path.join(root, dataset_slug, i)) for i in list_images_on_local_but_not_remote]
            handler = remote_dataset.push(images_to_upload)

    except Exception:
        print(f"Dataset '{dataset_slug}' does not exist - Creating a new one")
        dataset = client.create_dataset(clean_dataset_slug(dataset_slug))
        # Upload images to the dataset
        handler = dataset.push(images) 
