In [None]:
# load the necessary modules
import os
import darwin
import json
from pathlib import Path
import pandas as pd
import darwin.importer as importer
from darwin.client import Client
from darwin.importer import get_importer

# Setup the connection with Darwin

In [None]:
# Set up the connection
from darwin.client import Client

with open('../keys.json') as file:
    data = json.load(file)

API_KEY = data['darwin_api_key']
client = Client.from_api_key(API_KEY)

datasets = client.list_remote_datasets()

# Print dataset names
for dataset in datasets:
    print(dataset.name)


In [None]:
def get_list_images_darwin_dataset(dataset_slug):
    remote_dataset = client.get_remote_dataset(dataset_slug.lower().replace('.', '-'))
    a = remote_dataset.fetch_remote_files()
    list_files =[s.filename for s in a]
    return list_files

def clean_dataset_slug(text):
    replacements = {
        'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
        'Ä': 'A', 'Ö': 'O', 'Ü': 'U'
    }
    
    for german_char, replacement in replacements.items():
        text = text.replace(german_char, replacement)
    
    return text.lower().replace('.', '-')

# Example
original = "20241203_Löhre_Tänikon_S_30_F_70_H_12_O_krma_ID2"
result = clean_dataset_slug(original)
print(result)
# Output: 20241203_Lohre_Tanikon_S_30_F_70_H_12_O_krma_ID2


# Uploading the datasets

In [None]:
root = "/data/images/rumex/Temp"
for dataset_slug in os.listdir(root):
    list_images_local = os.listdir(f'/data/images/rumex/Temp/{dataset_slug}')
    # pick only the images with the following format png PNG JPG jpg JPEG jpeg
    list_images_local = [image for image in list_images_local if image.split('.')[-1].lower() in ['png', 'jpg', 'jpeg']]

    # Add images to a Path list
    images = [Path(os.path.join(root, dataset_slug, i)) for i in list_images_local]
    
    try:
        remote_dataset = client.get_remote_dataset(clean_dataset_slug(dataset_slug))
        print(f"Dataset '{dataset_slug}' exists.")

        # The dataset exists, check if the images are already uploaded
        list_images_remote= remote_dataset.fetch_remote_files()
        list_images_remote =[s.filename for s in list_images_remote]

        list_images_on_local_but_not_remote = list(set(list_images_local) - set(list_images_remote))

        
        if len(list_images_on_local_but_not_remote) == 0:
            print(f"No images uploaded in dataset '{dataset_slug}'")
        else:
            print(f"Uploading {len(list_images_on_local_but_not_remote)} images to dataset '{dataset_slug}'")
            print(list_images_on_local_but_not_remote)
            images_to_upload = [Path(os.path.join(root, dataset_slug, i)) for i in list_images_on_local_but_not_remote]
            handler = remote_dataset.push(images_to_upload)


    except Exception:
        print(f"Dataset '{dataset_slug}' does not exist - Creating a new one")
        dataset = client.create_dataset(clean_dataset_slug(dataset_slug))
        # Upload images to the dataset
        handler = dataset.push(images) 


# Uploading tiled datasets (annotations only. Images have been uploaded using the interface)

In [None]:
root = (
    "/mnt/Data-Work-RE/26_Agricultural_Engineering-RE/263_DP/00_Darwin/digital-production"
)

for dataset_slug in ['haldennord10']: #lightly, bildacher

    images_dir = os.path.join(root, dataset_slug, 'images_splitted')
    print(images_dir)
    list_images = os.listdir(images_dir)
    list_images = [image for image in list_images if image.split('.')[-1].lower() in ['png', 'jpg', 'jpeg']]
    
    annotations_dir = os.path.join(root, dataset_slug, 'releases/1/annotations_splitted')
    list_annotations = os.listdir(annotations_dir)
    list_annotations = [ann for ann in list_annotations if ann.split('.')[-1].lower() == 'json']
    
    # Add images to a Path list
    images = [Path(os.path.join(root, dataset_slug, 'images_splitted', i)) for i in list_images]
    annotations = [Path(os.path.join(root, dataset_slug, 'releases/1/annotations_splitted', i)) for i in list_annotations]
    
    # annotations_new = sorted(annotations)
    # print(len(annotations_new))
    # to_upload = annotations_new[50:100]
    # print('/n/n/n')
    # print(annotations_new[0:5])
    # print('/n/n/n')
    dataset = client.get_remote_dataset(clean_dataset_slug(dataset_slug+'splitted'))
    print(f"Dataset '{dataset_slug+'splitted'}' exists.")
    parser = get_importer('darwin')
    
    batch_size = 50
    for i in range(0, len(annotations), batch_size):
        to_upload = annotations[i:i + batch_size]
        
        # You can now use `to_upload` for processing/uploading
        print(f"Uploading batch {i // batch_size + 1}: {len(to_upload)} items")
        importer.import_annotations(dataset, parser, to_upload, append=False, overwrite = True)