# Context

When I tried to run a train where with the get_dataset tool from Darwin, I got an error message saying that some annotations contain negative values.
After contacting darwin for this issue, they said this behavior is expected. The workaround would be to trim the negative annotations.
So they sent me a python code called 00-trim_annotations_to_border_single_frame.py.

Steps:
- Download darwin annotations.
- Trim them.
- Upload and overwrite them into darwin.

In [None]:
import os
from tqdm import tqdm

# === Paths and Dataset Selection ===
darwin_path = (
    "/mnt/Data-Work-RE/26_Agricultural_Engineering-RE/263_DP/01_Projekte/"
    "2020-Fenaco-Drohnen/code/Rumex-Paper-1/assets/digital-production"
)

datasets = [
    'haldennord09',
    'haldensued10',
    'haldensued08',
    'haldennord10',
    'lightly'
]


In [None]:
def get_latest_darwin_version_on_local(path):
    """
    Returns the highest numeric folder name from a given path.
    
    Example:
        dataset = 'haldensued10'
        dir = os.path.join(darwin_path, dataset, 'releases')
        latest_version = get_latest_darwin_version_on_local(dir)
    """
    try:
        folders = os.listdir(path)
        numbers = [int(folder) for folder in folders if folder.isdigit()]
        return str(max(numbers)) if numbers else None
    except FileNotFoundError:
        print(f"Directory not found: {path}")
        return None
    except Exception as e:
        print(f"Error reading from {path}: {e}")
        return None

dataset = 'haldensued10'
dir = os.path.join(darwin_path, dataset, 'releases')
latest_version = get_latest_darwin_version_on_local(dir)
latest_version

In [None]:
for d in tqdm(datasets):
    # Get the dataset version
    dir = os.path.join(darwin_path, d, 'releases')
    v = get_latest_darwin_version_on_local(dir)
    # Construct the input link
    input_path = os.path.join(darwin_path, d, 'releases', v, 'annotations')
    # Construct the output link
    output_path = os.path.join(darwin_path, d, 'releases', 'fixed', 'annotations')
    # run the python script
    !python3 ./trim_annotations_folder.py -i "{input_path}" -o "{output_path}" 
    

In [None]:
dataset = datasets[0]
print(dataset)
ann_path = os.path.join(darwin_path, dataset, 'releases', 'fixed', 'annotations')
len(os.listdir(ann_path))

In [None]:
from darwin.client import Client
import darwin.importer as importer
from darwin.importer import get_importer

API_KEY = "jRfXJTu.AiF1ryWGYPoMjHrB5dv6ZiDtnrWWk6Lr"
client = Client.from_api_key(API_KEY)
parser = get_importer("darwin")


In [None]:
d

In [None]:
d = datasets[4]
ann_path = os.path.join(darwin_path, d, 'releases', 'fixed', 'annotations')
dataset = client.get_remote_dataset(f"digital-production/{d}")
json_files = [os.path.join(ann_path, f) for f in os.listdir(ann_path) if f.endswith('json')]
print(len(json_files))


In [None]:
batch_size = 50
for i in range(0, len(json_files), batch_size):
    batch = json_files[i:i+batch_size]
    importer.import_annotations(dataset, parser, batch, append=False, overwrite=True)

In [None]:
# Upload the annotations
for d in tqdm(datasets[4:]):
    ann_path = os.path.join(darwin_path, d, 'releases', 'fixed', 'annotations')
    dataset = client.get_remote_dataset(f"digital-production/{d}")
    importer.import_annotations(dataset, parser, [ann_path], append=False, overwrite=True)


In [None]:
# Now export the new version again
!darwin dataset export digital-production/{dataset]

In [None]:
# Pull the version 2 of the dataset
!darwin dataset pull digital-production/{dataset]

In [None]:
# And split it again
!darwin dataset split digital-production/{dataset] --val-percentage 0.1 --test-percentage 0.2