In [1]:
import boto3
from botocore.exceptions import ClientError
import os
from dotenv import load_dotenv

load_dotenv()
access_key_id = os.getenv("ACCESS_KEY_ID")
secret_access_key = os.getenv("SECRET_ACCESS_KEY")
minio_url = "http://" + os.getenv("S3_API_ENDPOINT")


minio_client = boto3.client(
    "s3",
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    endpoint_url=minio_url
)

manifest_name = "dataset_labelled.json"

In [2]:
new_bucket = "labelling-zone"
try:
    minio_client.create_bucket(Bucket=new_bucket)
except ClientError as e:
    error_code = e.response['Error']['Code']
    if error_code in ['BucketAlreadyExists', 'BucketAlreadyOwnedByYou']:
        print(f"Bucket '{new_bucket}' already exists")
    else:
        print(f"Error creating bucket: {e}")

Bucket 'labelling-zone' already exists


## Copy data

We first will copy the data from one zone to the other so we can keep track of the changes being made to the data.


In [3]:
source_bucket = "exploitation-zone"

response = minio_client.list_objects_v2(Bucket=source_bucket)
if 'Contents' in response:
    for obj in response['Contents']:
        copy_source = {'Bucket': source_bucket, 'Key': obj['Key']}
        minio_client.copy_object(CopySource=copy_source, Bucket=new_bucket, Key=obj['Key'])
        print(f"Copied {obj['Key']} from {source_bucket} to {new_bucket}")
else:
    print(f"No objects found in bucket '{source_bucket}'.")

Copied images/ISIC_0024388.png from exploitation-zone to labelling-zone
Copied images/ISIC_0024508.png from exploitation-zone to labelling-zone
Copied images/ISIC_0024853.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025118.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025200.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025202.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025298.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025343.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025430.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025806.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025874.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025886.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025899.png from exploitation-zone to labelling-zone
Copied images/ISIC_0025960.png from exploitation-zone to labelli

In [3]:
import chromadb
import json
client = chromadb.HttpClient(host=os.getenv("CHROMADB_ENDPOINT"), port=os.getenv("CHROMADB_PORT"))

collection = client.get_collection("text_multimodal_collection")
collection_image = client.get_collection("image_multimodal_collection")
objects = collection.get(include=["metadatas", "documents"])
text_data = collection.get(
    include=["embeddings", "metadatas", "documents"]
)
text_embeddings = text_data["embeddings"]

dataset_pairs = []

results = collection_image.query(
    query_embeddings=text_embeddings,
    n_results=1,
    include=["metadatas", "documents", "distances"],
)
print(results)

for i, text_path in enumerate(text_data["ids"]):
    best_image_path = results["ids"][i][0]
    score = results["distances"][i][0]
    dataset_pairs.append({
        "image": best_image_path,
        "text": text_path,
        "score": score
    })

with open(manifest_name, "w") as f:
    json.dump(dataset_pairs, f)

minio_client.upload_file(manifest_name, new_bucket, manifest_name)

{'ids': [['images/ISIC_0027249.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0026152.png'], ['images/ISIC_0026803.png'], ['images/ISIC_0026077.png'], ['images/ISIC_0026803.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0027249.png'], ['images/ISIC_0031987.png'], ['images/ISIC_0028103.png'], ['images/ISIC_0029475.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0027999.png'], ['images/ISIC_0026152.png'], ['images/ISIC_0031987.png'], ['images/ISIC_0031380.png'], ['images/ISIC_0029220.png'], ['images/ISIC_0025874.png'], ['images/ISIC_0031442.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0032110.png'], ['images/ISIC_0033505.png'], ['images/ISIC_0031831.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0031981.png'], ['images/ISIC_0030197.png'], ['images/ISIC_0032415.png'], ['images/ISIC_0032727.png'], ['images/ISIC_0029694.png'], ['images/ISIC_0029929.png'], ['images/ISIC_0029263.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0034196.png'], ['images/ISIC_0032731.png'], ['ima