In [3]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv()
access_key_id = os.getenv("ACCESS_KEY_ID")
secret_access_key = os.getenv("SECRET_ACCESS_KEY")
minio_url = "http://" + os.getenv("S3_API_ENDPOINT")


minio_client = boto3.client(
    "s3",
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    endpoint_url=minio_url
)

new_bucket = "training-preparation-zone"
try:
    minio_client.create_bucket(Bucket=new_bucket)
except (minio_client.exceptions.BucketAlreadyExists, minio_client.exceptions.BucketAlreadyOwnedByYou):
    print(f"Bucket '{new_bucket}' already exists")

Bucket 'training-preparation-zone' already exists


## Copy data

We first will copy the data from one zone to the other so we can keep track of the changes being made to the data.


In [9]:
source_bucket = "exploitation-zone"

response = minio_client.list_objects_v2(Bucket=source_bucket)

if 'Contents' in response:
    for obj in response['Contents']:
        copy_source = {'Bucket': source_bucket, 'Key': obj['Key']}
        minio_client.copy_object(CopySource=copy_source, Bucket=new_bucket, Key=obj['Key'])
        print(f"Copied {obj['Key']} from {source_bucket} to {new_bucket}")
else:
    print(f"No objects found in bucket '{source_bucket}'.")

Copied audios/answer_0.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_1.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_10.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_11.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_12.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_13.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_14.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_15.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_16.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_17.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_18.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_19.mp3 from exploitation-zone to training-preparation-zone
Copied audios/answer_2.mp3 from exploitati

In [5]:
import chromadb
import json
client = chromadb.HttpClient(host="localhost", port=8000)

collection = client.get_collection("text_multimodal_collection")
collection_image = client.get_collection("image_multimodal_collection")
objects = collection.get(include=["metadatas", "documents"])
text_data = collection.get(
    include=["embeddings", "metadatas", "documents"]
)
text_embeddings = text_data["embeddings"]

dataset_pairs = []

results = collection_image.query(
    query_embeddings=text_embeddings,
    n_results=1,
    include=["metadatas", "documents", "distances"],
)
print(results)

for i, text_path in enumerate(text_data["ids"]):
    best_image_path = results["ids"][i][0]
    score = results["distances"][i][0]
    dataset_pairs.append({
        "image": best_image_path,
        "text": text_path,
        "score": score
    })
local_filename = "dataset_train.json"
with open(local_filename, "w") as f:
    json.dump(dataset_pairs, f)

minio_client.upload_file(local_filename, new_bucket, local_filename)

{'ids': [['images/ISIC_0027249.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0026152.png'], ['images/ISIC_0026803.png'], ['images/ISIC_0026077.png'], ['images/ISIC_0026803.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0027249.png'], ['images/ISIC_0031987.png'], ['images/ISIC_0028103.png'], ['images/ISIC_0029475.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0027999.png'], ['images/ISIC_0026152.png'], ['images/ISIC_0031987.png'], ['images/ISIC_0031380.png'], ['images/ISIC_0029220.png'], ['images/ISIC_0025874.png'], ['images/ISIC_0031442.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0032110.png'], ['images/ISIC_0033505.png'], ['images/ISIC_0031831.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0031981.png'], ['images/ISIC_0030197.png'], ['images/ISIC_0032415.png'], ['images/ISIC_0032727.png'], ['images/ISIC_0029694.png'], ['images/ISIC_0029929.png'], ['images/ISIC_0029263.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0034196.png'], ['images/ISIC_0032731.png'], ['ima