In [10]:
import boto3
from botocore.exceptions import ClientError
import os
from dotenv import load_dotenv

load_dotenv()
access_key_id = os.getenv("ACCESS_KEY_ID")
secret_access_key = os.getenv("SECRET_ACCESS_KEY")
minio_url = "http://" + os.getenv("S3_API_ENDPOINT")


minio_client = boto3.client(
    "s3",
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    endpoint_url=minio_url
)

minio_bucket = "training-preparation-zone"
manifest_name = "dataset_train.json"
local_file = "./dataset_train.json"

In [11]:
new_bucket = "augmentation-zone"
try:
    minio_client.create_bucket(Bucket=new_bucket)
except ClientError as e:
    error_code = e.response['Error']['Code']
    if error_code in ['BucketAlreadyExists', 'BucketAlreadyOwnedByYou']:
        print(f"Bucket '{new_bucket}' already exists")

Bucket 'augmentation-zone' already exists


In [12]:
new_bucket = "training-preparation-zone"
try:
    minio_client.create_bucket(Bucket=new_bucket)
except ClientError as e:
    error_code = e.response['Error']['Code']
    if error_code in ['BucketAlreadyExists', 'BucketAlreadyOwnedByYou']:
        print(f"Bucket '{new_bucket}' already exists")
    else:
        print(f"Error creating bucket: {e}")

Bucket 'training-preparation-zone' already exists


## Copy data

We first will copy the data from one zone to the other so we can keep track of the changes being made to the data.


In [13]:
source_bucket = "exploitation-zone"

response = minio_client.list_objects_v2(Bucket=source_bucket)
if 'Contents' in response:
    for obj in response['Contents']:
        copy_source = {'Bucket': source_bucket, 'Key': obj['Key']}
        minio_client.copy_object(CopySource=copy_source, Bucket=new_bucket, Key=obj['Key'])
        print(f"Copied {obj['Key']} from {source_bucket} to {new_bucket}")
else:
    print(f"No objects found in bucket '{source_bucket}'.")

Copied images/ISIC_0024388.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0024508.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0024853.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025118.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025200.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025202.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025298.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025343.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025430.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025806.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025874.png from exploitation-zone to training-preparation-zone
Copied images/ISIC_0025886.png from exploitation-zone to training-preparation-zone
Copi

In [14]:
import chromadb
import json
client = chromadb.HttpClient(host="localhost", port=8000)

collection = client.get_collection("text_multimodal_collection")
collection_image = client.get_collection("image_multimodal_collection")
objects = collection.get(include=["metadatas", "documents"])
text_data = collection.get(
    include=["embeddings", "metadatas", "documents"]
)
text_embeddings = text_data["embeddings"]

dataset_pairs = []

results = collection_image.query(
    query_embeddings=text_embeddings,
    n_results=1,
    include=["metadatas", "documents", "distances"],
)
print(results)

for i, text_path in enumerate(text_data["ids"]):
    best_image_path = results["ids"][i][0]
    score = results["distances"][i][0]
    dataset_pairs.append({
        "image": best_image_path,
        "text": text_path,
        "score": score
    })
local_filename = "dataset_train.json"
with open(local_filename, "w") as f:
    json.dump(dataset_pairs, f)

minio_client.upload_file(local_filename, new_bucket, local_filename)

{'ids': [['images/ISIC_0027249.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0026152.png'], ['images/ISIC_0026803.png'], ['images/ISIC_0026077.png'], ['images/ISIC_0026803.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0027249.png'], ['images/ISIC_0031987.png'], ['images/ISIC_0028103.png'], ['images/ISIC_0029475.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0027999.png'], ['images/ISIC_0026152.png'], ['images/ISIC_0031987.png'], ['images/ISIC_0031380.png'], ['images/ISIC_0029220.png'], ['images/ISIC_0025874.png'], ['images/ISIC_0031442.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0032110.png'], ['images/ISIC_0033505.png'], ['images/ISIC_0031831.png'], ['images/ISIC_0027058.png'], ['images/ISIC_0031981.png'], ['images/ISIC_0030197.png'], ['images/ISIC_0032415.png'], ['images/ISIC_0032727.png'], ['images/ISIC_0029694.png'], ['images/ISIC_0029929.png'], ['images/ISIC_0029263.png'], ['images/ISIC_0025960.png'], ['images/ISIC_0034196.png'], ['images/ISIC_0032731.png'], ['ima

## Transforming images while maintaining the same embedding 

Here, we transform the images (data augmentation) while keeping the embedding of the original image, so that both share the same semantic representation.


In [None]:
def apply_transformations(image):
    """Aplica transformaciones aleatorias a la imagen"""
    transformations = []
    
    # Rotación aleatoria
    if random.random() > 0.5:
        angle = random.randint(-15, 15)
        image = image.rotate(angle, expand=False, fillcolor='white')
        transformations.append(f"rotated_{angle}")
    
    # Ajuste de brillo
    if random.random() > 0.5:
        enhancer = ImageEnhance.Brightness(image)
        factor = random.uniform(0.8, 1.2)
        image = enhancer.enhance(factor)
        transformations.append(f"brightness_{factor:.2f}")
    
    # Ajuste de contraste
    if random.random() > 0.5:
        enhancer = ImageEnhance.Contrast(image)
        factor = random.uniform(0.8, 1.2)
        image = enhancer.enhance(factor)
        transformations.append(f"contrast_{factor:.2f}")
    
    # Flip horizontal
    if random.random() > 0.5:
        image = ImageOps.mirror(image)
        transformations.append("flipped")
    
    return image, transformations

In [None]:
import io
from PIL import Image, ImageEnhance, ImageOps
import random
import chromadb
import time
import boto3

# Conectar a ChromaDB
chroma_client = chromadb.HttpClient(host="localhost", port=8000)
collection_image = chroma_client.get_collection("image_multimodal_collection")

# Obtener todas las imágenes del bucket training-preparation-zone
source_bucket = "exploitation-zone"
destination_bucket = "augmentation-zone"
# Función para cargar embedder solo si es necesario (lazy loading para evitar cargar el modelo si no hay espacio)
_embed_image_cache = None
def get_embed_image():
    """Carga el embedder solo cuando es necesario (con caché)"""
    global _embed_image_cache
    if _embed_image_cache is None:
        import sys
        sys.path.append('/Users/carlesaguilera/Desktop/ADSDB2/ADSDB/Part2')
        from src.embedder import embed_image
        _embed_image_cache = embed_image
    return _embed_image_cache

# Listar solo imágenes
response = minio_client.list_objects_v2(Bucket=source_bucket, Prefix="images/")


if 'Contents' in response:
    transformed_count = 0
    for obj in response['Contents']:
        if obj['Key'].endswith(('.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG')):
            try:
                # Pequeño delay antes de descargar para evitar rate limiting
                time.sleep(0.05)
                
                # Descargar imagen original
                image_obj = minio_client.get_object(Bucket=source_bucket, Key=obj['Key'])
                image_data = image_obj['Body'].read()
                original_image = Image.open(io.BytesIO(image_data))
                
                # Obtener el embedding de la imagen original desde ChromaDB
                original_embedding = None
                original_id = obj['Key']
                
                try:
                    result = collection_image.get(ids=[original_id], include=["embeddings"])
                    if result['ids'] and len(result['embeddings']) > 0:
                        original_embedding = result['embeddings'][0]
                        print(f"Embedding encontrado para {original_id}")
                    else:
                        # Si no existe, intentar generar el embedding (solo si hay espacio)
                        print(f"⚠ Embedding NO encontrado para {original_id}")
                        print(f"⚠ Intentando generar embedding (requiere espacio en disco ~1.7GB)...")
                        # Cargar embedder solo si es necesario
                        try:
                            embed_image_fn = get_embed_image()
                        except Exception as load_error:
                            print(f"✗ Error cargando modelo CLIP: {load_error}")
                            print(f"✗ No se puede generar embedding. Saltando esta imagen.")
                            continue
                        
                        try:
                            original_embedding = embed_image_fn(original_image)
                            # Guardar el embedding de la original si no existía
                            # Guardamos la imagen original en la lista de imágenes asociadas
                            collection_image.add(
                                ids=[original_id],
                                embeddings=[original_embedding],
                                metadatas=[{
                                    "type": "original", 
                                    "image_type": "original",
                                    "associated_images": original_id  # Lista inicial con solo la original
                                }]
                            )
                            print(f"✓ Embedding generado y guardado para {original_id}")
                        except Exception as gen_error:
                            print(f"✗ Error generando embedding: {gen_error}")
                            print(f"✗ Saltando esta imagen.")
                            continue
                except Exception as e:
                    # Si no existe en ChromaDB, intentar generar el embedding
                    print(f"⚠ Error obteniendo embedding para {original_id}: {e}")
                    print(f"⚠ Intentando generar embedding (requiere espacio en disco ~1.7GB)...")
                    # Cargar embedder solo si es necesario
                    try:
                        embed_image_fn = get_embed_image()
                    except Exception as load_error:
                        print(f"✗ Error cargando modelo CLIP: {load_error}")
                        print(f"✗ No se puede generar embedding. Saltando esta imagen.")
                        continue
                    
                    try:
                        original_embedding = embed_image_fn(original_image)
                        # Guardar el embedding de la original
                        try:
                            collection_image.add(
                                ids=[original_id],
                                embeddings=[original_embedding],
                                metadatas=[{
                                    "type": "original", 
                                    "image_type": "original",
                                    "associated_images": original_id  # Lista inicial con solo la original
                                }]
                            )
                        except:
                            # Si ya existe, actualizar con el embedding
                            collection_image.update(
                                ids=[original_id],
                                embeddings=[original_embedding],
                                metadatas=[{
                                    "type": "original", 
                                    "image_type": "original",
                                    "associated_images": original_id
                                }]
                            )
                        print(f"✓ Embedding generado y guardado para {original_id}")
                    except Exception as gen_error:
                        print(f"✗ Error generando embedding: {gen_error}")
                        print(f"✗ Saltando esta imagen.")
                        continue
                
                # Aplicar transformaciones
                transformed_image, transformations = apply_transformations(original_image.copy())
                
                # Crear nombre para la imagen transformada
                base_name = obj['Key'].replace('images/', '').replace('.png', '').replace('.jpg', '').replace('.jpeg', '')
                transform_suffix = '_'.join(transformations) if transformations else 'augmented'
                new_key = f"images/{base_name}_{transform_suffix}.png"
                
                # Guardar imagen transformada en buffer
                buffer = io.BytesIO()
                transformed_image.save(buffer, format='PNG')
                buffer.seek(0)
                # Obtener los datos del buffer como bytes

                buffer_data = buffer.getvalue()
                # Subir imagen transformada a MinIO usando put_object
                minio_client.put_object(
                    Bucket=destination_bucket,
                    Key=new_key,
                    Body=buffer_data,
                    ContentType='image/png'
                )
                # Delay más largo para evitar rate limiting de MinIO
                time.sleep(0.2)
                # Pequeño delay para evitar rate limiting de MinIO
                time.sleep(0.1)
                
                # En lugar de crear un nuevo registro, actualizamos el registro original
                # para agregar la imagen transformada a la lista de imágenes asociadas
                try:
                    # Obtener los metadatos actuales del registro original
                    current_result = collection_image.get(ids=[original_id], include=["metadatas"])
                    current_metadata = current_result['metadatas'][0] if current_result['metadatas'] and current_result['metadatas'][0] else {}
                    
                    # Obtener la lista actual de imágenes asociadas
                    if 'associated_images' in current_metadata:
                        # Si ya existe, puede ser un string o una lista
                        if isinstance(current_metadata['associated_images'], str):
                            associated_images = current_metadata['associated_images'].split(',')
                        else:
                            associated_images = current_metadata['associated_images'] if isinstance(current_metadata['associated_images'], list) else [current_metadata['associated_images']]
                    else:
                        # Si no existe, empezar con la imagen original
                        associated_images = [original_id]
                    
                    # Agregar la nueva imagen transformada a la lista
                    if new_key not in associated_images:
                        associated_images.append(new_key)
                    
                    # Actualizar el registro original con la nueva lista de imágenes asociadas
                    collection_image.update(
                        ids=[original_id],
                        embeddings=[original_embedding],  # Mantener el mismo embedding
                        metadatas=[{
                            "type": "original",
                            "image_type": "original",
                            "associated_images": ','.join(associated_images),  # Lista de todas las imágenes (original + transformadas)
                            "transformations_info": f"{new_key}:{', '.join(transformations) if transformations else 'none'}"  # Info de transformaciones
                        }]
                    )
                    print(f"✓ Imagen transformada agregada al embedding de {original_id}")
                except Exception as e:
                    print(f"⚠ Error actualizando embedding para {original_id}: {e}")
                    # Si falla, intentar crear el registro original si no existe
                    try:
                        collection_image.add(
                            ids=[original_id],
                            embeddings=[original_embedding],
                            metadatas=[{
                                "type": "original",
                                "image_type": "original",
                                "associated_images": f"{original_id},{new_key}"
                            }]
                        )
                        print(f"✓ Registro creado con ambas imágenes")
                    except Exception as create_error:
                        print(f"✗ Error creando registro: {create_error}")
                
                transformed_count += 1
                print(f"  Transformada: {original_id} -> {new_key}")
                print(f"  Imagen agregada a la lista de imágenes asociadas del embedding")
                print(f"  Transformaciones: {', '.join(transformations) if transformations else 'ninguna'}")
                
            except Exception as e:
                print(f"Error procesando {obj['Key']}: {e}")
                import traceback
                traceback.print_exc()
                continue
    
    print(f"\nProceso completado: {transformed_count} imágenes transformadas")
    print(f" En ChromaDB: cada embedding único está almacenado UNA SOLA VEZ")
    print(f" con todas sus imágenes asociadas (original + transformadas) en los metadatos")
else:
    print(f"No se encontraron imágenes en el bucket '{source_bucket}'.")



Embedding encontrado para images/ISIC_0024388.png
✓ Imagen transformada agregada al embedding de images/ISIC_0024388.png
  Transformada: images/ISIC_0024388.png -> images/ISIC_0024388_contrast_1.19_flipped.png
  Imagen agregada a la lista de imágenes asociadas del embedding
  Transformaciones: contrast_1.19, flipped
Embedding encontrado para images/ISIC_0024508.png
✓ Imagen transformada agregada al embedding de images/ISIC_0024508.png
  Transformada: images/ISIC_0024508.png -> images/ISIC_0024508_rotated_-13_brightness_1.03_flipped.png
  Imagen agregada a la lista de imágenes asociadas del embedding
  Transformaciones: rotated_-13, brightness_1.03, flipped
Embedding encontrado para images/ISIC_0024853.png
✓ Imagen transformada agregada al embedding de images/ISIC_0024853.png
  Transformada: images/ISIC_0024853.png -> images/ISIC_0024853_brightness_0.82_contrast_1.15_flipped.png
  Imagen agregada a la lista de imágenes asociadas del embedding
  Transformaciones: brightness_0.82, contras

## Prove the validation of the new images transforme matching the same Embedding.

Now we're going to print all the embeddings with all the images that have associated, so we can validate that the augmentation part have been done successfully.

In [None]:
import numpy as np

data = collection_image.get(include=["metadatas", "documents", "embeddings"])

# Ahora cada embedding tiene una lista de imágenes asociadas en los metadatos
print(f"Total de embeddings en ChromaDB: {len(data['ids'])}")
print("=" * 80)

total_images_count = 0
for idx, (embedding, img_id, metadata) in enumerate(zip(data["embeddings"], data["ids"], data["metadatas"])):
    print(f"\nEmbedding {idx + 1}:")
    print(f"  ID del registro: {img_id}")
    print(f"  Vector: {embedding[:5]}... (primeros 5 valores)")
    
    if metadata and 'associated_images' in metadata:
        # Obtener la lista de imágenes asociadas
        associated_images_str = metadata['associated_images']
        if isinstance(associated_images_str, str):
            associated_images = [img.strip() for img in associated_images_str.split(',')]
        else:
            associated_images = associated_images_str if isinstance(associated_images_str, list) else [associated_images_str]
        
        print(f"  Total de imágenes asociadas: {len(associated_images)}")
        print(f"  Imágenes asociadas:")
        for img in associated_images:
            print(f"    - {img}")
            total_images_count += 1
        
        # Mostrar información de transformaciones si existe
        if 'transformations_info' in metadata:
            print(f"  Info transformaciones: {metadata['transformations_info']}")
    else:
        # Si no hay associated_images, mostrar solo el ID del registro
        print(f"  Imagen: {img_id}")
        total_images_count += 1
    
    if metadata:
        print(f"  Otros metadatos: {metadata}")
    print("-" * 80)

print(f"\nResumen:")
print(f"  Total de embeddings únicos: {len(data['ids'])}")
print(f"  Total de imágenes (originales + transformadas): {total_images_count}")

Total de embeddings en ChromaDB: 259

Embedding 1:
  ID del registro: images/ISIC_0024388.png
  Vector: [ 0.00830536  0.01662186 -0.01564987  0.04551699  0.0083858 ]... (primeros 5 valores)
  Total de imágenes asociadas: 2
  Imágenes asociadas:
    - images/ISIC_0024388.png
    - images/ISIC_0024388_contrast_1.19_flipped.png
  Info transformaciones: images/ISIC_0024388_contrast_1.19_flipped.png:contrast_1.19, flipped
  Otros metadatos: {'associated_images': 'images/ISIC_0024388.png,images/ISIC_0024388_contrast_1.19_flipped.png', 'image_type': 'original', 'type': 'original', 'transformations_info': 'images/ISIC_0024388_contrast_1.19_flipped.png:contrast_1.19, flipped'}
--------------------------------------------------------------------------------

Embedding 2:
  ID del registro: images/ISIC_0024508.png
  Vector: [-0.01091168 -0.02055728 -0.04908438  0.04230993  0.01473207]... (primeros 5 valores)
  Total de imágenes asociadas: 2
  Imágenes asociadas:
    - images/ISIC_0024508.png
    

In [None]:
data2 = collection_image.get(include=["metadatas", "documents", "embeddings"])
# For to seee for each embedding all the information

for idx, (embedding, img_id, metadata) in enumerate(zip(data2["embeddings"], data2["ids"], data2["metadatas"])):
    print(f"\nEmbedding {idx + 1}:")
    print(f"  ID del registro: {img_id}")
    print(f"  Vector: {embedding[:5]}... (primeros 5 valores)")
    
    if metadata:
        print(f"  Otros metadatos: {metadata}")
    print("-" * 80)

print(f"\nResumen:")
print(f"  Total de embeddings únicos: {len(data2['ids'])}")
print(f"  Total de imágenes (originales + transformadas): {total_images_count}")



Embedding 1:
  ID del registro: images/ISIC_0024388.png
  Vector: [ 0.00830536  0.01662186 -0.01564987  0.04551699  0.0083858 ]... (primeros 5 valores)
  Otros metadatos: {'associated_images': 'images/ISIC_0024388.png,images/ISIC_0024388_contrast_1.19_flipped.png', 'transformations_info': 'images/ISIC_0024388_contrast_1.19_flipped.png:contrast_1.19, flipped', 'image_type': 'original', 'type': 'original'}
--------------------------------------------------------------------------------

Embedding 2:
  ID del registro: images/ISIC_0024508.png
  Vector: [-0.01091168 -0.02055728 -0.04908438  0.04230993  0.01473207]... (primeros 5 valores)
  Otros metadatos: {'image_type': 'original', 'associated_images': 'images/ISIC_0024508.png,images/ISIC_0024508_rotated_-13_brightness_1.03_flipped.png', 'type': 'original', 'transformations_info': 'images/ISIC_0024508_rotated_-13_brightness_1.03_flipped.png:rotated_-13, brightness_1.03, flipped'}
----------------------------------------------------------

In [None]:
import chromadb
import json
client = chromadb.HttpClient(host="localhost", port=8000)

collection = client.get_collection("text_multimodal_collection")
collection_image = client.get_collection("image_multimodal_collection")
objects = collection.get(include=["metadatas", "documents"])
text_data = collection.get(
    include=["embeddings", "metadatas", "documents"]
)
text_embeddings = text_data["embeddings"]

dataset_pairs = []
dataset_pairs_augmented = []

results = collection_image.query(
    query_embeddings=text_embeddings,
    n_results=1,
    include=["metadatas", "documents", "distances"],
)

# For the augmntation reason we will include 3 rsultst, to catch some of the images that have been modififed.
results_augmented = collection_image.query(
    query_embeddings=text_embeddings,
    n_results=3,
    include=["metadatas", "documents", "distances"],
)
print(results)
print(results_augmented)

for i, text_path in enumerate(text_data["ids"]):
    best_image_path = results["ids"][i][0]
    score = results["distances"][i][0]
    dataset_pairs.append({
        "image": best_image_path,
        "text": text_path,
        "score": score
    })

# in that case we will include 3 results, to catch some of the images that have been modififed.
for i, text_path in enumerate(text_data["ids"]):
    for j in range(3):
        best_image_path = results_augmented["ids"][i][j]
        score = results_augmented["distances"][i][j]
        dataset_pairs_augmented.append({
            "image": best_image_path,
            "text": text_path,
            "score": score
        })

local_filename = "dataset_train.json"
local_filename_augmented = "dataset_train_augmented.json"
with open(local_filename, "w") as f:
    json.dump(dataset_pairs, f)
with open(local_filename_augmented, "w") as f:
    json.dump(dataset_pairs_augmented, f)

minio_client.upload_file(local_filename, new_bucket, local_filename)
minio_client.upload_file(local_filename_augmented, new_bucket, local_filename_augmented)

{'ids': [['images/ISIC_0025899_rotated_-13_contrast_1.06.png'], ['images/ISIC_0026803_rotated_-2_contrast_1.19_flipped.png'], ['images/ISIC_0026803_rotated_-2_contrast_1.19_flipped.png'], ['images/ISIC_0029577_brightness_0.87_contrast_0.86_flipped.png'], ['images/ISIC_0031981.png'], ['images/ISIC_0026803_rotated_-2_contrast_1.19_flipped.png'], ['images/ISIC_0030492_contrast_0.96.png'], ['images/ISIC_0024853_rotated_15.png'], ['images/ISIC_0025960_rotated_3_contrast_1.07.png'], ['images/ISIC_0028103_rotated_2_contrast_0.89_flipped.png'], ['images/ISIC_0026283_rotated_-13_brightness_1.03_flipped.png'], ['images/ISIC_0025343_flipped.png'], ['images/ISIC_0027219_brightness_1.03_contrast_0.89_flipped.png'], ['images/ISIC_0026803_rotated_-2_contrast_1.19_flipped.png'], ['images/ISIC_0026283_rotated_-13_brightness_1.03_flipped.png'], ['images/ISIC_0032110_rotated_-8_brightness_1.19_contrast_1.02.png'], ['images/ISIC_0027999.png'], ['images/ISIC_0034129_contrast_0.92.png'], ['images/ISIC_00321