In [2]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv()
access_key_id = os.getenv("ACCESS_KEY_ID")
secret_access_key = os.getenv("SECRET_ACCESS_KEY")
minio_url = "http://" + os.getenv("S3_API_ENDPOINT")


minio_client = boto3.client(
    "s3",
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    endpoint_url=minio_url
)

new_bucket = "exploitation-zone"
try:
    minio_client.create_bucket(Bucket=new_bucket)
except (minio_client.exceptions.BucketAlreadyExists, minio_client.exceptions.BucketAlreadyOwnedByYou):
    print(f"Bucket '{new_bucket}' already exists")

Bucket 'exploitation-zone' already exists


In [7]:
import chromadb
from transformers import ClapModel, ClapProcessor
import librosa
import io
import torch

client = chromadb.HttpClient(host="localhost", port=8000)
paginator = minio_client.get_paginator('list_objects_v2')
exploitation_zone = "exploitation-zone"
trusted_zone = "trusted-zone"

model_id = "laion/clap-htsat-unfused"
model = ClapModel.from_pretrained(model_id)
processor = ClapProcessor.from_pretrained(model_id)
collection_name = "exploitation_zone-audio"
TARGET_SAMPLE_RATE = 48000

try:
    client.delete_collection(name=collection_name)
except Exception:
    pass

try:
    collection = client.get_or_create_collection(name=collection_name)
except Exception as e:
    print(f"Error accessing or creating collection: {e}")
    exit(1)

for page in paginator.paginate(Bucket=trusted_zone, Prefix="audio/"):
    for obj in page.get("Contents", []):
        key = obj.get("Key", "")
        response = minio_client.get_object(Bucket=trusted_zone, Key=key)
        audio_bytes = response['Body'].read()
        audio_waveform, _ = librosa.load(
            io.BytesIO(audio_bytes), 
            sr=TARGET_SAMPLE_RATE, 
            mono=True
        )

        inputs = processor(
            audios=audio_waveform, 
            sampling_rate=TARGET_SAMPLE_RATE, 
            return_tensors="pt"
        )
        with torch.no_grad():
                audio_features = model.get_audio_features(**inputs)

        embedding = audio_features[0].numpy().tolist()

        collection.add(
            embeddings=[embedding],
            metadatas=[{"source": trusted_zone, "audio_path": key}],
            ids=[key]
        )

        minio_client.copy_object(
            Bucket=exploitation_zone,
            CopySource={'Bucket': trusted_zone, 'Key': key},
            Key=key
        )


result = collection.get()
print("returned keys:", list(result.keys()))

  inputs = processor(


returned keys: ['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris', 'included']
