### Stockage dans ChromaDB

##### 1. Intialiser chromaDB

In [1]:
from chromadb import PersistentClient

client = PersistentClient(path='../data/chroma_db')

##### 2. Charger les données

- Metadatas

In [18]:
import pandas as pd

train_metadata_clean = pd.read_csv('../data/metadata/train_metadata_clean.csv')
train_metadata_mentions = pd.read_csv('../data/metadata/train_metadata_mentions.csv')
train_metadata_undersampling = pd.read_csv('../data/metadata/train_metadata_undersampling.csv')
test_metadata_clean = pd.read_csv('../data/metadata/test_metadata_clean.csv')
test_metadata_mentions = pd.read_csv('../data/metadata/test_metadata_mentions.csv')
test_metadata_undersampling = pd.read_csv('../data/metadata/test_metadata_undersampling.csv')


- Embeddings

In [19]:
import numpy as np

train_embeddings_clean = np.load('../data/embedding/train_embeddings_clean.npy')
train_embeddings_mentions = np.load('../data/embedding/train_embeddings_mentions.npy')
train_embeddings_undersampling = np.load('../data/embedding/train_embeddings_undersampling.npy')
test_embeddings_clean = np.load('../data/embedding/test_embeddings_clean.npy')
test_embeddings_mentions = np.load('../data/embedding/test_embeddings_mentions.npy')
test_embeddings_undersampling = np.load('../data/embedding/test_embeddings_undersampling.npy')


##### 3. Créer des collections train dans ChromaDB

In [20]:
# Function to insert embeddings into a single collection

def insert_embeddings(collection_name, description, embedding, metadata):

    if collection_name in [col.name for col in client.list_collections()]:
        client.delete_collection(collection_name)

    collection = client.create_collection(
        collection_name,
        metadata={"description": description}
    )

    batch_size = 5000
    n = len(embedding)

    print(f"--- {collection_name}:")

    for i in range(0, n, batch_size):
        end = min(i + batch_size, n)
        
        batch_ids = [str(j) for j in list(metadata['id'][i: end])]
        batch_embeddings = embedding[i:end].tolist()
        batch_metadatas = [
            {
                "label": int(metadata['label'][j]),
                "label_name": str(metadata['label_name'][j])
            } for j in range(i, end)
        ]
        
        collection.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            metadatas=batch_metadatas,
        )
        
        print(f"\t Inserted {end} / {n}")

    print('\n')

    return collection



# Function to insert both train and test embeddings

def insert_train_test_embeddings(name, train_embedding, train_metadata, test_embedding, test_metadata):

    # For train

    train_collection = insert_embeddings(
        f"train_collection_{name}",
        f"Embeddings pour données d'entraînement ({name})",
        train_embedding,
        train_metadata
    )

    # For test

    test_collection = insert_embeddings(
        f"test_collection_{name}",
        f"Embeddings pour données de test ({name})",
        test_embedding,
        test_metadata
    )

    print("Les données ont été inserées dans chromaDB")

    return [train_collection, test_collection]

- Data 1: data_clean

In [21]:
train_collection_clean, test_collection_clean = \
insert_train_test_embeddings(
    "clean", 
    train_embeddings_clean, train_metadata_clean, 
    test_embeddings_clean, test_metadata_clean
)

--- train_collection_clean:
	 Inserted 5000 / 12410
	 Inserted 10000 / 12410
	 Inserted 12410 / 12410


--- test_collection_clean:
	 Inserted 2191 / 2191


Les données ont été inserées dans chromaDB


- Data 2: data_with_mentions

In [22]:
train_collection_mentions, test_collection_mentions = \
insert_train_test_embeddings(
    "mentions", 
    train_embeddings_mentions, train_metadata_mentions, 
    test_embeddings_mentions, test_metadata_mentions
)

--- train_collection_mentions:
	 Inserted 5000 / 12410
	 Inserted 10000 / 12410
	 Inserted 12410 / 12410


--- test_collection_mentions:
	 Inserted 2191 / 2191


Les données ont été inserées dans chromaDB


- Data 3: data_balanced_undersampling

In [23]:
train_collection_undersampling, test_collection_undersampling = \
insert_train_test_embeddings(
    "undersampling", 
    train_embeddings_undersampling, train_metadata_undersampling, 
    test_embeddings_undersampling, test_metadata_undersampling
)


--- train_collection_undersampling:
	 Inserted 5000 / 7254
	 Inserted 7254 / 7254


--- test_collection_undersampling:
	 Inserted 1281 / 1281


Les données ont été inserées dans chromaDB


##### 10. Vérifier la persistance des données

In [28]:
def show_total_rows(data_list):
    for item in data_list:
        print(f"Le nombre total de collection d'entrainement - {item[0] :15s} ->", item[1].count())
        print(f"Le nombre total de collection de test        - {item[0] :15s} ->", item[2].count())
        print('\n')

In [30]:
data_list = [
    ["clean", train_collection_clean, test_collection_clean],
    ["mentions", train_collection_mentions, test_collection_mentions],
    ["undersampling", train_collection_undersampling, test_collection_undersampling],
]

show_total_rows(data_list)


Le nombre total de collection d'entrainement - clean           -> 12410
Le nombre total de collection de test        - clean           -> 2191


Le nombre total de collection d'entrainement - mentions        -> 12410
Le nombre total de collection de test        - mentions        -> 2191


Le nombre total de collection d'entrainement - undersampling   -> 7254
Le nombre total de collection de test        - undersampling   -> 1281




In [31]:
results = train_collection_clean.query(
    query_embeddings=[train_embeddings_clean[0].tolist()],
    n_results=1,
    include=['embeddings', 'metadatas']
)

results

{'ids': [['0']],
 'embeddings': [array([[ 0.02839299, -0.02863677,  0.00565065, ..., -0.02072558,
           0.0079131 ,  0.0486407 ]], shape=(1, 1024))],
 'documents': None,
 'uris': None,
 'included': ['embeddings', 'metadatas'],
 'data': None,
 'metadatas': [[{'label': 0, 'label_name': 'negative'}]],
 'distances': None}