### Stockage de données

##### 1. Chargement de données

In [1]:
import pandas as pd

data_train = pd.read_csv('../data/processed/preprocessed_train_text.csv')
data_test = pd.read_csv('../data/processed/preprocessed_test_text.csv')


##### 2. Extraire et sauvegarder les métadonnées nécessaires (label, identifiant).

In [2]:
train_metadata = data_train[["label", "label_text"]].copy()
test_metadata = data_test[["label", "label_text"]].copy()

train_metadata["id"] = range(len(train_metadata))
test_metadata["id"] = range(len(test_metadata))

- Sauvegarder

In [3]:
import os

os.makedirs('../data/metadata', exist_ok=True)

train_metadata.to_csv("../data/metadata/train_metadata.csv", index=False)
test_metadata.to_csv("../data/metadata/test_metadata.csv", index=False)

##### 2. Créer les collections ChromaDB

- Initialisation

In [4]:
import chromadb

client = chromadb.PersistentClient(path='../data/chromaDB')

- Pour les données d’entraînement.

In [5]:
train_collection = client.create_collection(
    name="train_collection",
    metadata={"description": "Embeddings pour données d'entraînement"}
)

- Pour les données de test.

In [6]:
test_collection = client.create_collection(
    name="test_collection",
    metadata={"description": "Embeddings pour données de test"}
)

##### 3. Insérer les embeddings et métadonnées dans les collections.

- la colonne `ids` doit toujours être une liste de chaînes de caractères (str), pas des entiers.
- `ids` doit être unique pour chaque document.
- `embeddings` doit être une liste de listes, pas un array Numpy.
- Diviser les données en plus petits lots avant de les ajouter à la collection. (ValueError si les lignes se dépasse 5461)

1. Charger les embeddings

In [7]:
import numpy as np

train_embeddings = np.load('../data/embeddings/train_embeddings_minilm.npy')
test_embeddings = np.load('../data/embeddings/test_embeddings_minilm.npy')


2. Train

In [8]:
batch_size = 5000
n = len(train_embeddings)

for i in range(0, n, batch_size):
    end = min(i + batch_size, n)
    
    batch_ids = [str(j) for j in list(train_metadata['id'][i: end])]
    batch_embeddings = train_embeddings[i:end].tolist()
    batch_metadatas = [
        {
            "label": int(train_metadata['label'][j]),
            "label_text": str(train_metadata['label_text'][j])
        } for j in range(i, end)
    ]
    
    train_collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
    )
    
    print(f"Inserted {end} / {n}")

Inserted 5000 / 120000
Inserted 10000 / 120000
Inserted 15000 / 120000
Inserted 20000 / 120000
Inserted 25000 / 120000
Inserted 30000 / 120000
Inserted 35000 / 120000
Inserted 40000 / 120000
Inserted 45000 / 120000
Inserted 50000 / 120000
Inserted 55000 / 120000
Inserted 60000 / 120000
Inserted 65000 / 120000
Inserted 70000 / 120000
Inserted 75000 / 120000
Inserted 80000 / 120000
Inserted 85000 / 120000
Inserted 90000 / 120000
Inserted 95000 / 120000
Inserted 100000 / 120000
Inserted 105000 / 120000
Inserted 110000 / 120000
Inserted 115000 / 120000
Inserted 120000 / 120000


3. Test

In [9]:
batch_size = 5000
n = len(test_embeddings)

for i in range(0, n, batch_size):
    end = min(i + batch_size, n)
    
    batch_ids = [str(j) for j in list(test_metadata['id'][i: end])]
    batch_embeddings = test_embeddings[i:end].tolist()
    batch_metadatas = [
        {
            "label": int(test_metadata['label'][j]),
            "label_text": str(test_metadata['label_text'][j])
        } for j in range(i, end)
    ]
    
    test_collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas
    )
    
    print(f"Inserted {end} / {n}")


Inserted 5000 / 7600
Inserted 7600 / 7600


##### 4. Vérifier l’intégrité des données stockées

In [10]:
print("Le nombre total de collection d'entrainement:", train_collection.count())
print("Le nombre total de collection de test:", test_collection.count())

Le nombre total de collection d'entrainement: 120000
Le nombre total de collection de test: 7600


In [11]:
results = train_collection.query(
    query_embeddings=[train_embeddings[0].tolist()],
    n_results=1,
    include=['embeddings', 'metadatas']
)

print(results.keys(), '\n')

results

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances']) 



{'ids': [['0']],
 'embeddings': [array([[ 0.07367805, -0.02631132,  0.03900679,  0.03266085,  0.06207192,
          -0.00634163, -0.07240014, -0.11341788,  0.01835962,  0.00309855,
          -0.04974312, -0.02516613,  0.02603205, -0.04401075,  0.00837482,
           0.05773751, -0.02215332, -0.03377735, -0.0066717 ,  0.02544165,
          -0.14476441, -0.07789535,  0.02016891,  0.03573161, -0.04943183,
           0.03341607, -0.05275111,  0.02774966, -0.04304387, -0.12120994,
          -0.00940505,  0.03281521, -0.04796458,  0.03048585, -0.02534978,
           0.04817534,  0.0327391 ,  0.01701158,  0.00372152,  0.00995936,
           0.01279518,  0.01307371,  0.05426204,  0.03380084,  0.05362077,
          -0.02931431,  0.02448952, -0.04502512,  0.01437801, -0.03599564,
           0.06709903, -0.02882682, -0.03396979,  0.02171521,  0.02825229,
           0.00121623,  0.0187257 , -0.03953204,  0.08476631,  0.0235977 ,
           0.09339388, -0.08286188, -0.05783248,  0.06071364, -0.0041