### Génération des embeddings

##### 1. Charger les données

In [20]:
import pandas as pd

data = pd.read_csv('../data/processed/data_clean.csv')

##### 2. Séparer les données en ensembles d'entraînement et de test

In [32]:
from sklearn.model_selection import train_test_split

X = data.drop('airline_sentiment', axis=1)
y = data['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print('X_Train:', len(X_train))
print('X_Test :', len(X_test))
print('Y_Train:', len(y_train))
print('Y_Test :', len(y_test))

X_Train: 11680
X_Test : 2921
Y_Train: 11680
Y_Test : 2921


##### 3. Charger le modèle `paraphrase-multilingual-MiniLM-L12-v2` avec Sentence Transformers

In [33]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

##### 4. Générer les embeddings pour les données

- Train

In [34]:
texts = X_train['text'].tolist()

train_embeddings = model.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

Batches: 100%|██████████| 365/365 [01:40<00:00,  3.64it/s]


- Test

In [35]:
texts = X_test['text'].tolist()

test_embeddings = model.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

Batches: 100%|██████████| 92/92 [00:45<00:00,  2.00it/s]


##### 5. Afficher les embeddings Générés

In [36]:
print('Train:  ', train_embeddings.shape)
print('Test:   ', test_embeddings.shape, '\n')

print(train_embeddings)

Train:   (11680, 384)
Test:    (2921, 384) 

[[ 0.03057806  0.01824954 -0.06659513 ... -0.09728357 -0.01112673
   0.04634745]
 [ 0.08125947  0.06308529  0.00967043 ... -0.01007925 -0.1580316
  -0.04903226]
 [ 0.10380986  0.00310333 -0.03927619 ...  0.02882032 -0.1018984
  -0.00109817]
 ...
 [ 0.03718933  0.00505602  0.03527729 ...  0.06018802 -0.06565111
   0.00525965]
 [-0.00319692  0.00877186 -0.00891564 ...  0.01496773 -0.07131661
   0.01495774]
 [ 0.01028042 -0.0467954  -0.05908336 ...  0.02119558 -0.12419796
   0.04274945]]


##### 6. Sauvegarder les embeddings Générés `(.npy)`

In [37]:
import numpy as np

np.save('../data/embedding/train_embeddings.npy', train_embeddings)
np.save('../data/embedding/test_embeddings.npy', test_embeddings)

print('Les embeddings ont été enregistrées')

Les embeddings ont été enregistrées


##### 7. Sauvegarder les labels et identifiants

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

train_metadata = pd.DataFrame()
train_metadata['label_name'] = data[['airline_sentiment']]
train_metadata['id'] = range(0, len(train_metadata))
train_metadata['label'] = encoder.fit_transform(train_metadata['label_name'])


test_metadata = pd.DataFrame()
test_metadata['label_name'] = data[['airline_sentiment']]
test_metadata['id'] = range(0, len(test_metadata))
test_metadata['label'] = encoder.transform(test_metadata['label_name'])

train_metadata.to_csv('../data/metadata/train_metadata.csv', index=False)
test_metadata.to_csv('../data/metadata/test_metadata.csv', index=False)


In [43]:
classes = train_metadata[['label', 'label_name']].drop_duplicates()

classes.to_csv('../data/processed/classes.csv', index=False)

##### 8. Intialiser chromaDB

In [44]:
from chromadb import PersistentClient

client = PersistentClient(path='../data/chroma_db')

##### 8. Créer une collection train dans ChromaDB

In [47]:
train_collection = client.get_or_create_collection(
    'train_collection',
    metadata={"description": "Embeddings pour données d'entraînement"}
)

batch_size = 5000
n = len(train_embeddings)

for i in range(0, n, batch_size):
    end = min(i + batch_size, n)
    
    batch_ids = [str(j) for j in list(train_metadata['id'][i: end])]
    batch_embeddings = train_embeddings[i:end].tolist()
    batch_metadatas = [
        {
            "label": int(train_metadata['label'][j]),
            "label_name": str(train_metadata['label_name'][j])
        } for j in range(i, end)
    ]
    
    train_collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
    )
    
    print(f"Inserted {end} / {n}")

Inserted 5000 / 11680
Inserted 10000 / 11680
Inserted 11680 / 11680


##### 9. Créer une collection test dans ChromaDB

In [49]:
test_collection = client.get_or_create_collection(
    'test_collection',
    metadata={"description": "Embeddings pour données de test"}
)

batch_size = 5000
n = len(test_embeddings)

for i in range(0, n, batch_size):
    end = min(i + batch_size, n)
    
    batch_ids = [str(j) for j in list(test_metadata['id'][i: end])]
    batch_embeddings = test_embeddings[i:end].tolist()
    batch_metadatas = [
        {
            "label": int(test_metadata['label'][j]),
            "label_name": str(test_metadata['label_name'][j])
        } for j in range(i, end)
    ]
    
    test_collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas
    )
    
    print(f"Inserted {end} / {n}")


Inserted 2921 / 2921
