## Importation et chargement des données avec Pandas

In [46]:
import pandas as pd
df = pd.read_csv(r'../data/clean_data.csv')

df.head()

Unnamed: 0,text_clean,airline_sentiment,word_count
0,virginamerica what dhepburn said,neutral,4
1,virginamerica plus you ve added commercials t...,positive,9
2,virginamerica i didn t today must mean i n...,neutral,12
3,virginamerica it s really aggressive to blast...,negative,17
4,virginamerica and it s a really big bad thing...,negative,10


## Séparation des données (Train / Test)

In [47]:
from sklearn.model_selection import train_test_split
X = df["text_clean"]
y = df["airline_sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

## Vérification de la version de Sentence Transformers installée

In [48]:
import sentence_transformers 
print(sentence_transformers.__version__)

3.0.0


## Génération des embeddings avec all-mpnet-base-v2

In [49]:
from sentence_transformers import SentenceTransformer

# Use all-mpnet-base-v2 model for high-quality embeddings
model = SentenceTransformer("all-mpnet-base-v2")
print(f"✓ Loaded model: all-mpnet-base-v2")

# Generate embeddings
train_embeddings = model.encode(X_train.tolist(), batch_size=32, show_progress_bar=True)
test_embeddings  = model.encode(X_test.tolist(),  batch_size=32, show_progress_bar=True)

print(f"✓ Train embeddings shape: {train_embeddings.shape}")
print(f"✓ Test embeddings shape: {test_embeddings.shape}")

# Create dataframes with embeddings
df_train = pd.DataFrame({"text": X_train.values, "label": y_train.values})
df_test  = pd.DataFrame({"text": X_test.values,  "label": y_test.values})

df_train["embeddings"] = [e.tolist() for e in train_embeddings]
df_test["embeddings"]  = [e.tolist() for e in test_embeddings]

df_train["id"] = range(len(df_train))
df_test["id"]  = range(len(df_test))



✓ Loaded model: all-mpnet-base-v2


Batches: 100%|██████████| 366/366 [05:43<00:00,  1.06it/s]
Batches:   0%|          | 0/92 [00:00<?, ?it/s]
Batches: 100%|██████████| 92/92 [01:41<00:00,  1.10s/it]



✓ Train embeddings shape: (11712, 768)
✓ Test embeddings shape: (2928, 768)


## Préparation des métadonnées pour les ensembles d'entraînement et de test


In [50]:
metadata_df_train = df_train[["id", "label", "embeddings"]]
print(metadata_df_train.head())

metadata_df_test = df_test[["id", "label", "embeddings"]]
print(metadata_df_test.head())


   id     label                                         embeddings
0   0   neutral  [0.0019280788255855441, 0.058254923671483994, ...
1   1   neutral  [-0.0006868790951557457, 0.019814640283584595,...
2   2  negative  [-0.020629476755857468, 0.03164245933294296, 0...
3   3  positive  [-0.010550018399953842, 0.0832424983382225, -0...
4   4  negative  [-0.01696091704070568, 0.04112563654780388, -0...
   id     label                                         embeddings
0   0   neutral  [-0.020865321159362793, 0.07746444642543793, 0...
1   1  positive  [0.006122920662164688, 0.016212385147809982, -...
2   2  negative  [0.018521815538406372, 0.076076440513134, -0.0...
3   3  negative  [0.03081049956381321, 0.03982585296034813, 0.0...
4   4  negative  [-0.004964960273355246, 0.08353422582149506, -...


## Sauvegarde des métadonnées et embeddings en fichiers CSV


In [51]:
import os

os.makedirs("../data", exist_ok=True)
metadata_df_train.to_csv("../data/train_metadata_embeddings.csv", index=False)
metadata_df_test.to_csv("../data/test_metadata_embeddings.csv", index=False)


## Initialisation du client ChromaDB

In [52]:
import chromadb
import os

os.makedirs("../data/chroma", exist_ok=True)

client = chromadb.PersistentClient(path="../data/chroma")
print("✓ ChromaDB client initialized")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


✓ ChromaDB client initialized


## Création des collections ChromaDB pour l'entraînement et le test


In [57]:
# Delete old collections if they exist (to handle dimension mismatch)
try:
    client.delete_collection(name="train_collection")
    client.delete_collection(name="test_collection")
    print("✓ Deleted old collections")
except:
    print("No old collections to delete")

# Create new collections
train_collection = client.get_or_create_collection(name="train_collection")
test_collection  = client.get_or_create_collection(name="test_collection")
print(f"✓ Created collections: {[c.name for c in client.list_collections()]}")

Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


✓ Deleted old collections
✓ Created collections: ['train_collection', 'test_collection']


## Préparation des données d'entraînement pour l'insertion dans ChromaDB

In [58]:
train_ids = list(metadata_df_train["id"])
train_ids = [str(i) for i in metadata_df_train["id"]]
train_embeddings = list(metadata_df_train["embeddings"])
train_labels = list(metadata_df_train["label"])

for i in range(3):
    print(train_embeddings[i])

[0.0019280788255855441, 0.058254923671483994, 0.020187897607684135, -0.05111808702349663, 0.019140342250466347, 0.039491549134254456, 0.0634191706776619, -0.030058294534683228, -0.015957681462168694, -0.044663261622190475, 0.04115566238760948, -0.019049905240535736, -0.03711240366101265, 0.05208006873726845, 0.0025803381577134132, 0.01811743527650833, 0.03135095536708832, -0.03657321259379387, 0.005608169361948967, -0.04876318946480751, -0.03640349581837654, 1.6065710951806977e-05, -0.02237877808511257, -0.043864358216524124, -0.01861646957695484, -0.03320343419909477, -0.07908227294683456, 0.05620989948511124, -0.0010872879065573215, 0.0022315536625683308, 0.025090528652071953, 0.007661588490009308, -0.008763068355619907, 0.028783665969967842, 1.6411543128924677e-06, -0.0608895868062973, -0.0658976212143898, -0.012933106161653996, 0.03632452338933945, 0.05502096191048622, -0.001750547206029296, 0.09273336082696915, -0.017990369349718094, -0.005118686705827713, 0.013742358423769474, 0.

## Préparation des données de test pour l'insertion dans ChromaDB

In [59]:
test_ids = list(metadata_df_test["id"])
test_ids = [str(i) for i in metadata_df_test["id"]]
test_embeddings = list(metadata_df_test["embeddings"])
test_labels = list(metadata_df_test["label"])


for i in range(3):
    print(train_embeddings[i])

[0.0019280788255855441, 0.058254923671483994, 0.020187897607684135, -0.05111808702349663, 0.019140342250466347, 0.039491549134254456, 0.0634191706776619, -0.030058294534683228, -0.015957681462168694, -0.044663261622190475, 0.04115566238760948, -0.019049905240535736, -0.03711240366101265, 0.05208006873726845, 0.0025803381577134132, 0.01811743527650833, 0.03135095536708832, -0.03657321259379387, 0.005608169361948967, -0.04876318946480751, -0.03640349581837654, 1.6065710951806977e-05, -0.02237877808511257, -0.043864358216524124, -0.01861646957695484, -0.03320343419909477, -0.07908227294683456, 0.05620989948511124, -0.0010872879065573215, 0.0022315536625683308, 0.025090528652071953, 0.007661588490009308, -0.008763068355619907, 0.028783665969967842, 1.6411543128924677e-06, -0.0608895868062973, -0.0658976212143898, -0.012933106161653996, 0.03632452338933945, 0.05502096191048622, -0.001750547206029296, 0.09273336082696915, -0.017990369349718094, -0.005118686705827713, 0.013742358423769474, 0.

## Insertion des embeddings dans les collections ChromaDB

In [60]:
batch_size = 40000

for i in range(0, len(train_ids), batch_size):
    train_collection.add(
        ids=train_ids[i:i+batch_size],
        embeddings=train_embeddings[i:i+batch_size],
        metadatas=[{"label": label} for label in train_labels[i:i+batch_size]]
    )

batch_size_test = 40000
for i in range(0, len(test_ids), batch_size_test):
    test_collection.add(
        ids=test_ids[i:i+batch_size_test],
        embeddings=test_embeddings[i:i+batch_size_test],
        metadatas=[{"label": label} for label in test_labels[i:i+batch_size_test]]
    )


Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


## Vérification du nombre d'éléments dans les collections ChromaDB

In [61]:
print(train_collection.count())
print(test_collection.count())

11712
2928
