## Importation et chargement des données avec Pandas

In [None]:
import pandas as pd
df = pd.read_csv(r'data/clean_data.csv')

df.head()

Unnamed: 0,text_clean,airline_sentiment,word_count
0,what said,neutral,4
1,plus youve added commercials to the experience...,positive,9
2,i didnt today must mean i need to take another...,neutral,12
3,its really aggressive to blast obnoxious enter...,negative,17
4,and its a really big bad thing about it,negative,10


## Séparation des données (Train / Test)

In [None]:
from sklearn.model_selection import train_test_split
X = df["text_clean"]
y = df["airline_sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

## Vérification de la version de Sentence Transformers installée

In [None]:
import sentence_transformers 
print(sentence_transformers.__version__)

  from tqdm.autonotebook import tqdm, trange


3.0.0


## Génération des embeddings avec all-mpnet-base-v2

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-large-en-v1.5")
print(f"✓ Loaded model: all-mpnet-base-v2")

train_embeddings = model.encode(X_train.tolist(), batch_size=32, show_progress_bar=True)
test_embeddings  = model.encode(X_test.tolist(),  batch_size=32, show_progress_bar=True)

print(f"✓ Train embeddings shape: {train_embeddings.shape}")
print(f"✓ Test embeddings shape: {test_embeddings.shape}")

df_train = pd.DataFrame({"text": X_train.values, "label": y_train.values})
df_test  = pd.DataFrame({"text": X_test.values,  "label": y_test.values})

df_train["embeddings"] = [e.tolist() for e in train_embeddings]
df_test["embeddings"]  = [e.tolist() for e in test_embeddings]

df_train["id"] = range(len(df_train))
df_test["id"]  = range(len(df_test))



✓ Loaded model: all-mpnet-base-v2


Batches: 100%|██████████| 362/362 [1:23:10<00:00, 13.79s/it]
Batches: 100%|██████████| 91/91 [08:01<00:00,  5.29s/it]


✓ Train embeddings shape: (11561, 1024)
✓ Test embeddings shape: (2891, 1024)


## Préparation des métadonnées pour les ensembles d'entraînement et de test


In [None]:
metadata_df_train = df_train[["id", "label", "embeddings"]]
print(metadata_df_train.head())

metadata_df_test = df_test[["id", "label", "embeddings"]]
print(metadata_df_test.head())


   id     label                                         embeddings
0   0  positive  [0.03735261410474777, -0.02885301783680916, 0....
1   1   neutral  [0.021572643890976906, 0.03109913133084774, -0...
2   2  positive  [0.01893659494817257, 0.03151579946279526, -0....
3   3  negative  [0.014490927569568157, 0.014819961972534657, -...
4   4  negative  [0.03833279386162758, 0.011557473801076412, -0...
   id     label                                         embeddings
0   0   neutral  [0.033828891813755035, 0.008136649616062641, -...
1   1  negative  [-0.0038370145484805107, -0.005647120997309685...
2   2  negative  [0.0013054902665317059, 0.021464243531227112, ...
3   3   neutral  [-0.025541039183735847, 0.018278608098626137, ...
4   4  negative  [-0.05670439451932907, -0.03457264602184296, -...


## Sauvegarde des métadonnées et embeddings en fichiers CSV


In [None]:
import os

os.makedirs("../data", exist_ok=True)
metadata_df_train.to_csv("../data/train_metadata_embeddings.csv", index=False)
metadata_df_test.to_csv("../data/test_metadata_embeddings.csv", index=False)


## Initialisation du client ChromaDB

In [None]:
import chromadb
import os

os.makedirs("../data/chroma", exist_ok=True)

client = chromadb.PersistentClient(path="../data/chroma")
print("✓ ChromaDB client initialized")

[0;93m2025-12-18 11:47:50.120046071 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


✓ ChromaDB client initialized


## Création des collections ChromaDB pour l'entraînement et le test


In [None]:
try:
    client.delete_collection(name="train_collection")
    client.delete_collection(name="test_collection")
    print("Deleted old collections")
except:
    print("No old collections to delete")

train_collection = client.get_or_create_collection(name="train_collection")
test_collection  = client.get_or_create_collection(name="test_collection")
print(f"✓ Created collections: {[c.name for c in client.list_collections()]}")

Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


No old collections to delete
✓ Created collections: ['test_collection', 'train_collection']


## Préparation des données d'entraînement pour l'insertion dans ChromaDB

In [None]:
train_ids = list(metadata_df_train["id"])
train_ids = [str(i) for i in metadata_df_train["id"]]
train_embeddings = list(metadata_df_train["embeddings"])
train_labels = list(metadata_df_train["label"])

for i in range(3):
    print(train_embeddings[i])

[0.03735261410474777, -0.02885301783680916, 0.0006253938190639019, 0.04160246625542641, -0.010300028137862682, 0.0034845415502786636, -0.023987047374248505, 0.06124403327703476, 0.020402144640684128, -0.008984778076410294, 0.03716837242245674, -0.06298233568668365, 0.016229456290602684, -0.006631845608353615, -0.023729374632239342, -0.004319880157709122, 0.015294726938009262, -0.05240899324417114, -0.0016299623530358076, -0.00159829156473279, 0.027028663083910942, -0.003119563916698098, -0.04021942988038063, 0.009208856150507927, -0.014006580226123333, 0.024627381935715675, 0.02828264608979225, 0.006398702971637249, 0.0713299959897995, 0.02459733746945858, 0.013298158533871174, 0.015790559351444244, 0.05150385573506355, -0.03517463058233261, 0.012188388966023922, -0.013647097162902355, 0.06157348304986954, -0.016054043546319008, -0.005424561910331249, -0.0015968105290085077, -0.02367219515144825, 0.0015850706258788705, 0.03815828636288643, -0.02104589343070984, -0.019855838268995285, -

## Préparation des données de test pour l'insertion dans ChromaDB

In [None]:
test_ids = list(metadata_df_test["id"])
test_ids = [str(i) for i in metadata_df_test["id"]]
test_embeddings = list(metadata_df_test["embeddings"])
test_labels = list(metadata_df_test["label"])


for i in range(3):
    print(train_embeddings[i])

[0.03735261410474777, -0.02885301783680916, 0.0006253938190639019, 0.04160246625542641, -0.010300028137862682, 0.0034845415502786636, -0.023987047374248505, 0.06124403327703476, 0.020402144640684128, -0.008984778076410294, 0.03716837242245674, -0.06298233568668365, 0.016229456290602684, -0.006631845608353615, -0.023729374632239342, -0.004319880157709122, 0.015294726938009262, -0.05240899324417114, -0.0016299623530358076, -0.00159829156473279, 0.027028663083910942, -0.003119563916698098, -0.04021942988038063, 0.009208856150507927, -0.014006580226123333, 0.024627381935715675, 0.02828264608979225, 0.006398702971637249, 0.0713299959897995, 0.02459733746945858, 0.013298158533871174, 0.015790559351444244, 0.05150385573506355, -0.03517463058233261, 0.012188388966023922, -0.013647097162902355, 0.06157348304986954, -0.016054043546319008, -0.005424561910331249, -0.0015968105290085077, -0.02367219515144825, 0.0015850706258788705, 0.03815828636288643, -0.02104589343070984, -0.019855838268995285, -

## Insertion des embeddings dans les collections ChromaDB

In [None]:
batch_size = 5000

for i in range(0, len(train_ids), batch_size):
    train_collection.add(
        ids=train_ids[i:i+batch_size],
        embeddings=train_embeddings[i:i+batch_size],
        metadatas=[{"label": label} for label in train_labels[i:i+batch_size]]
    )

batch_size_test = 5000
for i in range(0, len(test_ids), batch_size_test):
    test_collection.add(
        ids=test_ids[i:i+batch_size_test],
        embeddings=test_embeddings[i:i+batch_size_test],
        metadatas=[{"label": label} for label in test_labels[i:i+batch_size_test]]
    )


Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


## Vérification du nombre d'éléments dans les collections ChromaDB

In [13]:
print(train_collection.count())
print(test_collection.count())

11561
2891
