In [1]:
import pandas as pd
data=pd.read_csv('data_cleaned.csv')

In [2]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.20)

print('train shape: ',train_df.shape)
print('test shape: ',test_df.shape)

train shape:  (173, 2)
test shape:  (44, 2)


In [3]:
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score
import torch
import random
import numpy as np

# Fixer les graines aléatoires pour la reproductibilité
seed = 40
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)

# Charger les données et diviser en ensembles d'entraînement et de test
train_df, test_df = train_test_split(data, test_size=0.2, random_state=0)
print('train shape: ',train_df.shape)
print('test shape: ',test_df.shape)

# Charger le tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Charger le modèle
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3
)

# Définir la fonction pour convertir les étiquettes de sentiment en valeurs numériques
sentiment_map = {"positif": 1, "negatif": 0, "neutre": 2}
def convert_sentiment_to_num(sentiment):
    return sentiment_map[sentiment]

# Appliquer la fonction pour convertir les étiquettes de sentiment en valeurs numériques
train_df["Sentiment"] = train_df["Sentiment"].apply(convert_sentiment_to_num)
test_df["Sentiment"] = test_df["Sentiment"].apply(convert_sentiment_to_num)

# Prétraiter les données
train_encodings = tokenizer(train_df['Commentaire'].tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_df['Commentaire'].tolist(), truncation=True, padding=True, return_tensors='pt')

# Préparer les étiquettes
train_labels = torch.tensor(train_df['Sentiment'].tolist())
test_labels = torch.tensor(test_df['Sentiment'].tolist())

# Définir l'optimiseur
optimizer = torch.optim.Adam(model.parameters(), lr=2e-6)



# Faire des prédictions sur les données de test
test_input_ids = tokenizer(test_df['Commentaire'].tolist(), padding=True, truncation=True, return_tensors='pt')
test_output = model(**test_input_ids)[0]
test_predictions = torch.argmax(test_output, dim=1).tolist()

# Calculer l'accuracy
accuracy = accuracy_score(test_df['Sentiment'], test_predictions)
print(f'Accuracy: {accuracy:.2f}')

train shape:  (173, 2)
test shape:  (44, 2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.89


In [5]:
from sklearn.metrics import accuracy_score, classification_report
# Afficher le rapport de classification
report = classification_report(test_df['Sentiment'], test_predictions, target_names=['negatif', 'positif', 'neutre'])
print(report)

              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00         0
     positif       1.00      0.88      0.94        43
      neutre       0.20      1.00      0.33         1

    accuracy                           0.89        44
   macro avg       0.40      0.63      0.42        44
weighted avg       0.98      0.89      0.92        44



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
