In [None]:
from google.colab import drive
import pandas as pd
import os

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datasets import Dataset

from transformers import DistilBertTokenizerFast
from transformers import pipeline
from sklearn.metrics import classification_report
from transformers import TrainingArguments
import evaluate
import numpy as np
from transformers import Trainer

In [None]:
# !pip install wandb
!pip uninstall -y wandb
os.environ["WANDB_DISABLED"] = "true"

!pip install evaluate

In [None]:
drive.mount('/content/drive')
#os.chdir('path_to_your_folder')
os.getcwd()

## Importation des données

In [None]:
df = pd.read_csv('reviews.csv')

In [None]:
#df = df.rename(columns={'text': 'review', 'label': 'labels'})
df = df.rename(columns={ 'label': 'labels'})
print(df.shape)
df.head()

In [None]:
#Label encoding
le = preprocessing.LabelEncoder()
le.fit(df['labels'].tolist())
df['labels'] = le.transform(df['labels'].tolist())
df.head()

In [None]:
# import re
# import nltk
# from nltk.corpus import stopwords

# nltk.download("stopwords")
# stop_words = set(stopwords.words("english"))

# def clean_text(text):
#     text = text.lower()
#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
#     # text = ' '.join(word for word in text.split() if word not in stop_words)
#     return text
# df["text"] = df["review"].apply(clean_text)

In [None]:
df.head()

**Convert data to hugging face dataset**

In [None]:
df_train, df_test = train_test_split(df,test_size=0.2)

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
test_dataset

In [None]:
train_dataset

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length')

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_test
#tokenized_test['__index_level_0__'][:3]

In [None]:
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
# tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# tokenized_test

In [None]:
from transformers import DataCollatorWithPadding
from transformers import DistilBertForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to = None
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

**Evaluation du model**

In [None]:
#2nd distilled avec 11k et 10k: 88%

from sklearn.metrics import classification_report

preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_test['labels'].tolist()
print(classification_report(GT,preds))

In [None]:
trainer.save_model("classification_model_88_percent")

In [None]:
pipe = pipeline("text-classification", model="classification_model_88_percent", tokenizer="classification_model_88_percent")

user_review = "The product is perfect, I'll buy another one"
#user_review = "The product, beurkkk i don't like it"

pred= pipe(user_review)

#1 is positif, #0 is negatif
pred_label = pred[0]['label']
if pred_label == 'LABEL_1':
  sentiment = 'positif'
else:
  sentiment = 'négatif'
print(f"La review est: {sentiment}")