In [99]:
import pandas as pd
from datasets import Dataset
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
import torch
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import random

seed_value = 1999

random.seed(seed_value)
torch.manual_seed(seed_value)
np.random.seed(seed_value)

In [100]:
# Percorsi dei file CSV
train_csv_path = "../../data/New dataset/BERT/train_tweets_Transformers_new.csv"
eval_csv_path = "../../data/New dataset/BERT/eval_tweets_Transformers_new.csv"
test_csv_path = "../../data/New dataset/BERT/test_tweets_Transformers_new.csv"

In [101]:
# Caricamento dei dataset
train_df = pd.read_csv(train_csv_path)
eval_df = pd.read_csv(eval_csv_path)
test_df = pd.read_csv(test_csv_path)


train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)

In [102]:
train_dataset

Dataset({
    features: ['tweet_text', 'cyberbullying_type'],
    num_rows: 25155
})

In [103]:
# Preprocessing
tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")

In [104]:
from sklearn.preprocessing import LabelEncoder
possible_labels = train_df.cyberbullying_type.unique()

le = LabelEncoder()
label_dict = le.fit_transform(possible_labels)

In [105]:
def preprocess_function(examples):
    encoded_example=tokenizer(examples["tweet_text"], padding="max_length", truncation=True, max_length=150)
    encoded_example["label"] = label_dict[le.transform(examples["cyberbullying_type"])]
    return encoded_example

train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

Map: 100%|██████████| 25155/25155 [00:23<00:00, 1064.01 examples/s]
Map: 100%|██████████| 6325/6325 [00:05<00:00, 1154.83 examples/s]
Map: 100%|██████████| 7519/7519 [00:06<00:00, 1132.63 examples/s]


In [106]:
# Convertire i dataset in tensori PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [107]:
# Preparazione del modello
model = ElectraForSequenceClassification.from_pretrained("google/electra-small-discriminator", num_labels=5)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [108]:
# Funzione di valutazione personalizzata
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
    }

In [109]:
# Definizione dei parametri di addestramento
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'eval_strategy'

In [110]:
# Vettori per salvare le metriche
accuracy_per_epoch = []
f1_score_per_epoch = []

# Funzione di callback per salvare le metriche
class SaveMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        accuracy_per_epoch.append(metrics["eval_accuracy"])
        f1_score_per_epoch.append(metrics["eval_f1"])

In [111]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li

In [112]:
# Ensure input tensors and labels are on the same device as the model
input_ids = train_dataset['input_ids'][0].unsqueeze(0).to(device)
attention_mask = train_dataset['attention_mask'][0].unsqueeze(0).to(device)
labels = train_dataset['label'][0].unsqueeze(0).to(device)

outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

In [113]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[SaveMetricsCallback()]
)

NameError: name 'training_args' is not defined

In [114]:
# Addestramento del modello
trainer.train()

NameError: name 'trainer' is not defined

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("../../data/Transformers/ELECTRA-finetuned")

In [None]:
# Valutazione finale sul dataset di test
test_results = trainer.evaluate(test_dataset)
print("Test results:", test_results)

# Stampa delle metriche
print("Accuracy per epoch:", accuracy_per_epoch)
print("F1-score per epoch:", f1_score_per_epoch)