In [4]:
import torch
from transformers import ElectraForSequenceClassification, ElectraTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd

import sys
sys.path.append('../')

import import_ipynb
from data_preparation import Preprocessing

In [5]:
# Definizione del modello ELECTRA
model_name = 'google/electra-base-discriminator'
tokenizer = ElectraTokenizer.from_pretrained(model_name)
num_labels = 5  # Numero di classi
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def normalize_tweet_BERT(tweet):
    tweet = Preprocessing.remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = Preprocessing.remove_hashtag(tweet)
    tweet = Preprocessing.remove_special_characters(tweet)
 
    tweet = Preprocessing.remove_spaces(tweet)
    tweet = Preprocessing.remove_textual_emojis(tweet)
    tweet = Preprocessing.remove_not_ASCII(tweet)

    return tweet

In [7]:
# Classe per il dataset
class CustomDataset(Dataset):
    def __init__(self, data_path, max_length):
        self.data = pd.read_csv(data_path)
        self.data['tweet_text'] = self.data['tweet_text'].apply(normalize_tweet_BERT)
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['tweet_text']
        label = self.data.iloc[idx]['cyberbullying_type']
        # Mappatura delle etichette alle nuove classi
        label_map = {
            "not_cyberbullying": 0,
            "age": 1,
            "gender": 2,
            "ethnicity": 3,
            "religion": 4
        }
        label_id = label_map[label]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

In [None]:
from sklearn.model_selection import train_test_split

# Divido il dataset in training e validation set mantenendo la distribuzione delle classi

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=None)

# Creare una colonna per segnare il tipo di dato
df['data_type'] = ['not_set']*df.shape[0]

# Assegna 'train' ai dati di addestramento e 'val' ai dati di validazione nel DataFrame
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# Conta i valori per ogni combinazione di categoria, etichetta e tipo di dato
df.groupby(['cyberbullying_type', 'label', 'data_type']).count()



# Caricamento del dataset
train_dataset = CustomDataset('../../data/updated_tweets.csv', max_length=64)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
# Funzione di addestramento
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    return total_loss / len(loader)

In [None]:
# Impostazione di parametri per l'addestramento
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 3

In [None]:
# Addestramento
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}')

In [None]:
# Salvataggio del modello
torch.save(model.state_dict(), 'electra_cyberbullying_model.pth')