In [8]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    set_seed
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import argparse
from tqdm import tqdm

MODEL_NAME = "google/mobilebert-uncased"
DATASET_PATH = "/kaggle/input/dataset-sa/MultiEmotions-It.tsv"
TEXT_COLUMN = "comment"
LABEL_COLUMN = "EMOTIONS"
OUTPUT_DIR = "/kaggle/working/"
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 2e-5
MAX_LENGTH = 128
SEED = 42

# Impostazioni per la riproducibilità
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo in uso: {device}")

# Definizione della classe per il dataset personalizzato
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def load_data(file_path, text_col, label_col):
    """Carica il dataset da un file TSV."""
    df = pd.read_csv(file_path, sep='\t')
    
    # Verifica la presenza delle colonne necessarie
    if text_col not in df.columns or label_col not in df.columns:
        available_cols = ", ".join(df.columns)
        raise ValueError(f"Colonne richieste non trovate. Colonne disponibili: {available_cols}")
    
    # Se le etichette sono testuali, convertiamole in numeriche
    if not pd.api.types.is_numeric_dtype(df[label_col]):
        label_map = {label: idx for idx, label in enumerate(df[label_col].unique())}
        df['label_id'] = df[label_col].map(label_map)
        print(f"Mappatura etichette: {label_map}")
        return df[text_col].values, df['label_id'].values, label_map
    
    return df[text_col].values, df[label_col].values, None

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    progress_bar = tqdm(data_loader, desc="Training")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({"loss": loss.item()})
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return avg_loss, accuracy, f1

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return avg_loss, accuracy, f1

def main():

    best_val_f1 = 0.0
    patience = 5
    patience_counter = 0

    # Crea la directory di output se non esiste
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Carica il dataset
    print(f"Caricamento del dataset da {DATASET_PATH}...")
    texts, labels, label_map = load_data(DATASET_PATH, TEXT_COLUMN, LABEL_COLUMN)
    
    # Divisione in training e validation set
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.1, random_state=SEED
    )
    
    print(f"Testi di training: {len(train_texts)}")
    print(f"Testi di validazione: {len(val_texts)}")
    
    # Numero di etichette uniche nel dataset
    num_labels = len(np.unique(labels))
    print(f"Numero di etichette: {num_labels}")
    
    # Carica il tokenizer e il modello
    print(f"Caricamento del modello {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=num_labels, classifier_dropout=0.2 
    )
    
    # Prepara i dataset
    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
    val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
    
    # Prepara i dataloader
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    
    # Prepara l'ottimizzatore e lo scheduler
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Sposta il modello sul dispositivo appropriato
    model.to(device)
    
    # Addestramento
    print("Inizio dell'addestramento...")
    best_val_f1 = 0.0
    
    for epoch in range(EPOCHS):
        print(f"\nEpoca {epoch+1}/{EPOCHS}")
        
        train_loss, train_acc, train_f1 = train_epoch(
            model, train_dataloader, optimizer, scheduler, device
        )
        
        print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
        
        val_loss, val_acc, val_f1 = evaluate(model, val_dataloader, device)
        
        print(f"Val Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")

        # Salva il modello se abbiamo ottenuto un miglior F1 score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patient_counter = 0
            
            # Salva il modello
            output_path = os.path.join(OUTPUT_DIR, 'best_model')
            model.save_pretrained(output_path)
            tokenizer.save_pretrained(output_path)
            
            # Salva la mappatura delle etichette se presente
            if label_map:
                label_map_file = os.path.join(output_path, 'label_map.txt')
                with open(label_map_file, 'w') as f:
                    for label, idx in label_map.items():
                        f.write(f"{label}\t{idx}\n")
            
            print(f"Modello salvato in {output_path}")

        else:
            patience_counter += 1 
            print(f"Early stopping patience: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("Early stopping attivato: nessun miglioramento per troppe epoche consecutive.")
            break
    
    print("\nAddestramento completato!")
    print(f"Miglior F1 score di validazione: {best_val_f1:.4f}")


if __name__ == "__main__":
    main()

Dispositivo in uso: cuda
Caricamento del dataset da /kaggle/input/dataset-sa/MultiEmotions-It.tsv...
Mappatura etichette: {'LOVE - DELIGHT': 0, 'TRUST': 1, 'TRUST - SADNESS': 2, 'LOVE': 3, nan: 4, 'OPTIMISM': 5, 'TRUST - DELIGHT': 6, 'DISGUST': 7, 'DISAPPOINTMENT': 8, 'TRUST - OPTIMISM': 9, 'CONTEMPT': 10, 'ANGER': 11, 'LOVE - SENTIMENTALITY': 12, 'JOY': 13, 'DELIGHT': 14, 'LOVE - ANGER': 15, 'TRUST - DISAPPOINTMENT': 16, 'OUTRAGE': 17, 'LOVE - OPTIMISM': 18, 'TRUST - DISGUST': 19, 'TRUST - SURPRISE': 20, 'TRUST - OUTRAGE': 21, 'TRUST - ANTICIPATION': 22, 'LOVE - DISAPPOINTMENT': 23, 'TRUST - ANGER': 24, 'LOVE - ANTICIPATION': 25, 'ANGER - DISAPPOINTMENT': 26, 'SHAME': 27, 'LOVE - SURPRISE': 28, 'TRUST - SENTIMENTALITY': 29, 'SURPRISE': 30, 'SENTIMENTALITY': 31, 'SENTIMENTALITY - ANGER': 32, 'DELIGHT - SADNESS': 33, 'LOVE - SADNESS': 34, 'LOVE - TRUST': 35, 'CURIOSITY': 36, 'LOVE - CURIOSITY': 37, 'SADNESS': 38, 'FEAR': 39, 'ANTICIPATION': 40, 'REMORSE': 41, 'TRUST - JOY': 42, 'PRIDE':

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Inizio dell'addestramento...

Epoca 1/50


Training: 100%|██████████| 92/92 [00:30<00:00,  3.03it/s, loss=1.72]   


Train Loss: 1845263.1727, Accuracy: 0.0826, F1: 0.0959


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.63it/s]


Val Loss: 2.7892, Accuracy: 0.2438, F1: 0.1286
Modello salvato in /kaggle/working/best_model

Epoca 2/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.99it/s, loss=1.26]


Train Loss: 3.2038, Accuracy: 0.2130, F1: 0.1572


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.48it/s]


Val Loss: 2.6462, Accuracy: 0.2747, F1: 0.1639
Modello salvato in /kaggle/working/best_model

Epoca 3/50


Training: 100%|██████████| 92/92 [00:31<00:00,  2.96it/s, loss=1.59]


Train Loss: 2.7517, Accuracy: 0.2610, F1: 0.1941


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.45it/s]


Val Loss: 2.4395, Accuracy: 0.3333, F1: 0.2512
Modello salvato in /kaggle/working/best_model

Epoca 4/50


Training: 100%|██████████| 92/92 [00:31<00:00,  2.94it/s, loss=3.38]


Train Loss: 2.6106, Accuracy: 0.3196, F1: 0.2449


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.54it/s]


Val Loss: 2.3534, Accuracy: 0.3704, F1: 0.2997
Modello salvato in /kaggle/working/best_model

Epoca 5/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.98it/s, loss=1.1] 


Train Loss: 2.4062, Accuracy: 0.3642, F1: 0.2938


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.41it/s]


Val Loss: 2.3605, Accuracy: 0.3827, F1: 0.2980
Early stopping patience: 1/5

Epoca 6/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.97it/s, loss=1.04]


Train Loss: 3.7876, Accuracy: 0.3896, F1: 0.3232


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.47it/s]


Val Loss: 2.2480, Accuracy: 0.4105, F1: 0.3415
Modello salvato in /kaggle/working/best_model

Epoca 7/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.98it/s, loss=2.59]


Train Loss: 2.1936, Accuracy: 0.4102, F1: 0.3458


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.47it/s]


Val Loss: 2.2255, Accuracy: 0.4043, F1: 0.3334
Early stopping patience: 2/5

Epoca 8/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.97it/s, loss=1.54]


Train Loss: 2.0297, Accuracy: 0.4588, F1: 0.4012


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.48it/s]


Val Loss: 2.2328, Accuracy: 0.4290, F1: 0.3600
Modello salvato in /kaggle/working/best_model

Epoca 9/50


Training: 100%|██████████| 92/92 [00:31<00:00,  2.94it/s, loss=3.15]


Train Loss: 4.4411, Accuracy: 0.4757, F1: 0.4214


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.38it/s]


Val Loss: 2.2039, Accuracy: 0.4537, F1: 0.3961
Modello salvato in /kaggle/working/best_model

Epoca 10/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.98it/s, loss=2.43]


Train Loss: 1.8178, Accuracy: 0.5120, F1: 0.4614


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.54it/s]


Val Loss: 2.2711, Accuracy: 0.4414, F1: 0.3889
Early stopping patience: 3/5

Epoca 11/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.97it/s, loss=1.78]


Train Loss: 1.7271, Accuracy: 0.5261, F1: 0.4819


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.44it/s]


Val Loss: 2.2627, Accuracy: 0.4167, F1: 0.3704
Early stopping patience: 4/5

Epoca 12/50


Training: 100%|██████████| 92/92 [00:30<00:00,  2.98it/s, loss=1.94] 


Train Loss: 1.6245, Accuracy: 0.5631, F1: 0.5199


Validation: 100%|██████████| 11/11 [00:01<00:00, 10.47it/s]

Val Loss: 2.2933, Accuracy: 0.4352, F1: 0.3891
Early stopping patience: 5/5
Early stopping attivato: nessun miglioramento per troppe epoche consecutive.

Addestramento completato!
Miglior F1 score di validazione: 0.3961



