# Dataset

In [None]:
import os
import re
from transformers import AutoTokenizer
import numpy as np

In [None]:
top_regex = re.compile(r"(?P<stage>Stage: .+)\nChat Polarity Mean: (?:-?|\+?)\d\.?\d?\d?\nChat Polarity Variance: \d\.?\d?\d?\n(?P<event>Event: .+)\n\n(?P<chat>(?:.+|\n+)+)")
msgs_regex = re.compile(r"(?P<message>(?P<timestamp>\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d) \| (?P<name>.+):\n(?P<content>.+)\nPolarity: (?P<polarity>(?:-?|\+?)\d\.?\d?\d?)\n\[(?P<tag_explanation>Tag: (?P<tag>.+)\nSpiegazione: (?P<explanation>.+))\])")

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_bart = AutoTokenizer.from_pretrained("facebook/bart-base")

In [None]:
path = "../rsc/gemini-2.5-flash-dataset_2025-07-07-10-45-16/chats"
dirs = os.listdir(path)
bert_lengths = []
bart_lengths = []
polarities = []
for directory in dirs:
    files = os.listdir(os.path.join(path, directory))
    for file in files:
        bert_count = 0
        bart_count = 0
        polarity_sum = 0
        with open(os.path.join(path, directory, file), "r", encoding="utf-8") as f:
            content = f.read()
            match = top_regex.match(content)
            if match:
                chat = match.group("chat")
                messages = msgs_regex.finditer(chat)
                total = 0
                for message in messages:
                    content = message.group("content")
                    if content:
                        bert_tokens = tokenizer_bert.encode(content, add_special_tokens=True)
                        bart_tokens = tokenizer_bart.encode(content, add_special_tokens=True)
                        bert_count += len(bert_tokens)
                        bart_count += len(bart_tokens)
                    polarity_sum += float(message.group("polarity"))
                    total += 1
                # try:
                #     polarities.append(polarity_sum / total)
                # except ZeroDivisionError:
                #     print(f"ZeroDivisionError in file: {file}")
                bert_lengths.append(bert_count)
                bart_lengths.append(bart_count)
            else:
                print(f"No match found in file: {os.path.join(path, directory, file)}")
            

bert_v = np.array(bert_lengths)
bart_v = np.array(bart_lengths)
polarities_v = np.array(polarities)

print(f"BERT Max Token Count: {np.max(bert_v)}, Min Token Count: {np.min(bert_v)}")
print(f"BERT Mean Token Count: {np.mean(bert_v):.2f}")
print(f"BERT Variance Token Count: {np.std(bert_v):.2f}")
print(f"BERT Over 512 Tokens: {len(bert_v[bert_v > 512])} / {bert_v.shape[0]}\n")

print(f"BART Max Token Count: {np.max(bart_v)}, Min Token Count: {np.min(bart_v)}")
print(f"BART Mean Token Count: {np.mean(bart_v):.2f}")
print(f"BART Variance Token Count: {np.std(bart_v):.2f}")
print(f"BART Over 1024 Tokens: {len(bart_v[bart_v > 1024])} / {bart_v.shape[0]}\n")

print(f"Polarities Mean: {np.mean(polarities_v):.2f}")
print(f"Polarities Variance: {np.std(polarities_v):.2f}")
print(f"Polarities Min: {np.min(polarities_v):.2f}")
print(f"Polarities Max: {np.max(polarities_v):.2f}")
print(f"Polarities Over 0: {len(polarities_v[polarities_v > 0])} / {polarities_v.shape[0]}")
print(f"Polarities Under 0: {len(polarities_v[polarities_v < 0])} / {polarities_v.shape[0]}")
print(f"Polarities Around 0: {len(polarities_v[np.logical_and(polarities_v < 0.5, polarities_v > -0.5)])} / {polarities_v.shape[0]}")

# BART

In [1]:
import torch
import torch.nn as nn
from torch.nn import MSELoss
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    BartPreTrainedModel,
    BartModel,
)
from dataclasses import dataclass
from typing import Optional, Tuple, Union, List, Dict, Any
from transformers.modeling_outputs import Seq2SeqLMOutput
import numpy as np
import math
import nltk
import evaluate # Hugging Face's library for evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 0. Setup Dependencies ---
# Ensure you have the necessary libraries installed:
# pip install "evaluate>=0.4.0" "rouge_score>=0.1.2" "nltk>=3.8.1"

# --- 1. Setup & Dummy Chat Dataset ---
# Each item in the list is a full conversation.
raw_chats = [
    {
        "messages": ["Sei un incompetente!", "Davvero, non capisci niente."],
        "polarities": [-0.9, -1.0],
        "explanations": ["Il messaggio contiene un insulto.", "Il messaggio rafforza l'attacco personale."]
    },
    {
        "messages": ["Grazie mille per l'aiuto.", "Sei stato gentilissimo.", "Apprezzo molto il tuo tempo."],
        "polarities": [1.0, 0.9, 0.9],
        "explanations": ["Esprime gratitudine esplicita.", "Contiene un complimento diretto.", "Mostra apprezzamento per lo sforzo altrui."]
    },
    {
        "messages": ["Non sono d'accordo con questa decisione."],
        "polarities": [0.1],
        "explanations": ["Esprime disaccordo in modo neutrale e rispettoso."]
    },
    {
        "messages": ["Fai schifo.", "Spero che ti licenzino."],
        "polarities": [-1.0, -1.0],
        "explanations": ["Contiene un insulto grave.", "Contiene un augurio negativo e minaccioso."]
    },
    {
        "messages": ["Il prodotto è arrivato rotto.", "Il servizio clienti non risponde.", "Sono molto insoddisfatto."],
        "polarities": [-0.8, -0.7, -0.9],
        "explanations": ["Descrive un problema con il prodotto.", "Lamenta una mancanza di supporto.", "Esprime insoddisfazione generale."]
    }
]
# Convert to a Hugging Face Dataset
dataset = Dataset.from_list(raw_chats)

In [21]:
# Define model and special token
MODEL_CHECKPOINT = "morenolq/bart-it"
SEP_TOKEN = "<sep>"

# --- 2. Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
tokenizer.add_special_tokens({'sep_token': SEP_TOKEN})
sep_token_id = tokenizer.sep_token_id
print(f"Special token ID for '{SEP_TOKEN}': {sep_token_id}")

# --- 4. Preprocessing ---
def preprocess_function(chats):
    inputs = [SEP_TOKEN.join(chat['messages']) + SEP_TOKEN for chat in chats]
    targets = [SEP_TOKEN.join(chat['explanations']) + SEP_TOKEN for chat in chats]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=targets, max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["regression_labels"] = [chat["polarities"] for chat in chats]
    return model_inputs

preprocess_function(raw_chats)

Special token ID for '<sep>': 52000


{'input_ids': [[0, 13125, 329, 7635, 40980, 73, 5, 52000, 35375, 16, 384, 43722, 3900, 18, 52000, 2], [0, 4876, 4881, 300, 264, 11, 7560, 18, 52000, 13125, 781, 42167, 18, 52000, 8326, 14431, 765, 311, 1163, 876, 18, 52000, 2], [0, 1876, 458, 322, 11, 5151, 225, 71, 83, 82, 782, 4543, 18, 52000, 2], [0, 24501, 28141, 18, 52000, 19765, 312, 710, 527, 1099, 45297, 18, 52000, 2], [0, 714, 1882, 345, 5904, 20121, 18, 52000, 714, 1533, 2603, 384, 6452, 18, 52000, 3327, 765, 25849, 15643, 18, 52000, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[0, 714, 4789, 5988, 329, 7483, 2755, 18, 52000, 714, 4789, 6805, 264, 11, 10242, 2116, 18, 52000, 2], [0, 41, 14195, 272, 25644, 28711, 18, 52000, 2669, 5759, 329, 50757, 4611, 18, 520

In [None]:
# --- 5. Custom Data Collator ---
@dataclass
class DataCollatorForChat:
    tokenizer: AutoTokenizer
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Separate regression labels before padding the rest
        regression_labels = [f.pop("regression_labels") for f in features]
        
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        max_messages = max(len(p) for p in regression_labels)
        padded_polarities = []
        for p in regression_labels:
            padded_list = p + [self.label_pad_token_id] * (max_messages - len(p))
            padded_polarities.append(padded_list)
        
        batch["regression_labels"] = torch.tensor(padded_polarities, dtype=torch.float)
        return batch

data_collator = DataCollatorForChat(tokenizer=tokenizer)

In [None]:
# --- 3. Custom Model Definition ---
@dataclass
class ChatMultiTaskOutput(Seq2SeqLMOutput):
    regression_loss: Optional[torch.FloatTensor] = None
    regression_logits: Optional[torch.FloatTensor] = None

class BartForChatRegressionAndGeneration(BartPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = BartModel(config)
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.regression_head = nn.Sequential(
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model, 1)
        )
        self.post_init() # Weights initialization and other post-initialization tasks
        if not hasattr(config, 'sep_token_id'):
            raise ValueError("sep_token_id must be set in the model config.")
        self.sep_token_id = config.sep_token_id

    def forward(
        self,
        input_ids: torch.LongTensor,
        # indicates which tokens are padding (0) or not (1)
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        regression_labels: Optional[torch.FloatTensor] = None,
        **kwargs,
    ) -> Union[Tuple, ChatMultiTaskOutput]:
        
        encoder_outputs = self.model.encoder(
            input_ids=input_ids,
            # ignore padding tokens in the attention mask
            attention_mask=attention_mask,
            **kwargs
        )
        encoder_last_hidden_state = encoder_outputs.last_hidden_state

        # retrieve the hidden states corresponding to the special separator token <sep>
        sep_token_mask = (input_ids == self.sep_token_id)
        sep_hidden_states = encoder_last_hidden_state[sep_token_mask]
        
        # If there are no <sep> tokens, we return an empty tensor for regression logits
        regression_logits = torch.tensor([], device=input_ids.device)
        if sep_hidden_states.shape[0] > 0:
            regression_logits = self.regression_head(sep_hidden_states).squeeze(-1)
            regression_logits = torch.tanh(regression_logits)

        # Calculate regression loss if regression labels are provided
        regression_loss = None
        if regression_labels is not None:
            # We flatten the regression labels to match the shape of regression_logits
            valid_regression_labels = regression_labels.view(-1)
            # Create a mask to ignore padding tokens in regression labels
            valid_mask = valid_regression_labels != -100
            if regression_logits.shape[0] == valid_regression_labels[valid_mask].shape[0]:
                loss_fct = MSELoss()
                regression_loss = loss_fct(regression_logits, valid_regression_labels[valid_mask])
            else:
                regression_loss = torch.tensor(0.0, device=input_ids.device)
        
        # aggiunta da me
        decoder_attention_mask = (decoder_input_ids != tokenizer.pad_token_id).long()

        # We need to pass decoder_input_ids for the generation task during training
        decoder_input_ids = kwargs.get('decoder_input_ids')
        decoder_outputs = self.model.decoder(
            decoder_input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_last_hidden_state,
            # used in the self-attention to mask out padding tokens in the target text
            decoder_attention_mask=decoder_attention_mask, # aggiunta da me
            # used in the cross-attention layer to mask out embeddings of padding tokens
            encoder_attention_mask=attention_mask,
        )
        sequence_output = decoder_outputs.last_hidden_state
        lm_logits = self.lm_head(sequence_output)

        generation_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
            generation_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        total_loss = None
        if generation_loss is not None and regression_loss is not None:
            regression_weight = 1.0
            total_loss = generation_loss + (regression_weight * regression_loss)

        return ChatMultiTaskOutput(
            loss=total_loss,
            logits=lm_logits,
            regression_loss=regression_loss,
            regression_logits=regression_logits,
            encoder_last_hidden_state=encoder_last_hidden_state,
        )


In [None]:
# Split dataset before tokenizing
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)


In [None]:
# --- 6. Metrics Computation ---
nltk.download('punkt', quiet=True)
rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode generated summaries, replacing -100 padding with pad_token
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Decode reference summaries, replacing -100 padding with pad_token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Simple text cleaning
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # ROUGE expects a newline after each sentence
    decoded_preds_rouge = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
    decoded_labels_rouge = ["\n".join(nltk.sent_tokenize(label)) for label in decoded_labels]
    
    # Compute ROUGE scores
    rouge_result = rouge_metric.compute(predictions=decoded_preds_rouge, references=decoded_labels_rouge)
    
    # Compute BLEU scores
    decoded_labels_bleu = [[label] for label in decoded_labels] # BLEU expects a list of references
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels_bleu)

    result = {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"]
    }

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# --- 7. Training ---
model = BartForChatRegressionAndGeneration.from_pretrained(MODEL_CHECKPOINT)
model.resize_token_embeddings(len(tokenizer))
model.config.sep_token_id = sep_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-it-chat-multitask",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True, # Crucial for generation metrics
    fp16=torch.cuda.is_available(),
    logging_steps=5,
)

class CustomChatTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

trainer = CustomChatTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Add metrics computation
)

print("Starting model training...")
trainer.train()
print("Training finished.")

trainer.save_model("./bart-it-chat-multitask-final")
tokenizer.save_pretrained("./bart-it-chat-multitask-final")

# --- 8. Final Evaluation on Test Set ---
print("\n--- Evaluating on Test Set ---")
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

# Calculate and add perplexity
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = round(perplexity, 4)

print("\n--- Final Evaluation Metrics ---")
for key, value in sorted(eval_results.items()):
    print(f"{key}: {value}")
print("---------------------------------")


# --- 9. Inference ---
print("\n--- Running Inference ---")
# ... (Inference code remains the same)