# Dataset

In [1]:
import os
import re
from transformers import AutoTokenizer
import numpy as np
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
top_regex = re.compile(r"(?P<stage>Stage: .+)\nChat Polarity Mean: (?:-?|\+?)\d\.?\d?\d?\nChat Polarity Variance: \d\.?\d?\d?\n(?P<event>Event: .+)\n\n(?P<chat>(?:.+|\n+)+)")
msgs_regex = re.compile(r"(?P<message>(?P<timestamp>\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d) \| (?P<name>.+):\n(?P<content>.+)\nPolarity: (?P<polarity>(?:-?|\+?)\d\.?\d?\d?)\n\[(?P<tag_explanation>(?P<tag>Tag: .+)\n?(?P<explanation>Spiegazione: .+))\])")

In [3]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_bart = AutoTokenizer.from_pretrained("facebook/bart-base")

In [4]:
path = "./rsc/gemini-2.5-flash-dataset_2025-07-07-10-45-16/chats"
dirs = os.listdir(path)
bert_lengths = []
bart_lengths = []
polarities = []
raw_chats = []
for directory in tqdm.tqdm(dirs, desc="Processing chats"):
    files = os.listdir(os.path.join(path, directory))
    for file in files:
        bert_count = 0
        bart_count = 0
        polarity_sum = 0
        with open(os.path.join(path, directory, file), "r", encoding="utf-8") as f:
            content = f.read()
            match = top_regex.match(content)
            if match:
                chat = match.group("chat")
                messages = list(msgs_regex.finditer(chat))
                # checks if there are matched messages
                if len(messages) > 0:
                    total = 0
                    example = {
                        "messages": [],
                        "polarities": [],
                        "explanations": []
                    }
                    for message in messages:
                        content = message.group("content")
                        if content:
                            example["messages"] += [content]
                            example["polarities"] += [float(message.group("polarity"))]
                            example["explanations"] += [
                                message.group("tag") + "\n" +
                                message.group("explanation")
                            ]
                            bert_tokens = tokenizer_bert.encode(content, add_special_tokens=True)
                            bart_tokens = tokenizer_bart.encode(content, add_special_tokens=True)
                            bert_count += len(bert_tokens)
                            bart_count += len(bart_tokens)
                        polarity_sum += float(message.group("polarity"))
                        total += 1
                    # try:
                    #     polarities.append(polarity_sum / total)
                    # except ZeroDivisionError:
                    #     print(f"ZeroDivisionError in file: {file}")
                    if bart_count <= 1024:
                        raw_chats.append(example)
                        bert_lengths.append(bert_count)
                        bart_lengths.append(bart_count)
                else:
                    print(f"No messages found in file: {os.path.join(path, directory, file)}")
            else:
                print(f"No match found in file: {os.path.join(path, directory, file)}")

bert_v = np.array(bert_lengths)
bart_v = np.array(bart_lengths)
polarities_v = np.array(polarities)

# print(f"BERT Max Token Count: {np.max(bert_v)}, Min Token Count: {np.min(bert_v)}")
# print(f"BERT Mean Token Count: {np.mean(bert_v):.2f}")
# print(f"BERT Variance Token Count: {np.std(bert_v):.2f}")
# print(f"BERT Over 512 Tokens: {len(bert_v[bert_v > 512])} / {bert_v.shape[0]}\n")

print(f"BART Max Token Count: {np.max(bart_v)}, Min Token Count: {np.min(bart_v)}")
print(f"BART Mean Token Count: {np.mean(bart_v):.2f}")
print(f"BART Variance Token Count: {np.std(bart_v):.2f}")
print(f"BART Over 1024 Tokens: {len(bart_v[bart_v > 1024])} / {bart_v.shape[0]}\n")

# print(f"Polarities Mean: {np.mean(polarities_v):.2f}")
# print(f"Polarities Variance: {np.std(polarities_v):.2f}")
# print(f"Polarities Min: {np.min(polarities_v):.2f}")
# print(f"Polarities Max: {np.max(polarities_v):.2f}")
# print(f"Polarities Over 0: {len(polarities_v[polarities_v > 0])} / {polarities_v.shape[0]}")
# print(f"Polarities Under 0: {len(polarities_v[polarities_v < 0])} / {polarities_v.shape[0]}")
# print(f"Polarities Around 0: {len(polarities_v[np.logical_and(polarities_v < 0.5, polarities_v > -0.5)])} / {polarities_v.shape[0]}")

Processing chats: 100%|██████████| 7/7 [00:00<00:00,  7.39it/s]

BART Max Token Count: 1006, Min Token Count: 45
BART Mean Token Count: 580.39
BART Variance Token Count: 183.09
BART Over 1024 Tokens: 0 / 244






# BART

In [5]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainingArguments
)
import numpy as np
import nltk
import math
import evaluate # Hugging Face's library for evaluation
from datetime import datetime
import os
from src.model import (
    BartForChatRegressionAndGeneration,
    ChatMultiTaskOutput,
    CustomChatTrainer,
    DataCollatorForChat
)

In [6]:
# Checks if torch can detect a GPU
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")

NVIDIA GeForce RTX 3060 Laptop GPU


In [7]:
# raw_chats = [
#     {
#         "messages": ["Sei un incompetente!", "Davvero, non capisci niente."],
#         "polarities": [-0.9, -1.0],
#         "explanations": ["Il messaggio contiene un insulto.", "Il messaggio rafforza l'attacco personale."]
#     },
#     {
#         "messages": ["Grazie mille per l'aiuto.", "Sei stato gentilissimo.", "Apprezzo molto il tuo tempo."],
#         "polarities": [1.0, 0.9, 0.9],
#         "explanations": ["Esprime gratitudine esplicita.", "Contiene un complimento diretto.", "Mostra apprezzamento per lo sforzo altrui."]
#     },
#     {
#         "messages": ["Non sono d'accordo con questa decisione."],
#         "polarities": [0.1],
#         "explanations": ["Esprime disaccordo in modo neutrale e rispettoso."]
#     },
#     {
#         "messages": ["Fai schifo.", "Spero che ti licenzino."],
#         "polarities": [-1.0, -1.0],
#         "explanations": ["Contiene un insulto grave.", "Contiene un augurio negativo e minaccioso."]
#     },
#     {
#         "messages": ["Il prodotto è arrivato rotto.", "Il servizio clienti non risponde.", "Sono molto insoddisfatto."],
#         "polarities": [-0.8, -0.7, -0.9],
#         "explanations": ["Descrive un problema con il prodotto.", "Lamenta una mancanza di supporto.", "Esprime insoddisfazione generale."]
#     }
# ]
# Convert to a Hugging Face Dataset
dataset = Dataset.from_list(raw_chats)
#remove empty lists from dataset['messages']
dataset = dataset.filter(lambda x: len(x['messages']) > 0)
print(dataset)
# print(dataset[0]['messages'])
# print(dataset[0]['polarities'])
# print(dataset[0]['explanations'])

Filter: 100%|██████████| 244/244 [00:00<00:00, 21475.40 examples/s]

Dataset({
    features: ['messages', 'polarities', 'explanations'],
    num_rows: 244
})





In [None]:
MODEL_CHECKPOINT = "morenolq/bart-it"
SEP_TOKEN = "<sep>"

# --- Tokenizer Setup ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
tokenizer.add_special_tokens({'sep_token': SEP_TOKEN})
print(f"Special token ID for '{SEP_TOKEN}': {tokenizer.sep_token_id}")

# --- Preprocessing ---
def preprocess(examples):
    '''
    Preprocess the dataset by joining messages and explanations with the special separator token.
    The inputs are the concatenated messages, and the targets are the concatenated explanations.
    '''
    inputs = [SEP_TOKEN.join(chat) + SEP_TOKEN for chat in examples['messages']]
    targets = [SEP_TOKEN.join(chat) + SEP_TOKEN for chat in examples['explanations']]
    tokenized_examples = tokenizer(
        inputs,
        max_length=1024,
        truncation=True,
        text_target=targets
    )
    tokenized_examples["regression_labels"] = examples["polarities"]
    return tokenized_examples

def print_dataset(dataset): # debug only
    print(dataset)
    print(dataset[0]['input_ids'])
    print(dataset[0]['labels'])
    print(dataset[0]['regression_labels'])

tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names
)

print_dataset(tokenized_dataset)

Special token ID for '<sep>': 52000


Map: 100%|██████████| 244/244 [00:00<00:00, 1746.28 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'regression_labels'],
    num_rows: 244
})
[0, 6664, 2209, 16, 9644, 312, 473, 3174, 2440, 368, 2565, 2467, 322, 77, 1005, 1268, 18, 4641, 458, 781, 225, 69, 329, 19613, 1777, 266, 4448, 458, 225, 77, 82, 10267, 18, 1474, 8776, 4441, 266, 17149, 18, 1519, 8368, 467, 8671, 18, 52000, 7354, 4266, 18, 12711, 16, 493, 301, 1798, 2467, 345, 7319, 1050, 16, 939, 6480, 329, 11, 38863, 11022, 18, 2554, 329, 357, 11, 22553, 16, 346, 6759, 300, 311, 1005, 18, 1474, 41669, 1366, 710, 515, 322, 77, 4240, 368, 4567, 11227, 16, 4058, 15013, 329, 1900, 1773, 16262, 35, 632, 2340, 1039, 944, 3793, 35, 52000, 4076, 1039, 329, 5239, 1777, 16, 20826, 18, 1474, 7182, 225, 71, 83, 82, 225, 77, 3803, 8997, 7832, 300, 368, 7067, 322, 77, 1005, 16, 384, 1049, 968, 10527, 18, 1875, 329, 11, 7683, 322, 77, 35628, 18, 1282, 345, 329, 2092, 16, 13236, 3905, 322, 77, 880, 946, 312, 4275, 18, 1519, 8368, 5512, 18, 52000, 15561, 1366, 17855, 16, 4266,




In [None]:
data_collator = DataCollatorForChat(tokenizer=tokenizer)
new_dataset = data_collator(tokenized_dataset.to_list())
new_dataset = Dataset.from_dict(new_dataset)
print_dataset(new_dataset)

In [None]:
# --- Metrics Computation ---
rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    print(f"Predictions: {preds}")
    print(f"Labels: {labels}")
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode generated summaries, replacing -100 padding with pad_token
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Decode reference summaries, replacing -100 padding with pad_token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Simple text cleaning
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # ROUGE expects a newline after each sentence
    decoded_preds_rouge = ["\n".join(nltk.sent_tokenize(pred)) for pred in decoded_preds]
    decoded_labels_rouge = ["\n".join(nltk.sent_tokenize(label)) for label in decoded_labels]
    
    # Compute ROUGE scores
    rouge_result = rouge_metric.compute(predictions=decoded_preds_rouge, references=decoded_labels_rouge)
    
    # Compute BLEU scores
    decoded_labels_bleu = [[label] for label in decoded_labels] # BLEU expects a list of references
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels_bleu)

    result = {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"]
    }

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
OUT_DIR = "./out/" + timestamp
# Ensure the output directory exists
os.makedirs(OUT_DIR, exist_ok=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
tokenized_train_dataset = train_test_split['train']
tokenized_test_dataset = train_test_split['test']

model = BartForChatRegressionAndGeneration.from_pretrained(MODEL_CHECKPOINT)
model.resize_token_embeddings(len(tokenizer))
# Ensure the model's config has the correct sep_token_id
model.config.sep_token_id = tokenizer.sep_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR + "/bart-it-chat-multitask",
    learning_rate=3e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True, # Crucial for generation metrics
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    remove_unused_columns=False
)

data_collator = DataCollatorForChat(tokenizer=tokenizer)

trainer = CustomChatTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Add metrics computation
)

print("Starting model training...")
trainer.train()
print("Training finished.")

trainer.save_model(OUT_DIR + "/bart-it-chat-multitask-final")
tokenizer.save_pretrained(OUT_DIR + "/bart-it-chat-multitask-final")

Some weights of BartForChatRegressionAndGeneration were not initialized from the model checkpoint at morenolq/bart-it and are newly initialized: ['regression_head.1.bias', 'regression_head.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Starting model training...


Step,Training Loss
50,3.8253
100,2.6798
150,2.3673


Training finished.


('./out/2025-07-11_20-10-02/bart-it-chat-multitask-final\\tokenizer_config.json',
 './out/2025-07-11_20-10-02/bart-it-chat-multitask-final\\special_tokens_map.json',
 './out/2025-07-11_20-10-02/bart-it-chat-multitask-final\\vocab.json',
 './out/2025-07-11_20-10-02/bart-it-chat-multitask-final\\merges.txt',
 './out/2025-07-11_20-10-02/bart-it-chat-multitask-final\\added_tokens.json',
 './out/2025-07-11_20-10-02/bart-it-chat-multitask-final\\tokenizer.json')

In [12]:
# --- Final Evaluation on Test Set ---
print("\n--- Evaluating on Test Set ---")
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

# Calculate and add perplexity
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = round(perplexity, 4)

print("\n--- Final Evaluation Metrics ---")
for key, value in sorted(eval_results.items()):
    print(f"{key}: {value}")
print("---------------------------------")


--- Evaluating on Test Set ---


KeyError: 'eval_loss'

In [15]:
# Load the fine-tuned model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"

# Use the correct local path for the model and tokenizer
MODEL_LOCAL_PATH = OUT_DIR + "/bart-it-chat-multitask-final"

model = BartForChatRegressionAndGeneration.from_pretrained(
    MODEL_LOCAL_PATH
).to(device)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_LOCAL_PATH
)


In [16]:
# --- 6. Inference ---
print("\n--- Running Inference ---")

def predict(text):
    """
    Function to perform inference on a single piece of text.
    """
    # Prepare the input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        # max_length=1024,
        truncation=True,
        # padding="max_length"
    ).to(device)
    # print(inputs)
    
    # Get the regression prediction
    # We don't need gradients for inference
    with torch.no_grad():
        outputs = model(**inputs)
        polarity_scores = outputs.regression_logits

    # Generate the explanation text
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        # max_length=1024,
        num_beams=7,
        # early_stopping=True
    )
    print(f"Generated IDs: {generated_ids}")
    # Decode the generated explanation skipping also the '<sep>' token
    explanations = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return polarity_scores, explanations

# Test with some examples
chat = '''
2024-03-29 17:00:10 | Lucia:
Ciao amore sei bellissimo e fichissimo!<sep>
2024-03-29 17:05:10 | Pippo:
Non ho parole per descrivere quanto sei stupida. Troia<sep>
2024-03-29 17:08:10 | Lucia:
Come sei cattivo<sep>
'''


polarities, explanations = predict(chat)
print(f"Input Messages:\n{chat}")
print(f"Predicted Polarities: {polarities.cpu().numpy()}")
print(f"Generated Explanations: \n{explanations}\n")
# input('Press Enter to continue...')  # Pause for user input before next iteration

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



--- Running Inference ---
Generated IDs: tensor([[    2,     0, 25349,    30,   866,   297,   340,  1359,   203,  1818,
          4114,    17,  7100,    17,  6371,  2158,    30,   516,    30,  2258,
             2]], device='cuda:0')
Input Messages:

2024-03-29 17:00:10 | Lucia:
Ciao amore sei bellissimo e fichissimo!<sep>
2024-03-29 17:05:10 | Pippo:
Non ho parole per descrivere quanto sei stupida. Troia<sep>
2024-03-29 17:08:10 | Lucia:
Come sei cattivo<sep>

Predicted Polarities: [ 0.33225065 -0.07724506  0.09848431]
Generated Explanations: 
Tag: Reciprocità
2024-03-29 17:00:10

