In [19]:
import os
import re
from transformers import AutoTokenizer
import numpy as np

In [4]:
top_regex = re.compile(r"(?P<stage>Stage: .+)\nChat Polarity Mean: (?:-?|\+?)\d\.?\d?\d?\nChat Polarity Variance: \d\.?\d?\d?\n(?P<event>Event: .+)\n\n(?P<chat>(?:.+|\n+)+)")
msgs_regex = re.compile(r"(?P<message>(?P<timestamp>\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d) \| (?P<name>.+):\n(?P<content>.+)\nPolarity: (?P<polarity>(?:-?|\+?)\d\.?\d?\d?)\n\[(?P<tag_explanation>Tag: (?P<tag>.+)\nSpiegazione: (?P<explanation>.+))\])")

In [5]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_bart = AutoTokenizer.from_pretrained("facebook/bart-base")

In [37]:
path = "../rsc/gemini-2.5-flash-dataset_2025-07-07-10-45-16/chats"
dirs = os.listdir(path)
bert_lengths = []
bart_lengths = []
polarities = []
for directory in dirs:
    files = os.listdir(os.path.join(path, directory))
    for file in files:
        bert_count = 0
        bart_count = 0
        polarity_sum = 0
        with open(os.path.join(path, directory, file), "r", encoding="utf-8") as f:
            content = f.read()
            match = top_regex.match(content)
            if match:
                chat = match.group("chat")
                messages = msgs_regex.finditer(chat)
                total = 0
                for message in messages:
                    content = message.group("content")
                    if content:
                        bert_tokens = tokenizer_bert.encode(content, add_special_tokens=True)
                        bart_tokens = tokenizer_bart.encode(content, add_special_tokens=True)
                        bert_count += len(bert_tokens)
                        bart_count += len(bart_tokens)
                    polarity_sum += float(message.group("polarity"))
                    total += 1
                # try:
                #     polarities.append(polarity_sum / total)
                # except ZeroDivisionError:
                #     print(f"ZeroDivisionError in file: {file}")
                bert_lengths.append(bert_count)
                bart_lengths.append(bart_count)
            else:
                print(f"No match found in file: {os.path.join(path, directory, file)}")
            

bert_v = np.array(bert_lengths)
bart_v = np.array(bart_lengths)
polarities_v = np.array(polarities)

print(f"BERT Max Token Count: {np.max(bert_v)}, Min Token Count: {np.min(bert_v)}")
print(f"BERT Mean Token Count: {np.mean(bert_v):.2f}")
print(f"BERT Variance Token Count: {np.std(bert_v):.2f}")
print(f"BERT Over 512 Tokens: {len(bert_v[bert_v > 512])} / {bert_v.shape[0]}\n")

print(f"BART Max Token Count: {np.max(bart_v)}, Min Token Count: {np.min(bart_v)}")
print(f"BART Mean Token Count: {np.mean(bart_v):.2f}")
print(f"BART Variance Token Count: {np.std(bart_v):.2f}")
print(f"BART Over 1024 Tokens: {len(bart_v[bart_v > 1024])} / {bart_v.shape[0]}\n")

print(f"Polarities Mean: {np.mean(polarities_v):.2f}")
print(f"Polarities Variance: {np.std(polarities_v):.2f}")
print(f"Polarities Min: {np.min(polarities_v):.2f}")
print(f"Polarities Max: {np.max(polarities_v):.2f}")
print(f"Polarities Over 0: {len(polarities_v[polarities_v > 0])} / {polarities_v.shape[0]}")
print(f"Polarities Under 0: {len(polarities_v[polarities_v < 0])} / {polarities_v.shape[0]}")
print(f"Polarities Around 0: {len(polarities_v[np.logical_and(polarities_v < 0.5, polarities_v > -0.5)])} / {polarities_v.shape[0]}")

BERT Max Token Count: 1128, Min Token Count: 0
BERT Mean Token Count: 476.98
BERT Variance Token Count: 269.57
BERT Over 512 Tokens: 88 / 172

BART Max Token Count: 1156, Min Token Count: 0
BART Mean Token Count: 494.01
BART Variance Token Count: 278.74
BART Over 1024 Tokens: 2 / 172

Polarities Mean: nan
Polarities Variance: nan


ValueError: zero-size array to reduction operation minimum which has no identity