In [None]:
import pandas as pd
import math
from tqdm import tqdm

tqdm.pandas(dynamic_ncols=True)

def text_entropy(text):
    if not text or not isinstance(text, str):
        return 0.0
    freq = {}
    for c in text:
        freq[c] = freq.get(c, 0) + 1
    total = len(text)
    return -sum((count/total) * math.log2(count/total) for count in freq.values())

df = pd.read_csv("/content/llama3.2_3b_results.csv")
df["entropy"] = df["model_response"].progress_apply(text_entropy)
df.to_csv("/content/llama3.2_3b_results.csv", index=False)


100%|██████████| 100/100 [00:00<00:00, 7050.32it/s]


In [None]:
import os
import math
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "gpt2"
MAX_LENGTH = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

@torch.no_grad()
def sequence_entropy(text, use_device):
    if not text or not isinstance(text, str):
        return 0.0

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(use_device) for k, v in inputs.items()}
    input_ids = inputs["input_ids"]
    seq_len = input_ids.size(1)
    if seq_len <= 1:
        return 0.0

    entropies = []
    for i in range(1, seq_len):
        logits = model(input_ids[:, :i]).logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        entropy = -torch.sum(probs * torch.log2(probs + 1e-12), dim=-1)
        entropies.append(float(entropy.cpu().item()))
    return sum(entropies) / len(entropies)

input_path = "/content/result_1_texts1.csv"
output_path = input_path

df = pd.read_csv(input_path)

if "entropy_new" not in df.columns:
    df["entropy_new"] = pd.NA

rows_to_process = df[df["entropy_new"].isna()].index.tolist()

for idx in tqdm(rows_to_process, dynamic_ncols=True):
    text = df.at[idx, "text"]
    try:
        ent = sequence_entropy(text, device)
    except Exception:
        try:
            torch.cuda.empty_cache()
        except Exception:
            pass
        try:
            model.to("cpu")
            ent = sequence_entropy(text, torch.device("cpu"))
            if device.type == "cuda":
                model.to(device)
        except Exception:
            ent = float("nan")
    df.at[idx, "entropy_new"] = ent
    df.to_csv(output_path, index=False)


100%|██████████| 150/150 [06:47<00:00,  2.72s/it]
