In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from datasets import load_dataset
from evaluate import load

# Load Model and Tokenizer
model_name = "google/flan-t5-xxl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    device_map="auto"
)

2025-03-29 10:16:12.403921: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-29 10:16:12.426032: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743243372.450069  114117 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743243372.457216  114117 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743243372.476557  114117 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
# Load Test Set
dataset = load_dataset("llm-blender/mix-instruct", split="test")

In [3]:
from tqdm import tqdm

def generate_output(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def batch_generate_output(prompts, bs=128):
    all_outputs = []
    for i in tqdm(range(0, len(prompts), bs)):
        batch_prompts = prompts[i:i + bs]
        inputs = tokenizer(batch_prompts, return_tensors='pt', padding=True).to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=200)
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_outputs.extend(decoded_outputs)
    return all_outputs

prompts = dataset["input"]
references = dataset["output"]
predictions = batch_generate_output(prompts, bs=256)

with open('cache/misc/flan.pkl', 'wb') as f:
    import pickle
    pickle.dump(predictions, f)

100%|██████████| 20/20 [35:18<00:00, 105.94s/it]


In [7]:
import torch

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [4]:
from evaluate import load
# Load Metrics
rouge = load("rouge")
bleu = load("bleu")
bertscore = load("bertscore")
bleurt = load("bleurt")

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


I0000 00:00:1743245575.065127  114117 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 35277 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:ca:00.0, compute capability: 8.0
I0000 00:00:1743245575.067957  114117 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 23577 MB memory:  -> device: 1, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:17:00.0, compute capability: 8.0
I0000 00:00:1743245575.069723  114117 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 7289 MB memory:  -> device: 2, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:31:00.0, compute capability: 8.0


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [5]:
def evaluate_metrics(predictions, references):
    # Compute Metrics
    results = {}
    results.update(rouge.compute(predictions=predictions, references=references))
    results.update(bleu.compute(predictions=predictions, references=references))
    results.update(bertscore.compute(predictions=predictions, references=references, lang="en"))
    results.update(bleurt.compute(predictions=predictions, references=references))
    return results

with open('cache/misc/flan.pkl', 'rb') as f:
    import pickle
    pred = pickle.load(f)
# Calculate Metrics
results = evaluate_metrics(pred, references)

print("Evaluation Results:")
for metric, value in results.items():
    print(f"{metric}: {value}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Results:
rouge1: 0.09917449354126222
rouge2: 0.028109537821339067
rougeL: 0.08298002650824621
rougeLsum: 0.08357447956493415
bleu: 0.003966477362503056
precisions: [0.26550340897815705, 0.07468079614981706, 0.0351600895921623, 0.0205943024702934]
brevity_penalty: 0.06443709526710639
length_ratio: 0.26723207284876077
translation_length: 70549
reference_length: 263999
precision: [0.8841546773910522, 0.8232880234718323, 0.0, 0.8414329290390015, 0.79203200340271, 0.9037922024726868, 0.8255098462104797, 0.0, 0.8050273656845093, 0.9354555606842041, 0.929699182510376, 0.9067494869232178, 0.9114758372306824, 0.7947301864624023, 0.8337825536727905, 0.8249984383583069, 0.8454591035842896, 0.8689166307449341, 0.900529682636261, 0.8978637456893921, 0.9519354701042175, 0.8347032070159912, 0.8179648518562317, 0.891208827495575, 0.817611575126648, 0.8204136490821838, 0.8085076212882996, 0.8747634291648865, 0.7982109189033508, 0.85594242811203, 0.8785643577575684, 0.8143312931060791, 0.8183

In [6]:
dataset['input'][0]

"I've always wondered what the difference is between a skeptic and a denier."

In [15]:
predictions[12]

'a.'