In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("billingsmoore/mlotsawa-ground-small", device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained('billingsmoore/mlotsawa-ground-small')

In [2]:
from datasets import load_from_disk

ds = load_from_disk('../Finetuning/Data/tokenized-finetuning-ds')['test']

In [3]:
import torch

def generate_and_decode(batch):
    # Convert inputs to torch tensors on the model's device
    input_ids = torch.tensor(batch["input_ids"]).to(model.device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(model.device)

    # Generate predictions
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            num_beams=4,
            do_sample=False,
        )

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    return {
        "small_predictions": decoded_preds
    }


In [11]:
pred_ds = ds.map(generate_and_decode, batched=True, batch_size=16)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [12]:
pred_ds

Dataset({
    features: ['bo', 'en', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'small_predictions'],
    num_rows: 100000
})

In [13]:
pred_ds.save_to_disk('small-pred-ds')

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]