In [1]:
from datasets import load_dataset

ds = load_dataset('ape-ds')

In [2]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "baseline-model/checkpoint-120970"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained('my-tokenizer')
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [5]:
import torch

def make_translations(batch):
    # Tokenize batch at once
    inputs = tokenizer(["translate Tibetan to English: " + text for text in batch['bo']], 
                       return_tensors="pt", padding=True, truncation=True, max_length=256).to('cuda:0')

    # Run batch inference
    with torch.no_grad():  # Disables gradients for efficiency
        outputs = model.generate(inputs.input_ids)

    # Decode all translations at once
    batch['for-post-edit'] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    return batch

# Apply batch processing (adjust batch_size based on memory constraints)
ds = ds.map(make_translations, batched=True, batch_size=128)  # Adjust batch size as needed


Map:   0%|          | 0/193544 [00:00<?, ? examples/s]

Map:   0%|          | 0/21506 [00:00<?, ? examples/s]

In [7]:
ds['train'][0]

{'bo': 'ཞི་ཁྲོ་སྤྲུལ་པའི་སྐུ་ལ་ཕྱག་འཚལ་ལོ༔',
 'en': 'Nirmāṇakāya peaceful and wrathful: to you I pay homage!',
 'topic': 'Confession, Termas, Tibetan Masters, Nyala Pema Dündul',
 'input_ids': [7526,
  440,
  13572,
  311,
  727,
  823,
  566,
  61,
  22,
  13972,
  23483,
  6763,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [9]:
ds = ds.remove_columns(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['bo', 'en', 'topic', 'for-post-edit'],
        num_rows: 193544
    })
    test: Dataset({
        features: ['bo', 'en', 'topic', 'for-post-edit'],
        num_rows: 21506
    })
})

In [13]:
ds['test'][0]

{'bo': 'བདག་ལུས་སྐྱེ་དགུའི་ལོངས་སྤྱོད་མཎྜལ་རྫས༔',
 'en': 'My body and the resources of all sentient beings are substances of this maṇḍala.',
 'topic': 'Ngöndro, Termas, Tibetan Masters, Do Khyentse Yeshe Dorje',
 'for-post-edit': 'The five certainties of the five certainties of the five certainties'}

In [12]:
ds.save_to_disk('ape-ds')

Saving the dataset (0/1 shards):   0%|          | 0/193544 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21506 [00:00<?, ? examples/s]