In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os

# Importing the T5 modules from huggingface/transformers
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration

[2024-06-29 05:12:02,957] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
class PredictDataset(Dataset):

    def __init__(self, sentences, tokenizer, source_len):
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.ctext = sentences  # self.data.ctext

    def __len__(self):
        return len(self.ctext)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length=self.source_len, pad_to_max_length=True,
                                                  return_tensors='pt', truncation=True)
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
        }

In [18]:
def predict(tokenizer, model, device, loader,):
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=400,
                num_beams=10,
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
                     generated_ids]
            if _ % 100 == 0:
                print(f'Completed {_}')
            #print(f'Completed {_}')

            predictions.extend(preds)

    return predictions

In [19]:
torch.manual_seed(32)  # pytorch random seed
np.random.seed(32)  # numpy random seed
torch.backends.cudnn.deterministic = True
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

In [42]:
ptpath = "/data/users/jalabi/YAD/checkpoints/NiLE/small/"
tokenizer = AutoTokenizer.from_pretrained(ptpath)
model = T5ForConditionalGeneration.from_pretrained(ptpath)
model = model.to(device)

In [52]:
test_set = PredictDataset(["<en2ig>: Food is good"], tokenizer, 1024)
test_params = {
    'batch_size': 1,
    'shuffle': False,
    'num_workers': 0
}
test_loader = DataLoader(test_set, **test_params)

In [53]:
lang_predictions = predict(tokenizer, model, device, test_loader,)

Completed 0


In [54]:
lang_predictions

['Nri bụ ihe dị mma.']