In [None]:
MODEL = "./best_model"
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 128
BATCH_SIZE = 8
NO_REPEAT_NGRAM_SIZE = 2
NUM_BEAMS = 15
LENGTH_PENALTY = 1

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import torch
import evaluate
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
import os
from statistics import mean
from rouge import Rouge

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
path = "../Dataset"
test = pd.read_csv(f"{path}/test.csv")
test = Dataset.from_dict(test)
ds = DatasetDict({
    "test": test,
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def tokenize_data(data):
    input_feature = tokenizer(data["question"], truncation=True, max_length=MAX_INPUT_LENGTH)
    label = tokenizer(data["summary"], truncation=True, max_length=MAX_OUTPUT_LENGTH)
    
    return {
        "input_ids": input_feature["input_ids"],
        "attention_mask": input_feature["attention_mask"],
        "labels": label["input_ids"],
    }

tokenized_ds = ds.map(
    tokenize_data,
    remove_columns=["summary", "question"],
    batched=True,
    batch_size=BATCH_SIZE
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(
    MODEL,
    max_length=MAX_OUTPUT_LENGTH,
    length_penalty=LENGTH_PENALTY,
    no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
    num_beams=NUM_BEAMS,
)

model = (AutoModelForSeq2SeqLM.from_pretrained(MODEL).to(device))

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt"
)

In [None]:
bert_score = evaluate.load("bertscore")
rouge = Rouge()

def tokenize_sentence(arg):
    encoded_arg = tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    results = {}
    
    rouge_scores = rouge.get_scores(text_preds, text_labels, avg = True, ignore_empty = True)
    results['rouge-1'] = rouge_scores['rouge-1']['f']
    results['rouge-2'] = rouge_scores['rouge-2']['f']
    results['rouge-l'] = rouge_scores['rouge-l']['f']
    
    bertscore_result = bert_score.compute(
        predictions=text_preds,
        references=text_labels,
        model_type="csebuetnlp/banglabert",
        num_layers=12,
        batch_size=4
    )
    results['bertscore'] = mean([round(v, 4) for v in bertscore_result["f1"]])
    
    return results

In [None]:
test_dataloader = DataLoader(
  tokenized_ds["test"].with_format("torch"),
  collate_fn=data_collator,
  batch_size=BATCH_SIZE
)

In [None]:
scores = []
predictions = []

for batch in tqdm(test_dataloader):
    torch.cuda.empty_cache()
    with torch.no_grad():
        preds = model.generate(
            batch["input_ids"].to(device),
            num_beams=NUM_BEAMS,
            num_return_sequences=1,
            no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
            remove_invalid_values=True,
            max_length=MAX_OUTPUT_LENGTH,
        )
        preds = preds.cpu()
        labels = batch["labels"]
        scores.append(metrics_func([preds, labels]))
        preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        predictions.extend(preds)

print(pd.DataFrame(scores).mean())
test = test.add_column('finetuned_predictions', predictions)
test.to_pandas().to_json('output.json', orient='records')