In [None]:
import torch
torch.cuda.empty_cache()
from project.models.baselines.classification_model import ClassificationModel
from project.models.common.nlp_configs import HFTransformerDataConfig
from project.data.nlp_data_module import NLPDataModule
from transformers import AutoTokenizer

In [None]:
# dataset must have sentence1, sentence2 columns (corresponding to arg, kp) and idx column (an int index)
test_data_path = r'test_dataset.csv'
model_checkpoint_path = r"crypto-440-epoch=2-step=968.ckpt"
model = ClassificationModel.load_from_checkpoint(checkpoint_path=model_checkpoint_path)
data_cfg = HFTransformerDataConfig(**{
    "batch_size": 32,
    "num_workers": 4,
    "train_file": test_data_path,
    "test_file": test_data_path
})
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model.model.config._name_or_path, use_fast=True)
datamodule = NLPDataModule(cfg=data_cfg, tokenizer=tokenizer)
datamodule.setup("test")
test_dataset = datamodule.ds["test"]

In [None]:
# predict in batches
BATCH_SIZE = 32

test_ds_df = test_dataset.to_pandas()
test_ds_df.set_index('idx')
test_ds_df['score'] = -1

model.to('cuda')

def add_preds(examples):
    similarity, _, output = model._forward(examples, predict_with_gpu=True)
    for i, idx in enumerate(examples["idx"]):
        if output is not None:
            score = output[i][1].item()
        else:
            score = similarity[i].item()
        test_ds_df.iat[idx.item(), test_ds_df.columns.get_loc('score')] = score

test_dataset.map(lambda examples: add_preds(examples), batched=True, batch_size=BATCH_SIZE)

In [None]:
test_ds_df.head(2)

In [None]:
test_ds_df.to_csv(r'test_dataset_with_match_score.csv', index=False)