In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('boapps/kmdb_classification_model').to('cuda')
tokenizer = BertTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')

In [None]:
from datasets import concatenate_datasets
from datasets import load_dataset

dataset = load_dataset("boapps/kmdb_classification")

dataset = dataset.map(lambda row: {'td': row['title']+'\n'+row['description']})
def tokenize_function(examples):
    return tokenizer(examples["td"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
test_pos = tokenized_datasets['test'].filter(lambda row: row['label'] == 1)
test_neg = tokenized_datasets['test'].filter(lambda row: row['label'] == 0)
test_pos = test_pos.select(range(200))
test_neg = test_neg.select(range(1895))

test_set = concatenate_datasets([test_pos, test_neg]).shuffle(seed=42)

In [None]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

tp = 0
tn = 0
fp = 0
fn = 0

with torch.no_grad():
    for a in tqdm(test_set):
        inputs = tokenizer(a['td'], return_tensors="pt").to('cuda')
        logits = model(**inputs).logits
        probabilities = F.softmax(logits[0], dim=-1)
        res = torch.argmax(logits, axis=1)[0]
        neg = probabilities[0] > 0.5
        if neg and a['label'] == 0:
            tn += 1
        elif not neg and a['label'] == 1:
            tp += 1
        elif not neg and a['label'] == 0:
            fp += 1
        elif neg and a['label'] == 1:
            fn += 1

print('precision', tp/(tp+fp))
print('recall', tp/(tp+fn))
print('accuracy', (tp+tn)/(tp+tn+fp+fn))

tp, tn, fp, fn