In [1]:
import torch
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from transformers import DataCollatorWithPadding
from transformers import BertForSequenceClassification, AutoTokenizer

In [3]:
cedr = load_dataset('cedr')

No config specified, defaulting to: cedr/main
Reusing dataset cedr (/Users/arsenplus/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def binarize_labels(labels):
    return [int(len(labels)==0) ] + [int(i in labels) for i in range(5)]

In [5]:
MODEL_NAME = 'cointegrated/rubert-tiny2-cedr-emotion-detection'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=6, 
    problem_type='multi_label_classification'
    )

In [6]:
cedr_mapped = cedr.map(
    lambda x: tokenizer(
        x['text'],
        truncation=True), 
    batched=True
    ).map(
        lambda x: {'label': [float(y) for y in binarize_labels(x['labels'])]},
        batched=False,
        remove_columns=['text', 'labels', 'source']
        )

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7528 [00:00<?, ?ex/s]

  0%|          | 0/1882 [00:00<?, ?ex/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer)

In [8]:
def predict_with_model(model, dataloader):
    preds = []
    facts = []

    for batch in tqdm(dataloader):
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)
        
        with torch.no_grad():
            pr = model(
                input_ids=batch.input_ids,
                attention_mask=batch.attention_mask,
                token_type_ids=batch.token_type_ids
                )
        preds.append(torch.softmax(pr.logits, -1).cpu().numpy())

    facts = np.concatenate(facts)
    preds = np.concatenate(preds)

    return facts, preds


def get_classification_report(facts, preds, model):
    aucs = {label:roc_auc_score(facts[:, i], preds[:, i]) for i, label in model.config.id2label.items()}
    return aucs


def evaluate_model(model, dev_dataloader):
    facts, preds = predict_with_model(model, dev_dataloader)
    aucs = get_classification_report(facts, preds, model)
    aucs['overall'] = np.mean(list(aucs.values()))
    return aucs

In [9]:
batch_size = 64

In [10]:
test_dataloader = DataLoader(
    cedr_mapped['test'], 
    batch_size=batch_size,
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator
)

In [11]:
test_results = evaluate_model(model, test_dataloader)

  0%|          | 0/30 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
test_results

{'no_emotion': 0.9285743954656362,
 'joy': 0.9511928957992504,
 'sadness': 0.956353783198774,
 'surprise': 0.8908363111599781,
 'fear': 0.8954949670239243,
 'anger': 0.7511348890153671,
 'overall': 0.8955978736104884}