In [2]:
from datasets import load_dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
from time import localtime, strftime, strptime
import os

In [3]:
CL = 5

f = {f"{k}{i}": Value(dtype="string") for k in "ts" for i in range(CL)}
dataset = load_dataset("csv", 
                       data_files={split: f"../D0/{split}.csv" for split in ["train", "val", "test"]},
                       features=Features({
                            **f,
                            "labels": ClassLabel(num_classes=8, names=["neutral", "joy", "sadness", "fear", "anger", "surprise", "disgust", "non-neutral"])
                       }))

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocess_data(examples):
    encoding = tokenizer(examples[f"t{CL-1}"], max_length=128, truncation=True)

    return encoding

tokenized_dataset = dataset.map(preprocess_data, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/7681 [00:00<?, ? examples/s]

Map:   0%|          | 0/858 [00:00<?, ? examples/s]

Map:   0%|          | 0/1964 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("../M1/weighted/")

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    metrics = {
        "WA": accuracy_score(labels, predictions).tolist(),
        "UWA": balanced_accuracy_score(labels, predictions).tolist(),
        "miF1": f1_score(labels, predictions, average="micro"),
        "maF1": f1_score(labels, predictions, average="macro"),
        "wtF1": f1_score(labels, predictions, average="weighted"),
    }

    return metrics

In [9]:
loss_fn = nn.CrossEntropyLoss(torch.Tensor(1/np.bincount(dataset["train"]["labels"])).to("cuda"))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)

        loss = loss_fn(outputs["logits"], labels)
        return (loss, outputs) if return_outputs else loss

In [10]:
training_args = TrainingArguments(
    output_dir="/tmp/tmp_trainer",
    per_device_eval_batch_size=64,
    report_to="none",
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

In [12]:
trainer.evaluate(tokenized_dataset["test"])

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 1.381253957748413,
 'eval_WA': 0.5025458248472505,
 'eval_UWA': 0.5303076044043993,
 'eval_miF1': 0.5025458248472505,
 'eval_maF1': 0.3889112305202165,
 'eval_wtF1': 0.49749468578180783,
 'eval_runtime': 0.8976,
 'eval_samples_per_second': 2187.966,
 'eval_steps_per_second': 34.535}

In [11]:
predictions = {i: trainer.predict(tokenized_dataset[i]) for i in dataset}

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/121 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

In [13]:
preds = {i: pd.DataFrame(dataset[i]) for i in dataset}
for i in preds:
    preds[i]["labels"] = np.argmax(predictions[i][0], axis=1)

In [14]:
preds["train"]

Unnamed: 0,t0,t1,t2,t3,t4,s0,s1,s2,s3,s4,labels
0,also I was the point person on my company's tr...,You must've had your hands full.,That I did. That I did.,So let's talk a little bit about your duties.,My duties? All right.,Chandler,The Interviewer,Chandler,The Interviewer,Chandler,0
1,You must've had your hands full.,That I did. That I did.,So let's talk a little bit about your duties.,My duties? All right.,"Now you'll be heading a whole division, so you...",The Interviewer,Chandler,The Interviewer,Chandler,The Interviewer,0
2,That I did. That I did.,So let's talk a little bit about your duties.,My duties? All right.,"Now you'll be heading a whole division, so you...",I see.,Chandler,The Interviewer,Chandler,The Interviewer,Chandler,0
3,So let's talk a little bit about your duties.,My duties? All right.,"Now you'll be heading a whole division, so you...",I see.,But there'll be perhaps 30 people under you so...,The Interviewer,Chandler,The Interviewer,Chandler,The Interviewer,0
4,My duties? All right.,"Now you'll be heading a whole division, so you...",I see.,But there'll be perhaps 30 people under you so...,Good to know.,Chandler,The Interviewer,Chandler,The Interviewer,Chandler,1
...,...,...,...,...,...,...,...,...,...,...,...
7676,"Hold it, hold it.",I gotta side with Chandler on this one.,"When I first moved to the city, I went out a c...",It made me nuts.,You or me?,Joey,Joey,Joey,Joey,Chandler,0
7677,I gotta side with Chandler on this one.,"When I first moved to the city, I went out a c...",It made me nuts.,You or me?,"I got it. Uh, Joey, women don't have Adam's ap...",Joey,Joey,Joey,Chandler,Ross,0
7678,"When I first moved to the city, I went out a c...",It made me nuts.,You or me?,"I got it. Uh, Joey, women don't have Adam's ap...","You guys are messing with me, right?",Joey,Joey,Chandler,Ross,Joey,4
7679,It made me nuts.,You or me?,"I got it. Uh, Joey, women don't have Adam's ap...","You guys are messing with me, right?",Yeah.,Joey,Chandler,Ross,Joey,All,0


In [16]:
for i in preds:
    preds[i].to_csv(f"{i}.csv", index=False)