In [1]:
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel

class MyDataset(Dataset):
    def __init__(self, 
                ids: List[str], 
                speakers: List[str], 
                sexes: List[str], 
                texts: List[str], 
                texts_en: List[str], 
                labels: List[bool],
                device: torch.device = torch.device('cpu'),
                model_name: str = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                max_length: int = 512
        ):
        assert len(ids) == len(speakers) == len(sexes) == len(texts) == len(texts_en) == len(labels)
        self.ids = []
        self.speakers = []
        self.sexes = []
        self.texts = []
        self.texts_en = []
        self.embeddings = []
        self.attention_masks = []
        self.labels = []
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        for i in range(len(ids)):
            inputs = self.tokenizer(texts[i], add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
            if inputs['input_ids'].shape[1] <= max_length:
                inputs = self.tokenizer(texts_en[i], add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
                self.ids.append(ids[i])
                self.speakers.append(speakers[i])
                self.sexes.append(sexes[i])
                self.texts.append(texts[i])
                self.texts_en.append(texts_en[i])
                self.embeddings.append(inputs['input_ids'][0])
                self.attention_masks.append(inputs['attention_mask'])
                self.labels.append(torch.tensor((labels[i]), dtype=torch.long))
                
        print(f'Loaded {len(self.ids)}/{len(ids)} samples.')

    def __getitem__(self, index):
        return self.ids[index], self.speakers[index], self.sexes[index], self.texts[index], \
                self.texts_en[index], self.embeddings[index][:512].to(self.device), self.attention_masks[index][0][:512].to(self.device), self.labels[index]
            
    def __len__(self):
        return len(self.ids)

    def set_device(self, device: torch.device):
        '''
        Sets the device to the given device.
        '''
        self.device = device

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import types
import sys
dataset_train = torch.load('D:/fer/9.sem/OPJ/data/torch/orientation/train_dataset_all.pt')
dataset_valid = torch.load('D:/fer/9.sem/OPJ/data/torch/orientation/val_dataset_all.pt')


  dataset_train = torch.load('D:/fer/9.sem/OPJ/data/torch/orientation/train_dataset_all.pt')
  dataset_valid = torch.load('D:/fer/9.sem/OPJ/data/torch/orientation/val_dataset_all.pt')


In [4]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel, PreTrainedModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix

def evaluate(dataset: Dataset, model: PreTrainedModel, device: torch.device = torch.device('cpu'), plot: bool = False):
    '''
    Evaluates the model on the given dataset.
    
    Parameters:
        dataset: Dataset
            The dataset to evaluate on.
        model: PreTrainedModel
            The model to evaluate.
        device: torch.device
            The device to use.
        plot: bool
    '''
    model.to(device)
    model.eval()
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    correct_labels = []
    model_predictions = []
    with torch.no_grad():
        for batch in loader:
            id_, speaker, sex, text, text_en, embedding, attention_mask, label = batch
            embedding = embedding.to(device)
            attention_mask = attention_mask.to(device).squeeze(1)
            label = label.to(device)
            model_output = model(input_ids=embedding, labels=label, attention_mask=attention_mask)
            logits = model_output.logits
            predictions = torch.argmax(logits, dim=1)
            correct_labels.extend(label.cpu().numpy())
            model_predictions.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(correct_labels, model_predictions)
    print(f'Accuracy: {accuracy}')
    print(f'Confusion matrix:\n{confusion_matrix(correct_labels, model_predictions)}')


def train(dataset_train: Dataset, dataset_val: Dataset, model: PreTrainedModel, optimizer_type: type = torch.optim.Adam, 
        batch_size: int = 8, epochs: int = 5, device: torch.device = torch.device('cpu'), lr: float = 1e-4, 
        gamma: Union[float,None] = None, loss_fn = torch.nn.CrossEntropyLoss()):
    '''
    Trains the model on the given dataset.

    Parameters:
        dataset_train: Dataset
            The training dataset.
        dataset_val: Dataset
            The validation dataset.
        model: PreTrainedModel
            The model to train.
        optimizer_type: type
            The optimizer type to use.
        batch_size: int
            The batch size.
        epochs: int
            The number of epochs.
        device: torch.device
            The device to use.
        lr: float
            The learning rate.
        gamma: Union[float,None]
            The gamma parameter for the scheduler.
    '''
    model.to(device)
    optimizer = optimizer_type(model.parameters(), lr=lr)

    if gamma is not None:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
    
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    log_rate = len(train_loader) // 20

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for ind, batch in enumerate(train_loader):
            optimizer.zero_grad()
            id_, speaker, sex, text, text_en, embedding, attention_mask, label = batch
            embedding = embedding.to(device)
            attention_mask = attention_mask.to(device).squeeze(1)
            label = label.to(device)
            breakpoint()
            model_output = model(input_ids=embedding, labels=label, attention_mask=attention_mask)
            
            ##TODO: change this if you want to use a different loss function
            ## or the model that outputs logits
            #loss = model_output.loss
            logits = model_output.logits
            loss = loss_fn(logits, label)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()

            if ind % log_rate == 0:
                print(f'Epoch {epoch+1}/{epochs}, Batch {ind+1}/{len(train_loader)}, Batch loss: {loss.item()}, Average epoch loss: {epoch_loss/(ind+1)}')
                
        print(f'Epoch {epoch+1}/{epochs}, Average epoch loss: {epoch_loss/len(train_loader)}')

        evaluate(dataset_val, model, device=device)

        if gamma is not None:
            scheduler.step()
            
    torch.save(model, f'roberta_base_en_{epoch}.pt')
    return model

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
evaluate(dataset_valid, model, device=torch.device('cuda:0'))

Accuracy: 0.422029702970297
Confusion matrix:
[[1705    0]
 [2335    0]]


In [7]:
model = train(
    model=model,
    optimizer_type=torch.optim.Adam,
    dataset_train=dataset_train,
    dataset_val=dataset_valid,
    epochs=4,
    batch_size=16,
    lr=1e-5,
    device=torch.device('cuda:0'),
    gamma=0.75,
)

Epoch 1/4, Batch 1/2022, Batch loss: 0.7236325740814209, Average epoch loss: 0.7236325740814209
Epoch 1/4, Batch 102/2022, Batch loss: 0.7243245840072632, Average epoch loss: 0.680482370596306
Epoch 1/4, Batch 203/2022, Batch loss: 0.7947362065315247, Average epoch loss: 0.6612705074507614
Epoch 1/4, Batch 304/2022, Batch loss: 0.6777669191360474, Average epoch loss: 0.6493584207798305
Epoch 1/4, Batch 405/2022, Batch loss: 0.6903229355812073, Average epoch loss: 0.642678924990289
Epoch 1/4, Batch 506/2022, Batch loss: 0.7135332822799683, Average epoch loss: 0.6314634826814705
Epoch 1/4, Batch 607/2022, Batch loss: 0.6023502945899963, Average epoch loss: 0.6222605018383981
Epoch 1/4, Batch 708/2022, Batch loss: 0.5136598944664001, Average epoch loss: 0.6164784534166088
Epoch 1/4, Batch 809/2022, Batch loss: 0.5534195899963379, Average epoch loss: 0.6135780244734143
Epoch 1/4, Batch 910/2022, Batch loss: 0.5406137108802795, Average epoch loss: 0.6092020619374056
Epoch 1/4, Batch 1011/20

In [8]:
torch.save(model, 'roberta_base_en.pt')