In [1]:
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel

class MyDataset(Dataset):
    def __init__(self, 
                ids: List[str], 
                speakers: List[str], 
                sexes: List[str], 
                texts: List[str], 
                texts_en: List[str], 
                labels: List[bool],
                device: torch.device = torch.device('cpu'),
                model_name: str = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                max_length: int = 512
        ):
        assert len(ids) == len(speakers) == len(sexes) == len(texts) == len(texts_en) == len(labels)
        self.ids = []
        self.speakers = []
        self.sexes = []
        self.texts = []
        self.texts_en = []
        self.embeddings = []
        self.attention_masks = []
        self.labels = []
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        for i in range(len(ids)):
            inputs = self.tokenizer(texts[i], add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
            if inputs['input_ids'].shape[1] <= max_length:
                inputs = self.tokenizer(texts_en[i], add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
                self.ids.append(ids[i])
                self.speakers.append(speakers[i])
                self.sexes.append(sexes[i])
                self.texts.append(texts[i])
                self.texts_en.append(texts_en[i])
                self.embeddings.append(inputs['input_ids'][0])
                self.attention_masks.append(inputs['attention_mask'])
                self.labels.append(torch.tensor((labels[i]), dtype=torch.long))
                
        print(f'Loaded {len(self.ids)}/{len(ids)} samples.')

    def __getitem__(self, index):
        return self.ids[index], self.speakers[index], self.sexes[index], self.texts[index], \
                self.texts_en[index], self.embeddings[index][:512].to(self.device), self.attention_masks[index][0][:512].to(self.device), self.labels[index]
            
    def __len__(self):
        return len(self.ids)

    def set_device(self, device: torch.device):
        '''
        Sets the device to the given device.
        '''
        self.device = device

In [None]:
import types

dataset_train = torch.load('/kaggle/input/orientation/train_dataset_all.pt')
dataset_valid = torch.load('/kaggle/input/orientation/val_dataset_all.pt')
dataset_train = torch.load('/kaggle/input/orientation/train_dataset_all.pt')


In [3]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel, PreTrainedModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, BertForSequenceClassification
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix

def evaluate(dataset: Dataset, model: PreTrainedModel, device: torch.device = torch.device('cpu'), plot: bool = False):
    '''
    Evaluates the model on the given dataset.
    
    Parameters:
        dataset: Dataset
            The dataset to evaluate on.
        model: PreTrainedModel
            The model to evaluate.
        device: torch.device
            The device to use.
        plot: bool
    '''
    model.to(device)
    model.eval()
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    correct_labels = []
    model_predictions = []
    with torch.no_grad():
        for batch in loader:
            id_, speaker, sex, text, text_en, embedding, attention_mask, label = batch
            embedding = embedding.to(device)
            attention_mask = attention_mask.to(device).squeeze(1)
            label = label.to(device)
            model_output = model(input_ids=embedding, labels=label, attention_mask=attention_mask)
            logits = model_output.logits
            predictions = torch.argmax(logits, dim=1)
            correct_labels.extend(label.cpu().numpy())
            model_predictions.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(correct_labels, model_predictions)
    print(f'Accuracy: {accuracy}')
    print(f'Confusion matrix:\n{confusion_matrix(correct_labels, model_predictions)}')


def train(dataset_train: Dataset, dataset_val: Dataset, model: PreTrainedModel, optimizer_type: type = torch.optim.Adam, 
        batch_size: int = 8, epochs: int = 5, device: torch.device = torch.device('cpu'), lr: float = 1e-4, 
        gamma: Union[float,None] = None, loss_fn = torch.nn.CrossEntropyLoss()):
    '''
    Trains the model on the given dataset.

    Parameters:
        dataset_train: Dataset
            The training dataset.
        dataset_val: Dataset
            The validation dataset.
        model: PreTrainedModel
            The model to train.
        optimizer_type: type
            The optimizer type to use.
        batch_size: int
            The batch size.
        epochs: int
            The number of epochs.
        device: torch.device
            The device to use.
        lr: float
            The learning rate.
        gamma: Union[float,None]
            The gamma parameter for the scheduler.
    '''
    model.to(device)
    optimizer = optimizer_type(model.parameters(), lr=lr)

    if gamma is not None:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
    
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    log_rate = len(train_loader) // 20

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for ind, batch in enumerate(train_loader):
            optimizer.zero_grad()
            id_, speaker, sex, text, text_en, embedding, attention_mask, label = batch
            embedding = embedding.to(device)
            attention_mask = attention_mask.to(device).squeeze(1)
            label = label.to(device)
            #breakpoint()
            model_output = model(input_ids=embedding, labels=label, attention_mask=attention_mask)
            
            ##TODO: change this if you want to use a different loss function
            ## or the model that outputs logits
            #loss = model_output.loss
            logits = model_output.logits
            loss = loss_fn(logits, label)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()

            if ind % log_rate == 0:
                print(f'Epoch {epoch+1}/{epochs}, Batch {ind+1}/{len(train_loader)}, Batch loss: {loss.item()}, Average epoch loss: {epoch_loss/(ind+1)}')
                
        print(f'Epoch {epoch+1}/{epochs}, Average epoch loss: {epoch_loss/len(train_loader)}')

        evaluate(dataset_val, model, device=device)

        if gamma is not None:
            scheduler.step()
            
    torch.save(model, f'distilbert_cased_en_{epoch}.pt')
    return model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
evaluate(dataset_valid, model, device=torch.device('cuda:0'))

In [5]:
model = train(
    model=model,
    optimizer_type=torch.optim.Adam,
    dataset_train=dataset_train,
    dataset_val=dataset_valid,
    epochs=4,
    batch_size=16,
    lr=1e-5,
    device=torch.device('cuda:0'),
    gamma=0.75,
)

Epoch 1/4, Batch 1/1964, Batch loss: 0.693416178226471, Average epoch loss: 0.693416178226471
Epoch 1/4, Batch 99/1964, Batch loss: 0.7335326075553894, Average epoch loss: 0.6573833901472766
Epoch 1/4, Batch 197/1964, Batch loss: 0.5540610551834106, Average epoch loss: 0.6485447897221231
Epoch 1/4, Batch 295/1964, Batch loss: 0.6741949319839478, Average epoch loss: 0.6396701667268397
Epoch 1/4, Batch 393/1964, Batch loss: 0.6398229598999023, Average epoch loss: 0.6298951819804485
Epoch 1/4, Batch 491/1964, Batch loss: 0.6663534641265869, Average epoch loss: 0.6225247562174399
Epoch 1/4, Batch 589/1964, Batch loss: 0.718400776386261, Average epoch loss: 0.6190030744302455
Epoch 1/4, Batch 687/1964, Batch loss: 0.8486047983169556, Average epoch loss: 0.6136627678354061
Epoch 1/4, Batch 785/1964, Batch loss: 0.6214850544929504, Average epoch loss: 0.6117804672308029
Epoch 1/4, Batch 883/1964, Batch loss: 0.4210622012615204, Average epoch loss: 0.605746772422931
Epoch 1/4, Batch 981/1964, 

In [7]:
torch.save(model, 'distilbert_cased_en_novi.pt')

In [8]:
evaluate(dataset_valid, model, device=torch.device('cuda:0'))

Accuracy: 0.7368689444161143
Confusion matrix:
[[1054  527]
 [ 505 1836]]


In [None]:
dataset_train = torch.load('/kaggle/input/orientation/train_dataset_all.pt')

In [None]:
evaluate(dataset_test, model, device=torch.device('cuda:0'))