In [4]:
import os
import re

import numpy as np
import pandas as pd
from tqdm import tqdm
import transformers
import matplotlib.pyplot as plt 

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, BCELoss, BCEWithLogitsLoss
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchmetrics.classification import BinaryF1Score, BinaryPrecision, BinaryRecall, BinaryAccuracy
#from torchsummary import summary
from datetime import datetime
from pprint import pprint

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold


In [7]:
CUDA = 'cuda:2'
DEVICE = torch.device(CUDA if torch.cuda.is_available() else 'cpu') 
path_to_data = '../data/'
#model_version = 'sentence-transformers/all-distilroberta-v1'
#model_version_mini = 'sentence-transformers/all-MiniLM-L6-v2'
SEED = 1234

In [8]:
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

## load dataset

In [9]:
with open(path_to_data+'all_annotators.csv', 'r') as f:
    all_df = pd.read_csv(f)

with open(path_to_data+'train_all_annotators.csv', 'r') as f:
    train_df = pd.read_csv(f)

with open(path_to_data+'test_all_annotators.csv', 'r') as f:
    test_df = pd.read_csv(f)

In [10]:
os.path.join( "data", 'GHC',
                     'hate' + "_multi.csv")

'data/GHC/hate_multi.csv'

## create pytorch Dataset class

In [21]:
# dataframe needs to have Columns like this: [Text, Annotator-1_label, ..., Annotator-N_label]
class MultiTaskDataset(Dataset):
    def __init__(self, annotations_df, min_number_of_annotations, tokenizer, max_length):
        super().__init__()
        annotations_df = filter_df_min_annotation(annotations_df, min_number_of_annotations)
        self.annotator_ids = [x for x in annotations_df.columns if re.fullmatch(r'[0-9]+',x)]
        self.num_annotators = len(self.annotator_ids)
        
        self.texts = annotations_df.text
        self.labels = annotations_df[self.annotator_ids]
        self.tokenizer=tokenizer
        self.max_length=max_length

    def __len__(self): 
        return len(self.texts)

    def __getitem__(self,idx):
        text = self.texts[idx]

        labels = list(self.labels.iloc[idx,:])
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'masks': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.int64)
            }


def filter_df_min_annotation(df,min_annotations):
    annotators = [c for c in df.columns if re.fullmatch(r'[0-9]+',c)]
    rest = [x for x in df.columns if x not in annotators]
    filter_df = df[annotators]
    filter_df = filter_df.replace(-1,float('nan'))
    filtered_annotators = [a for a,c in filter_df.count(axis=0).items() if c >= min_annotations]
    return df[rest+filtered_annotators]

In [22]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset = MultiTaskDataset(all_df, 1, tokenizer, max_length=100)

In [23]:
dataset.num_annotators

18

In [24]:
dataloader = DataLoader(dataset, batch_size=16)

In [25]:
for i, data_dict in enumerate(dataloader):
    pprint(data_dict)
    break

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([[  101, 24151,  4665,  ...,     0,     0,     0],
        [  101,  3738,  2024,  ...,     0,     0,     0],
        [  101,  3539,  2099,  ...,     0,     0,     0],
        ...,
        [  101,  1022,  2442,  ...,     0,     0,     0],
        [  101,  1022,  1011,  ...,     0,     0,     0],
        [  101,  1030, 19785,  ...,     0,     0,     0]]),
 'labels': tensor([[-1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1,  0,  0, -1, -1, -1, -1, -1],
        [ 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1,  0, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1,  0, -1],
        [-1, -1,  0, -1, -1,  0, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1,  1,  0, -1, -1, -1, -1, -1],
        [-1, -1, -1,  0, -1, -1,  0, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1, -1],
        [-1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1,  0,  0, -1, -1, -1, -1, -1],
        [-1, -1,  0, -1, -1,  0, -1, -1, -1,



## create Model

In [26]:
class MultiTaskBERT(nn.Module):
    def __init__(self, num_annotators, freeze_bert=False, train_last_bert_k=0, bert_dim=768):
        super().__init__()
        self.freeze_bert = freeze_bert
        self.num_annotators = num_annotators
        self.train_last_bert_k = train_last_bert_k
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
        
        # this is only needed for freezing bert layers is wanted.
        self.bert_modules = nn.ModuleList(self.bert_model.children())[:-1]
        self.bert_modules = nn.Sequential(*self.bert_modules)
        self.base_number_of_layers = self._get_layer_count(self.bert_modules)
        
        self.dropout_layer = nn.Dropout(p=0.1)
        
        if freeze_bert:
            self.freeze_feature_layers_until_last_k(train_last_bert_k)
            
        for i in range(self.num_annotators):
            setattr(self, f"fc{i}", nn.Linear(bert_dim, 2))
            
        # initialize all fc layers to xavier
        # Skip it for now. Is not in the original paper.
        #for m in self.modules():
        #    if isinstance(m, nn.Linear):
        #        torch.nn.init.xavier_normal_(m.weight, gain = 1)
        
    def forward(self, ids, mask, token_type_ids):
        _,bert_out = self.bert_model(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        drop_out = self.dropout_layer(bert_out)
        
        clf_outputs = {}
        for i in range(self.num_annotators):
            lin = getattr(self, f"fc{i}")(drop_out)
            clf_outputs[f"fc{i}"] = lin
        
        return clf_outputs
    
        
    def _set_freeze_(self, status ,k=0): #BERT has 12 layers
        last_layer_number = self.base_number_of_layers - k - 1
        for n,p in self.bert_modules.named_parameters():
            layer_number = self._get_layer_number(n)
            if layer_number <= last_layer_number: 
                p.requires_grad = status

    def freeze_feature_layers(self):
        self._set_freeze_(False)
        
    def unfreeze_feature_layers(self):
        self._set_freeze_(True)
        
    def unfreeze_all_layers(self):
        self._set_freeze_(True)
        for i in range(self.num_annotators):
            layer = getattr(self, f"fc{i}")
            layer.requires_grad = True
        
    def freeze_feature_layers_until_last_k(self,k):
        self.unfreeze_feature_layers()
        self._set_freeze_(False, k)
    
    def size(self):
        return sum(p.numel() for p in self.parameters())
    
    def _get_layer_count(self,modules):
        numbers = -1
        for n,p in modules.named_parameters():
            num = self._get_layer_number(n)
            if num > numbers:
                numbers = num
        return numbers+1
    
    def _get_layer_number(self,name):
        m = re.search('layer\.\d*\.',name)
        number = 0
        if m:
            number = m.group().split('.')[-2]
            number = int(number)
        return number

In [27]:
class MultiTaskLossWrapper(nn.Module):
    def __init__(self, annotator_weights):
        super().__init__()
        self.annotator_weights = annotator_weights.values()

    def forward(self, preds, true_vals):            
        losses = []
        for pred, true_val, weight in zip(preds, true_vals, self.annotator_weights):
            if true_val == -1:
                losses.append(0)
            else:
                target = F.one_hot(true_val.to(torch.int64),num_classes=2).float().to(DEVICE)
                loss = F.binary_cross_entropy_with_logits(
                    input=pred.to(DEVICE),
                    target=target.to(DEVICE), 
                    weight=torch.tensor(weight).to(DEVICE))
                losses.append(loss)
           
        return sum(losses)

In [28]:
def calc_annotator_class_weights(dataframe):
    annotator_numbers = [x for x in dataframe.columns if re.fullmatch(r'[0-9]+',x)]
    weights = dict()
    for i in annotator_numbers:
        labels = [x for x in dataframe[str(i)].values if x != -1]
        weight = compute_class_weight(
              class_weight="balanced", classes=np.unique(labels), y=labels)
        weights[i] = weight
        
    return weights

In [29]:
def filter_df_min_annotation(df,min_annotations):
    annotators = [c for c in df.columns if re.fullmatch(r'[0-9]+',c)]
    rest = [x for x in df.columns if x not in annotators]
    filter_df = df[annotators]
    filter_df = filter_df.replace(-1,float('nan'))
    filtered_annotators = [a for a,c in filter_df.count(axis=0).items() if c >= min_annotations]
    return df[rest+filtered_annotators]

In [30]:
def train_epoch(model, dataloader, optimizer, loss_fn, train_only_annotated, print_interval=100):
    
    losses = []
    accs = []
    
    bin_acc = BinaryAccuracy().to(DEVICE)

    # using set_grad_enabled() we can enable or disable
    # the gardient accumulation and calculation, this is specially
    # good for conserving more memory at validation time and higher performance
    with torch.set_grad_enabled(True):    
        
        model.train()
        
        for i, data_dict in enumerate(tqdm(dataloader)):
            preds, labels = _model_and_process(data_dict, model)

            #the loss needs to be calculated for each sample, so it is not called on the batch
            single_losses = []
            for pred_vec, label_vec in zip(preds,labels):
                loss=loss_fn(pred_vec,label_vec)
                single_losses.append(loss)

            loss_batch = torch.stack(single_losses).mean()
            losses.append(loss_batch.item())
            
            optimizer.zero_grad()
            loss_batch.backward() 
            optimizer.step()
            
            preds_bin_1dim = torch.topk(preds, 1, dim=2, largest=True, sorted=True, out=None)[1].squeeze(dim=2)
            
            mask = labels.not_equal(-1)
            masked_1d_labels = torch.masked_select(labels,mask)
            masked_1d_preds = torch.masked_select(preds_bin_1dim,mask)

            all_acc = bin_acc(masked_1d_preds,masked_1d_labels)
            accs.append(all_acc)

            if i%print_interval==0:
                print(f'[Training] Itteration/Batch: {i:>3}: Loss: {loss_batch:.2f} | Accuracy: {all_acc:.2f}')
                logging.info(f'[Training] Itteration/Batch: {i:>3}: Loss: {loss_batch:.2f} | Accuracy: {all_acc:.2f}')
                
    return (losses, accs)

## test and debug

### run cross validation

In [21]:
num_annotators = 18
batch_size = 32
num_epochs  = 3
num_splits = 5

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset= MultiTaskDataset(test_df, num_annotators, tokenizer, max_length=100, is_training_set=True)
loss_fn = MultiTaskLossWrapper().to(DEVICE)


In [22]:
splits = KFold(n_splits=num_splits, shuffle=True, random_state=SEED)


In [48]:
#sure you can use cuda:2, that is the second graphic card? If so, resolve this malicious code:
break
CUDA = 'cuda:2'
DEVICE = torch.device(CUDA if torch.cuda.is_available() else 'cpu')

In [None]:
history = {'train_loss': [], 
           'test_loss': [],
           'train_f1s':[],
           'test_f1s':[],
           'train_accs':[],
           'test_accs':[],
           'individual_train_f1s':[], 
           'individual_test_f1s':[]}

for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(dataset)))):

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(val_idx)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)

    model = MultiTaskBERT(num_annotators=num_annotators, freeze_bert=False)
    model.to(DEVICE)
    optimizer = optim.Adam(model.parameters(),lr= 1e-7)
    
    print('#'*30, f'\t Fold {fold+1} \t', '#'*30)

    for epoch in range(num_epochs):
        
        print('_'*30, f'\t Running Epoch {epoch+1} of {num_epochs} \t', '_'*30)
        
        (tr_prec, tr_rec, tr_f1s, tr_losses, tr_all_f1s, tr_all_accs) = train_val_epoch(model=model, 
                                                                     num_annotators=num_annotators, 
                                                                     dataloader=train_loader, 
                                                                     optimizer=optimizer, 
                                                                     loss_fn=loss_fn, 
                                                                     is_training=True, 
                                                                     train_only_annotated=False)
        (te_prec, te_rec, te_f1s, te_losses, te_all_f1s, te_all_accs) = train_val_epoch(model=model, 
                                                                     num_annotators=num_annotators, 
                                                                     dataloader=test_loader, 
                                                                     optimizer=optimizer, 
                                                                     loss_fn=loss_fn, 
                                                                     is_training=False, 
                                                                     train_only_annotated=False)
        
        train_loss = float(torch.tensor(tr_losses).mean())
        train_f1 = float(torch.tensor(tr_all_f1s).mean())
        train_acc = float(torch.tensor(tr_all_accs).mean())
        test_loss =float(torch.tensor(te_losses).mean())
        test_f1 = float(torch.tensor(te_all_f1s).mean())
        test_acc = float(torch.tensor(te_all_accs).mean())

        print(f"\nEpoch {epoch+1}: AVG Training Loss:{train_loss} AVG Test Loss:{test_loss} AVG Training F1 {train_f1} AVG Test F1 {test_f1}")

        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['train_f1s'].append(train_f1)
        history['test_f1s'].append(test_f1)
        history['train_accs'].append(train_acc)
        history['test_accs'].append(test_acc)
        history['individual_train_f1s'].append(tr_f1s)
        history['individual_test_f1s'].append(te_f1s)

### check if only annotated FCs get updated in loss_fn.backward() in train epoch 

yes, with batchsize = 1 you can see only three FCs have gradients

In [78]:
batch_size = 16
num_epochs  = 10
num_splits = 5
print_interval = 200/batch_size
learning_rate = 5e-5
only_one_fold = True
max_length = 64
min_number_of_annotations = 3000

dataframe = filter_df_min_annotation(all_df, min_number_of_annotations)

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset = MultiTaskDataset(dataframe, min_number_of_annotations, tokenizer, max_length=max_length)

num_annotators = dataset.num_annotators

weights = calc_annotator_class_weights(dataframe)
loss_fn = MultiTaskLossWrapper(annotator_weights=weights).to(DEVICE)

CUDA = 'cuda'
DEVICE = 'cpu'#torch.device(CUDA if torch.cuda.is_available() else 'cpu')

dataloader = DataLoader(dataset, batch_size=batch_size)

model = MultiTaskBERT(num_annotators, freeze_bert=False)
model.to(DEVICE)
optimizer = optim.Adam(model.parameters(),lr= 1e-7)

KeyError: "None of [Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',\n       '13', '17'],\n      dtype='object')] are in the [index]"

In [None]:
with torch.set_grad_enabled(True):    
        
    model.train()

    for i, data_dict in enumerate(dataloader):
        token_ids = data_dict['ids'].to(DEVICE) 
        token_type_ids = data_dict['token_type_ids'].to(DEVICE) 
        masks = data_dict['masks'].to(DEVICE) 
        labels = data_dict['labels'].to(DEVICE)

        output = model(
            ids=token_ids,
            mask=masks,
            token_type_ids=token_type_ids)   

        preds = list(output.values()) #values, because output is a dict, keys are the fc layers of the model 
        preds = torch.stack(preds)
        preds = preds.transpose(0,1) #such that we have shape(n_batch, n_annotators, n_classes=2)

        labels = labels.type_as(preds)
        
        
        #the loss needs to be calculated for each sample, so it is not called on the batch
        single_losses = []
        for pred_vec, label_vec in zip(preds,labels):
            loss=loss_fn(pred_vec,label_vec)
            single_losses.append(loss)

        loss_batch = torch.stack(single_losses).mean()
        
        optimizer.zero_grad()
        loss_batch.backward() 
        optimizer.step()
        
        break
                   

In [69]:
data_dict['labels'].shape

torch.Size([16, 15])

In [70]:
preds.shape

torch.Size([16, 16, 2])

In [71]:
labels.shape

torch.Size([16, 15])

In [59]:
# get names of FC layers with non-zero gradient.
affected = []
for name, param in model.named_parameters():
    if 'weight' in name and 'fc' in name:
        if param.grad.sum() != 0:
            print(name, param.grad.sum())
            affected.append(name)

fc0.weight tensor(-0.0064)
fc1.weight tensor(-1.2619)
fc2.weight tensor(-0.7301)
fc3.weight tensor(-1.6932)
fc5.weight tensor(-0.3889)
fc6.weight tensor(-0.6615)
fc7.weight tensor(-1.4467)
fc11.weight tensor(-0.2987)
fc12.weight tensor(-5.1947)
fc13.weight tensor(-4.6475)
fc16.weight tensor(-2.4237)


### loss function

In [31]:
batch_size = 16
num_epochs  = 10
num_splits = 5
print_interval = 200/batch_size
learning_rate = 5e-5
only_one_fold = True
max_length = 64
min_number_of_annotations = 1

dataframe = filter_df_min_annotation(all_df, min_number_of_annotations)

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset = MultiTaskDataset(dataframe, min_number_of_annotations, tokenizer, max_length=max_length)

num_annotators = dataset.num_annotators

weights = calc_annotator_class_weights(dataframe)
loss_fn = MultiTaskLossWrapper(annotator_weights=weights).to(DEVICE)

CUDA = 'cuda'
DEVICE = 'cpu'#torch.device(CUDA if torch.cuda.is_available() else 'cpu')

dataloader = DataLoader(dataset, batch_size=batch_size)

model = MultiTaskBERT(num_annotators, freeze_bert=False)
model.to(DEVICE)
optimizer = optim.Adam(model.parameters(),lr= 1e-7)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
with torch.set_grad_enabled(True):    
        
    model.train()

    for i, data_dict in enumerate(dataloader):
        token_ids = data_dict['ids'].to(DEVICE) 
        token_type_ids = data_dict['token_type_ids'].to(DEVICE) 
        masks = data_dict['masks'].to(DEVICE) 
        labels = data_dict['labels'].to(DEVICE)

        output = model(
            ids=token_ids,
            mask=masks,
            token_type_ids=token_type_ids)   

        preds = list(output.values()) #values, because output is a dict, keys are the fc layers of the model 
        preds = torch.stack(preds)
        preds = preds.transpose(0,1) #such that we have shape(n_batch, n_annotators, n_classes=2)

        labels = labels.type_as(preds)
        
        
        #the loss needs to be calculated for each sample, so it is not called on the batch
        single_losses = []
        for pred_vec, label_vec in zip(preds,labels):
            loss=loss_fn(pred_vec,label_vec)
            single_losses.append(loss)

        loss_batch = torch.stack(single_losses).mean()
        
        break
        
        optimizer.zero_grad()
        loss_final.backward() 
        optimizer.step()
        
        # get names of FC layers with non-zero gradient.
        affected = []
        for name, param in model.named_parameters():
            if 'weight' in name and 'fc' in name:
                if param.grad.sum() != 0:
                    affected.append(name)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [34]:
preds.shape

torch.Size([16, 18, 2])

In [35]:
labels.shape

torch.Size([16, 18])

In [131]:
p = preds[:,0,:]
p

tensor([[-0.6777, -0.0925],
        [-0.4624,  0.1261],
        [-0.6270,  0.2642],
        [-0.6609,  0.2400],
        [-0.5805,  0.1754],
        [-0.3018, -0.0274],
        [-0.5309,  0.0210],
        [-0.5920,  0.1980],
        [-0.5246, -0.0557],
        [-0.4550, -0.1275],
        [-0.6951,  0.1632],
        [-0.5830,  0.2511],
        [-0.5432,  0.2115],
        [-0.5651,  0.3057],
        [-0.1779, -0.0308],
        [-0.6336, -0.1782]], grad_fn=<SliceBackward0>)

In [114]:
labels[:,1]

tensor([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  0., -1., -1.,
        -1., -1.])

In [84]:
mask = labels[:,1] != -1
mask

tensor([False, False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False])

In [357]:
test_input_0 = torch.tensor([[3,-1]]*3,dtype=torch.float)
test_input_1 = torch.tensor([[-1,3]]*3,dtype=torch.float)
test_target_0 = torch.tensor([0]*3,dtype=torch.long)
test_target_1 = torch.tensor([1]*3,dtype=torch.long)
test_target_0_one_hot = torch.tensor([[1,0]]*3,dtype=torch.float)
test_target_1_one_hot = torch.tensor([[0,1]]*3,dtype=torch.float)

weight = torch.tensor([0.1,10],dtype=torch.float)
loss_fn = torch.nn.CrossEntropyLoss(weight=weight)

In [358]:
torch.tensor([0.1,10],dtype=torch.float).shape

torch.Size([2])

In [359]:
loss_fn(input=test_input_0, 
        target=test_target_1_one_hot)

tensor(40.1815)

In [360]:
loss_fn(input=test_input_1, 
        target=test_target_0_one_hot)

tensor(0.4018)

In [361]:
loss_fn(input=test_input_0, 
        target=test_target_1)

tensor(4.0181)

In [345]:
loss_fn(input=test_input_1, 
        target=test_target_0)

tensor(4.0181)

In [159]:
l = labels[:,0]
l

tensor([-1.,  0., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  0., -1.,  0.,
        -1., -1.])

In [160]:
l_f = torch.stack([x for x in l if x != -1])
l_f

tensor([0., 0., 0.])

In [162]:
lf2 = F.one_hot(l_f.to(torch.int64),num_classes=2).float()
lf2

tensor([[1., 0.],
        [1., 0.],
        [1., 0.]])

In [148]:
p_f = torch.stack(
    [x for x,y in zip(p,l) if y != -1])
p_f

tensor([[-0.4624,  0.1261],
        [-0.5830,  0.2511],
        [-0.5651,  0.3057]], grad_fn=<StackBackward0>)

In [157]:
mask = labels[:,1] != -1
loss_fn = torch.nn.CrossEntropyLoss(weight=weight)
loss_fn(input=p_f,
        target=lf2)

tensor(0.1148, grad_fn=<DivBackward1>)

In [213]:
loss_fn = nn.CrossEntropyLoss(
                weight=torch.tensor([0.9,1.5],dtype=torch.float))
            
annos_preds = preds[:,0,:]
annos_trues = labels[:,0]

annos_preds_filtered = torch.stack(
    [pred for pred,true_val in zip(annos_preds,annos_trues) if true_val != -1])
annos_lables_filtered = torch.stack(
    [true_val for true_val in annos_trues if true_val != -1])

annos_lables_filtered_one_hot = F.one_hot(annos_lables_filtered.to(torch.int64),num_classes=2).float()


In [214]:
loss_fn(input=annos_preds_filtered,
        target=annos_lables_filtered_one_hot)


tensor(1.0336, grad_fn=<DivBackward1>)

In [None]:
F.binary_cross_entropy_with_logits(
                        input=preds[:,1,:],
                        target=F.one_hot(labels[:,1].to(torch.int64),num_classes=2).float())

In [183]:
class MultiTaskLossWrapper_old(nn.Module):
    """The loss needs to be calculated for each sample because each sample has possibly different annotators and thus different pos_sample weights
    """
    def __init__(self, annotator_weights, sum_not_mean=False):
        super().__init__()
        self.annotator_weights = annotator_weights.values()
        self.num_annotators = len(self.annotator_weights)
        self.sum_not_mean = sum_not_mean

    def forward(self, batch_preds, batch_true_vals):  
        sample_losses = []
        
        # for each sample in the batch
        for pred_vec, label_vec in zip(batch_preds, batch_true_vals):
            annotator_sample_losses = []
            
            #for each annotator
            for pred, true_val, weight in zip(pred_vec, label_vec, self.annotator_weights):
                if true_val == -1:
                    annotator_sample_losses.append(0)
                else:
                    target = F.one_hot(true_val.to(torch.int64),num_classes=2).float().to(DEVICE)
                    loss = F.binary_cross_entropy_with_logits(
                        input=pred.to(DEVICE),
                        target=target.to(DEVICE), 
                        pos_weight=torch.tensor(weight).to(DEVICE))
                    annotator_sample_losses.append(loss)

            sample_loss = sum(annotator_sample_losses)
            sample_losses.append(sample_loss)
        
        if self.sum_not_mean:
            batch_loss = torch.stack(sample_losses).sum()
        else:
            batch_loss = torch.stack(sample_losses).mean()
            
        return batch_loss
    
    
    
class MultiTaskLossWrapper(nn.Module):
    """The loss needs to be calculated for each sample because each sample has possibly different annotators and thus different pos_sample weights
    """
    def __init__(self, annotator_weights, sum_not_mean=False):
        super().__init__()
        self.annotator_weights = annotator_weights.values()
        self.num_annotators = len(self.annotator_weights)
        self.sum_not_mean = sum_not_mean

    def forward(self, batch_preds, batch_true_vals):  
        anno_losses = []
        
        # for each annotator
        for anno in range(self.num_annotators):
            loss_fn = nn.CrossEntropyLoss(weight=self.annotator_weights[anno])
            
            annos_preds = batch_preds[:,anno,:]
            annos_trues = batch_true_vals[:,anno]
            
            annos_preds_filtered = torch.stack(
                [pred for pred,true_val in zip(annos_preds,annos_trues) if true_val != -1])
            annos_preds_filtered = torch.stack(
                [true_val for true_val in annos_trues if true_val != -1])
            
            annos_preds_filtered_one_hot = F.one_hot(annos_preds_filtered.to(torch.int64),num_classes=2).float()
            
            anno_loss = loss_fn(input=annos_preds_filtered.to(DEVICE),
                                target=annos_preds_filtered_one_hot.to(DEVICE))
            
            anno_losses.append(anno_loss)
            
        
        if self.sum_not_mean:
            batch_loss = torch.stack(anno_losses).sum()
        else:
            batch_loss = torch.stack(anno_losses).mean()
        
        return batch_loss

In [36]:
torch.topk(preds, 1, dim=2, largest=True, sorted=True, out=None)[1].squeeze(dim=2).shape

torch.Size([16, 18])

### class weight

In [44]:
# example class weight, for annotator 1 (annotator 0 is weirdly quite balanced)
annotator_id = '1'
x = [a for a in all_df[annotator_id] if a != -1]
print('Counts:','0:', x.count(0),'   1:', x.count(1))

weights = compute_class_weight(class_weight="balanced", classes=np.unique(x), y=x)
print('Weights: ', weights)

Counts: 0: 3855    1: 129
Weights:  [ 0.51673152 15.44186047]


In [45]:
pred_0 = torch.tensor([[3,-1],[2,-2]],dtype=torch.float)
pred_1 = torch.tensor([[-1,3], [-2,2]],dtype=torch.float)

true_0 = torch.tensor([[1,0]]*2,dtype=torch.float)
true_1 = torch.tensor([[0,1]]*2,dtype=torch.float)

weights_tensor = torch.tensor(weights,dtype=torch.float)

In [46]:
# class 0 is predicted and also true 
F.binary_cross_entropy_with_logits(input=pred_0, target=true_0, pos_weight=weights_tensor)

tensor(0.1327)

In [47]:
# class 1 is predicted and also true 
F.binary_cross_entropy_with_logits(input=pred_1, target=true_1, pos_weight=weights_tensor)

tensor(0.7876)

In [48]:
# class 0 is predicted but 1 is true
F.binary_cross_entropy_with_logits(input=pred_0, target=true_1, pos_weight=weights_tensor)

tensor(14.5746)

In [49]:
# class 1 is predicted but 0 is true
F.binary_cross_entropy_with_logits(input=pred_1, target=true_0, pos_weight=weights_tensor)

tensor(1.7383)

In [56]:
F.binary_cross_entropy_with_logits(
    input=torch.tensor([3,-1],dtype=torch.float), 
    target=torch.tensor([1],dtype=torch.float))

ValueError: Target size (torch.Size([1])) must be the same as input size (torch.Size([2]))

In [262]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
loss_fn(input=test_input_0, 
    target=torch.tensor([1,1,0],dtype=torch.long))

tensor(2.6848)

In [261]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
loss_fn(input=torch.tensor([[3,-1]]*5,dtype=torch.float), 
    target=torch.tensor([1,-1,1,0,-1],dtype=torch.long))

tensor(2.6848)

In [296]:
loss_fn = nn.CrossEntropyLoss()
loss_fn(input=torch.tensor([[[3,-1]]*5]*3,dtype=torch.float), 
    target=torch.tensor([[1]*5]*3,dtype=torch.long))

RuntimeError: Expected target size [3, 2], got [3, 5]

In [291]:
torch.tensor([[[3,-1]]*5]*3,dtype=torch.float).transpose(2,0).shape

torch.Size([2, 5, 3])

In [292]:
torch.tensor([[1]*5]*3,dtype=torch.long).shape

torch.Size([3, 5])

In [249]:
loss_fn = torch.nn.CrossEntropyLoss()
loss_fn(input=torch.tensor([[3,-1]]*5,dtype=torch.float), 
    target=torch.tensor([[0,1]]*5,dtype=torch.float))

tensor(4.0181)

In [191]:
4.0181*18

72.32579999999999

In [201]:
26.0133/4.0181

6.47403001418581

In [199]:
torch.tensor([[[3,-1]]*18]*4,dtype=torch.float).shape

torch.Size([4, 18, 2])

In [221]:
[[1]*18]*4

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [228]:
loss_fn = torch.nn.CrossEntropyLoss()
loss_fn(input=torch.tensor([[[3,-1]]*18]*4,dtype=torch.float), 
    target=torch.tensor([[1]*18]*4,dtype=torch.long))

RuntimeError: Expected target size [4, 2], got [4, 18]

### number of annotations per annotator

In [258]:
all_df.iloc[:,1:20].replace(0,1).replace(-1,0).sum()

0      7708
1      3984
2      4941
3      4511
4      3364
5      3776
6      4060
7      8454
8      3305
9      3548
10     3407
11    12650
12     8009
13     8861
14      286
15     1384
16      542
17     3518
dtype: int64

### other

In [26]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

In [33]:
splits = skf.split(X=all_df.Text, y=all_df.majority_label)

In [None]:
splits.

### other

### other