In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [None]:
# The two segments are presented as a single input sequence
# to BERT with special tokens delimiting them:
# [CLS], x1, . . . , xN , [SEP], y1, . . . , yM, [EOS].

In [None]:
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm
import torch
 
from torch.nn import functional as F
from transformers import RobertaTokenizer, BertTokenizer
from transformers import BertForSequenceClassification 
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/test.csv")

In [None]:
df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12120 entries, 0 to 12119
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12120 non-null  object
 1   premise     12120 non-null  object
 2   hypothesis  12120 non-null  object
 3   lang_abv    12120 non-null  object
 4   language    12120 non-null  object
 5   label       12120 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 568.2+ KB


Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [None]:
df_train.lang_abv.value_counts().sort_values(ascending = True).cumsum()

bg      342
tr      693
de     1044
es     1410
th     1781
el     2153
hi     2527
ru     2903
vi     3282
ur     3663
sw     4048
fr     4438
ar     4839
zh     5250
en    12120
Name: lang_abv, dtype: int64

In [None]:
df_test.lang_abv.value_counts().sort_values(ascending = True).cumsum()

vi     145
hi     295
bg     445
zh     596
de     748
fr     905
ar    1064
th    1228
tr    1395
ur    1563
el    1731
ru    1903
sw    2075
es    2250
en    5195
Name: lang_abv, dtype: int64

In [None]:
df_train = df_train[df_train.lang_abv == "en"].reset_index(drop = True)
df_test = df_test[df_test.lang_abv == "en"].reset_index(drop = True)

In [None]:
df_train.shape, df_test.shape

((6870, 6), (2945, 5))

In [None]:
df_train.label.value_counts()
# 0 for entailment
# 1 for neutral 
# 2 for contradiction 

0    2427
2    2277
1    2166
Name: label, dtype: int64

In [None]:
MODEL_MAX_LENGTH = 256

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", 
                                             model_max_length = 512,
                                             padding_side = "right",
                                             truncation_side = "right", 
                                             sep_token = "[SEP]", cls_token = "[CLS]", pad_token = "[PAD]")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
tokenizer.decode(tokenizer("Hello world", "hello to you too",
          return_attention_mask = True,
          add_special_tokens = True,
          padding = "max_length")["input_ids"], skip_special_tokens = False)

'[CLS] Hello world [SEP] [SEP] hello to you too [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                               model_max_length = MODEL_MAX_LENGTH,
                                              padding_side = "right",
                                              truncation_side = "right",)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
bert_tokenizer.decode(bert_tokenizer("HELLO world"*10, "hello to you too hjhkhjhk",
          return_attention_mask = True,
          add_special_tokens = True,
          padding = "max_length",
          )["input_ids"], skip_special_tokens= False)

NameError: ignored

In [None]:
bert_tokenizer.encode('[CLS] hello [UNK] [UNK] [SEP] [PAD] [PAD] [PAD]')
# CLS - 101
# UNK - 100
# PAD - 0

[101, 101, 7592, 100, 100, 102, 0, 0, 0, 102]

In [None]:
bert_tokenizer("Hello world", "hello to you too hkhkhkh",
          return_attention_mask = True,
          add_special_tokens = True,
          padding = "max_length",
          return_tensors = "tf")

{'input_ids': <tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  7592,  2088,   102,  7592,  2000,  2017,  2205, 22563,
         2232, 10023, 10023,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [None]:
df_train.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
3,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,en,English,2
4,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,en,English,1


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df_train.index.values, 
                                                  df_train.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df_train.label.values)

df_train['data_type'] = ""

df_train.loc[X_train, 'data_type'] = 'train'
df_train.loc[X_val, 'data_type'] = 'val'

In [None]:
def encode_one(x):
  encoded = bert_tokenizer(x['hypothesis'], x['premise'],
                           return_attention_mask = True,
                            add_special_tokens = True,
                            padding = "max_length",
                            return_tensors = "np") 
  return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [None]:
df_train[["input_ids","token_type_ids", "attention_mask"]] = df_train.apply(lambda x : encode_one(x), axis = 1, result_type = 'expand')

In [None]:
unk_token_count = df_train.loc[:, 'input_ids'].apply(lambda x : len([i for i in x[0] if i == 100]))

In [None]:
unk_token_count[unk_token_count > 0]

Series([], Name: input_ids, dtype: int64)

In [None]:
from torch.utils.data import TensorDataset

In [None]:
filter_dtype_train = df_train.data_type == "train"

input_ids_train = torch.from_numpy(np.stack(df_train.loc[filter_dtype_train,'input_ids'].values))
attention_masks_train = torch.from_numpy(np.stack(df_train.loc[filter_dtype_train,'attention_mask'].values))
labels_train = torch.tensor(df_train.loc[filter_dtype_train,'label'].values)

input_ids_val = torch.from_numpy(np.stack(df_train.loc[~filter_dtype_train,'input_ids'].values))
attention_masks_val = torch.from_numpy(np.stack(df_train.loc[~filter_dtype_train,'attention_mask'].values))
labels_val = torch.tensor(df_train.loc[~filter_dtype_train,'label'].values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
df_train[(df_train.input_ids.apply(lambda x : x.shape) != (1,MODEL_MAX_LENGTH))].input_ids.apply(lambda x : x.shape)

Series([], Name: input_ids, dtype: object)

In [None]:
BATCH_SIZE = 32

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataloader_train = DataLoader(dataset_train, 
                              sampler=SequentialSampler(dataset_train), 
                              batch_size=BATCH_SIZE)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=BATCH_SIZE)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3,)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:

optimizer = AdamW(model.parameters(), lr=1e-4)

# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)



In [None]:
torch.tensor([1,0]).unsqueeze(0)

tensor([[1, 0]])

In [None]:
# labels = torch.tensor([1,0]).unsqueeze(0)
# outputs = model(input_ids, attention_mask=attention_mask)
# loss = F.cross_entropy(labels, outputs.logitd)
# loss.backward()
# optimizer.step()
# scheduler.step()

In [None]:
epochs = 5
num_train_steps = len(dataloader_train) * epochs

In [None]:
# from transformers import get_linear_schedule_with_warmup
# scheduler = get_linear_schedule_with_warmup(optimizer, 0, num_train_steps)

In [None]:
for i in model.base_model.named_parameters():
  print(i[0])

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [None]:
# for param in model.base_model.parameters():
#     param.requires_grad = False

In [None]:
for num, x in enumerate(model.named_parameters(), start = 1 ):
  if num > 12 : break
  print(f"{str(num):<3} {x[0]:-<100} {x[1].shape}")

1   bert.embeddings.word_embeddings.weight-------------------------------------------------------------- torch.Size([30522, 768])
2   bert.embeddings.position_embeddings.weight---------------------------------------------------------- torch.Size([512, 768])
3   bert.embeddings.token_type_embeddings.weight-------------------------------------------------------- torch.Size([2, 768])
4   bert.embeddings.LayerNorm.weight-------------------------------------------------------------------- torch.Size([768])
5   bert.embeddings.LayerNorm.bias---------------------------------------------------------------------- torch.Size([768])
6   bert.encoder.layer.0.attention.self.query.weight---------------------------------------------------- torch.Size([768, 768])
7   bert.encoder.layer.0.attention.self.query.bias------------------------------------------------------ torch.Size([768])
8   bert.encoder.layer.0.attention.self.key.weight------------------------------------------------------ torch.Size([76

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:


seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].view(-1,MODEL_MAX_LENGTH),
                  'attention_mask': batch[1].view(-1,MODEL_MAX_LENGTH),
                  'labels':         batch[2]
                 }     
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0].view(-1,MODEL_MAX_LENGTH),
                  'attention_mask': batch[1].view(-1,MODEL_MAX_LENGTH),
                  'labels':         batch[2]
                 }      

        outputs = model(**inputs)
        
        # loss = F.cross_entropy(inputs['labels'], outputs['logits'].softmax(dim=1))
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        # scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    # torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/183 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.1760498874113371
Validation loss: 1.6819025350339485
F1 Score (Weighted): 0.6256478997706588


Epoch 2:   0%|          | 0/183 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.15336753159909017
Validation loss: 1.9490182616493918
F1 Score (Weighted): 0.6000420190366145


Epoch 3:   0%|          | 0/183 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.14972506174588668
Validation loss: 1.5570962085868374
F1 Score (Weighted): 0.6041652729635155


Epoch 4:   0%|          | 0/183 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.11015732511063743
Validation loss: 2.2073651045052842
F1 Score (Weighted): 0.5780701778532409


Epoch 5:   0%|          | 0/183 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.10907456393816757
Validation loss: 1.986913326111707
F1 Score (Weighted): 0.6182872566946245


In [None]:
print(outputs['logits'].argmax(dim=1))
print(inputs['labels'])

tensor([0, 2, 1, 1, 2, 1, 0, 0, 1, 2, 0, 2, 0, 2, 0], device='cuda:0')
tensor([0, 2, 1, 1, 2, 1, 0, 0, 1, 2, 0, 2, 0, 2, 0], device='cuda:0')


In [None]:
print(inputs['input_ids'].shape)
print(inputs['attention_mask'].shape)
print(inputs['labels'].shape)


torch.Size([64, 512])
torch.Size([64, 512])
torch.Size([64])


In [None]:
inputs['input_ids'][0]
inputs['attention_mask'][0]
inputs['labels']


tensor([0, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 0, 0, 2, 2, 1, 2, 1, 2, 2, 0,
        0, 0, 0, 2, 1, 2, 1, 0, 2, 1, 0, 0, 1, 2, 0, 0, 2, 0, 0, 0, 2, 1, 1, 1,
        2, 2, 1, 0, 0, 0, 0, 0, 1, 0, 2, 1, 1, 0, 1, 2], device='cuda:0')

In [None]:
model.eval()
    
loss_val_total = 0
predictions, true_vals = [], []

for batch in dataloader_validation:
    
    batch = tuple(b.to(device) for b in batch)
    
    inputs = {'input_ids':      batch[0].view(-1,MODEL_MAX_LENGTH),
              'attention_mask': batch[1].view(-1,MODEL_MAX_LENGTH),
              'labels':         batch[2]
              }     
    with torch.no_grad():        
        outputs = model(**inputs)
        
    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

loss_val_avg = loss_val_total/len(dataloader_validation) 

predictions = np.concatenate(predictions, axis=0)
true_vals = np.concatenate(true_vals, axis=0)

In [None]:
sum(predictions.argmax(axis=1) == true_vals) / len(true_vals) * 100

61.88166828322017

In [None]:
f1_score_func(predictions, true_vals)

0.6182872566946245

In [None]:
predictions.shape, len(true_vals)

((1031, 3), 1031)

In [None]:
# References :

# https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613
# https://huggingface.co/docs/transformers/v4.26.1/en/model_doc/bert#transformers.BertForSequenceClassification
# https://affine.medium.com/natural-language-inferencing-nli-task-demonstration-using-kaggle-dataset-34cbce0f0852