In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [None]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
import random
import os

from tqdm.notebook import tqdm
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
# from transformers import BertForSequenceClassification ## we will write a custom module to replace this
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
BATCH_SIZE = 64
MODEL_MAX_LENGTH = 128

In [None]:
def loss_fn(output, target):
    return nn.CrossEntropyLoss()(output, target)

In [None]:
class MyDataSet(Dataset):
  def __init__(self, **kwargs):
    super().__init__()
    pass
    
  def __len__(self):
    pass 

  def __getitem__(self):
    pass

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                               model_max_length = MODEL_MAX_LENGTH,
                                               padding_side = "right",
                                               truncation_side = "right",)

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/train.csv") 
df_test = pd.read_csv("/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/test.csv")

In [None]:
df_train = df_train[df_train.lang_abv == "en"].reset_index(drop = True) # known labels dataset which we will use to train the classifier
df_test = df_test[df_test.lang_abv == "en"].reset_index(drop = True) # final dataset with unknown labels

In [None]:
df_train.label.value_counts()

In [None]:
# we will devide the training set into train and validation set for training in a 85:15 ratio
X_train, X_val, y_train, y_val = train_test_split(df_train.index.values, 
                                                  df_train.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df_train.label.values) 

df_train['data_type'] = ""

df_train.loc[X_train, 'data_type'] = 'train'
df_train.loc[X_val, 'data_type'] = 'val'

In [None]:
# this is just a lambda function to tokenize hypothesis premise pairs
def encode_one(x):
  encoded = bert_tokenizer(x['hypothesis'], x['premise'],
                           return_attention_mask = True,
                            add_special_tokens = True,
                            padding = "max_length",
                            truncation = 'longest_first',
                            return_tensors = "np") 
  return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [None]:
df_train[["input_ids","token_type_ids", "attention_mask"]] = df_train.apply(lambda x : encode_one(x), axis = 1, result_type = 'expand')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
df_test[["input_ids","token_type_ids", "attention_mask"]] = df_test.apply(lambda x : encode_one(x), axis = 1, result_type = 'expand')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
filter_dtype_train = df_train.data_type == "train"

input_ids_train = torch.from_numpy(np.stack(df_train.loc[filter_dtype_train,'input_ids'].values))
attention_masks_train = torch.from_numpy(np.stack(df_train.loc[filter_dtype_train,'attention_mask'].values))
token_type_ids_train = torch.from_numpy(np.stack(df_train.loc[filter_dtype_train,'token_type_ids'].values))
labels_train = torch.tensor(df_train.loc[filter_dtype_train,'label'].values)

input_ids_val = torch.from_numpy(np.stack(df_train.loc[~filter_dtype_train,'input_ids'].values))
attention_masks_val = torch.from_numpy(np.stack(df_train.loc[~filter_dtype_train,'attention_mask'].values))
token_type_ids_val = torch.from_numpy(np.stack(df_train.loc[~filter_dtype_train,'token_type_ids'].values))
labels_val = torch.tensor(df_train.loc[~filter_dtype_train,'label'].values)

# making tensor datasets for train and val sets
dataset_train = TensorDataset(input_ids_train, attention_masks_train, token_type_ids_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, token_type_ids_val, labels_val)

In [None]:
input_ids_test = torch.from_numpy(np.stack(df_test.loc[:,'input_ids'].values))
attention_masks_test = torch.from_numpy(np.stack(df_test.loc[:,'attention_mask'].values))
token_type_ids_test = torch.from_numpy(np.stack(df_test.loc[:,'token_type_ids'].values))

# building a tensor dataset for test too, but we do not have labels here
dataset_test = TensorDataset(input_ids_test, attention_masks_test, token_type_ids_test)

In [None]:
# making dataloaders
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=BATCH_SIZE)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=BATCH_SIZE)

NameError: ignored

In [None]:
dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=BATCH_SIZE)

In [None]:
df_train.label.value_counts()

0    2427
2    2277
1    2166
Name: label, dtype: int64

In [None]:
class BERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased") # loading a prebuilt model for finetuning
        self.dp1 = nn.Dropout(p=0.1) # adding dropout for regularization
        self.ffnn1 = nn.Linear(768, 3,) # adding a classifier head 
        
        
    def forward(self,input_ids,attention_mask,token_type_id_mask, labels):
        _,o2 = self.bert_model(input_ids = input_ids,
                              attention_mask = attention_mask,
                              token_type_ids = token_type_id_mask, 
                              return_dict=False) # o2 is the [CLF] token that is typically used to attach a classifier head on

        out = self.dp1(o2)
        out = torch.sigmoid(self.ffnn1(out))
        return out # outputs would be class probabilities 
    
model=BERT()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device # gpu is recommended

device(type='cuda')

In [None]:
# If you need to keep the weights fixed for base model, use this.
# model.to(torch.device('cpu')) 
# for param in model.bert_model.parameters():
#     param.requires_grad = False
# model.to(torch.device('gpu'))

In [None]:
for p,p_x in model.named_parameters():
    if p_x.requires_grad:
      print(p)

bert_model.embeddings.word_embeddings.weight
bert_model.embeddings.position_embeddings.weight
bert_model.embeddings.token_type_embeddings.weight
bert_model.embeddings.LayerNorm.weight
bert_model.embeddings.LayerNorm.bias
bert_model.encoder.layer.0.attention.self.query.weight
bert_model.encoder.layer.0.attention.self.query.bias
bert_model.encoder.layer.0.attention.self.key.weight
bert_model.encoder.layer.0.attention.self.key.bias
bert_model.encoder.layer.0.attention.self.value.weight
bert_model.encoder.layer.0.attention.self.value.bias
bert_model.encoder.layer.0.attention.output.dense.weight
bert_model.encoder.layer.0.attention.output.dense.bias
bert_model.encoder.layer.0.attention.output.LayerNorm.weight
bert_model.encoder.layer.0.attention.output.LayerNorm.bias
bert_model.encoder.layer.0.intermediate.dense.weight
bert_model.encoder.layer.0.intermediate.dense.bias
bert_model.encoder.layer.0.output.dense.weight
bert_model.encoder.layer.0.output.dense.bias
bert_model.encoder.layer.0.outp

In [None]:
SAVE_DIR = "/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/models" 
MODEL_NAME = 'BERT_model_v3.0'

In [None]:
model.to(device)

BERT(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
epochs = 10
num_train_steps = len(dataloader_train) * epochs

In [None]:
# setting an initial learning rate to 1e-4 which we will decrease linearly
optimizer= AdamW(model.parameters(), lr=1e-4)

# we will use a scheduler but without warmup
scheduler = get_linear_schedule_with_warmup(optimizer, 0, num_train_steps)

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':          batch[0].view(-1,MODEL_MAX_LENGTH),
                  'attention_mask':     batch[1].view(-1,MODEL_MAX_LENGTH),
                  'token_type_id_mask': batch[2].view(-1,MODEL_MAX_LENGTH),
                  'labels':             batch[3]
                 }     
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = loss_fn(outputs.softmax(dim=1), inputs['labels'])
        loss_val_total += loss.item()
        logits = outputs.softmax(dim=1)

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals



In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
SAVE_DIR_DL = "/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/dataloaders"
torch.save(dataloader_train, f'{SAVE_DIR_DL}/dataloader_train.pth')
torch.save(dataloader_validation, f'{SAVE_DIR_DL}/dataloader_validation.pth')

In [None]:
SAVE_DIR_DL = "/content/drive/MyDrive/Personal Projects/NLP POCs/contradictory-my-dear-watson/dataloaders"
dataloader_train = torch.load(f'{SAVE_DIR_DL}/dataloader_train.pth')
dataloader_validation = torch.load(f'{SAVE_DIR_DL}/dataloader_validation.pth')

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':          batch[0].view(-1,MODEL_MAX_LENGTH),
                  'attention_mask':     batch[1].view(-1,MODEL_MAX_LENGTH),
                  'token_type_id_mask': batch[2].view(-1,MODEL_MAX_LENGTH),
                  'labels':             batch[3]
                 }      
        outputs = model(**inputs)
        
        loss = loss_fn(outputs.softmax(dim=1), inputs['labels'])
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'{SAVE_DIR}/{MODEL_NAME}_{epoch}.pth')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.055793534802354
Validation loss: 1.0245713381206287
F1 Score (Weighted): 0.6115242893471489


Epoch 2:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.0141724432292192
Validation loss: 0.9931571343365837
F1 Score (Weighted): 0.6484447688217391


Epoch 3:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.982638541771018
Validation loss: 0.9888197990024791
F1 Score (Weighted): 0.6645133473109169


Epoch 4:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.955651295573815
Validation loss: 0.9777881853720721
F1 Score (Weighted): 0.7059949872841965


Epoch 5:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.939907431602478
Validation loss: 0.9851321122225594
F1 Score (Weighted): 0.6726459420573593


Epoch 6:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.9285015820161157
Validation loss: 0.9788610549534068
F1 Score (Weighted): 0.7053254462459793


Epoch 7:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.9191051196792851
Validation loss: 0.9802051537177142
F1 Score (Weighted): 0.6923449520587294


Epoch 8:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.9134934894416643
Validation loss: 0.9771027775371776
F1 Score (Weighted): 0.6965984043895844


Epoch 9:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.9071338034194448
Validation loss: 0.9749289610806633
F1 Score (Weighted): 0.7004890570535387


Epoch 10:   0%|          | 0/92 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.9048838213734005
Validation loss: 0.9772941154592177
F1 Score (Weighted): 0.7056077893214007


In [None]:
def predict(dataloader_test):
    model.eval()
    predictions = []
    for batch in dataloader_test:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':          batch[0].view(-1,MODEL_MAX_LENGTH),
                  'attention_mask':     batch[1].view(-1,MODEL_MAX_LENGTH),
                  'token_type_id_mask': batch[2].view(-1,MODEL_MAX_LENGTH),
                  'labels': None 
                 }     

        with torch.no_grad():        
            outputs = model(**inputs)
        
        logits = outputs.softmax(dim=1)

        logits = logits.detach().cpu().numpy()
        predictions.append(logits)

    predictions = np.concatenate(predictions, axis=0)    
    return predictions

In [None]:
predictions_test = predict(dataloader_test)

In [None]:
# we have all three classes in the predictions. Who can say if they are good :P
np.unique(predictions.argmax(axis=1), return_counts=True)

(array([0, 1, 2]), array([369, 373, 289]))