In [3]:
!pip install datasets transformers nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.7


In [2]:
import os
import os.path as osp
import nltk
import random
# nltk.download('stopwords')
# nltk.download('punkt')
# from nltk.corpus import stopwords
# english_stopwords = stopwords.words("english")
import numpy as np
import re
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
import pandas as pd
import string
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import datasets
from datasets import load_dataset
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel
import multiprocessing
import time
from torch.utils.data import DataLoader, Dataset 
import sys
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [3]:
test_file = "test_data_iob.csv"
val_file = "val_data_iob.csv"
train_file = "train_data_iob.csv"

In [4]:
pretrained_model = "bert-base-uncased"
batch_size = 128
max_para_length = 128

# Check if cuda is available and set device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

# Make sure you choose suitable num_worker, otherwise it will result in errors
num_workers = 8 if cuda else 0

print("Cuda = ", str(cuda), " with num_workers = ", str(num_workers),  " system version = ", sys.version)

Cuda =  True  with num_workers =  8  system version =  3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) 
[GCC 10.3.0]


In [5]:
class ContextEmbeddingDataset(Dataset):
    def __init__(self, csv_file, context, pretrained_model):
      df = pd.read_csv(csv_file)

      self.context = context
      self.tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower=True)    

      # Tokenize the paragraphs
      self.df = df["para"].apply(self.preprocess)
      self.y = df['label']
  
     
    def preprocess(self, examples):
      return self.tokenizer(examples, truncation=True, 
                     padding="max_length", max_length=max_para_length,
                     return_token_type_ids=False)['input_ids']

    def __len__(self):
      return len(self.y) - (2*self.context)
    
    def __getitem__(self,index):
      return torch.LongTensor(list(self.df[index:(index + 2*self.context+1)])), self.y[index+self.context]
      
      # self.embed_model.eval()
      # Generate BERT embeddings for the tokens in each para
      # with torch.no_grad():
      #   x = torch.LongTensor(list(self.df[index:(index + 2*self.context+1)])).to(device)
      #   print(x.shape)    
      #   outputs = self.embed_model(x)
      #   print(outputs.shape) # (3, tokens(128), input_dim(3072))
       
      # return outputs.cpu(), self.y[index+self.context]

In [5]:
val_data = ContextEmbeddingDataset(val_file, context = 1, pretrained_model = pretrained_model)
train_data = ContextEmbeddingDataset(train_file, context = 1, pretrained_model = pretrained_model)
test_data = ContextEmbeddingDataset(test_file, context = 1, pretrained_model = pretrained_model)

train_args = dict(shuffle=True, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=False) if cuda else dict(shuffle=True, batch_size=batch_size, drop_last=False)
train_loader = DataLoader(train_data, **train_args)

val_args = dict(shuffle=False, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=False) if cuda else dict(shuffle=False, batch_size=batch_size, drop_last=False)
val_loader = DataLoader(val_data, **val_args)


test_args = dict(shuffle=False, batch_size=batch_size, num_workers=8, pin_memory=True, drop_last=False) if cuda else dict(shuffle=False, batch_size=batch_size, drop_last=False)
test_loader = DataLoader(test_data, **test_args)

## Fixed Bert word Embeddings, BiLSTM encoder, Triplet Decoder

In [6]:
class BertEmbedding(nn.Module):
  def __init__(self, pretrained_model):
    super().__init__()
    self.model = BertModel.from_pretrained(pretrained_model, output_hidden_states = True)
    # for param in self.model.bert.parameters():
    #   param.requires_grad = False
    # print(sum(p.numel() for p in self.model.parameters()))

  def forward(self, x):
    # print("Input to BertEmbedding: ", x.shape)
    outputs = self.model(x)
    hidden_states = outputs[2]
    embedding = torch.cat((hidden_states[-1],hidden_states[-2],hidden_states[-3],hidden_states[-4]), dim = 2)
    # print("Output from BertEmbedding: ", embedding.shape)
    return embedding

class ParaEncoderForContext(nn.Module):
  def __init__(self, bilayers = 1, input_dim = 3072, hidden_size = 512):
    super().__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_size
    self.lstm = nn.LSTM(
            input_size=input_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=True, bidirectional=True)
    
    for name, param in self.lstm.named_parameters():
      if 'bias' in name:
        nn.init.constant(param, 0.0)
      elif 'weight' in name:
        nn.init.orthogonal(param)
     

  def forward(self, x): # (B*T(T=1+2*context), tokens, input_dim)
    # print("Input to Encoder: ",x.shape)
    outputs, _ = self.lstm(x) # (B*T, tokens, 2*hidden_dim)
    # print("After LSTM: ", outputs.shape)
    first = outputs[:, 0, self.hidden_dim:]
    second = outputs[:, -1, :self.hidden_dim]
    para_embed = torch.cat((second,first), dim = 1) #(B*T, 2*hidden_dim)

    # print("Output from Encoder", para_embed.shape)
    return para_embed #(B*T, 2*hidden_dim)



class ParaDecoderTriplet(nn.Module):
  def __init__(self, input_size, output_size = 1):
    super().__init__()
    self.linear = nn.Linear(input_size, 3, bias= True)
    # self.layers = nn.Sequential(nn.Linear(input_size, output_size, bias = True), 
    #                             nn.BatchNorm1d(output_size), 
    #                             nn.ReLU(inplace = True), 
    #                             nn.Linear(output_size, 1, bias = True))
    
    # for mod in self.modules():
    #   if isinstance(mod, nn.BatchNorm1d):
    #     nn.init.constant_(mod.weight.data, 1)
    #     if(mod.bias is not None):
    #       nn.init.constant_(mod.bias.data, 0)

  def forward(self, x): # #(B, T, 2*hidden_dim)
    # print("Input to decoder: ", x.shape) 
    s0,s1,s2 = x.shape
    x = x.reshape(s0,-1) #concat main and context para embeddings
    # print("Input to linear layer in decoder:", xv.shape) 
    return self.linear(x) #(B,1)

class EncoderDecoderTriplet(nn.Module):
  def __init__(self, embed_model, decoder_output_size = 1, encoder_bilayers = 1, encoder_input_dim = 3072, encoder_hidden_size = 512, context = 1):
    super().__init__()
    self.para_encoder = ParaEncoderForContext(bilayers = encoder_bilayers, input_dim = encoder_input_dim, hidden_size = encoder_hidden_size)
    self.para_decoder = ParaDecoderTriplet(input_size = encoder_hidden_size*2*(1+2*context))
    self.embed_model = embed_model
    #freeze bert embedding layer
    for param in self.embed_model.parameters():
      param.requires_grad = False

  def forward(self, x): # (B, 2*context+1, tokens_per_para)
    # print("Input to model: ", x.shape)
    s0, s1, s2 = x.shape
    xv = x.view(s0*s1, s2)
    embeds = self.embed_model(xv)
    para_vec = self.para_encoder(embeds)
    pvv = para_vec.view(s0, s1, -1)
    # print("Input to decoder: ", pvv.shape)
    return self.para_decoder(pvv)

## Train and Validate Functions

In [7]:
def train(para_model, data_loader):
  para_model.train()

  avg_loss = []
  all_predictions = []
  all_targets = []
  start = time.time()

  for i, (x, y) in enumerate(tqdm(data_loader, desc="Epoch", leave=False)):
    optimizer.zero_grad()
    y  = y.to(device) 
    x = x.to(device)
 
    output = para_model(x)

    # print("Output from model: ", output.shape)  

    loss = criterion(output, y.long())
    avg_loss.extend([loss.item()]*len(y))

    # output = nn.Sigmoid()(output)

    loss.backward()
    optimizer.step()
    scheduler.step()
    
    all_predictions.extend(torch.argmax(output, axis=1).cpu().tolist())
    all_targets.extend(y.cpu().tolist())
    
    
  end = time.time()
  avg_loss = np.mean(avg_loss)
  print('learning_rate: {}'.format(scheduler.get_last_lr()))
  print('Training loss: {:.2f}, Time: {}'.format(avg_loss, end-start))
  
  all_predictions = np.array(all_predictions)
  all_targets = np.array(all_targets)
  scores = precision_recall_fscore_support(all_targets, all_predictions, 
                                            average="weighted", zero_division=0.)
  
  test_scores={
      "eval_accuracy": (all_predictions == all_targets).sum() / len(all_predictions),
      "eval_precision": scores[0],
      "eval_recall": scores[1],
      "eval_f-1": scores[2]
  }
  print(test_scores)
  return test_scores["eval_f-1"]


In [8]:
def validate(para_model, data_loader):
  para_model.eval()
  
  avg_loss = []
  all_predictions = []
  all_targets = []
  start = time.time()

  for i, (x, y) in enumerate(tqdm(data_loader, desc="Epoch", leave=False)):
    # optimizer.zero_grad()

    y = y.to(device)
    x = x.to(device)

    with torch.no_grad():
      output = para_model(x)

      loss = criterion(output, y.long())
      avg_loss.extend([loss.item()]*len(y))

      # output = nn.Sigmoid()(output)

      all_predictions.extend(torch.argmax(output, axis=1).cpu().tolist())
      all_targets.extend(y.cpu().tolist())

    
  end = time.time()
  avg_loss = np.mean(avg_loss)
  print('learning_rate: {}'.format(scheduler.get_last_lr()))
  print('Validation loss: {:.2f}, Time: {}'.format(avg_loss, end-start))
  
  all_predictions = np.array(all_predictions)
  all_targets = np.array(all_targets)
  scores = precision_recall_fscore_support(all_targets, all_predictions, 
                                            average="weighted", zero_division=0.)
  
  test_scores={
      "eval_accuracy": (all_predictions == all_targets).sum() / len(all_predictions),
      "eval_precision": scores[0],
      "eval_recall": scores[1],
      "eval_f-1": scores[2]
  }
  print(test_scores)
  return test_scores["eval_f-1"], all_predictions


In [9]:
def save(model, acc, best=""):
    if not os.path.exists('./bert_base_triplet_iob/'):
        os.mkdir('./bert_base_triplet_iob/')

    torch.save(model.state_dict(), './bert_base_triplet_iob/'+'/{}model_params_{}.pth'.format(best, acc))
    

## Main

In [13]:
model = EncoderDecoderTriplet(embed_model = BertEmbedding(pretrained_model))
model.load_state_dict(torch.load('./bert_base_triplet_iob/model_model_params_0.9371448069961689.pth'))
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
non_trainable_total_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print("Total params: ", total_params)
print("Trainable params: ", trainable_total_params)
print("Non Trainable params: ", non_trainable_total_params)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total params:  124179715
Trainable params:  14697475
Non Trainable params:  109482240


In [15]:
epochs = 20
lamda = 1e-4  #L2 regularization
learning_rate = 1e-2

criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=lamda)
# optimizer.load_state_dict(torch.load('./bert_base_triplet/optimizer_model_params_0.9409211846833226.pth'))    

# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[i for i in range(4,20,4)], gamma=0.75)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))

In [None]:
best_val_f1 = 0
for epoch in range(epochs):
  print('Epoch #{}'.format(epoch+1))
  
  train_f1 = train(model, train_loader)
  val_f1 = validate(model, val_loader)
  
  if val_f1 > best_val_f1:
    best_val_f1 = val_f1
    save(model, best_val_f1, best = "model_")
    save(optimizer, best_val_f1, best = "optimizer_")


Epoch #1


                                                                                                                        

learning_rate: [0.009938441702975667]
Training loss: 0.48, Time: 1319.851054430008
{'eval_accuracy': 0.8764125521557719, 'eval_precision': 0.871023677712392, 'eval_recall': 0.8764125521557719, 'eval_f-1': 0.8723446046759972}


                                                                                                                        

learning_rate: [0.009938441702975667]
Validation loss: 0.24, Time: 165.62895560264587
{'eval_accuracy': 0.9174068672941801, 'eval_precision': 0.9149725318609351, 'eval_recall': 0.9174068672941801, 'eval_f-1': 0.9154566351734379}
Epoch #2


                                                                                                                        

learning_rate: [0.009755282581475743]
Training loss: 0.21, Time: 1335.586627960205
{'eval_accuracy': 0.9297635605006954, 'eval_precision': 0.9285303747448357, 'eval_recall': 0.9297635605006954, 'eval_f-1': 0.9289806803573732}


                                                                                                                        

learning_rate: [0.009755282581475743]
Validation loss: 0.26, Time: 164.5749387741089
{'eval_accuracy': 0.898316319766671, 'eval_precision': 0.8923926770373725, 'eval_recall': 0.898316319766671, 'eval_f-1': 0.8901410609586241}
Epoch #3


                                                                                                                        

learning_rate: [0.00945503262094182]
Training loss: 0.20, Time: 1331.5338985919952
{'eval_accuracy': 0.9322192280945758, 'eval_precision': 0.9312951850084437, 'eval_recall': 0.9322192280945758, 'eval_f-1': 0.9316622638995351}


                                                                                                                        

learning_rate: [0.00945503262094182]
Validation loss: 0.26, Time: 164.30304384231567
{'eval_accuracy': 0.9038843961288612, 'eval_precision': 0.9031313727096011, 'eval_recall': 0.9038843961288612, 'eval_f-1': 0.8958143780934341}
Epoch #4


                                                                                                                        

learning_rate: [0.00904508497187471]
Training loss: 0.19, Time: 1331.091432094574
{'eval_accuracy': 0.9346314325452016, 'eval_precision': 0.9339133019921703, 'eval_recall': 0.9346314325452016, 'eval_f-1': 0.9342087307112202}


                                                                                                                        

learning_rate: [0.00904508497187471]
Validation loss: 0.27, Time: 164.64631533622742
{'eval_accuracy': 0.9060055680763622, 'eval_precision': 0.904293363281582, 'eval_recall': 0.9060055680763622, 'eval_f-1': 0.9027097073673209}
Epoch #5


                                                                                                                        

learning_rate: [0.008535533905932705]
Training loss: 0.19, Time: 1332.0355229377747
{'eval_accuracy': 0.9345445062586927, 'eval_precision': 0.9338690707887813, 'eval_recall': 0.9345445062586927, 'eval_f-1': 0.9341601591783465}


                                                                                                                        

learning_rate: [0.008535533905932705]
Validation loss: 0.31, Time: 164.4893605709076
{'eval_accuracy': 0.8961951478191701, 'eval_precision': 0.8961671247932443, 'eval_recall': 0.8961951478191701, 'eval_f-1': 0.8949735061632252}
Epoch #6


                                                                                                                        

learning_rate: [0.007938926261462356]
Training loss: 0.18, Time: 1332.1142826080322
{'eval_accuracy': 0.940585883171071, 'eval_precision': 0.9401593110068588, 'eval_recall': 0.940585883171071, 'eval_f-1': 0.94035244641687}


                                                                                                                        

learning_rate: [0.007938926261462356]
Validation loss: 0.32, Time: 164.96645665168762
{'eval_accuracy': 0.8992443324937027, 'eval_precision': 0.9048566978732059, 'eval_recall': 0.8992443324937027, 'eval_f-1': 0.8962437147670431}
Epoch #7


                                                                                                                        

learning_rate: [0.007269952498697718]
Training loss: 0.16, Time: 1334.862622976303
{'eval_accuracy': 0.9452581710709318, 'eval_precision': 0.9447831487201181, 'eval_recall': 0.9452581710709318, 'eval_f-1': 0.9449860957678784}


                                                                                                                        

learning_rate: [0.007269952498697718]
Validation loss: 0.57, Time: 164.9325771331787
{'eval_accuracy': 0.7898714039506828, 'eval_precision': 0.8735025363626111, 'eval_recall': 0.7898714039506828, 'eval_f-1': 0.8078944110132805}
Epoch #8


                                                                                                                        

learning_rate: [0.00654508497187472]
Training loss: 0.17, Time: 1333.7403135299683
{'eval_accuracy': 0.9416072670375522, 'eval_precision': 0.941115024192774, 'eval_recall': 0.9416072670375522, 'eval_f-1': 0.9413314273367553}


                                                                                                                        

learning_rate: [0.00654508497187472]
Validation loss: 0.30, Time: 164.5705440044403
{'eval_accuracy': 0.9091873259976136, 'eval_precision': 0.908100810709509, 'eval_recall': 0.9091873259976136, 'eval_f-1': 0.9039477121163302}
Epoch #9


                                                                                                                        

learning_rate: [0.005782172325201153]
Training loss: 0.16, Time: 1337.7969307899475
{'eval_accuracy': 0.9476051808066759, 'eval_precision': 0.9469883771838434, 'eval_recall': 0.9476051808066759, 'eval_f-1': 0.94721725238031}


                                                                                                                        

learning_rate: [0.005782172325201153]
Validation loss: 0.25, Time: 165.86021900177002
{'eval_accuracy': 0.910380485218083, 'eval_precision': 0.9075902343529393, 'eval_recall': 0.910380485218083, 'eval_f-1': 0.9068169059617027}
Epoch #10


                                                                                                                        

learning_rate: [0.005000000000000004]
Training loss: 0.16, Time: 1336.0554132461548
{'eval_accuracy': 0.9471488178025035, 'eval_precision': 0.9467154068307854, 'eval_recall': 0.9471488178025035, 'eval_f-1': 0.9468830979475127}


                                                                                                                        

learning_rate: [0.005000000000000004]
Validation loss: 0.24, Time: 163.56647968292236
{'eval_accuracy': 0.920986344955588, 'eval_precision': 0.9191783721204383, 'eval_recall': 0.920986344955588, 'eval_f-1': 0.9194661237518974}
Epoch #11


                                                                                                                        

learning_rate: [0.004217827674798844]
Training loss: 0.15, Time: 1329.5806546211243
{'eval_accuracy': 0.9503433588317107, 'eval_precision': 0.9500118575081681, 'eval_recall': 0.9503433588317107, 'eval_f-1': 0.9501470661168175}


                                                                                                                        

learning_rate: [0.004217827674798844]
Validation loss: 0.31, Time: 164.97776460647583
{'eval_accuracy': 0.9028238101551107, 'eval_precision': 0.9093853421907178, 'eval_recall': 0.9028238101551107, 'eval_f-1': 0.9045185988585854}
Epoch #12


                                                                                                                        

learning_rate: [0.003454915028125261]
Training loss: 0.14, Time: 1337.2282836437225
{'eval_accuracy': 0.9551243045897079, 'eval_precision': 0.9549364510025935, 'eval_recall': 0.9551243045897079, 'eval_f-1': 0.9550228780692358}


                                                                                                                        

learning_rate: [0.003454915028125261]
Validation loss: 0.25, Time: 163.65571856498718
{'eval_accuracy': 0.9220469309293384, 'eval_precision': 0.9195543469624103, 'eval_recall': 0.9220469309293384, 'eval_f-1': 0.9194313744948152}
Epoch #13


                                                                                                                        

learning_rate: [0.0027300475013022673]
Training loss: 0.12, Time: 1330.278065443039
{'eval_accuracy': 0.9601225660639777, 'eval_precision': 0.9599678997325811, 'eval_recall': 0.9601225660639777, 'eval_f-1': 0.960039220964892}


                                                                                                                        

learning_rate: [0.0027300475013022673]
Validation loss: 0.27, Time: 163.9424226284027
{'eval_accuracy': 0.9144902558663661, 'eval_precision': 0.9120846559830211, 'eval_recall': 0.9144902558663661, 'eval_f-1': 0.9106212413233612}
Epoch #14


                                                                                                                        

learning_rate: [0.0020610737385376356]
Training loss: 0.11, Time: 1323.32079911232
{'eval_accuracy': 0.9647948539638387, 'eval_precision': 0.9647266714127979, 'eval_recall': 0.9647948539638387, 'eval_f-1': 0.9647589569429914}


                                                                                                                        

learning_rate: [0.0020610737385376356]
Validation loss: 0.25, Time: 162.84117889404297
{'eval_accuracy': 0.9205886252154315, 'eval_precision': 0.9183992221678198, 'eval_recall': 0.9205886252154315, 'eval_f-1': 0.9171260919746081}
Epoch #15


                                                                                                                        

learning_rate: [0.0014644660940672616]
Training loss: 0.10, Time: 1323.4212214946747
{'eval_accuracy': 0.9679676634214186, 'eval_precision': 0.9679521048251739, 'eval_recall': 0.9679676634214186, 'eval_f-1': 0.9679583892746595}


                                                                                                                        

learning_rate: [0.0014644660940672616]
Validation loss: 0.25, Time: 163.042733669281
{'eval_accuracy': 0.9203234787219939, 'eval_precision': 0.917989381674372, 'eval_recall': 0.9203234787219939, 'eval_f-1': 0.9170920287325617}
Epoch #16


                                                                                                                        

learning_rate: [0.0009549150281252627]
Training loss: 0.09, Time: 1328.6055009365082
{'eval_accuracy': 0.9705537204450626, 'eval_precision': 0.9704725321868611, 'eval_recall': 0.9705537204450626, 'eval_f-1': 0.970509566361135}


                                                                                                                        

learning_rate: [0.0009549150281252627]
Validation loss: 0.23, Time: 164.04758501052856
{'eval_accuracy': 0.9300013257324672, 'eval_precision': 0.9283889652123568, 'eval_recall': 0.9300013257324672, 'eval_f-1': 0.9287925747876695}
Epoch #17


                                                                                                                        

learning_rate: [0.0005449673790581601]
Training loss: 0.08, Time: 1326.1111781597137
{'eval_accuracy': 0.974182892906815, 'eval_precision': 0.9741812866092091, 'eval_recall': 0.974182892906815, 'eval_f-1': 0.9741796642151118}


                                                                                                                        

learning_rate: [0.0005449673790581601]
Validation loss: 0.23, Time: 163.45793652534485
{'eval_accuracy': 0.9306641919660612, 'eval_precision': 0.9289908793954363, 'eval_recall': 0.9306641919660612, 'eval_f-1': 0.9292222483203666}
Epoch #18


                                                                                                                        

learning_rate: [0.0002447174185242328]
Training loss: 0.07, Time: 1326.5301849842072
{'eval_accuracy': 0.9765299026425591, 'eval_precision': 0.9765506983375265, 'eval_recall': 0.9765299026425591, 'eval_f-1': 0.9765391935239514}


                                                                                                                        

learning_rate: [0.0002447174185242328]
Validation loss: 0.21, Time: 163.87330222129822
{'eval_accuracy': 0.9383534402757524, 'eval_precision': 0.9370081576788303, 'eval_recall': 0.9383534402757524, 'eval_f-1': 0.9371448069961689}
Epoch #19


Epoch:  38%|███████████████████████████▊                                              | 135/360 [08:21<13:55,  3.71s/it]

In [11]:
# Test on Test Set

In [None]:
_, predictions = validate(model, test_loader)

Epoch:  94%|███████████████████████████████████████████████████████████████    | 95/101 [03:58<00:15,  2.65s/it]

In [None]:
temp = predictions
print(len(predictions))
predictions = np.concatenate([[0], predictions, [0]])
print(len(predictions))

In [None]:
test_df = pd.read_csv(test_file)
test_df['predictions'] = predictions
test_df.to_csv("bert_embed_triplet_iob_pred.csv")

In [None]:
error = test_df[test_df['label'] != test_df['predictions']]
print((len(test_df)- len(error)) / len(test_df))

In [31]:
print(len(error))
error.to_csv("errors_bert_embed_triplet.csv")

778


In [32]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [33]:
error 
# 1. errors are on table rows which contain chemical name independently (without a lot of other content) 
# just like chemical name headings for reaction starts
# 2. Reaction headings that do not contain chemical names: EXAMPLE 3. Selective deprotection of position 6.\n
# 3. Errors on B tags (label 2) 3A: Sometimes paras that are like this:
# Example 15: N-(5-(4-(5-bromo-3-methyl-2-oxo-2,3-dihydro-1H-benzo[d]imidazole -1-yl)pyrimidin-2-ylamino)-2-((2-(dimethylamino)ethyl)(methyl)amino)-4-methoxyphenyl) acrylamide hydrochloride\n
# are tagged as outer (0) or Beginning (2) in the gold standard. Model probably needs more context
# 3B : Headings (tag B (2)) such as 1H NMR of GLP-111: DMSO-d6, δ 1.56-1.57 (br, m, 9H, 3CH2), 1.61-1.63 (br, m, 2H, CH2), 2.01 (brs, 4H, CH2), 2.36 (br, 1H, NH—CH2), 3.22 (brs, 2H, CH2), 5.77 (s, 1H, OH), 6.44-6.46 (dd, 1H, Arom-H), 6.92-6.93 (dd, 1H, Arom-H) 7.06-7.09 (t, 1H, Arom-H), 7.21-7.22 (t, 1H, Arom-H), 9.40 (br, s, 1H, NH), 9.74 (br, s, 1H, NH).\n
# that contain properties of chemicals are classified as 0 because in most cases, the paras that just contain the properites
# are not tagged as reaction paras
# 4. Long paras containing a lot of chemical names are tagged as 2 even tho they are 0
# 5. Example 4m\n	 : tagged as 2 even tho they are 0
# 6. Tables inside reactions are not recognized as reactions (tagged as 0 instead of 1)

Unnamed: 0,para,label,document,predictions
70,ZD001 & 2-fluoro-5-(pyridin-2-ethynyl)-N-(4-fluor ophenyl)benzamide & <img> id-imgb0019.tif </img>\n,0,EP3284738A1.txt,2
71,ZD002 & (2-chloro-5-(pyridin-2-ethynyl)phenyl)(7-oxa-2-aza-spiro[3.5]nonan-2-yl)methanone & <img> id-imgb0020.tif </img>\n,0,EP3284738A1.txt,2
73,ZD004 & 2-fluoro-N-(4-fluorophenyl)-5-((2-methylt hiazol-4-yl)ethynyl)benzamide & <img> id-imgb0022.tif </img>\n,0,EP3284738A1.txt,2
74,"ZD005 & (3-((2-methylthiazol-4-yl)ethynyl)phenyl) (3-trifluoromethyl-5,6-dihydro-[1,2,4]tria zolo[4,3-a]pyrazin-7(8H)-yl)methanone & <img> id-imgb0023.tif </img>\n",0,EP3284738A1.txt,2
75,ZD006 & 5-((1H-indazol-5-yl)ethynyl)-2-fluoro-N-( 4-fluorophenyl)benzamide & <img> id-imgb0024.tif </img>\n,0,EP3284738A1.txt,2
76,ZD007 & (3-((2-methylthiazol-4-yl)phenyl)(7-oxa-2 -aza-spiro[3.5]nonan-2-yl)methanone & <img> id-imgb0025.tif </img>\n,0,EP3284738A1.txt,2
77,ZD008 & (3-((2-methylthiazol-4-yl)phenyl)(8-oxa-2 -aza-spiro[4.5]dec-2-yl)methanone & <img> id-imgb0026.tif </img>\n,0,EP3284738A1.txt,2
78,ZD009 & N-(4-fluorophenyl)-3-((2-methylthiazol-4-yl)ethynyl)benzamide & <img> id-imgb0027.tif </img>\n,0,EP3284738A1.txt,2
79,ZD010 & N-(4-cyanophenyl)-3-((2-methylthiazol-4-yl)ethynyl)benzamide & <img> id-imgb0028.tif </img>\n,0,EP3284738A1.txt,2
83,ZD014 & 3-((2-methylthiazol-4-ylethynyl)-N-pheny lbenzamide & <img> id-imgb0032.tif </img>\n,0,EP3284738A1.txt,2
