# Training code for MeasuredEntity and HasQuantity Extraction


In [1]:
!pip install transformers
!pip install nltk
!pip install cookiecutter
!pip install stanza
!pip install scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
!pip install pytorch-crf

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 8.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 51.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 55.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=a177ba460da

In [2]:
import os
import io
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, BertForPreTraining
from sklearn.model_selection import train_test_split
import stanza
import spacy
import re
import en_core_sci_sm
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel
from transformers import BertForTokenClassification, AdamW
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, AdamW
from torchcrf import CRF

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




### Mounting the drive containing the training and dev data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_path = "/content/drive/MyDrive/final_data/train/"
val_path = "/content/drive/MyDrive/final_data/dev/"

In [7]:
files=os.listdir(train_path + "text")
files_val = os.listdir(val_path + "text")

#### Function to split the scientific articles into sentences

In [8]:
nlp = en_core_sci_sm.load()
def sen_split(text):
  doc = nlp(text)
  sen = [s.text for s in doc.sents]
  return sen

### Training data preparation

In [9]:
ans = []
itr = 0
for fil in files:
  pathtxt=os.path.join(train_path+"text",fil)
  pathtsv=os.path.join(train_path+"tsv",fil[:-3]+"tsv")
  if(not(os.path.exists(pathtsv))):
    continue
  filetsv=pd.read_csv(pathtsv,sep = '\t')
  tsv_lis = filetsv.values
  with open(pathtxt, 'r') as f:
    text_in=f.read()
  sen_lis = sen_split(text=text_in)
  len_p = 0
  for sen in sen_lis:
    for i in range(len(tsv_lis)):
      if tsv_lis[i][2] == "Quantity" and tsv_lis[i][3]>=len_p and tsv_lis[i][4]<=len_p+len(sen):
        val = []
        val.append(sen)
        val.append((tsv_lis[i][3] - len_p, tsv_lis[i][4] - len_p))
        c = 0
        for j in range(len(tsv_lis)):
          if tsv_lis[j][2] == "MeasuredEntity" and tsv_lis[j][3]>=len_p and tsv_lis[j][4]<=len_p+len(sen) and tsv_lis[i][1] == tsv_lis[j][1]:
            c += 1
            val.append((tsv_lis[j][3] - len_p, tsv_lis[j][4] - len_p))
            ans.append(val)
        if c == 0:
          ans.append(val)
    len_p += len(sen)+1
  print(itr)
  print(fil)
  itr += 1

0
S0167577X14001256-389.txt
1
S030881461301604X-1001.txt
2
S0006322312001096-1136.txt
3
S095741741101342X-2624.txt
4
S0927024813002961-1334.txt
5
S0019103512001388-3081.txt
6
S0960896612001022-1223.txt
7
S0012821X12004384-1265.txt
8
S2213671113001306-885.txt
9
S0378383912000130-3745.txt
10
S0038071713001971-1427.txt
11
S0168945213001805-4775.txt
12
S0032063312003054-2467.txt
13
S0921818113002245-882.txt
14
S0167880913001229-1304.txt
15
S095741741101342X-726.txt
16
S0032063312003054-2501.txt
17
S2213671113001306-1398.txt
18
S0016236113008041-961.txt
19
S2211124712002884-1060.txt
20
S0012821X12004384-1249.txt
21
S0019103512003995-1283.txt
22
S2213671113000738-684.txt
23
S0022399913003358-943.txt
24
S0019103512004009-3976.txt
25
S0927024813002420-1032.txt
26
S2213671113000738-738.txt
27
S0378383912000130-3827.txt
28
S0960148113004989-3258.txt
29
S175058361300203X-1280.txt
30
S0012821X12004384-1148.txt
31
S0016236113008041-872.txt
32
S0378383912000130-1096.txt
33
S0378112713005288-1720.txt

In [10]:
ans_val = []
for fil in files_val:
  pathtxt=os.path.join(val_path+"text",fil)
  pathtsv=os.path.join(val_path+"tsv",fil[:-3]+"tsv")
  if(not(os.path.exists(pathtsv))):
    continue
  filetsv=pd.read_csv(pathtsv,sep = '\t')
  tsv_lis = filetsv.values
  with open(pathtxt, 'r') as f:
    text_in=f.read()
  sen_lis = sen_split(text=text_in)
  len_p = 0
  for sen in sen_lis:
    for i in range(len(tsv_lis)):
      if tsv_lis[i][2] == "Quantity" and tsv_lis[i][3]>=len_p and tsv_lis[i][4]<=len_p+len(sen):
        val = []
        val.append(sen)
        val.append((tsv_lis[i][3] - len_p, tsv_lis[i][4] - len_p))
        c = 0
        for j in range(len(tsv_lis)):
          if tsv_lis[j][2] == "MeasuredEntity" and tsv_lis[j][3]>=len_p and tsv_lis[j][4]<=len_p+len(sen) and tsv_lis[i][1] == tsv_lis[j][1]:
            c += 1
            val.append((tsv_lis[j][3] - len_p, tsv_lis[j][4] - len_p))
            ans_val.append(val)
        if c == 0:
          ans_val.append(val)
    len_p += len(sen)+1
  print(fil)

S0016236113008041-3186.txt
S0019103512004009-3825.txt
S0019103512004009-5019.txt
S0019103512003533-4685.txt
S0016236113008041-967.txt
S0012821X12004384-1594.txt
S0019103512004009-4007.txt
S0019103513005058-3094.txt
S0021979713004438-1969.txt
S0012821X13002185-1231.txt
S0012821X13007309-1989.txt
S0012821X13007309-1605.txt
S0016236113008041-890.txt
S0019103511004994-1511.txt
S0022459611006116-1160.txt
S0016236113008041-3171.txt
S0022459611006116-1195.txt
S0019103511004994-1382.txt
S0019103512003533-3299.txt
S0012821X13002185-1200.txt


In [13]:
stanza.download('en')
nlp2 = stanza.Pipeline(lang='en', processors='tokenize', tokenize_no_ssplit=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 44.8MB/s]                    
2021-04-04 11:49:12 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:13<00:00, 5.60MB/s]
2021-04-04 11:50:30 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-04-04 11:50:30 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-04-04 11:50:30 INFO: Use device: gpu
2021-04-04 11:50:30 INFO: Loading: tokenize
2021-04-04 11:50:41 INFO: Done loading processors!


In [14]:
def length(text):
  l = 0
  for i in range(len(text)):
    if text[i] != '#':
      l += 1
  return l

### Enclosing the Quantity span in the $ symbol

In [15]:
pretrain = []
i = 0
for val in ans:
  label_q = []
  sen_tok = []
  sen = nlp2(val[0])
  label_en = []
  if len(val) == 3:
    for w in sen.sentences[0].words[:]:
      rng = re.findall(r'\d+', w.misc) 
      bert_tok = tokenizer.tokenize(w.text)
      sen_tok.extend(bert_tok)
      w_len = int(rng[0]);
      for t in bert_tok:
        if val[1][0] <= w_len and val[1][1] >= w_len + length(t):
          label_q.append(1)
        else:
          label_q.append(0)
        if val[2][0] <= w_len and val[2][1] >= w_len + length(t):
          label_en.append(1)
        else:
          label_en.append(0)
        w_len += length(t)
    for i in range(len(label_q)):
      if label_q[i] == 1 and i == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      elif label_q[i] == 1 and label_q[i-1] == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      if label_q[i] == 1 and i == len(label_q) - 1:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
      elif label_q[i] == 1 and label_q[i+1] == 0:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
    pretrain.append([sen_tok, label_en])
  else:
    for w in sen.sentences[0].words[:]:
      rng = re.findall(r'\d+', w.misc) 
      bert_tok = tokenizer.tokenize(w.text)
      sen_tok.extend(bert_tok)
      w_len = int(rng[0]);
      for t in bert_tok:
        if val[1][0] <= w_len and val[1][1] >= w_len + length(t):
          label_q.append(1)
        else:
          label_q.append(0)
        label_en.append(0)
        w_len += length(t)
    for i in range(len(label_q)):
      if label_q[i] == 1 and i == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      elif label_q[i] == 1 and label_q[i-1] == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      if label_q[i] == 1 and i == len(label_q) - 1:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
      elif label_q[i] == 1 and label_q[i+1] == 0:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
    pretrain.append([sen_tok, label_en])

In [16]:
validation = []
i = 0
for val in ans_val:
  label_q = []
  sen_tok = []
  sen = nlp2(val[0])
  label_en = []
  if len(val) == 3:
    for w in sen.sentences[0].words[:]:
      rng = re.findall(r'\d+', w.misc) 
      bert_tok = tokenizer.tokenize(w.text)
      sen_tok.extend(bert_tok)
      w_len = int(rng[0]);
      for t in bert_tok:
        if val[1][0] <= w_len and val[1][1] >= w_len + length(t):
          label_q.append(1)
        else:
          label_q.append(0)
        if val[2][0] <= w_len and val[2][1] >= w_len + length(t):
          label_en.append(1)
        else:
          label_en.append(0)
        w_len += length(t)
    for i in range(len(label_q)):
      if label_q[i] == 1 and i == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      elif label_q[i] == 1 and label_q[i-1] == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      if label_q[i] == 1 and i == len(label_q) - 1:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
      elif label_q[i] == 1 and label_q[i+1] == 0:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
    validation.append([sen_tok, label_en])
  else:
    for w in sen.sentences[0].words[:]:
      rng = re.findall(r'\d+', w.misc) 
      bert_tok = tokenizer.tokenize(w.text)
      sen_tok.extend(bert_tok)
      w_len = int(rng[0]);
      for t in bert_tok:
        if val[1][0] <= w_len and val[1][1] >= w_len + length(t):
          label_q.append(1)
        else:
          label_q.append(0)
        label_en.append(0)
        w_len += length(t)
    for i in range(len(label_q)):
      if label_q[i] == 1 and i == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      elif label_q[i] == 1 and label_q[i-1] == 0:
        label_en.insert(i, 0)
        sen_tok.insert(i,"$")
      if label_q[i] == 1 and i == len(label_q) - 1:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
      elif label_q[i] == 1 and label_q[i+1] == 0:
        label_en.insert(i+2, 0)
        sen_tok.insert(i+2,"$")
    validation.append([sen_tok, label_en])

In [17]:
train = []
for val in pretrain:
  lab = np.zeros(256)
  tok_arr = np.zeros(256)
  att_mask = np.zeros(256)
  sen_tok = []
  tok_arr[0] = 102
  att_mask[0] = 1
  for tok in val[0]:
    sen_tok.append(tokenizer.convert_tokens_to_ids(tok))
  for i in range(len(val[1])):
    lab[i+1] = val[1][i]
    tok_arr[i+1] = sen_tok[i]
    att_mask[i+1] = 1
  train.append([tok_arr,att_mask, lab])

In [18]:
val_data = []
for val in validation:
  lab = np.zeros(256)
  tok_arr = np.zeros(256)
  att_mask = np.zeros(256)
  sen_tok = []
  tok_arr[0] = 102
  att_mask[0] = 1
  for tok in val[0]:
    sen_tok.append(tokenizer.convert_tokens_to_ids(tok))
  for i in range(len(val[1])):
    lab[i+1] = val[1][i]
    tok_arr[i+1] = sen_tok[i]
    att_mask[i+1] = 1
  val_data.append([tok_arr,att_mask, lab])

In [19]:
x_train_id = np.zeros((0,256))
x_train_mask = np.zeros((0,256))
y_train = np.zeros((0,256))
for val in train:
  x_train_id = np.vstack((x_train_id, val[0]))
  x_train_mask = np.vstack((x_train_mask, val[1]))
  y_train = np.vstack((y_train, val[2]))

In [20]:
for i in range(len(y_train)):
  for j in range(1,len(y_train[0])):
    if y_train[i][j-1] == 0 and y_train[i][j] == 1:
      y_train[i][j] = 2

In [21]:
x_val_id = np.zeros((0,256))
x_val_mask = np.zeros((0,256))
y_val = np.zeros((0,256))
for val in val_data:
  x_val_id = np.vstack((x_val_id, val[0]))
  x_val_mask = np.vstack((x_val_mask, val[1]))
  y_val = np.vstack((y_val, val[2]))

In [22]:
for i in range(len(y_val)):
  for j in range(1,len(y_val[0])):
    if y_val[i][j-1] == 0 and y_val[i][j] == 1:
      y_val[i][j] = 2

In [23]:
train_data = TensorDataset(torch.from_numpy(x_train_id), torch.from_numpy(x_train_mask), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val_id), torch.from_numpy(x_val_mask), torch.from_numpy(y_val))

In [24]:
batch_size = 24
train_loader = DataLoader(train_data, shuffle=True, batch_size = batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size = batch_size)

In [25]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




### Overall Model Architechture of SciBERT + CRF Model

In [26]:
class BERT_Arch(nn.Module):

    def __init__(self, bert, embed_dim, hidden_dim, drop_prob, n_layers, out_dim):
      
      super(BERT_Arch, self).__init__()
      self.bert = bert 
      self.dropout = nn.Dropout(drop_prob)
      self.fc1 = nn.Linear(2*embed_dim,out_dim)
      self.w1 = nn.Linear(embed_dim, embed_dim)
      self.w2 = nn.Linear(embed_dim, embed_dim)
      #self.bilstm = nn.LSTM(embed_dim, hidden_dim,  bidirectional=True, batch_first=True)
      self.softmax = nn.LogSoftmax(dim = 2)
      self.crf = CRF(3, batch_first=True)  
      self.tanh = nn.Tanh()

    #define the forward pass
    def forward(self, sent_id, mask_val, labels=None):
      x = self.bert(sent_id, attention_mask=mask_val)
      x = x.last_hidden_state
      x = self.tanh(x)
      cls = x[:,0,:]
      cls = cls.unsqueeze(1).repeat(1, 256, 1)
      cls = self.w1(cls)
      x = self.w2(x)
      x = torch.cat([x,cls], dim = 2)
      #x,_ = self.bilstm(x)
      x = self.dropout(x)
      x = self.fc1(x)
      mask_val = mask_val.type(torch.uint8)
      logit = self.softmax(x)
      if labels is not None:
          loss = -self.crf(logit, labels, mask=mask_val, reduction='mean')
          return loss
      else:
          prediction = self.crf.decode(x, mask=mask_val)
          return prediction

In [27]:
bert_model = BERT_Arch(model, 768, 64, 0.1, 1,3)
bert_model = bert_model.to(device)

In [28]:
print(bert_model)

BERT_Arch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [29]:
optimizer = torch.optim.Adam(bert_model.parameters(), lr=1e-5)

### Score metric very much similar to F1-Overlap score of the Organizers

In [30]:
def score(val_loader):
  p=0
  n=0
  pos=0
  neg=0
  prec_num = 0 
  prec_den = 0
  rec_num = 0
  rec_den = 0 
  p1 = 0
  n1=0
  for seq, mask, y in val_loader:
      bert_model.zero_grad()
      bert_model.eval()
      y_pred = bert_model(seq.long().to(device), mask.long().to(device))
      np_out = np.zeros((len(y_pred),256, 1))
      np_act = (y.cpu().data.numpy() >= 1).astype(int).reshape((len(y), len(y[0]),1))
      
      for i in range(np_out.shape[0]):
        prev_2 = 0
        for j in range(np_out.shape[1]):
          for k in range(np_out.shape[2]):
            if j >= len(y_pred[i]):
              np_out[i][j][k] = 0
            elif y_pred[i][j] == 2:
              np_out[i][j][k]=1
              prev_2 = 1
            elif y_pred[i][j] == 1 and prev_2 == 1:
              np_out[i][j][k]=1
            else:
              prev_2 = 0
              np_out[i][j][k]=0

      for i in range(np_out.shape[0]):
        for j in range(np_out.shape[1]):
          if np.max(np_out[i,j,:])==1 or np.max(np_act[i,j,:])==1:
            if np.max(np_out[i,j,:])==1 and np.max(np_act[i,j,:])==1:
              p1 = p1 + 1
            else:
              n1 = n1 + 1

      for i in range(np_out.shape[0]):
        for j in range(np_out.shape[1]):
          for k in range(np_out.shape[2]):
            if np_out[i,j,k] == np_act[i,j,k]:
              p = p +1
            else:
              n=n+1
            if (np_out[i,j,k])==1 or (np_act[i,j,k])==1:
              if (np_out[i,j,k])==1 and (np_act[i,j,k])==1:
                pos = pos + 1
              else:
                neg = neg + 1
            if np_out[i,j,k]==1:
              if np_out[i,j,k]==np_act[i,j,k]:
                prec_num = prec_num + 1
              else:
                prec_den = prec_den + 1
            if np_act[i,j,k]==1:
              if np_out[i,j,k]==np_act[i,j,k]:
                rec_num = rec_num + 1
              else:
                rec_den = rec_den + 1
  if prec_num+prec_den == 0:
    return 0
  precision = prec_num/(prec_num+prec_den)
  if rec_num+rec_den == 0:
    return 0
  recall = rec_num/(rec_num+rec_den)
  if precision == 0 and recall == 0:
    return 0
  F1 = 2*precision*recall/(precision+recall)
  return F1

### Loading a pretrained model 

In [41]:
bert_model = torch.load("/content/drive/My Drive/scibert_base_measured_entity_crf_0.56.pt")

### Uncomment the below code if you want to train the Custom Model

In [40]:
# epochs = 10
# for e in range(epochs):
  
#   bert_model.train()
#   i=0
#   train_loss=0
#   for seq, mask, y in train_loader:
#     bert_model.zero_grad()
#     loss = bert_model(seq.long().to(device), mask.long().to(device), y.long().to(device))
#     train_loss += loss.item()*batch_size
#     loss.backward()
#     torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)
#     optimizer.step()
#     if(i%5==0):
#       print("Epoch-{}/{} Iterations-{} loss-{}".format(e+1,epochs,i+1,loss.item()))
#     i+=1
  
  
#   bert_model.eval()
#   f1_score = score(val_loader)
#   val_loss=0
#   for seq, mask, y in val_loader:
#     bert_model.zero_grad()
#     loss = bert_model(seq.long().to(device), mask.long().to(device), y.long().to(device))
#     val_loss += loss.item()*batch_size
#     i+=1
  
#   print("Epoch-{}/{} train_loss-{} Val_loss-{} F1 Score-{}".format(e+1,epochs,train_loss/len(train_loader),val_loss/len(val_loader), f1_score))

### The F1-Score obtained on the dev dataset using the score metric defined above

In [42]:
p=0
n=0
pos=0
neg=0
prec_num = 0 
prec_den = 0
rec_num = 0
rec_den = 0 
p1 = 0
n1=0
for seq, mask, y in val_loader:
    bert_model.zero_grad()
    bert_model.eval()
    y_pred = bert_model(seq.long().to(device), mask.long().to(device))
    np_out = np.zeros((len(y_pred),256, 1))
    np_act = (y.cpu().data.numpy() >= 1).astype(int).reshape((len(y), len(y[0]),1))
    
    for i in range(np_out.shape[0]):
      prev_2 = 0
      for j in range(np_out.shape[1]):
        for k in range(np_out.shape[2]):
          if j >= len(y_pred[i]):
            np_out[i][j][k] = 0
          elif y_pred[i][j] == 2:
            np_out[i][j][k]=1
            prev_2 = 1
          elif y_pred[i][j] == 1 and prev_2 == 1:
            np_out[i][j][k]=1
          else:
            prev_2 = 0
            np_out[i][j][k]=0

    for i in range(np_out.shape[0]):
      for j in range(np_out.shape[1]):
        if np.max(np_out[i,j,:])==1 or np.max(np_act[i,j,:])==1:
          if np.max(np_out[i,j,:])==1 and np.max(np_act[i,j,:])==1:
            p1 = p1 + 1
          else:
            n1 = n1 + 1

    for i in range(np_out.shape[0]):
      for j in range(np_out.shape[1]):
        for k in range(np_out.shape[2]):
          if np_out[i,j,k] == np_act[i,j,k]:
            p = p +1
          else:
            n=n+1
          if (np_out[i,j,k])==1 or (np_act[i,j,k])==1:
            if (np_out[i,j,k])==1 and (np_act[i,j,k])==1:
              pos = pos + 1
            else:
              neg = neg + 1
          if np_out[i,j,k]==1:
            if np_out[i,j,k]==np_act[i,j,k]:
              prec_num = prec_num + 1
            else:
              prec_den = prec_den + 1
          if np_act[i,j,k]==1:
            if np_out[i,j,k]==np_act[i,j,k]:
              rec_num = rec_num + 1
            else:
              rec_den = rec_den + 1

precision = prec_num/(prec_num+prec_den)
recall = rec_num/(rec_num+rec_den)
F1 = 2*precision*recall/(precision+recall)

print("Entity recognition modified accuracy:-" + str(p1/(p1+n1)))
print("--------NER RESULTS--------")
print("Accuracy:-" + str(p/(n+p)))
print("Modified Accuracy:-" + str(pos/(pos+neg)))
print("Precision:-" + str(precision))
print("Recall:-" + str(recall))
print("F1 score:-"+str(F1))

Entity recognition modified accuracy:-0.3830409356725146
--------NER RESULTS--------
Accuracy:-0.9910411005434783
Modified Accuracy:-0.3830409356725146
Precision:-0.5954545454545455
Recall:-0.5177865612648221
F1 score:-0.5539112050739958


### Testing the model on custom sentences

In [43]:
t = "The concentration and strength of H2SO4 is $ 2g/ml $ and 98%"

In [44]:
tok_lis = tokenizer.tokenize(t)
tok_arr = np.zeros(256)
att_mask = np.zeros(256)
att_mask[0] = 1
tok_arr[0] = 102
sen_tok = []
for tok in tok_lis:
  sen_tok.append(tokenizer.convert_tokens_to_ids(tok))
for i in range(len(sen_tok)):
  tok_arr[i+1] = sen_tok[i]
  att_mask[i+1] = 1
tok_arr = torch.from_numpy(tok_arr)
att_mask = torch.from_numpy(att_mask)

In [45]:
bert_model.zero_grad()
bert_model.eval()
y_pred = bert_model(tok_arr.reshape((1,256)).long().to(device), att_mask.reshape((1,256)).long().to(device))

In [46]:
tokenized_sen = tokenizer.tokenize(t)

In [47]:
i = 1
for word in tokenized_sen:
  if y_pred[0][i] != 0:
    print(word + ": "+"MeasuredEntity")
  else:
    print(word + ": ")
  i+=1

the: 
concentration: 
and: 
strength: 
of: 
h: MeasuredEntity
##2: MeasuredEntity
##so: MeasuredEntity
##4: MeasuredEntity
is: 
$: 
2: 
##g: 
/: 
ml: 
$: 
and: 
98: 
%: 
