In [2]:
!pip install transformers==4.1.1 &> /dev/null
import importlib
import sys
importlib.reload(sys.modules['pkg_resources'])

sys.modules['pkg_resources'].get_distribution('tokenizers').version

import transformers
transformers.__version__

In [3]:
!pip install mendelai-brat-parser &> /dev/null
from brat_parser import get_entities_relations_attributes_groups

import torch 
import spacy
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

#!pip uninstall nltk &> /dev/null
#!pip install -U nltk &> /dev/null
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('punkt')

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm, trange

MAX_LEN = 75
batch_size = 32

from google.colab import drive
drive.mount('/content/drive')

# Task 1: NER

## Data Extraction

train = pd.DataFrame(columns = ["Id","Doc_Id","Sent_Id","Word","Tag"])

sent_id = 0
id = 0
doc_id = 0
for path in glob("/content/drive/MyDrive/CMED/Data/trainingdata_v3/train/*"):
  
  if path.endswith(".ann"):
    continue

  doc_id += 1
  if doc_id % 25 == 0:
    print(doc_id,end = ' ')

  span = 0
  with open(path) as txt:
    raw = txt.read()

  entities,_,_,_ = get_entities_relations_attributes_groups(path.replace(".txt",".ann"))
  entl = []
  for i in entities:
    if (entities[i].span.__len__())>1:
      entities[i].span = ((entities[i].span[0][0],entities[i].span[-1][-1]),)
    entl.append([*entities[i].span[0],entities[i].text,entities[i].type])
  entl = sorted(entl)[::-1]

  for sent in sent_tokenize(raw):
    sent_id += 1
    for word in word_tokenize(sent):
      id += 1
      train = train.append(pd.DataFrame(data = np.array([[id,doc_id,sent_id,word,np.nan]]),columns = ["Id","Doc_Id","Sent_Id","Word","Tag"]))
      span += raw[span:].find(word)
      while True:
        if len(entl)>0 and entl[-1][0] <= span:
          if entl[-1][1] <= span:
            entl.pop()
          else:
            if span == entl[-1][0]:
              train.iloc[-1,-1] = "B-" + entl[-1][3]
            else:
              train.iloc[-1,-1] = "I-" + entl[-1][3]
            break
        else:
          train.iloc[-1,-1] = 'O'
          break

train.to_csv("TRAIN_CMED_NER_BIO2.csv")

## Data Preprocessing

In [4]:
train = pd.read_csv("../input/topsecret/TRAIN_CMED_NER_BIO2.csv")
train_ner = train.copy()
train_ner = train_ner.drop(np.where(train_ner["Word"].isnull())[0],axis=0)
train_ner["Tag"] = [word[0] for word in train_ner["Tag"]]
train_ner = train_ner.set_index("Id")
train_ner = train_ner.drop("Unnamed: 0",axis = 1)
del train
train_ner.head()

In [5]:
grp_obj = train_ner.groupby("Sent_Id").apply(lambda x:[(word,label) for word,label in zip(x["Word"],x["Tag"])])

sentences = [[tpl[0] for tpl in sent] for sent in grp_obj]
labels = [[tpl[1] for tpl in sent] for sent in grp_obj]

saved_sentences = sentences
saved_labels = labels
 
tag_values = list(set(train_ner["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) 

def get_tokenized_sentences_and_labels():

  global sentences,labels
  sent = []
  lab = []

  for i,j in zip(sentences,labels):
    sent.append([])
    lab.append([])
    for ind in range(len(i)):
      temp = tokenizer.tokenize(i[ind])
      sent[-1].extend(temp)
      lab[-1].extend((j[ind]*len(temp)))
  return sent,lab
  
sentences,labels = get_tokenized_sentences_and_labels()

In [7]:
def normalize_longer_sequences():
  global sentences,labels

  inds = []

  for i in range(len(sentences)):
    if len(sentences[i]) > MAX_LEN:
      inds.append(i)

  temp_sent = np.array(sentences,dtype = object)[inds]
  temp_lab = np.array(labels,dtype = object)[inds]

  sentences = np.delete(np.array(sentences,dtype = object),inds,0)
  labels = np.delete(np.array(labels,dtype = object),inds,0)

  save_sent = []
  save_lab = []
  for i in range(len(temp_sent)):
    for k in range(len(temp_sent[i])//(MAX_LEN//2)-1):
      save_sent.append(temp_sent[i][k*(MAX_LEN//2):k*(MAX_LEN//2)+75])
      save_lab.append(temp_lab[i][k*(MAX_LEN//2):k*(MAX_LEN//2)+75])

  sentences = np.concatenate([sentences,save_sent],axis = 0)
  labels = np.concatenate([labels,save_lab],axis = 0)

normalize_longer_sequences()

for i,j in zip(sentences,labels):
  assert(len(i) == len(j))

In [8]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in sentences],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [9]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state= 0, test_size=0.15)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [10]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

## Model

### BERT

#### Utility procedures for training BERT

In [11]:
from sklearn.metrics import confusion_matrix

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def plot_confusion_matrix(cnf_matrix,title = None):
    plt.figure(figsize = (3.3,3.3))
    sns.heatmap(cnf_matrix,annot = True,cbar = False)
    plt.xticks([0.5,1.5,2.5,3.5])
    plt.yticks([0.5,1.5,2.5,3.5])
    plt.xlabel("Predicted Class")
    plt.ylabel("True Class")
    plt.gca().set_xticklabels([tag_values[w] for w in range(4)])
    plt.gca().set_yticklabels([tag_values[w] for w in range(4)])
    plt.gca().xaxis.set_ticks_position("top")
    plt.gca().xaxis.set_label_position("top")
    if title:
        plt.title(title)

In [12]:
class Token_Classifier_With_Pretrained_Bert_Base(torch.nn.Module):
  def __init__(self,base,classes = 4):
    super().__init__()
    self.n_classes = classes
    self.base = base
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=classes, bias=True)
    self.criterion = torch.nn.CrossEntropyLoss()

  def forward(self,bert_input,attention_mask,token_type_ids = None,labels = None):
    out = self.base(bert_input,attention_mask = attention_mask,token_type_ids = None)[0]
    out = self.dropout(out)
    out = self.classifier(out)
    if labels is None:
      return out
    else:
      return (self.criterion(out.reshape(-1,4),labels.reshape(-1)),out)

In [13]:
def finetune_procedure_bert_for_ner(model,epochs,dataloader_train,dataloader_valid,optimizer,criterion = None,scheduler = None,max_grad_norm = 1.0,early_stopping = False,patience = 5,stopping_criteria = None,PATH = None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
      model.cuda()
    if early_stopping:
      assert(stopping_criteria is not None)
      assert(PATH is not None)
      curp = 0
      best_score = 0

    history = {
        "average_train_loss_per_batch" : [],
        "average_valid_loss_per_batch" : [],
        "train_confusion_matrix" : None,
        "valid_confusion_matrix" : None,
    }
    
    for i in range(1,1+epochs):
        print("Epoch number {}:".format(i))
        
        # ========================================
        #              Training
        # ========================================
        model.train()
        iterator_dataloader_train = iter(dataloader_train)
        train_loss = 0
        cnf_matrix = np.zeros((4,4))
        
        for _ in trange(len(dataloader_train)):
            
            batch = next(iterator_dataloader_train)
            train_input,train_mask,train_label = (i.to(device) for i in batch)
            
            model.zero_grad()
            
            outputs = model(train_input, token_type_ids=None, attention_mask=train_mask, labels=train_label)
            
            if criterion is not None:
                loss = criterion(outputs[1].reshape(-1,4),train_label.reshape(-1))
            else:
                loss = outputs[0]
            
            loss.backward()
            train_loss += loss.item()
            
            torch.nn.utils.clip_grad_norm_(parameters = model.parameters(),max_norm = max_grad_norm)
            
            optimizer.step()
            if scheduler is not None:
                scheduler.step()
                
            x = confusion_matrix(train_label.detach().to("cpu").numpy().ravel(),outputs[1].detach().to("cpu").numpy().argmax(axis = -1).ravel())
            cnf_matrix += np.pad(x,[0,4 - x.shape[0]])
            
        history["average_train_loss_per_batch"].append( train_loss / len(dataloader_train) )
        print("Average_Train_Loss_per_Batch:",history["average_train_loss_per_batch"][-1])
        plot_confusion_matrix(cnf_matrix,"Training_Epoch_{}".format(i))
        save_train_cnf_matrix = cnf_matrix
          
        # ========================================
        #              Validation
        # ========================================
        model.eval()
        iterator_dataloader_valid = iter(dataloader_valid)
        valid_loss = 0
        valid_accuracy = 0
        cnf_matrix = np.zeros((4,4))
        
        for _ in trange(len(dataloader_valid)):
            batch = next(iterator_dataloader_valid)
            valid_input,valid_mask,valid_label = (i.to(device) for i in batch)
            
            with torch.no_grad():
                outputs = model(valid_input, token_type_ids=None,
                                attention_mask=valid_mask, labels=valid_label)
            
            logits = outputs[1].detach().cpu().numpy()
            label_ids = valid_label.to('cpu').numpy()
            
            if criterion is not None:
                valid_loss += criterion(outputs[1].reshape(-1,4),valid_label.reshape(-1)).item()
            else:
                valid_loss += outputs[0].item()
            
            valid_accuracy += flat_accuracy(logits, label_ids)
            x = confusion_matrix(valid_label.detach().to("cpu").numpy().ravel(),outputs[1].detach().to("cpu").numpy().argmax(axis = -1).ravel())
            cnf_matrix += np.pad(x,[0,4 - x.shape[0]])
            

        history["average_valid_loss_per_batch"].append( valid_loss / len(dataloader_valid) )
        print("Average_Valid_Loss_per_Batch: ",history["average_valid_loss_per_batch"][-1])
        print("Validation Accuracy: {}".format(valid_accuracy/len(dataloader_valid)))
        plot_confusion_matrix(cnf_matrix,"Validation_Epoch_{}".format(i))

        if early_stopping:
          current_score = stopping_criteria(cnf_matrix)
          if best_score < current_score:
            torch.save(model.state_dict(), PATH)
            best_score = current_score
            curp = 0
            history["valid_confusion_matrix"] = cnf_matrix
            history["train_confusion_matrix"] = save_train_cnf_matrix
          else:
            curp += 1
            if curp > patience:
              break
          print("Micro Average F1 score : {} , Current_steps_with_decreasing_score : {}".format(current_score,curp))
        else :
            history["valid_confusion_matrix"] = cnf_matrix
            history["train_confusion_matrix"] = save_train_cnf_matrix

    return history

In [14]:
def print_training_results(history):
    func = lambda x: x.diagonal() / x.sum(axis = 1)
    fig,ax = plt.subplots(1,2,figsize = (12,3))
    ax[0].plot(history["average_train_loss_per_batch"])
    ax[1].plot(history["average_valid_loss_per_batch"])
    ax[0].set_title("average_train_loss_per_batch")
    ax[1].set_title("average_valid_loss_per_batch")

    df = pd.DataFrame(np.array([func(history["train_confusion_matrix"]),func(history["train_confusion_matrix"].T)]).T,index = tag_values,columns = ["Precision","Recall"])
    df["F1-score"] = ( 2 * df["Precision"] * df["Recall"] ) / (df["Precision"] + df["Recall"])
    print("Metrics on training data after training:")
    print(df)
    print()
    df = pd.DataFrame(np.array([func(history["valid_confusion_matrix"]),func(history["valid_confusion_matrix"].T)]).T,index = tag_values,columns = ["Precision","Recall"])
    df["F1-score"] = ( 2 * df["Precision"] * df["Recall"] ) / (df["Precision"] + df["Recall"])
    print("Metrics on validation results after training:")
    print(df)

#### Training Bert Models

##### bert-base-cased

In [15]:
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

In [16]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

In [17]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [18]:
epochs = 100
max_grad_norm = 1.0

#############################################

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [19]:
class_weight = [np.sum(np.array([np.sum(np.array(j) == tag2idx[w]) for j in tags])) for w in tag_values]
class_weight = 1 / np.array(class_weight,dtype = np.float32)
class_weight = np.array(class_weight) / np.array(class_weight).sum()

criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weight).to("cuda" if torch.cuda.is_available() else "cpu"))

In [20]:
try :
    os.mkdir("BERT_BASE_CASED")
except:
    pass

In [21]:
def es_criteria(cnf_matrix):
  precision = cnf_matrix.diagonal() / np.sum(cnf_matrix,axis = 0)
  recall = cnf_matrix.diagonal() / np.sum(cnf_matrix,axis = 1)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1.mean()

#history = finetune_procedure_bert_for_ner(model,epochs,train_dataloader,valid_dataloader,optimizer,criterion,scheduler,max_grad_norm,early_stopping = True,patience = 15,stopping_criteria = es_criteria,PATH = "BERT_BASE_CASED/state_dict")

In [22]:
#print_training_results(history)

##### Bio-Clinical Bert

In [23]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
base_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = Token_Classifier_With_Pretrained_Bert_Base(base_model,4)

In [24]:
sentences = saved_sentences
labels = saved_labels

sentences,labels = get_tokenized_sentences_and_labels()
normalize_longer_sequences()

for i,j in zip(sentences,labels):
  assert(len(i) == len(j))

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in sentences],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state= 0, test_size=0.15)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [25]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [26]:
epochs = 10
max_grad_norm = 1.0

#############################################

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [27]:
class_weight = [np.sum(np.array([np.sum(np.array(j) == tag2idx[w]) for j in tags])) for w in tag_values]
class_weight = 1 / np.array(class_weight,dtype = np.float32)
class_weight = np.array(class_weight) / np.array(class_weight).sum()

criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weight).to("cuda" if torch.cuda.is_available() else "cpu"))

In [28]:
try :
    os.mkdir("BIO_CLINICAL_BERT")
except:
    pass

In [29]:
def es_criteria(cnf_matrix):
  precision = cnf_matrix.diagonal() / np.sum(cnf_matrix,axis = 0)
  recall = cnf_matrix.diagonal() / np.sum(cnf_matrix,axis = 1)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1.mean()

#history = finetune_procedure_bert_for_ner(model,epochs,train_dataloader,valid_dataloader,optimizer,criterion,scheduler,max_grad_norm,early_stopping = True,patience = 15,stopping_criteria = es_criteria,PATH = "BIO_CLINICAL_BERT/state_dict")

In [30]:
#print_training_results(history)

#### BioELECTRA

In [31]:
class Token_Classifier_From_BioElectra(torch.nn.Module):
  def __init__(self,base,classes = 4):
    super().__init__()
    self.n_classes = classes
    self.base = base.electra
    self.intermediate = base.discriminator_predictions.dense
    self.dropout = torch.nn.Dropout(p=0.1, inplace=False)
    self.classifier = torch.nn.Linear(in_features=768, out_features=classes, bias=True)
    self.criterion = torch.nn.CrossEntropyLoss()

  def forward(self,bert_input,attention_mask,token_type_ids = None,labels = None):
    out = self.base(bert_input,attention_mask = attention_mask,token_type_ids = None).__getitem__(0)
    out = self.intermediate(out)
    out = self.dropout(out)
    out = self.classifier(out)
    if labels is None:
      return out
    else:
      return (self.criterion(out.reshape(-1,4),labels.reshape(-1)),out)

In [32]:
from transformers import ElectraForPreTraining, ElectraTokenizerFast

base_model = ElectraForPreTraining.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed")
tokenizer = ElectraTokenizerFast.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed")
model = Token_Classifier_From_BioElectra(base_model,4)

In [33]:
sentences = saved_sentences
labels = saved_labels

sentences,labels = get_tokenized_sentences_and_labels()
normalize_longer_sequences()

for i,j in zip(sentences,labels):
  assert(len(i) == len(j))

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in sentences],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state= 0, test_size=0.15)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [34]:
FULL_FINETUNING = True

###################################333
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [35]:
epochs = 100
max_grad_norm = 1.0

#############################################

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [36]:
class_weight = [np.sum(np.array([np.sum(np.array(j) == tag2idx[w]) for j in tags])) for w in tag_values]
class_weight = 1 / np.array(class_weight,dtype = np.float32)
class_weight = np.array(class_weight) / np.array(class_weight).sum()

criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weight).to("cuda" if torch.cuda.is_available() else "cpu"))

In [37]:
try :
    os.mkdir("BIO_ELECTRA")
except:
    pass

In [None]:
def es_criteria(cnf_matrix):
  precision = cnf_matrix.diagonal() / np.sum(cnf_matrix,axis = 0)
  recall = cnf_matrix.diagonal() / np.sum(cnf_matrix,axis = 1)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1.mean()

history = finetune_procedure_bert_for_ner(model,epochs,train_dataloader,valid_dataloader,optimizer,criterion,scheduler,max_grad_norm,early_stopping = True,patience = 2000,stopping_criteria = es_criteria,PATH = "BIO_ELECTRA/state_dict")

In [None]:
print_training_results(history)