In [None]:
import urllib.request
from pathlib import Path

def download_file(url, output_file):
  Path(output_file).parent.mkdir(parents=True, exist_ok=True)
  urllib.request.urlretrieve (url, output_file)

download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/train.txt', '/content/data/train.txt')
download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/test.txt', '/content/data/test.txt')
download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/dev.txt', '/content/data/dev.txt')

In [None]:
import pandas as pd
def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = '\t', header = None, keep_default_na = False,
                    names = ['words', 'pos', 'chunk', 'labels'],
                    quoting = 3, skip_blank_lines = False)
    df = df[~df['words'].astype(str).str.startswith('-DOCSTART- ')] # Remove the -DOCSTART- header
    df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

In [None]:
train_df = read_conll('/content/data/train.txt')
test_df = read_conll('/content/data/test.txt')
dev_df = read_conll('/content/data/dev.txt')
#train_df.head(100)
#test_df.head(10)
test_df.head(100)

Unnamed: 0,words,pos,chunk,labels,sentence_id
2,Famotidine,PROPN,O,I-Entity,1
3,-,PUNCT,O,O,1
4,associated,VERB,O,O,1
5,delirium,NOUN,O,I-Entity,1
6,.,PUNCT,O,O,1
...,...,...,...,...,...
101,elderly,ADJ,O,O,5
102,population,NOUN,O,O,5
103,seen,VERB,O,O,5
104,.,PUNCT,O,O,5


In [None]:
len(train_df)

95370

In [None]:
num_samples = train_df.shape[0]

print(f"Number of samples in train_df: {num_samples}")

Number of samples in train_df: 95370


In [None]:

# Randomly sample a subset of the dataset
train_df = train_df.sample(n=50000)  # Specify the desired number of samples

# Save the reduced dataset to a new file
train_df.to_csv("reduced_dataset.csv", index=False)

In [None]:
test_df = test_df.sample(n=10000)  # Specify the desired number of samples

# Save the reduced dataset to a new file
test_df.to_csv("reduced_dataset_test.csv", index=False)

In [None]:
dev_df = dev_df.sample(n=50000)  # Specify the desired number of samples

# Save the reduced dataset to a new file
dev_df.to_csv("reduced_dataset_dev.csv", index=False)

neeeeeeeeee


In [None]:
!pip install --upgrade transformers
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.38.1


In [None]:
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, BertModel

class CustomBertForTokenClassification(BertForTokenClassification):
    def __init__(self, config):
        super(CustomBertForTokenClassification, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # Define additional layers if needed
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)


In [None]:
!pip install pytorch-crf


Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
print("Number of tags: {}".format(len(train_df.labels.unique())))
frequencies = train_df.labels.value_counts()
frequencies

Number of tags: 3


O           82026
I-Entity    10931
B-Entity     2413
Name: labels, dtype: int64

In [None]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m797.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=73cd9b7fee16061786341dfa2081599191ffd9638060f0ac6c53cc6a1f3b521c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('Ent', 13344)]


In [None]:
labels_to_ids = {k: v for v, k in enumerate(train_df.labels.unique())}
ids_to_labels = {v: k for v, k in enumerate(train_df.labels.unique())}
labels_to_ids

{'I-Entity': 0, 'O': 1, 'B-Entity': 2}

In [None]:
labels_to_ids = {k: v for v, k in enumerate(test_df.labels.unique())}
ids_to_labels = {v: k for v, k in enumerate(test_df.labels.unique())}
labels_to_ids

{'I-Entity': 0, 'O': 1, 'B-Entity': 2}

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
train_df = train_df.fillna(method='ffill')
train_df.head()

Unnamed: 0,words,pos,chunk,labels,sentence_id
2,Naloxone,PROPN,O,I-Entity,1
3,reverses,VERB,O,O,1
4,the,DET,O,O,1
5,antihypertensive,ADJ,O,O,1
6,effect,NOUN,O,O,1


In [None]:
len(train_df)

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
test_df = test_df.fillna(method='ffill')
test_df.head()

Unnamed: 0,words,pos,chunk,labels,sentence_id
2,Famotidine,PROPN,O,I-Entity,1
3,-,PUNCT,O,O,1
4,associated,VERB,O,O,1
5,delirium,NOUN,O,I-Entity,1
6,.,PUNCT,O,O,1


In [None]:
train_df['sentence'] = train_df[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
train_df['word_labels'] = train_df[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
train_df.head()

Unnamed: 0,words,pos,chunk,labels,sentence_id,sentence,word_labels
2,Naloxone,PROPN,O,I-Entity,1,Naloxone reverses the antihypertensive effect ...,"I-Entity,O,O,O,O,O,I-Entity,O"
3,reverses,VERB,O,O,1,Naloxone reverses the antihypertensive effect ...,"I-Entity,O,O,O,O,O,I-Entity,O"
4,the,DET,O,O,1,Naloxone reverses the antihypertensive effect ...,"I-Entity,O,O,O,O,O,I-Entity,O"
5,antihypertensive,ADJ,O,O,1,Naloxone reverses the antihypertensive effect ...,"I-Entity,O,O,O,O,O,I-Entity,O"
6,effect,NOUN,O,O,1,Naloxone reverses the antihypertensive effect ...,"I-Entity,O,O,O,O,O,I-Entity,O"


In [None]:
test_df['sentence'] = test_df[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
test_df['word_labels'] = test_df[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
test_df.head()

Unnamed: 0,words,pos,chunk,labels,sentence_id,sentence,word_labels
2,Famotidine,PROPN,O,I-Entity,1,Famotidine - associated delirium .,"I-Entity,O,O,I-Entity,O"
3,-,PUNCT,O,O,1,Famotidine - associated delirium .,"I-Entity,O,O,I-Entity,O"
4,associated,VERB,O,O,1,Famotidine - associated delirium .,"I-Entity,O,O,I-Entity,O"
5,delirium,NOUN,O,I-Entity,1,Famotidine - associated delirium .,"I-Entity,O,O,I-Entity,O"
6,.,PUNCT,O,O,1,Famotidine - associated delirium .,"I-Entity,O,O,I-Entity,O"


In [None]:
train_df = train_df.drop(columns=['chunk'])

In [None]:
test_df = test_df.drop(columns=['chunk'])

In [None]:
train_df.head()

In [None]:
train_df= train_df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
train_df.head()

Unnamed: 0,sentence,word_labels
0,Naloxone reverses the antihypertensive effect ...,"I-Entity,O,O,O,O,O,I-Entity,O"
1,"In unanesthetized , spontaneously hypertensive...","O,O,O,O,I-Entity,O,O,O,O,O,O,O,O,O,O,O,O,I-Ent..."
2,The hypotensive effect of 100 mg / kg alpha - ...,"O,I-Entity,O,O,O,O,O,O,B-Entity,I-Entity,I-Ent..."
3,Naloxone alone did not affect either blood pre...,"I-Entity,O,O,O,O,O,O,O,O,O,O,O"
4,In brain membranes from spontaneously hyperten...,"O,O,O,O,O,I-Entity,O,I-Entity,O,O,O,O,O,O"


In [None]:
test_df= test_df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
test_df.head()

Unnamed: 0,sentence,word_labels
0,Famotidine - associated delirium .,"I-Entity,O,O,I-Entity,O"
1,Famotidine is a histamine H2-receptor antagoni...,"I-Entity,O,O,O,O,O,O,O,O,O,O,O,O,O,I-Entity,O,..."
2,Although all of the currently available H2-rec...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,I-Entity,O,O,O,O,O..."
3,The authors report on six cases of famotidine ...,"O,O,O,O,O,O,O,I-Entity,O,O,I-Entity,O,O,O,O,O,..."
4,The pharmacokinetics of famotidine are reviewe...,"O,O,O,I-Entity,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [None]:
len(train_df)

3942

In [None]:
len(test_df)

3125

In [None]:
train_df.iloc[5].sentence

'M , did not influence stereoselective binding of [ 3H]-naloxone ( 8 nM ) , and naloxone , 10(-8 ) to 10(-4 )'

In [None]:
train_df.iloc[5].word_labels

'O,O,O,O,O,O,O,O,B-Entity,I-Entity,O,O,O,O,O,O,I-Entity,O,O,O,O,O,O'

In [None]:
!pip install --upgrade transformers
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification



**Preparing the dataset and dataloader**

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

Note that this is a design decision. You could also decide to only label the first wordpiece of each word and let the model only learn this (this is what was done in the original BERT paper, see Github discussion here). Another design decision could be to give the first wordpiece of each word the original word label, and then use the label “X” for all subsequent subwords of that word.

All of them lead to good performance.

Next, we define a regular PyTorch dataset class (which transforms examples of a dataframe to PyTorch tensors). Here, each sentence gets tokenized, the special tokens that BERT expects are added, the tokens are padded or truncated based on the max length of the model, the attention mask is created and the labels are created based on the dictionary which we defined above.



In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
'''train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)'''
train_dataset=train_df
test_dataset=test_df
print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (3942, 2)
TRAIN Dataset: (3942, 2)
TEST Dataset: (4138, 2)


Let's have a look at the first training example:

In [None]:
training_set[0]

{'ids': tensor([  101,  6583,  4135, 22500,  2063,  7901,  2015,  1996,  3424, 10536,
          4842, 25808,  3512,  3466,  1997, 18856, 10698, 10672,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print('{0:10}  {1}'.format(token, label))

Now, let's define the corresponding PyTorch dataloaders:

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
training_loader

<torch.utils.data.dataloader.DataLoader at 0x7caa106a2d10>

**DEFINING THE MODEL**

Here we define the model, BertForTokenClassification, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head).

Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset. This is also printed as a warning when you run the code cell below.

Then, we move the model to the GPU.

In [None]:
len(labels_to_ids)

3

In [None]:
labels_to_ids

{'I-Entity': 0, 'O': 1, 'B-Entity': 2}

In [None]:
from transformers import BertForSequenceClassification

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


**As deep learning can be accellerated a lot using a GPU instead of a CPU, make sure you can run this notebook in a GPU runtime (which Google Colab provides for free! - check "Runtime" - "Change runtime type" - and set the hardware accelerator to "GPU").**

**We can set the default device to GPU using the following code (if it prints "cuda", it means the GPU has been recognized)**

In [None]:
import torch
from torch import nn

# Now you can use nn.Dropout, nn.Linear, etc.


In [None]:
# Initialize custom model
model = CustomBertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,  # Number of different labels (I-Entity, B-Entity, O)
)
model.to(device)


In [None]:
model =BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

**SANITY CHECK**

Before training the model, let's perform a sanity check, which I learned thanks to Andrej Karpathy's wonderful cs231n course at Stanford (see also his blog post about debugging neural networks). The initial loss of your model should be close to -ln(1/number of classes) = -ln(1/17) = 2.83.

Why? Because we are using cross entropy loss. The cross entropy loss is defined as -ln(probability score of the model for the correct class). In the beginning, the weights are random, so the probability distribution for all of the classes for a given token will be uniform, meaning that the probability for the correct class will be near 1/17. The loss for a given token will thus be -ln(1/17). As PyTorch's CrossEntropyLoss (which is used by BertForTokenClassification) uses mean reduction by default, it will compute the mean loss for each of the tokens in the sequence (in other words, for all of the 512 tokens). The mean of 512 times -log(1/17) is, you guessed it, -log(1/17).

Let's verify this:

In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(1.5889, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
mask

In [None]:
targets

In [None]:
ids

In [None]:
outputs[0]

tensor(1.3506, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 3])

Next, we define the optimizer. Here, we are just going to use Adam with a default learning rate. One can also decide to use more advanced ones such as AdamW (Adam with weight decay fix), which is included in the Transformers repository, and a learning rate scheduler, but we are not going to do that here.


In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    epoch_loss_graph = []  # Initialize list to store loss values for each epoch
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.long)

        optimizer.zero_grad()  # Zero the gradients before each forward pass

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=targets).to_tuple()
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        # compute training accuracy
        flattened_targets = targets.view(-1)  # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels)  # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1  # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # backward pass
        loss.backward()
        optimizer.step()

        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

    epoch_loss_graph.append(epoch_loss)  # Append the epoch loss to the list
    return epoch_loss_graph


In [None]:
import matplotlib.pyplot as plt

# Define the number of epochs
EPOCHS = 10

# List to store epoch losses
epoch_losses = []

# Train the model for each epoch
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    epoch_loss = train(epoch)
    epoch_losses.append(epoch_loss)

# Plotting the epoch vs loss graph
plt.plot(range(1, EPOCHS + 1), epoch_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Epoch vs Loss')
plt.legend()
plt.show()


**Evaluating the Model**

Now that we've trained our model, we can evaluate its performance on the held-out test set (which is 20% of the data). Note that here, no gradient updates are performed, the model just outputs its logits.

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets).to_tuple()

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

In [None]:

labels

In [None]:
predictions

**Inference**

The fun part is when we can quickly test the model on new, unseen sentences. Here, we use the prediction of the first word piece of every word. Note that the function we used to train our model (tokenze_and_preserve_labels) propagated the label to all subsequent word pieces (so you could for example also perform a majority vote on the predicted labels of all word pieces of a word).

In other words, the code below does not take into account when predictions of different word pieces that belong to the same word do not match.

In [None]:
sentence = "After a single oral dose of 4 mg / kg indomethacin ( IDM ) to sodium and volume depleted rats plasma renin activity ( PRA ) and systolic blood pressure fell significantly within four hours."

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

NameError: name 'tokenizer' is not defined

In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

      Entity       0.77      0.83      0.80     12104

   micro avg       0.77      0.83      0.80     12104
   macro avg       0.77      0.83      0.80     12104
weighted avg       0.77      0.83      0.80     12104



In [None]:
!pip install seqeval

In [None]:
# save vocabulary of the tokenizer
tokenizer.save_vocabulary('/content/drive/MyDrive/project_model/tokenizer')
# save the model weights and its configuration file
model.save_pretrained('/content/drive/MyDrive/project_model/weights')
print('All files saved')

**model Loading**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
from transformers import BertTokenizer, BertForTokenClassification

# Load the tokenizer
tokenizer1 = BertTokenizer.from_pretrained('/content/drive/MyDrive/project_model/tokenizer')

# Load the model
model1 = BertForTokenClassification.from_pretrained('/content/drive/MyDrive/project_model/weights')
model1.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
!pip install torch
import torch
# prompt: assign these values  {'I-Entity': 0, 'O': 1, 'B-Entity': 2} in a dicyionary called  ids_to_labels

ids_to_labels = {
    0: 'I-Entity',
    1: 'O',
    2: 'B-Entity'
}




In [None]:
sentence = "Mary suffers from fever & cancer."
MAX_LEN=128
inputs = tokenizer1(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model1(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model1.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer1.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

mary suffers from fever & cancer .
['O', 'O', 'O', 'I-Entity', 'O', 'I-Entity', 'O']


In [None]:
!pip install  gradio

Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m5.2/7.8 MB[0m [31m154.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m7.5/7.8 MB[0m [31m98.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.8/7.8 MB[0m [31m97.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl (37 kB)
Collecting uvicorn>=0.1

In [None]:
import gradio as gr

# Define the function to perform predictions
def predict_entities(sentence):
    MAX_LEN = 128
    inputs = tokenizer1(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

    # Move to GPU
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)

    # Forward pass
    outputs = model1(ids, mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model1.num_labels)  # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer1.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))  # list of tuples. Each tuple = (wordpiece, prediction)

    word_level_predictions = []
    for pair in wp_preds:
        if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
            # Skip prediction
            continue
        else:
            word_level_predictions.append(pair[1])

    # Join tokens, if they are not special ones
    str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
    return str_rep, word_level_predictions

# Define the input component
input_text = gr.Interface(fn=predict_entities, inputs="text", outputs="text", title="Entity Recognition Model", description="Enter a sentence:")

# Launch the interface
input_text.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a946ce6b534fc8395d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


