In [None]:
import pandas as pd
def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = ' ', header = None, keep_default_na = False,
                    names = ['words','labels'],
                    quoting = 3, skip_blank_lines = False)
    df = df[~df['words'].astype(str).str.startswith('-DOCSTART- ')] # Remove the -DOCSTART- header
    df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

In [None]:
test_df= read_conll('/content/drive/MyDrive/dataseti2b2/train.txt')
train_df = read_conll('/content/drive/MyDrive/dataseti2b2/test.txt')
dev_df = read_conll('/content/drive/MyDrive/dataseti2b2/dev.txt')
#train_df.head(100)
#test_df.head(100)
train_df.head(100)

Unnamed: 0,words,labels,sentence_id
0,359487751,O,0
2,PUOMC,O,1
4,9075321,O,2
6,07676,O,3
8,021768,O,4
...,...,...,...
125,ALLERGIES,O,30
126,:,O,30
128,No,O,31
129,known,B-problem,31


In [None]:
len(train_df)

267249

In [None]:
len(test_df)

599

In [None]:
len(dev_df)

20454

In [None]:
num_samples = train_df.shape[0]

print(f"Number of samples in train_df: {num_samples}")

Number of samples in train_df: 267249


In [None]:

# Randomly sample a subset of the dataset
train_df = train_df.sample(n=50000)  # Specify the desired number of samples

# Save the reduced dataset to a new file
train_df.to_csv("reduced_dataset.csv", index=False)

In [None]:
test_df = test_df.sample(n=10000)  # Specify the desired number of samples

# Save the reduced dataset to a new file
test_df.to_csv("reduced_dataset_test.csv", index=False)

In [None]:
dev_df = dev_df.sample(n=10000)  # Specify the desired number of samples

# Save the reduced dataset to a new file
dev_df.to_csv("reduced_dataset_dev.csv", index=False)

In [None]:
!pip install --upgrade transformers
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
import torch
import torch.nn as nn
from transformers import BertForTokenClassification, BertModel

class CustomBertForTokenClassification(BertForTokenClassification):
    def __init__(self, config):
        super(CustomBertForTokenClassification, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # Define additional layers if needed
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)


In [None]:
!pip install pytorch-crf


Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
print("Number of tags: {}".format(len(train_df.labels.unique())))
frequencies = train_df.labels.value_counts()
frequencies

Number of tags: 7


O              37997
I-problem       3255
B-problem       2277
B-treatment     1800
B-test          1736
I-test          1477
I-treatment     1458
Name: labels, dtype: int64

In [None]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('pro', 5532), ('tre', 3258), ('tes', 3213)]


In [None]:
labels_to_ids = {k: v for v, k in enumerate(train_df.labels.unique())}
ids_to_labels = {v: k for v, k in enumerate(train_df.labels.unique())}
labels_to_ids

{'O': 0,
 'B-problem': 1,
 'B-test': 2,
 'I-treatment': 3,
 'B-treatment': 4,
 'I-test': 5,
 'I-problem': 6}

In [None]:
labels_to_ids = {k: v for v, k in enumerate(test_df.labels.unique())}
ids_to_labels = {v: k for v, k in enumerate(test_df.labels.unique())}
labels_to_ids

{'B-problem': 0,
 'I-problem': 1,
 'O': 2,
 'B-test': 3,
 'I-test': 4,
 'B-treatment': 5,
 'I-treatment': 6}

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
train_df = train_df.fillna(method='ffill')
train_df.head()

Unnamed: 0,words,labels,sentence_id
73827,BY,O,7200
221724,Pneumonia,B-problem,20813
182533,started,O,17312
97369,affecting,O,9412
61737,pedis,O,6047


In [None]:
len(train_df)

50000

In [None]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
test_df = test_df.fillna(method='ffill')
test_df.head()

Unnamed: 0,words,labels,sentence_id
0,Right,B-problem,0
1,capsular,I-problem,0
2,putamenal,I-problem,0
3,hemorrhage,I-problem,0
4,.,O,0


In [None]:
dev_df = dev_df.fillna(method='ffill')
dev_df.head()

Unnamed: 0,words,labels,sentence_id
0,950452368,O,0
2,CTMC,O,1
4,25445040,O,2
6,808386,O,3
8,5/9/1998,O,4


In [None]:
train_df['sentence'] = train_df[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
train_df['word_labels'] = train_df[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
train_df.head()

Unnamed: 0,words,labels,sentence_id,sentence,word_labels
73827,BY,O,7200,BY ENTERED,"O,O"
221724,Pneumonia,B-problem,20813,Pneumonia,B-problem
182533,started,O,17312,started with,"O,O"
97369,affecting,O,9412,"affecting , , / Conditions","O,O,O,O,O"
61737,pedis,O,6047,pedis Dorsalis,"O,O"


In [None]:
test_df['sentence'] = test_df[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
test_df['word_labels'] = test_df[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
test_df.head()

Unnamed: 0,words,labels,sentence_id,sentence,word_labels
0,Right,B-problem,0,Right capsular putamenal hemorrhage .,"B-problem,I-problem,I-problem,I-problem,O"
1,capsular,I-problem,0,Right capsular putamenal hemorrhage .,"B-problem,I-problem,I-problem,I-problem,O"
2,putamenal,I-problem,0,Right capsular putamenal hemorrhage .,"B-problem,I-problem,I-problem,I-problem,O"
3,hemorrhage,I-problem,0,Right capsular putamenal hemorrhage .,"B-problem,I-problem,I-problem,I-problem,O"
4,.,O,0,Right capsular putamenal hemorrhage .,"B-problem,I-problem,I-problem,I-problem,O"


In [None]:
dev_df['sentence'] = dev_df[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
dev_df['word_labels'] = dev_df[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
dev_df.head()

Unnamed: 0,words,labels,sentence_id,sentence,word_labels
0,950452368,O,0,950452368,O
2,CTMC,O,1,CTMC,O
4,25445040,O,2,25445040,O
6,808386,O,3,808386,O
8,5/9/1998,O,4,5/9/1998 12:00:00 AM,"O,O,O"


In [None]:
train_df= train_df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
train_df.head()

Unnamed: 0,sentence,word_labels
0,BY ENTERED,"O,O"
1,Pneumonia,B-problem
2,started with,"O,O"
3,"affecting , , / Conditions","O,O,O,O,O"
4,pedis Dorsalis,"O,O"


In [None]:
test_df= test_df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
test_df.head()

Unnamed: 0,sentence,word_labels
0,Right capsular putamenal hemorrhage .,"B-problem,I-problem,I-problem,I-problem,O"
1,hypertension .,"B-problem,O"
2,Mrs. Dua is a 34 year old white female with an...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-problem,O,O,O,O,O,O"
3,She was in her usual state of health until aro...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,"Her speech became slurred , this was followed ...","O,O,O,B-problem,O,O,O,O,O,O,O,O,O,B-problem,I-..."


In [None]:
dev_df= dev_df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
dev_df.head()

Unnamed: 0,sentence,word_labels
0,950452368,O
1,CTMC,O
2,25445040,O
3,808386,O
4,5/9/1998 12:00:00 AM,"O,O,O"


In [None]:
len(train_df)

13770

In [None]:
len(test_df)

50

In [None]:
train_df.iloc[4].sentence

'pedis Dorsalis'

In [None]:
train_df.iloc[4].word_labels

'O,O'

In [None]:
!pip install --upgrade transformers
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 8
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
'''train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)'''
train_dataset=train_df
test_dataset=test_df
print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (13770, 2)
TRAIN Dataset: (13770, 2)
TEST Dataset: (50, 2)


In [None]:
testing_set=dataset(dev_df, tokenizer, MAX_LEN)

In [None]:
training_set[0]

{'ids': tensor([ 101, 1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'mask': tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print('{0:10}  {1}'.format(token, label))

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
training_loader

<torch.utils.data.dataloader.DataLoader at 0x7eef61b5f5e0>

**DEFINING THE MODEL**

Here we define the model, BertForTokenClassification, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head).

Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset. This is also printed as a warning when you run the code cell below.

Then, we move the model to the GP


In [None]:
len(labels_to_ids)

7

In [None]:
labels_to_ids

{'B-problem': 0,
 'I-problem': 1,
 'O': 2,
 'B-test': 3,
 'I-test': 4,
 'B-treatment': 5,
 'I-treatment': 6}

In [None]:
from transformers import BertForSequenceClassification

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# Initialize custom model
model = CustomBertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=7,  # Number of different labels (I-Entity, B-Entity, O)
)
model.to(device)


**SANITY CHECK**

In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.0332, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
mask

In [None]:
targets

In [None]:
ids

In [None]:
outputs[0]

tensor(1.3506, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 7])

Next, we define the optimizer. Here, we are just going to use Adam with a default learning rate. One can also decide to use more advanced ones such as AdamW (Adam with weight decay fix), which is included in the Transformers repository, and a learning rate scheduler, but we are not going to do that here.


In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    epoch_loss_graph = []  # Initialize list to store loss values for each epoch
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.long)

        optimizer.zero_grad()  # Zero the gradients before each forward pass

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=targets).to_tuple()
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        # compute training accuracy
        flattened_targets = targets.view(-1)  # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels)  # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1  # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # backward pass
        loss.backward()
        optimizer.step()

        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

    epoch_loss_graph.append(epoch_loss)  # Append the epoch loss to the list
    return epoch_loss_graph


In [None]:
import matplotlib.pyplot as plt

# Define the number of epochs
EPOCHS = 8

# List to store epoch losses
epoch_losses = []

# Train the model for each epoch
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    epoch_loss = train(epoch)
    epoch_losses.append(epoch_loss)

# Plotting the epoch vs loss graph
plt.plot(range(1, EPOCHS + 1), epoch_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Epoch vs Loss')
plt.legend()
plt.show()


In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets).to_tuple()

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 8.815978071652353e-05
Validation loss per 100 evaluation steps: 0.07253792790008069
Validation loss per 100 evaluation steps: 0.07701232211993375
Validation loss per 100 evaluation steps: 0.07626576262661074
Validation loss per 100 evaluation steps: 0.0750642349777317
Validation loss per 100 evaluation steps: 0.07269233771000677
Validation loss per 100 evaluation steps: 0.07305106206416298
Validation loss per 100 evaluation steps: 0.07228134580717871
Validation loss per 100 evaluation steps: 0.07309829963681258
Validation loss per 100 evaluation steps: 0.07220359290245752
Validation Loss: 0.07233608026158375
Validation Accuracy: 0.8621415648508541


In [None]:

labels

In [None]:
predictions

In [None]:
sentence = "Mary suffers from fever & cancer. docter give paracetamol"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

mary suffers from fever & cancer . docter give paracetamol
['O', 'O', 'I-problem', 'B-problem', 'I-problem', 'I-problem', 'O', 'O', 'O', 'O', 'B-treatment', 'B-treatment', 'B-treatment', 'O']


In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

     problem       0.36      0.61      0.45      1763
        test       0.42      0.61      0.50       799
   treatment       0.55      0.70      0.62      1440

   micro avg       0.43      0.64      0.51      4002
   macro avg       0.44      0.64      0.52      4002
weighted avg       0.44      0.64      0.52      4002



In [None]:
!pip install seqeval

In [None]:
!pip install seqeval

In [None]:
# save vocabulary of the tokenizer
tokenizer.save_vocabulary('/content/drive/MyDrive/project_model_i2b2/tokenizer')
# save the model weights and its configuration file
model.save_pretrained('/content/drive/MyDrive/project_model_i2b2/weights')
print('All files saved')

All files saved


**model loading**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [None]:
from transformers import BertTokenizer, BertForTokenClassification

# Load the tokenizer
tokenizer1 = BertTokenizer.from_pretrained('/content/drive/MyDrive/project_model_i2b2/tokenizer')

# Load the model
model1 = BertForTokenClassification.from_pretrained('/content/drive/MyDrive/project_model_i2b2/weights')
model1.to(device)

In [None]:
!pip install torch
import torch

ids_to_labels = {
    0:'B-problem',
  1:'I-problem',
  2:'O',
 3: 'B-test',
  4:'I-test',
 5:'B-treatment' ,
 6: 'I-treatment'
}


In [None]:
sentence = "Mary suffers from fever & cancer."
MAX_LEN=128
inputs = tokenizer1(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model1(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model1.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer1.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

mary suffers from fever & cancer .
['O', 'O', 'O', 'B-problem', 'I-problem', 'I-problem', 'O']


In [None]:
!pip install  gradio

Collecting websockets<12.0,>=10.0 (from gradio-client==0.10.1->gradio)
  Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx>=0.24.1->gradio)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.24.1->gradio)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting colorama<0.5.0,>=0.4.3 (from typer[all]<1.0,>=0.9->gradio)
  Downloading 

In [None]:
import gradio as gr

# Define the function to perform predictions
def predict_entities(sentence):
    MAX_LEN = 128
    inputs = tokenizer1(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

    # Move to GPU
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)

    # Forward pass
    outputs = model1(ids, mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model1.num_labels)  # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer1.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))  # list of tuples. Each tuple = (wordpiece, prediction)

    word_level_predictions = []
    for pair in wp_preds:
        if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
            # Skip prediction
            continue
        else:
            word_level_predictions.append(pair[1])

    # Join tokens, if they are not special ones
    str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
    return str_rep, word_level_predictions

# Define the input component
input_text = gr.Interface(fn=predict_entities, inputs="text", outputs="text", title="Entity Recognition Model", description="Enter a sentence:")

# Launch the interface
input_text.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://297493039307044f3f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


