<a href="https://colab.research.google.com/github/bshort95/final_project/blob/main/model/eli_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install torchmetrics
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from transformers import BertTokenizerFast
import torch
from torchmetrics import F1Score
from torch.utils.data import DataLoader
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)


Mounted at /content/gdrive/


In [3]:
# big data set

# df = pd.read_csv('https://raw.githubusercontent.com/bshort95/final_project/main/data.csv')

# condensed data set
df = pd.read_csv('https://github.com/bshort95/final_project/raw/main/datav3.csv')

In [4]:
labels = [i.split() for i in df['mask'].values.tolist()]

unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]
 
print(unique_labels)

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

label_all_tokens = True

{'S_perks', 'E_degree', 'E_Misc_Info', 'E_Soft_Skill', 'S_Years_Experince', 'S_Company_Name', 'S_resp', 'E_location', 'E_Important_Dates', 'S_Hard_Skill', 'E_resp', 'S_Important_Dates', 'E_Years_Experince', 'E_perks', 'S_degree', 'S_Soft_Skill', 'S_location', 'E_Company_Name', 'O', 'S_Misc_Info', 'E_Hard_Skill'}


In [5]:
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [6]:
import numpy as np
df = df.rename(columns={'mask': 'labels'})
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])



In [7]:
from torch import nn
from transformers import BertForTokenClassification

class BertModel1(torch.nn.Module):

    def __init__(self):

        super(BertModel1, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [8]:



def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):
              temp = {}

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              labels = set(predictions.tolist() + label_clean.tolist())
              
              f1 = F1Score(task = "multiclass", num_classes=(len(unique_labels))).to(device)
              f1_score = f1(predictions.to(device), label_clean.to(device))

              total_acc_train += f1_score
              total_loss_train += loss.item()
                      

            model.zero_grad()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):
              temp = {}

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
                         
              f1 = F1Score(task = "multiclass", num_classes=(len(unique_labels))).to(device)
              f1_score = f1(predictions.to(device), label_clean.to(device))

              
              total_acc_val += f1_score
              total_loss_val += loss.item()
        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | micro-f1: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | micro-f1: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 1e-6
EPOCHS = 75
BATCH_SIZE = 2

model = BertModel1()
train_loop(model, df_train, df_val)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epochs: 1 | Loss:  2.335 | micro-f1:  0.553 | Val_Loss:  1.345 | micro-f1:  0.962


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 2 | Loss:  1.117 | micro-f1:  0.811 | Val_Loss:  0.895 | micro-f1:  0.765


100%|██████████| 144/144 [00:10<00:00, 13.53it/s]


Epochs: 3 | Loss:  1.015 | micro-f1:  0.811 | Val_Loss:  0.846 | micro-f1:  0.923


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 4 | Loss:  0.962 | micro-f1:  0.811 | Val_Loss:  0.814 | micro-f1:  0.555


100%|██████████| 144/144 [00:10<00:00, 13.45it/s]


Epochs: 5 | Loss:  0.934 | micro-f1:  0.811 | Val_Loss:  0.783 | micro-f1:  0.721


100%|██████████| 144/144 [00:10<00:00, 13.52it/s]


Epochs: 6 | Loss:  0.905 | micro-f1:  0.811 | Val_Loss:  0.765 | micro-f1:  0.897


100%|██████████| 144/144 [00:10<00:00, 13.53it/s]


Epochs: 7 | Loss:  0.890 | micro-f1:  0.811 | Val_Loss:  0.750 | micro-f1:  0.790


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 8 | Loss:  0.878 | micro-f1:  0.811 | Val_Loss:  0.736 | micro-f1:  0.710


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 9 | Loss:  0.863 | micro-f1:  0.811 | Val_Loss:  0.726 | micro-f1:  0.741


100%|██████████| 144/144 [00:10<00:00, 13.41it/s]


Epochs: 10 | Loss:  0.853 | micro-f1:  0.812 | Val_Loss:  0.721 | micro-f1:  0.585


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 11 | Loss:  0.841 | micro-f1:  0.812 | Val_Loss:  0.709 | micro-f1:  0.948


100%|██████████| 144/144 [00:10<00:00, 13.45it/s]


Epochs: 12 | Loss:  0.820 | micro-f1:  0.812 | Val_Loss:  0.703 | micro-f1:  0.949


100%|██████████| 144/144 [00:10<00:00, 13.46it/s]


Epochs: 13 | Loss:  0.812 | micro-f1:  0.812 | Val_Loss:  0.683 | micro-f1:  0.792


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 14 | Loss:  0.801 | micro-f1:  0.812 | Val_Loss:  0.680 | micro-f1:  0.801


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 15 | Loss:  0.790 | micro-f1:  0.812 | Val_Loss:  0.663 | micro-f1:  0.618


100%|██████████| 144/144 [00:10<00:00, 13.53it/s]


Epochs: 16 | Loss:  0.776 | micro-f1:  0.812 | Val_Loss:  0.657 | micro-f1:  0.928


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 17 | Loss:  0.766 | micro-f1:  0.812 | Val_Loss:  0.655 | micro-f1:  0.872


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 18 | Loss:  0.751 | micro-f1:  0.812 | Val_Loss:  0.646 | micro-f1:  0.958


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 19 | Loss:  0.745 | micro-f1:  0.813 | Val_Loss:  0.631 | micro-f1:  0.802


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 20 | Loss:  0.734 | micro-f1:  0.813 | Val_Loss:  0.627 | micro-f1:  0.845


100%|██████████| 144/144 [00:10<00:00, 13.24it/s]


Epochs: 21 | Loss:  0.729 | micro-f1:  0.813 | Val_Loss:  0.621 | micro-f1:  0.938


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 22 | Loss:  0.718 | micro-f1:  0.814 | Val_Loss:  0.613 | micro-f1:  0.761


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 23 | Loss:  0.711 | micro-f1:  0.814 | Val_Loss:  0.605 | micro-f1:  0.770


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 24 | Loss:  0.706 | micro-f1:  0.814 | Val_Loss:  0.605 | micro-f1:  0.850


100%|██████████| 144/144 [00:10<00:00, 13.54it/s]


Epochs: 25 | Loss:  0.696 | micro-f1:  0.816 | Val_Loss:  0.601 | micro-f1:  0.893


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 26 | Loss:  0.691 | micro-f1:  0.815 | Val_Loss:  0.595 | micro-f1:  0.773


100%|██████████| 144/144 [00:10<00:00, 13.45it/s]


Epochs: 27 | Loss:  0.684 | micro-f1:  0.815 | Val_Loss:  0.599 | micro-f1:  0.814


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 28 | Loss:  0.680 | micro-f1:  0.816 | Val_Loss:  0.577 | micro-f1:  0.889


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 29 | Loss:  0.669 | micro-f1:  0.817 | Val_Loss:  0.579 | micro-f1:  0.762


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 30 | Loss:  0.666 | micro-f1:  0.817 | Val_Loss:  0.579 | micro-f1:  0.768


100%|██████████| 144/144 [00:10<00:00, 13.55it/s]


Epochs: 31 | Loss:  0.657 | micro-f1:  0.818 | Val_Loss:  0.591 | micro-f1:  0.765


100%|██████████| 144/144 [00:10<00:00, 13.45it/s]


Epochs: 32 | Loss:  0.655 | micro-f1:  0.818 | Val_Loss:  0.579 | micro-f1:  0.941


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 33 | Loss:  0.649 | micro-f1:  0.819 | Val_Loss:  0.563 | micro-f1:  0.810


100%|██████████| 144/144 [00:10<00:00, 13.43it/s]


Epochs: 34 | Loss:  0.647 | micro-f1:  0.820 | Val_Loss:  0.560 | micro-f1:  0.724


100%|██████████| 144/144 [00:10<00:00, 13.52it/s]


Epochs: 35 | Loss:  0.640 | micro-f1:  0.820 | Val_Loss:  0.557 | micro-f1:  0.686


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 36 | Loss:  0.635 | micro-f1:  0.819 | Val_Loss:  0.559 | micro-f1:  0.887


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 37 | Loss:  0.631 | micro-f1:  0.819 | Val_Loss:  0.558 | micro-f1:  0.817


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 38 | Loss:  0.629 | micro-f1:  0.819 | Val_Loss:  0.547 | micro-f1:  0.916


100%|██████████| 144/144 [00:10<00:00, 13.53it/s]


Epochs: 39 | Loss:  0.623 | micro-f1:  0.821 | Val_Loss:  0.552 | micro-f1:  0.815


100%|██████████| 144/144 [00:10<00:00, 13.53it/s]


Epochs: 40 | Loss:  0.616 | micro-f1:  0.821 | Val_Loss:  0.558 | micro-f1:  0.695


100%|██████████| 144/144 [00:10<00:00, 13.46it/s]


Epochs: 41 | Loss:  0.615 | micro-f1:  0.822 | Val_Loss:  0.553 | micro-f1:  0.530


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 42 | Loss:  0.613 | micro-f1:  0.821 | Val_Loss:  0.547 | micro-f1:  0.742


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 43 | Loss:  0.609 | micro-f1:  0.822 | Val_Loss:  0.542 | micro-f1:  0.842


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 44 | Loss:  0.604 | micro-f1:  0.821 | Val_Loss:  0.539 | micro-f1:  0.904


100%|██████████| 144/144 [00:10<00:00, 13.43it/s]


Epochs: 45 | Loss:  0.598 | micro-f1:  0.823 | Val_Loss:  0.543 | micro-f1:  0.906


100%|██████████| 144/144 [00:10<00:00, 13.46it/s]


Epochs: 46 | Loss:  0.598 | micro-f1:  0.823 | Val_Loss:  0.545 | micro-f1:  0.879


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 47 | Loss:  0.597 | micro-f1:  0.823 | Val_Loss:  0.540 | micro-f1:  0.806


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 48 | Loss:  0.593 | micro-f1:  0.822 | Val_Loss:  0.540 | micro-f1:  0.893


100%|██████████| 144/144 [00:10<00:00, 13.44it/s]


Epochs: 49 | Loss:  0.589 | micro-f1:  0.824 | Val_Loss:  0.531 | micro-f1:  0.912


100%|██████████| 144/144 [00:10<00:00, 13.41it/s]


Epochs: 50 | Loss:  0.587 | micro-f1:  0.824 | Val_Loss:  0.540 | micro-f1:  0.774


100%|██████████| 144/144 [00:10<00:00, 13.42it/s]


Epochs: 51 | Loss:  0.585 | micro-f1:  0.823 | Val_Loss:  0.533 | micro-f1:  0.771


100%|██████████| 144/144 [00:10<00:00, 13.44it/s]


Epochs: 52 | Loss:  0.580 | micro-f1:  0.824 | Val_Loss:  0.534 | micro-f1:  0.689


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 53 | Loss:  0.579 | micro-f1:  0.824 | Val_Loss:  0.533 | micro-f1:  0.811


100%|██████████| 144/144 [00:10<00:00, 13.37it/s]


Epochs: 54 | Loss:  0.573 | micro-f1:  0.824 | Val_Loss:  0.538 | micro-f1:  0.759


100%|██████████| 144/144 [00:10<00:00, 13.45it/s]


Epochs: 55 | Loss:  0.573 | micro-f1:  0.824 | Val_Loss:  0.535 | micro-f1:  0.712


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 56 | Loss:  0.572 | micro-f1:  0.825 | Val_Loss:  0.528 | micro-f1:  0.778


100%|██████████| 144/144 [00:10<00:00, 13.53it/s]


Epochs: 57 | Loss:  0.567 | micro-f1:  0.827 | Val_Loss:  0.533 | micro-f1:  0.793


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 58 | Loss:  0.564 | micro-f1:  0.825 | Val_Loss:  0.529 | micro-f1:  0.541


100%|██████████| 144/144 [00:10<00:00, 13.48it/s]


Epochs: 59 | Loss:  0.562 | micro-f1:  0.826 | Val_Loss:  0.534 | micro-f1:  0.670


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 60 | Loss:  0.561 | micro-f1:  0.825 | Val_Loss:  0.525 | micro-f1:  0.802


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 61 | Loss:  0.556 | micro-f1:  0.827 | Val_Loss:  0.530 | micro-f1:  0.690


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 62 | Loss:  0.554 | micro-f1:  0.828 | Val_Loss:  0.528 | micro-f1:  0.856


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 63 | Loss:  0.555 | micro-f1:  0.828 | Val_Loss:  0.531 | micro-f1:  0.599


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 64 | Loss:  0.549 | micro-f1:  0.827 | Val_Loss:  0.528 | micro-f1:  0.791


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 65 | Loss:  0.550 | micro-f1:  0.827 | Val_Loss:  0.521 | micro-f1:  0.883


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 66 | Loss:  0.549 | micro-f1:  0.827 | Val_Loss:  0.527 | micro-f1:  0.795


100%|██████████| 144/144 [00:10<00:00, 13.46it/s]


Epochs: 67 | Loss:  0.544 | micro-f1:  0.829 | Val_Loss:  0.522 | micro-f1:  0.554


100%|██████████| 144/144 [00:10<00:00, 13.45it/s]


Epochs: 68 | Loss:  0.541 | micro-f1:  0.829 | Val_Loss:  0.520 | micro-f1:  0.803


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 69 | Loss:  0.542 | micro-f1:  0.827 | Val_Loss:  0.528 | micro-f1:  0.841


100%|██████████| 144/144 [00:10<00:00, 13.52it/s]


Epochs: 70 | Loss:  0.539 | micro-f1:  0.827 | Val_Loss:  0.523 | micro-f1:  0.740


100%|██████████| 144/144 [00:10<00:00, 13.49it/s]


Epochs: 71 | Loss:  0.537 | micro-f1:  0.829 | Val_Loss:  0.517 | micro-f1:  0.906


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 72 | Loss:  0.534 | micro-f1:  0.830 | Val_Loss:  0.513 | micro-f1:  0.905


100%|██████████| 144/144 [00:10<00:00, 13.47it/s]


Epochs: 73 | Loss:  0.537 | micro-f1:  0.830 | Val_Loss:  0.508 | micro-f1:  0.928


100%|██████████| 144/144 [00:10<00:00, 13.50it/s]


Epochs: 74 | Loss:  0.532 | micro-f1:  0.828 | Val_Loss:  0.514 | micro-f1:  0.744


100%|██████████| 144/144 [00:10<00:00, 13.51it/s]


Epochs: 75 | Loss:  0.531 | micro-f1:  0.829 | Val_Loss:  0.516 | micro-f1:  0.882


In [9]:
import pickle
torch.save(model,"/content/gdrive/MyDrive/models/srproj/model.pt")


In [10]:
pickled_model = torch.load("/content/gdrive/MyDrive/models/srproj/model.pt")

In [17]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):
              temp = {}

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              labels = set(predictions.tolist() + label_clean.tolist())
              
              for j in range(len(labels)):
                temp[list(labels)[j]] = j

              temp_pred = []
              temp_label = []
              for pred in predictions.tolist():
                temp_pred.append(temp[pred])
              for pred in label_clean.tolist():
                temp_label.append(temp[pred])
            

              temp_pred = torch.tensor(temp_pred)
              temp_label = torch.tensor(temp_label)
              
              # if len(labels) == 1:
              #   f1 = F1Score(task = "binary", num_classes=(len(labels)), multiclass=False)
              # else:
              #   f1 = F1Score(task = "multiclass", num_classes=(len(labels)))
              f1 = F1Score(task = "multiclass", num_classes=(len(unique_labels))).to(device)
              f1_score = f1(predictions.to(device), label_clean.to(device))

              # acc = (predictions == label_clean).float().mean()
              total_acc_test += f1_score

        # val_accuracy = total_acc_val / len(df_val)
        # val_loss = total_loss_val / len(df_val)

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate(pickled_model, df_test)

Test Accuracy:  0.795


In [12]:

def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)
            
evaluate_one_text(pickled_model, 'Thank you for applying to a job at eClerx US. Please note that if hired, eClerx LLC will require you provide your Covid-19 vaccination status on your first day of employment. Vaccination information allows the company to plan health and safety protocols and')

Thank you for applying to a job at eClerx US. Please note that if hired, eClerx LLC will require you provide your Covid-19 vaccination status on your first day of employment. Vaccination information allows the company to plan health and safety protocols and
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
