In [28]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [29]:
directory = './models/Transformers'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(directory)
model = AutoModelForTokenClassification.from_pretrained(directory)
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [30]:
sample_text = "In a Deep Learning distributed computing environment, nodes execute algorithms concurrently, leveraging data structures such as hash tables and binary trees, while protocols like TCP/IP ensure reliable data transmission, and security measures such as encryption and blockchain technology safeguard sensitive information against cyber threats."
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, is_split_into_words=False)

In [31]:
id2tag = {0: 'E-DATASET',
 1: 'I-RESEARCH_PROBLEM',
 2: 'B-METHOD',
 3: 'I-TOOL',
 4: 'B-TOOL',
 5: 'E-TOOL',
 6: 'E-RESOURCE',
 7: 'E-SOLUTION',
 8: 'S-RESEARCH_PROBLEM',
 9: 'I-DATASET',
 10: 'B-RESEARCH_PROBLEM',
 11: 'B-LANGUAGE',
 12: 'B-RESOURCE',
 13: 'S-RESOURCE',
 14: 'E-METHOD',
 15: 'B-DATASET',
 16: 'I-METHOD',
 17: 'I-LANGUAGE',
 18: 'I-SOLUTION',
 19: 'S-SOLUTION',
 20: 'S-LANGUAGE',
 21: 'S-METHOD',
 22: 'S-DATASET',
 23: 'S-TOOL',
 24: 'E-LANGUAGE',
 25: 'E-RESEARCH_PROBLEM',
 26: 'B-SOLUTION',
 27: 'O',
 28: 'I-RESOURCE'}

In [37]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Retrieve tokens and their corresponding tags
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
tags = [id2tag[i] for i in predictions[0].numpy()]  # Assuming id2tag is available

# Combine subwords and filter based on tag
full_tokens = []
current_token = ""
for token, tag in zip(tokens, tags):
    if token.startswith("##"):
        current_token += token[2:]  # Remove '##' and append
    else:
        if current_token:  # Push the previous full token
            full_tokens.append((current_token, prev_tag))
        current_token = token  # Start a new token
        prev_tag = tag

# Append the last token
if current_token:
    full_tokens.append((current_token, prev_tag))

print("Token\tPredicted Tag")
for token, tag in full_tokens:
    if tag != 'O': 
        print(f"{token}\t{tag}")

Token	Predicted Tag
distributed	E-RESEARCH_PROBLEM
computing	E-RESEARCH_PROBLEM
data	E-RESEARCH_PROBLEM
hash	E-RESEARCH_PROBLEM
trees	E-RESEARCH_PROBLEM
data	E-RESEARCH_PROBLEM
transmission	E-RESEARCH_PROBLEM
encryption	E-RESEARCH_PROBLEM
blockchain	B-RESEARCH_PROBLEM
information	E-RESEARCH_PROBLEM
cyber	B-RESEARCH_PROBLEM
.	I-RESEARCH_PROBLEM
[SEP]	I-RESEARCH_PROBLEM
