In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import pandas as pd
from datasets import Dataset, DatasetDict, Features, ClassLabel, Sequence, Value
import json
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForTokenClassification
from seqeval.scheme import IOB2

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(r"checkpoint-2910")
label_map = {'O': 0, 'B-object': 1, 'I-quality': 2, 'B-quality': 3, 'I-object': 4, 'B-value': 5, 'I-value': 6, 'B-property': 7, 'I-property': 8, 'B-OP': 9, 'I-OP':10}  # Adjust as necessary



In [3]:
def tokenize_input(text, tokenizer):
    return tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

def predict_tags(text, tokenizer, model, label_map):
    encoded_input = tokenize_input(text, tokenizer)

    input_ids = encoded_input['input_ids'].to(model.device)
    attention_mask = encoded_input['attention_mask'].to(model.device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        predictions = torch.argmax(logits, dim=-1)

    word_predictions = []
    word_ids = encoded_input.word_ids(batch_index=0)
    for i, word_id in enumerate(word_ids):
        if word_id is not None and word_id != -1:  # Ignore special tokens and out-of-range
            predicted_tag = list(label_map.keys())[list(label_map.values()).index(predictions[0, i].item())]
            decoded_word = tokenizer.decode([input_ids[0, i]], skip_special_tokens=True, clean_up_tokenization_spaces=True)
            word_predictions.append((decoded_word, predicted_tag))

    return word_predictions


In [4]:
input_text = "The height of door knob should be less than 1 meters from the ground."
predicted_tags = predict_tags(input_text, tokenizer, model, label_map)
print(predicted_tags)

[('The', 'O'), ('height', 'B-property'), ('of', 'O'), ('door', 'B-object'), ('knob', 'I-object'), ('should', 'O'), ('be', 'O'), ('less', 'B-OP'), ('than', 'I-OP'), ('1', 'B-value'), ('meters', 'I-value'), ('from', 'I-quality'), ('the', 'I-quality'), ('ground', 'I-quality'), ('.', 'O')]
