In [1]:
from transformers import BertPreTrainedModel, BertModel
from transformers import BertTokenizerFast
from src import config
from data_process import dataset
from torch.utils.data import DataLoader
import torch
from seqeval.metrics import f1_score
from seqeval.metrics import precision_score
from seqeval.metrics import recall_score

  from .autonotebook import tqdm as notebook_tqdm


## Data process

In [2]:
tokenizer = BertTokenizerFast.from_pretrained("clw8998/Product-Name-NER-model")
processed_train_data = [
        {
            'context': '【享夢城堡】超柔暖暖毯被150x195cm-角落小夥伴 壽司貓-膚橘',
            'question': '品牌',
            'answer': [
                'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 
                'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 
                'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
            ]
        }
]

In [3]:
# tokeniz輸入字串，然後改成格式: [BOS] 商品名稱 [SEP] 屬性 [SEP]
train_dataset = dataset.BERTDataset_preprocess(processed_train_data, [], tokenizer)
train_dataset[0]

({'input_ids': [[101, 523, 775, 1918, 1814, 1836, 524, 6631, 3382, 3265, 3265, 3691, 6158, 8269, 8206, 8818, 8157, 8341, 118, 6235, 5862, 2207, 1919, 845, 1904, 1385, 6506, 118, 5604, 3580, 102, 1501, 4277, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [4]:
train_loader = DataLoader(dataset = train_dataset,
                            batch_size = 1,
                            shuffle = True,
                            num_workers = 2,
                            collate_fn = dataset.BERTDataset_preprocess.collate_fn)

In [5]:
for batch_data, index, batch_gt, offset_mapping, p_names in train_loader:
    # 提取所需的資料
    input_ids = batch_data[0]
    attention_mask = batch_data[2]
    token_type_ids = batch_data[1]
    index = index
    batch_gt = batch_gt
    offset_mapping = offset_mapping
    p_names = p_names

    print("Input IDs:\n", input_ids) # 輸入到模型的資料，token ids
    print("Attention Mask:\n", attention_mask) # 輸入到模型的資料，atteention，0 代表該token不被關注
    print("Token Type IDs:\n", token_type_ids) # 輸入到模型的資料，token type，0 代標商品名稱的token，1 代標問題的token
    print("Index:\n", index) # 用不到
    print("Batch GT (Ground Truths):\n", batch_gt) # 正確答案，2代表實體的B，1代表實體的I，0代表O，-100代表忽略
    print("Offset Mapping:\n", offset_mapping) # 每個token對應到輸入字串的區間
    print("P Names:\n", p_names) # 原始輸入之商品名稱
    break

# [BOS] '【享夢城堡】超柔暖暖毯被150x195cm-角落小夥伴 壽司貓-膚橘' [SEP] '品牌' [SEP]

Input IDs:
 tensor([[ 101,  523,  775, 1918, 1814, 1836,  524, 6631, 3382, 3265, 3265, 3691,
         6158, 8269, 8206, 8818, 8157, 8341,  118, 6235, 5862, 2207, 1919,  845,
         1904, 1385, 6506,  118, 5604, 3580,  102, 1501, 4277,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
Attention Mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 

## Model Architecture

In [6]:
class Contextual_BERT(BertPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.QA = Question_Answering()

        # Initialize weights and apply final processing
        self.post_init()
    
    def forward(
        self,
        input_ids,
        gt = None,
        attention_mask = None,
        token_type_ids = None,
        position_ids = None,
        head_mask = None,
        inputs_embeds = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        hidden_states = outputs[0]    # 0 代表最後一層的輸出，長度與輸入相同(bs, max_length, 768), 1 代表第0個token的向量，(bs, 768)
        return self.QA(hidden_states)
            
class Question_Answering(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.BIO_classifier = torch.nn.Linear(config.hidden, 3)
        self.dropout = torch.nn.Dropout(0.2)
        

    def forward(self, batch_hidden_states):
        
        batch_hidden_states = self.dropout(batch_hidden_states)
        batch_logits = self.BIO_classifier(batch_hidden_states).squeeze() # dim = (bs, max_query_len, 3)
        return batch_logits

In [7]:
model = Contextual_BERT.from_pretrained("clw8998/Product-Name-NER-model")

In [8]:
model

Contextual_BERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [9]:
batch_logits = model(input_ids = batch_data[0],
                    attention_mask = batch_data[2],
                    token_type_ids = batch_data[1])
batch_logits

tensor([[ 8.1059, -4.0888, -4.5058],
        [ 8.0974, -3.8082, -4.7128],
        [-2.9853, -2.1178,  5.8071],
        [-3.4841,  6.4687, -3.3589],
        [-3.6310,  6.3099, -2.9481],
        [-3.3073,  6.4135, -3.4099],
        [ 8.0723, -3.8192, -4.7608],
        [ 7.8756, -4.2779, -4.0285],
        [ 7.9492, -3.8682, -4.7294],
        [ 7.9941, -4.2765, -4.3242],
        [ 8.0210, -3.8359, -4.8442],
        [ 7.4887, -3.4424, -4.8452],
        [ 7.9901, -3.5014, -5.2639],
        [ 7.9308, -3.9771, -4.4201],
        [ 8.0414, -3.7620, -4.8761],
        [ 8.0760, -3.8390, -4.7921],
        [ 8.0908, -3.8313, -4.8225],
        [ 8.1038, -3.9345, -4.7146],
        [ 8.1054, -3.8269, -4.7686],
        [ 7.4536, -4.3248, -3.5178],
        [ 7.4464, -3.6202, -4.4964],
        [ 7.5143, -3.7000, -4.4047],
        [ 7.4665, -3.6719, -4.3938],
        [ 7.5233, -3.6240, -4.4983],
        [ 7.4242, -4.1915, -3.6446],
        [ 7.4969, -3.4840, -4.6253],
        [ 7.2133, -3.4239, -4.4558],
 

In [10]:
if batch_logits.ndim < 3:
    batch_logits = torch.unsqueeze(batch_logits, 0) # add batch_size dim
batch_logits.shape

torch.Size([1, 128, 3])

## Train

### Loss Funtion

In [11]:
class loss_fn(torch.nn.Module):

    def __init__(self):
        super(loss_fn, self).__init__()
        self.ce_loss = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction = 'mean')

    def forward(self, logits, targets, q):
  
        logits = torch.flatten(logits, start_dim= 0, end_dim = 1)
        targets = torch.flatten(targets, start_dim= 0, end_dim = 1)

        if q == 0:
            loss = self.ce_loss(logits, targets)
        
        return loss
    
loss_fn = loss_fn()

In [12]:
batch_loss = loss_fn(batch_logits, batch_gt, 0)
batch_loss

tensor(4.1853e-05, grad_fn=<NllLossBackward0>)

### Optimizer

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.99), weight_decay=5e-5)
optimizer.zero_grad()

In [14]:
batch_loss.backward()

In [15]:
optimizer.step()
optimizer.zero_grad()

## Evaluate

In [16]:
tokenizer = BertTokenizerFast.from_pretrained("clw8998/Product-Name-NER-model")
evaluate_data = [
        {
            'context': '【享夢城堡】超柔暖暖毯被150x195cm-角落小夥伴 壽司貓-膚橘',
            'question': '品牌',
            'answer': [
                'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 
                'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 
                'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'
            ]
        }
]

In [17]:
# tokeniz輸入字串，然後改成格式: [BOS] 商品名稱 [SEP] 屬性 [SEP]
evaluate_dataset = dataset.BERTDataset_preprocess(evaluate_data, [], tokenizer)
print(evaluate_dataset[0])

({'input_ids': [[101, 523, 775, 1918, 1814, 1836, 524, 6631, 3382, 3265, 3265, 3691, 6158, 8269, 8206, 8818, 8157, 8341, 118, 6235, 5862, 2207, 1919, 845, 1904, 1385, 6506, 118, 5604, 3580, 102, 1501, 4277, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [18]:
evaluate_loader = DataLoader(dataset = evaluate_dataset,
                            batch_size = 1,
                            shuffle = False,
                            num_workers = 2,
                            collate_fn = dataset.BERTDataset_preprocess.collate_fn)

In [19]:
for batch_data, index, batch_gt, offset_mapping, p_names in evaluate_loader:
    # 提取所需的資料
    input_ids = batch_data[0]
    attention_mask = batch_data[2]
    token_type_ids = batch_data[1]
    index = index
    batch_gt = batch_gt
    offset_mapping = offset_mapping
    p_names = p_names

    print("Input IDs:\n", input_ids) # 輸入到模型的資料，token ids
    print("Attention Mask:\n", attention_mask) # 輸入到模型的資料，atteention，0 代表該token不被關注
    print("Token Type IDs:\n", token_type_ids) # 輸入到模型的資料，token type，0 代標商品名稱的token，1 代標問題的token
    print("Index:\n", index) # 用不到
    print("Batch GT (Ground Truths):\n", batch_gt) # 正確答案，2代表實體的B，1代表實體的I，0代表O，-100代表忽略
    print("Offset Mapping:\n", offset_mapping) # 每個token對應到輸入字串的區間
    print("P Names:\n", p_names) # 原始輸入之商品名稱
    break

Input IDs:
 tensor([[ 101,  523,  775, 1918, 1814, 1836,  524, 6631, 3382, 3265, 3265, 3691,
         6158, 8269, 8206, 8818, 8157, 8341,  118, 6235, 5862, 2207, 1919,  845,
         1904, 1385, 6506,  118, 5604, 3580,  102, 1501, 4277,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
Attention Mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 

In [20]:
batch_logits = model(input_ids = batch_data[0].to(config.device),
                                attention_mask = batch_data[2].to(config.device),
                                token_type_ids = batch_data[1].to(config.device))
            
if batch_logits.ndim < 3:
    batch_logits = batch_logits.unsqueeze(0) # add batch size dim

batch_pre = torch.argmax(batch_logits, dim = -1) # 取得模型預測的BIO標籤
batch_pre

tensor([[0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0]])

In [21]:
# 將模型輸出轉換成 seqeval 模組需要的輸入格式

total_gt = {}
total_pre = {}

for input_ids, token_type_ids, model_pre, gt in zip(batch_data[0], batch_data[1], batch_pre, batch_gt):       
    question_start = 0
    for id in token_type_ids:
        if id == 1:
            break
        question_start += 1
    question = tokenizer.convert_ids_to_tokens(input_ids[question_start : question_start + token_type_ids.sum() - 1])
    question = "".join(question)

    # seqeval
    BIO_gt = []
    for i in range(1, question_start - 1):
        if gt[i] == 2:
            BIO_gt.append('B')
        elif gt[i] == 1:
            BIO_gt.append('I')
        elif gt[i] == 0:
            BIO_gt.append('O')
    
    BIO_pre = []
    for i in range(1, question_start - 1):
        if model_pre[i] == 2:
            BIO_pre.append('B')
        elif model_pre[i] == 1:
            BIO_pre.append('I')
        elif model_pre[i] == 0:
            BIO_pre.append('O')

    if len(BIO_gt) != len(BIO_pre):
        pass
        print('pred length dont equal gt length ! maybe input too long!')

    else:
        if question not in total_gt.keys():
            total_gt[question] = []
        total_gt[question].append(BIO_gt)
        if question not in total_pre.keys():
            total_pre[question] = []
        total_pre[question].append(BIO_pre)

In [22]:
print(f"正確答案:\n{total_gt}")
print(f"模型預測:\n{total_pre}")

正確答案:
{'品牌': [['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]}
模型預測:
{'品牌': [['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]}


In [23]:
# 使用 seqeval 計算出 precision, recall, f1_score
# 將所有屬性的 precision, recall, f1_score 加總取平均，即得到最後的平均 precision, recall, f1_score
total_f1 = 0
total_p = 0
total_r = 0
results = []

for key in total_gt.keys():
    precision = precision_score(total_gt[key], total_pre[key])
    recall = recall_score(total_gt[key], total_pre[key])
    f1 = f1_score(total_gt[key], total_pre[key])

    total_p += precision
    total_r += recall
    total_f1 += f1
    results.append([round(precision,4), round(recall,4), round(f1,4)])
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')


Precision: 1.0000, Recall: 1.0000, F1: 1.0000


## Inference

In [24]:
tokenizer = BertTokenizerFast.from_pretrained("clw8998/Product-Name-NER-model")
inference_data = [
        {
            'context': '【a‵bella浪漫晶飾】方形密碼-深海藍水晶手鍊',
            'question': '品牌',
            'answer': []
        }
]

In [25]:
# tokeniz輸入字串，然後改成格式: [BOS] 商品名稱 [SEP] 屬性 [SEP]
inference_dataset = dataset.BERTDataset_preprocess(inference_data, [], tokenizer)
print(inference_dataset[0])

({'input_ids': [[101, 523, 143, 100, 13192, 3857, 4035, 3253, 7617, 524, 3175, 2501, 2166, 4826, 118, 3918, 3862, 5965, 3717, 3253, 2797, 7101, 102, 1501, 4277, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0

In [26]:
inference_loader = DataLoader(dataset = inference_dataset,
                            batch_size = 1,
                            shuffle = False,
                            num_workers = 2,
                            collate_fn = dataset.BERTDataset_preprocess.collate_fn)

In [27]:
for batch_data, index, batch_gt, offset_mapping, p_names in inference_loader:
    # 提取所需的資料
    input_ids = batch_data[0]
    attention_mask = batch_data[2]
    token_type_ids = batch_data[1]
    index = index
    batch_gt = batch_gt
    offset_mapping = offset_mapping
    p_names = p_names

    print("Input IDs:\n", input_ids) # 輸入到模型的資料，token ids
    print("Attention Mask:\n", attention_mask) # 輸入到模型的資料，atteention，0 代表該token不被關注
    print("Token Type IDs:\n", token_type_ids) # 輸入到模型的資料，token type，0 代標商品名稱的token，1 代標問題的token
    print("Index:\n", index) # 用不到
    print("Batch GT (Ground Truths):\n", batch_gt) # 正確答案，2代表實體的B，1代表實體的I，0代表O，-100代表忽略
    print("Offset Mapping:\n", offset_mapping) # 每個token對應到輸入字串的區間
    print("P Names:\n", p_names) # 原始輸入之商品名稱
    break

Input IDs:
 tensor([[  101,   523,   143,   100, 13192,  3857,  4035,  3253,  7617,   524,
          3175,  2501,  2166,  4826,   118,  3918,  3862,  5965,  3717,  3253,
          2797,  7101,   102,  1501,  4277,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,    

In [28]:
batch_logits = model(input_ids = batch_data[0].to(config.device),
                                attention_mask = batch_data[2].to(config.device),
                                token_type_ids = batch_data[1].to(config.device))
            
if batch_logits.ndim < 3:
    batch_logits = batch_logits.unsqueeze(0) # add batch size dim

batch_pre = torch.argmax(batch_logits, dim = -1) # 取得模型預測的BIO標籤
batch_pre

tensor([[0, 0, 2, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [29]:
result_text = []
result_pos = []
result_confidence = []

# 對 batch 中的每個樣本進行處理
for input_ids, token_type_ids, model_pre, offset, logits, p_name in zip(batch_data[0], batch_data[1], batch_pre, offset_mapping, batch_logits, p_names):

    # 取出問題(屬性)
    question_start = int(token_type_ids.argmax(-1)) # 找到問題的開始位置
    question = tokenizer.convert_ids_to_tokens(input_ids[question_start : question_start + token_type_ids.sum() - 1]) # 根據 token_type_ids 提取問題
    question = "".join(question) # 將 token 轉換為字符串

    # 取出商品名稱
    offset = offset[1:question_start - 1] # 移除CLS和問題的offset
    # 產生一個存放 token 的 list
    context = [' '] * (offset[-1][1]) # 根據最後一個token的index產生一個空白字符的列表
    for idx, text in enumerate(tokenizer.convert_ids_to_tokens(input_ids[1:question_start - 1])):
        if text == '[UNK]': # 如果是 UNK token 就從商品名稱中取值
            context[offset[idx][0] : offset[idx][1]] = list(p_name[offset[idx][0] : offset[idx][1]])
        else:
            context[offset[idx][0] : offset[idx][1]] = list(text.replace('##', '')) # '##'代表與前一個 token 相連
    context = "".join(context) # 將 context 轉換為字符串

    # 初始化結果列表
    result_text.append([context, {}])
    result_pos.append([context, {}])
    result_confidence.append([context, {}])

    # 取得每個字元的 confidence
    tmp_confidence = []
    last_idx = 0
    # 計算 softmax，並取出最大值作為 confidence
    for idx, confidence in enumerate(torch.nn.functional.softmax(logits[1:question_start - 1], dim=-1).max(-1)[0].detach().cpu().numpy().tolist()):
        # 若一個token超過一個字元，只有最後一個字元會被賦予 confifence，其餘都是 0.0
        # ex: "bella" 是單一一個 token 且信心值為 0.9，則 "bella" 每個字元信心值為: [0.0, 0.0, 0.0, 0.0, 0.9]
        # 計算時，會忽略補充的
        if last_idx < offset[idx][0]:
            tmp_confidence += [0.0] * (offset[idx][0] - last_idx) # 填充 0.0 表示非 token 的部分
        tmp_confidence = tmp_confidence + ([0.0] * (offset[idx][1] - offset[idx][0] - 1)) + [confidence]
        last_idx = offset[idx][1]
    
    # 將 confidence 加入結果
    result_confidence[-1][1][question] = tmp_confidence

    model_pre = model_pre[1:] # 移除CLS token的預測結果

    # 取得實體與區間
    i = 0
    start = 0
    attribute_values = []
    attribute_positions = []
    while i <= question_start - 2:
        if model_pre[i] == 2: # 遇到標記 2 表示一個屬性的開始
            if i > start:
                attribute_values.append(context[offset[start][0]:offset[i - 1][1]]) # 添加屬性的值
                attribute_positions.append([offset[start][0], offset[i - 1][1] - 1]) # 添加屬性的位置
                start = i
            elif i == start:
                pass
            i += 1
        elif model_pre[i] == 1: # 遇到標記 1 表示屬性中間
            i += 1
            if i == start:
                start = i
        elif model_pre[i] == 0: # 遇到標記 0 表示屬性結束
            if i > start:
                attribute_values.append(context[offset[start][0]:offset[i - 1][1]]) # 添加屬性的值
                attribute_positions.append([offset[start][0], offset[i - 1][1] - 1]) # 添加屬性的位置
            i += 1
            start = i
    
    # 將實體與區間結果
    result_text[-1][1][question] = attribute_values
    result_pos[-1][1][question] = attribute_positions


In [30]:
result_text, result_pos, result_confidence

([['【a‵bella浪漫晶飾】方形密碼-深海藍水晶手鍊', {'品牌': ['a‵bella', '浪漫晶飾']}]],
 [['【a‵bella浪漫晶飾】方形密碼-深海藍水晶手鍊', {'品牌': [[1, 7], [8, 11]]}]],
 [['【a‵bella浪漫晶飾】方形密碼-深海藍水晶手鍊',
   {'品牌': [0.9999871253967285,
     0.9990952014923096,
     0.9973575472831726,
     0.0,
     0.0,
     0.0,
     0.0,
     0.9998403787612915,
     0.9588338136672974,
     0.9993359446525574,
     0.9996674060821533,
     0.9995212554931641,
     0.9999836683273315,
     0.9999616146087646,
     0.9999741315841675,
     0.9998793601989746,
     0.9999148845672607,
     0.9999858140945435,
     0.999988317489624,
     0.9999903440475464,
     0.9999902248382568,
     0.9999865293502808,
     0.9999830722808838,
     0.999980092048645,
     0.999985933303833]}]])