## **1. Install and import bibraries**


In [1]:
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

import os
import json
import torch
import random
import evaluate
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn import metrics
from datasets import Dataset
from transformers import Trainer
from datasets import load_dataset
from collections import namedtuple
from dataclasses import dataclass, field, asdict
from transformers import AutoTokenizer, TrainingArguments
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf


from IPython.display import clear_output
clear_output()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## **2. Download dataset**


In [3]:
label2id = {
    'O': 0,
    'B-BODY': 1,
    'I-BODY': 2,
    'B-SYMP': 3,
    'I-SYMP': 4,
    'B-INST': 5,
    'I-INST': 6,
    'B-EXAM': 7,
    'I-EXAM': 8,
    'B-CHEM': 9,
    'I-CHEM': 10,
    'B-DISE': 11,
    'I-DISE': 12,
    'B-DRUG': 13,
    'I-DRUG': 14,
    'B-SUPP': 15,
    'I-SUPP': 16,
    'B-TREAT': 17,
    'I-TREAT': 18,
    'B-TIME': 19,
    'I-TIME': 20
}

id2label = {
    0: 'O',
    1: 'B-BODY',
    2: 'I-BODY',
    3: 'B-SYMP',
    4: 'I-SYMP',
    5: 'B-INST',
    6: 'I-INST',
    7: 'B-EXAM',
    8: 'I-EXAM',
    9: 'B-CHEM',
    10: 'I-CHEM',
    11: 'B-DISE',
    12: 'I-DISE',
    13: 'B-DRUG',
    14: 'I-DRUG',
    15: 'B-SUPP',
    16: 'I-SUPP',
    17: 'B-TREAT',
    18: 'I-TREAT',
    19: 'B-TIME',
    20: 'I-TIME'
}

In [None]:
eval_path = '../data/test.txt'
df_evaldata = pd.read_csv(eval_path, delimiter='\t', names=["text", "pre_label"], header=None)
df_evaldata['label'] = df_evaldata['pre_label'].map(label2id)

datasettest = Dataset.from_pandas(df_evaldata[['text', 'label']])
datasettest

In [5]:
# Mamba 的 config 類引用了這個詞: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/config_mamba.py
@dataclass
class MambaConfig:
    d_model: int = 2560
    n_layer: int = 64
    vocab_size: int = 50277 # 50277
    ssm_cfg: dict = field(default_factory=dict)
    rms_norm: bool = True
    residual_in_fp32: bool = True
    fused_add_norm: bool = True
    # pad_vocab_size_multiple: int = 8
    pad_vocab_size_multiple: int = 16
    tie_embeddings: bool = True

    def to_json_string(self):
        return json.dumps(asdict(self))

    def to_dict(self):
        return asdict(self)

In [6]:
class MambaTextClassification(MambaLMHeadModel):
    def __init__(
        self,
        config: MambaConfig,
        initializer_cfg=None,
        device=None,
        dtype=None,
    ) -> None:
        super().__init__(config, initializer_cfg, device, dtype)

        # 使用 MambaClassificationHead 創建一個分類器，輸入大小為 d_model，類號為 len(id2label)。
        self.classification_head = MambaClassificationHead(d_model=config.d_model, num_classes=len(label2id))

        del self.lm_head

    def forward(self, input_ids, attention_mask=None, labels=None):
        # 通過原生模型發送input_ids以接收hidden_states。
        hidden_states = self.backbone(input_ids)

        # 取二維hidden_states的平均值，創建具有代表性的 [CLS] 特徵
        mean_hidden_states = hidden_states.mean(dim=1)

        # 將mean_hidden_states通過分類器的頂部來接收logits。
        logits = self.classification_head(mean_hidden_states)

        if labels is None:
          ClassificationOutput = namedtuple("ClassificationOutput", ["logits"])
          return ClassificationOutput(logits=logits)
        else:
          ClassificationOutput = namedtuple("ClassificationOutput", ["loss", "logits"])

          # 使用 CrossEntropyLoss 損失函數計算損失。
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits, labels)

          return ClassificationOutput(loss=loss, logits=logits)

    def predict(self, text, tokenizer, id2label=None):
        input_ids = torch.tensor(tokenizer(text)['input_ids'], device=device)[None] # device = 'cuda'
        with torch.no_grad():
          logits = self.forward(input_ids).logits[0]
          label = np.argmax(logits.cpu().numpy())

        if id2label is not None:
          return id2label[label]
        else:
          return label

    @classmethod
    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
        # 從之前訓練的模型載入配置。
        config_data = load_config_hf(pretrained_model_name)
        config = MambaConfig(**config_data)

        # 從配置中初始化模型，並將其傳輸到所需的設備和數據類型。
        model = cls(config, device=device, dtype=dtype, **kwargs)

        # 載入以前訓練的模型狀態。
        model_state_dict = load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
        model.load_state_dict(model_state_dict, strict=False)

        # 列印出新初始化的嵌入參數。
        print("Newly initialized embedding:", set(model.state_dict().keys()) - set(model_state_dict.keys()))
        return model

In [7]:
# 用於分類的頭部類別的定義
class MambaClassificationHead(nn.Module):
    def __init__(self, d_model, num_classes, **kwargs):
        super(MambaClassificationHead, self).__init__()
        # 使用線性圖層根據輸入執行分類，該輸入的大小d_model且num_classes需要排序。
        self.classification_head = nn.Linear(d_model, num_classes, **kwargs)

    def forward(self, hidden_states):
        return self.classification_head(hidden_states)

### NER_tokenizer_llama-traditional-chinese-120M_Data_HealthNER_Epcoh10

In [None]:
model = MambaTextClassification.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_Data_HealthNER_Epcoh10")
model.to(device)

tokenizer = AutoTokenizer.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_Data_HealthNER_Epcoh10")
tokenizer.pad_token_id = tokenizer.eos_token_id

### NER_tokenizer_llama-traditional-chinese-120M_Data_HealthNER

In [None]:
# model = MambaTextClassification.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_Data_HealthNER")
# model.to(device)

# tokenizer = AutoTokenizer.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_Data_HealthNER")
# tokenizer.pad_token_id = tokenizer.eos_token_id

### NER_tokenizer_llama-traditional-chinese-120M_DataTest_ROCLING22_CHNER_truth

In [None]:
# model = MambaTextClassification.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_DataTest_ROCLING22_CHNER_truth")
# model.to(device)

# tokenizer = AutoTokenizer.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_DataTest_ROCLING22_CHNER_truth")
# tokenizer.pad_token_id = tokenizer.eos_token_id

### mamba_text_classification_NER_RunEpoch0.7

In [None]:
# model = MambaTextClassification.from_pretrained("UJForSchool/mamba_text_classification_NER_RunEpoch0.7")
# model.to(device)

# tokenizer = AutoTokenizer.from_pretrained("UJForSchool/mamba_text_classification_NER_RunEpoch0.7")
# tokenizer.pad_token_id = tokenizer.eos_token_id

### NER_tokenizer_llama-traditional-chinese-120M_Full_train

In [None]:
# model = MambaTextClassification.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_Full_train")
# model.to(device)

# tokenizer = AutoTokenizer.from_pretrained("UJForSchool/NER_tokenizer_llama-traditional-chinese-120M_Full_train")
# tokenizer.pad_token_id = tokenizer.eos_token_id

### mamba_text_classification_NER_RunEpoch0.7

In [None]:
# model = MambaTextClassification.from_pretrained("UJForSchool/mamba_text_classification_NER_RunEpoch0.7")
# model.to(device)

# tokenizer = AutoTokenizer.from_pretrained("UJForSchool/mamba_text_classification_NER_RunEpoch0.7")
# tokenizer.pad_token_id = tokenizer.eos_token_id

In [9]:
text_0 = datasettest[0]['text']
label_0 = datasettest[0]['label']
response = model.predict(text_0, tokenizer, id2label)

In [None]:
print(f'Classify: {text_0}\nGT: {id2label[label_0]}\nPredict: {response}')

In [None]:
labels = ['O','B-BODY','I-BODY', 'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM','B-CHEM', 'I-CHEM',
          'B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG', 'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME']

text = datasettest['text']
label = datasettest['label']

Y, Y_hat = [], []
for preY in text:
    Y.append(model.predict(preY, tokenizer, id2label))


# for trueY in label:
#     Y_hat.append(id2label[trueY])

# print(metrics.classification_report(Y_hat, Y, labels=labels, digits=3))
print(metrics.classification_report(df_evaldata['pre_label'], Y, labels=labels, digits=3))