## **1. Install and import bibraries**


In [1]:
!pip install datasets evaluate accelerate
!pip install causal-conv1d>=1.1.0
!pip install mamba-ssm
!pip install pytorch-crf


!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

from IPython.display import clear_output
clear_output()

In [16]:
import os
import random
import json
import torch
import torch.nn as nn
from torchcrf import CRF
from collections import namedtuple
from dataclasses import dataclass, field, asdict
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import Trainer
from transformers import AutoTokenizer, TrainingArguments

Login into huggingface_hub to push trained model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## **2. dataset**


In [4]:
import pandas as pd
from datasets import Dataset

In [None]:
train_path = '/content/data/train.txt'

df_traindata = pd.read_csv(train_path, delimiter='\t', names=["text", "label"], header=None)

dataset = Dataset.from_pandas(df_traindata)
dataset = dataset.train_test_split(train_size=0.8, seed=42)
dataset["validation"] = dataset.pop("test")

dataset

In [6]:
# eval_path = '/content/data/test.txt'
# df_evaldata = pd.read_csv(eval_path, delimiter='\t', names=["text", "label"], header=None)
# datasettest = Dataset.from_pandas(df_evaldata)
# datasettest

In [7]:
# dataset["train"][0]

In [13]:
VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}

## **3. Build Custom Mamba Model for Text Classification**


In [9]:
# Mamba 的 config 類引用了這個詞: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/config_mamba.py
@dataclass
class MambaConfig:
    d_model: int = 640 # 2560
    d_intermediate: int = 0
    n_layer: int = 8 # 64
    vocab_size: int = 50277 # 50277
    ssm_cfg: dict = field(default_factory=dict)
    rms_norm: bool = True
    residual_in_fp32: bool = True
    fused_add_norm: bool = True
    # pad_vocab_size_multiple: int = 8
    pad_vocab_size_multiple: int = 16
    tie_embeddings = True
    attn_layer_idx: list = field(default_factory=list)
    attn_cfg: dict = field(default_factory=dict)

    def to_json_string(self):
        return json.dumps(asdict(self))

    def to_dict(self):
        return asdict(self)

In [10]:
# 用於分類的頭部類別的定義
class MambaClassificationHead(nn.Module):
    def __init__(self, d_model, num_classes, **kwargs):
        super(MambaClassificationHead, self).__init__()
        # 使用線性圖層根據輸入執行分類，該輸入的大小d_model且num_classes需要排序。
        self.classification_head = nn.Linear(d_model, num_classes, **kwargs)

    def forward(self, hidden_states):
        return self.classification_head(hidden_states)

In [14]:
class MambaTextClassification(MambaLMHeadModel):
    def __init__(
        self,
        config: MambaConfig,
        initializer_cfg=None,
        device=None,
        dtype=None,
    ) -> None:
        super().__init__(config, initializer_cfg, device, dtype)

        # 使用 MambaClassificationHead 創建一個分類器，輸入大小為 d_model，類號為 len(labels)。
        # self.classification_head = nn.Linear(config.d_model, len(labels))
        # self.crf = CRF(len(labels), batch_first=True)

        self.classification_head = MambaClassificationHead(d_model=config.d_model, num_classes=len(tag2idx))
        self.crf = CRF(len(tag2idx), batch_first=True)

        del self.lm_head

    def forward(self, input_ids, tags, mask, is_test=False):
        # 通過原生模型發送input_ids以接收hidden_states。
        emissions = self.backbone(input_ids)

        # # 取二維emissions的平均值，創建具有代表性的 [CLS] 特徵
        # mean_emissions = emissions.mean(dim=1)

        # 將mean_emissions通過分類器的頂部來接收logits_emissions。
        logits_emissions = self.classification_head(emissions)

        if not is_test: # Training，return loss
            loss=-self.crf.forward(logits_emissions, tags, mask, reduction='mean')
            return loss
        else: # Testing，return decoding
            decode=self.crf.decode(logits_emissions, mask)
            return decode

    def predict(self, text, tokenizer, id2label=None):
        input_ids = torch.tensor(tokenizer(text)['input_ids'], device=device)[None] # device = 'cuda'
        with torch.no_grad():
          logits = self.forward(input_ids).logits[0]
          label = np.argmax(logits.cpu().numpy())

        if id2label is not None:
          return id2label[label]
        else:
          return label

    @classmethod
    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
        # 從之前訓練的模型載入配置。
        config_data = load_config_hf(pretrained_model_name)
        config = MambaConfig(**config_data)

        # 從配置中初始化模型，並將其傳輸到所需的設備和數據類型。
        model = cls(config, device=device, dtype=dtype, **kwargs)

        # 載入以前訓練的模型狀態。
        model_state_dict = load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype)
        model.load_state_dict(model_state_dict, strict=False)

        # 列印出新初始化的嵌入參數。
        print("Newly initialized embedding:", set(model.state_dict().keys()) - set(model_state_dict.keys()))
        return model

In [17]:
# # 從先前訓練的模型載入 Mamba 模型。
model = MambaTextClassification.from_pretrained("state-spaces/mamba-130m")
model.to(device)

# 從 gpt-neox-20b 模型載入 Mamba 模型的分詞器。
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# 從 gpt-neox-20b 模型載入 Mamba 模型的分詞器。
tokenizer.pad_token_id = tokenizer.eos_token_id

pytorch_model.bin:   0%|          | 0.00/517M [00:00<?, ?B/s]

Newly initialized embedding: {'crf.start_transitions', 'classification_head.classification_head.weight', 'crf.transitions', 'classification_head.classification_head.bias', 'crf.end_transitions'}


tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def format_parameters(num_params):
    return "{:,}".format(num_params)

In [None]:
model

In [None]:
Mamba_CRF_params = count_parameters(model)
print(f"Mamba + CRF parameters: {format_parameters(Mamba_CRF_params)}")