# Model classification

![](./figs/03_attention_mask.jpg)

![](./figs/03_model_example.jpg)

## model head

![](./figs/03_model_head.jpg)

![](./figs/03_model_head_examples.jpg)

# Model loading 

In [62]:
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
checkpoint = "hfl/rbt3"
config = AutoConfig.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint, config=config)
print(transformers.__version__)

4.35.2


In [2]:
config

BertConfig {
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

# model call

In [14]:
sen = "今天天气不错!"
inputs = tokenizer(sen, return_tensors="pt")
inputs

{'input_ids': tensor([[ 101,  791, 1921, 1921, 3698,  679, 7231,  106,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## without head

In [36]:
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)

In [37]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [38]:
config.output_attentions

False

In [54]:
model = AutoModel.from_pretrained(checkpoint, output_attentions=True)
outputs = model(**inputs)

In [44]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'attentions'])

In [40]:
# batch_size, num_heads, seq_len, seq_len
outputs['attentions'][0].shape

torch.Size([1, 12, 9, 9])

In [41]:
config.num_attention_heads

12

In [42]:
# encoded results
# batch_size, seq_len, hidden_size
outputs.last_hidden_state.shape

torch.Size([1, 9, 768])

In [43]:
# sequence length (including special tokens)
len(inputs['input_ids'][0])

9

## with head

In [45]:
from transformers import AutoModelForSequenceClassification
cls_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
outputs = cls_model(**inputs)

In [50]:
cls_model.num_labels

2

`from transformers import BertForSequenceClassification`

```
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()
```

In [51]:
cls_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=10)
cls_model(**inputs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=None, logits=tensor([[-0.1566, -0.1988, -0.5851,  0.1531,  0.9520, -0.3869, -0.1103,  0.2260,
         -0.4038, -0.5205]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

```
def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # [batch, 768] 768 = hidden_size
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        # [batch, num_labels]
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
```

In [58]:
# without head call and take the pooled output, i.e., [CLS] token
outputs[1].shape
# logits shape
cls_model.classifier(outputs[1]).shape

torch.Size([1, 10])

# Fine-tuning: sentimental analysis
## step 1: dataset

In [1]:
# !head -n 4 data/ChnSentiCorp_htl_all.csv
# waring: The current process just got forked....
#  https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning/72926996#72926996
import pandas as pd
data = pd.read_csv("data/ChnSentiCorp_htl_all.csv")
data = data.dropna()
data.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [2]:
from torch.utils.data import Dataset
class ChnSentiCorp(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv("data/ChnSentiCorp_htl_all.csv")
        self.data = self.data.dropna()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        label = row.label
        text = row.review
        return text, label

In [3]:
from torch.utils.data import  random_split
import torch
torch.manual_seed(42)
dataset = ChnSentiCorp()
trainset, valset = random_split(dataset, lengths=[0.8, 0.2])
for i in range(5):
    print(trainset[i])

('房间很好，符合五星级的标准。酒店离沟口及边边街都比较近，是去九寨沟旅游的最佳选择。补充点评2008年7月21日：地震对该酒店没有任何影响。宾馆反馈2008年7月24日：谢谢您的点评,欢迎下次下榻我酒店!我们将为您提供更周到的服务及优质的产品.', 1)
('性价比还不错。。。算是当地最好的酒店了如果是第一次去的话，可能会发现不太好找', 1)
('打开评价页面就很不一样，很少有酒店能这么认真的作评价回复，这一点就很不错了。我们是A9上海至湖州通车那天来湖州逛逛的，定了太湖豪华房，应该算是不错，建筑比较吸引人，不过那天风大雨大就一直在房间里，可能观念的问题吧，就五星标准来说，房间足够大，浴室也做的很漂亮，但却少许多东西，感觉空落落的，床上用品也只有简单的两个枕头、一条薄被，看上去就好冷啊~应该会比较适合夏季度假吧：）由于临时有事，没有过夜就提前退房了，前台mm还很遗憾的说，下次一定要来住哦，听了蛮窝心的，我想应该会在暖和些的日子再来的：）也希望再次入住前酒店能做更好的改进，哈哈补充点评2008年1月17日：关键没有国际卫视，这一点作为五星标准比较令人不能接受：S', 1)
('5月14日入住了这家酒店，服务确实很好，前台会亲自带领客人到房间，并介绍房间设施，得知我自带电脑，立刻把网线从房间内电脑上拆下，放在桌旁待用。晚上六点后会送果盘和小礼物，那天是香蕉和小西红柿，两个毛绒公仔。房间相较其他济南同价位的酒店显得新，很干净，位置也很好！推荐！', 1)
('位置是非常好，就在省政府对面，离步行街也很近，但房间内的条件太一般，达不到四星。我住的是398元的，还没包早餐，最多也就值200元，不过酒店能到机场接客确实很不错。如果住一晚也值了，如果住三晚以上，还是选择别的实惠一点的酒店吧。现在我才发现，通过携程网订房价格很没优势的，都是提高了价格后再折扣，折后价格比不上其他的普通酒店，以后还是要自己直接跟别的酒店联系，省了中间环节，价格会低很多，并且条件还会更好。', 1)


## step 2: dataloader

In [4]:
from torch.utils.data import DataLoader
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=False)


In [5]:
next(enumerate(trainloader))[1]

[('酒店很不错，性价比很好的。虽然在洛阳繁华的路段，但晚上休息非常安静，睡的很饱。同时去龙门也比较方便，有公交车，乘坐出租车去龙门24元左右就可以到达的。',
  '听当地公安机关说其士酒店服务人员不是特别厚道，据说已经出现过案件。建议换，信阳的阳光宾馆就不错，四星，建议重新考察。',
  '洲际酒店，携程的价格确实有竞争力。应该是沈阳最便宜的五星级酒店。1）地理位置：位于商业街太原街附近，交通方便，但门口老是堵车，建议打车的话在酒店对面下，然后过马路。2）房间：很小，但可以看到街景的不错。3)健身中心居然不给拍照，其他同类酒店没此规定4）礼宾部门口服务不好，可能因为沈阳冬天太冷，人都不出来迎接客人。5）最糟糕的一点是：叫礼宾部打辆出租车去机场，上车前还问酒店的人是否打表收费，确认后上车，结果到了机场竟开口喊价，50元的车费司机喊100！！当时因赶飞机的原因没有追究，事后投诉洲际，到现在仍未解决。相当失望！',
  '环境很好，算得上度假胜地，带露台、电脑的房间很让人愉悦。服务真的很一般很一般，办入住手续的时候居然可以连钥匙牌都给错。我们订的是一间普通套房和一间景观标间，在标间的订单中还特别注明要带景观的（这好像是废话？结果事实证明一点都不废），总台给的房间是一间带露台的套房和一间不带露台的套房。所谓套房也就是多一个麻将室，因我们之前查到酒店自己的网站上写着“景观房有露天阳台”，我们以为朝北的客房没有露台所以做成了麻将室，也就没在意。然后晚上准备休息的时候，大堂副理过来敲门，说给错钥匙了，给的两间都是套房，要换一间标房。换给我们的是朝北不带露台的房间，还说这个区域的客房统称景观房，不是一定带露台的。我们请携程交涉，才换到一间朝南的。这里的服务人员态度都好奇怪，餐厅没有人领位，清扫工自顾自抢在客人前面出门，早上8点多服务员敲门问可不可以整理房间（双休日唉，请勿打扰灯亮着没看见？），见到客人微笑问好是不要指望的。选这个宾馆一个原因是冲着锦江的牌子，相信管理、服务会比较到位吧，结果很失望，硬件硬，软件软。',
  '帮同事订过很多次，酒店位置很好，很方便。房间也不错。',
  '本次入住提供的房间较为满意，服务耐心，特别是饭菜比较可口，牛排是比较专业水准的，有圣诞气氛，不错。',
  '差劲，并没有特殊的，我订了2天的房，到第三天续房要换房，结果忘了拿抽屉里的DV机，第

### batched collate function to improve the efficience for tokenization

In [10]:
from transformers import AutoTokenizer
checkpoint = "hfl/rbt3"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collate_fn(batch):
    texts, labels = zip(*batch)
    # inputs is a dict
    inputs = tokenizer(texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [11]:
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valloader = DataLoader(valset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [12]:
next(enumerate(trainloader))[1]

{'input_ids': tensor([[ 101,  868,  711,  ...,    0,    0,    0],
        [ 101, 1310, 4495,  ...,    0,    0,    0],
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        ...,
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101, 3173, 1290,  ...,    0,    0,    0],
        [ 101, 2769, 3221,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 1, 1, 1, 1, 1, 1, 1])}

## step 3: model and optimizer

In [14]:

from transformers import AutoModelForSequenceClassification
checkpoint = "hfl/rbt3"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
if torch.cuda.is_available():
    model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=2e-5)

## step 4: trainer

In [16]:
def evaluate():
    model.eval()
    correct = 0
    with torch.inference_mode():
        for batch in valloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            correct += torch.sum(preds == batch['labels'])
        acc = correct / len(valset)
        return acc


def train(num_epoch=1, log_step=100):

    global_step = 0
    for ep in range(num_epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            global_step += 1
            if global_step % log_step == 0:
                print(f"global_step: {global_step:6d}, loss: {loss.item():.4f}")
            acc = evaluate()
        print(f"epoch: {ep}, acc: {acc:.4f}")
            


In [17]:
train()

global_step:    100, loss: 0.3555
epoch: 0, acc: 0.8957
global_step:    200, loss: 0.1541
global_step:    300, loss: 0.1175
epoch: 1, acc: 0.8957
global_step:    400, loss: 0.1589
global_step:    500, loss: 0.2019
epoch: 2, acc: 0.8925


## Step 5: Inference

In [20]:
sentence = "我觉得这家店的菜很好吃"
id2label = {0: 'negative', 1: 'positive'}
with torch.inference_mode():
    inputs = tokenizer(sentence, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    print('input:', sentence, '\n', '模型结果:', id2label[preds.item()])

input: 我觉得这家店的菜很好吃 
 模型结果: positive


In [22]:
from transformers import pipeline
model.config.id2label = id2label
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)
pipe(sentence)

[{'label': 'positive', 'score': 0.9922996163368225}]