In [1]:
# 选择根目录
%cd /home/yxlu/zpwang/InsultingLanguageDetection

/home/yxlu/zpwang/InsultingLanguageDetection


In [2]:
import torch
import torch.nn as nn

from transformers import AutoModel
from torch.nn import functional as F



class ClassificationHead(nn.Module):
    """
    自定义用于分类的模块，输入特征、输出分类
    """

    def __init__(self, hidden_size, classifier_dropout=0.1, num_labels=3):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)  # 全连接层，维度不变
        self.dropout = nn.Dropout(classifier_dropout)  # Dropout，随机失活，防止过拟合的常用模块
        self.out_proj = nn.Linear(hidden_size, num_labels)  # 全连接层，第二维 hidden_size -> num_labels

    def forward(self, features):
        """
        输入特征，自定义前馈方法并返回分类
        运算时注意维度变化 (batch_size, sequence_length, hidden_size) -> (batch_size, num_labels)
        """
        features  # batch_size, sequence_length, hidden_size
        x = features[:, 0, :]  # batch_size, hidden_size  # 取第一个 token <s> 作为句子的编码
        x = self.dropout(x)  # batch_size, hidden_size
        x = self.dense(x)  # batch_size, hidden_size
        x = torch.tanh(x)  # batch_size, hidden_size
        x = self.dropout(x)  # batch_size, hidden_size
        x = self.out_proj(x)  # batch_size, num_labels
        return x  # batch_size, num_labels


class CustomModel(nn.Module):
    """
    自定义模型结构，处理模型输入得到所需输出
    此处以基本的 Encoder-Decoder 架构为例
        句子 -> tokenizer -> 预训练语言模型 -> 自定义分类头 -> 分类结果
    """
    def __init__(self, 
                 encoder_name='bert-base-uncased',
                 ):
        super().__init__()
        self.encoder_name = encoder_name
        
        self.encoder = AutoModel.from_pretrained(encoder_name)
        self.decoder = ClassificationHead(hidden_size=768)  # hidden_size 为对应预训练语言模型的输出维度，bert base为768
        # self.model = AutoModelForSequenceClassification.from_pretrained(config.encoder_name, num_labels=2)
    
    def forward(self, batch_x):
        """
        自定义前馈过程，输入tokenizer分词后的语句，输出各个分类的概率
        """
        feature = self.encoder(**batch_x)
        feature = feature['last_hidden_state']
        feature  # batch_size, sequence_length, hidden_size
        
        logits = self.decoder(feature)  # batch_size, num_labels
        probs = F.softmax(logits, dim=-1)  # batch_size, num_labels
        return probs
    
    def predict(self, batch_x):
        """
        自定义预测过程，输入tokenizer分词后的语句，输出预测结果（属于哪个分类）
        """
        output = self(batch_x)  
        preds = torch.argmax(output, dim=-1)  # 在 probs 的基础上取概率最大的分类
        return preds


encoder_name = 'bert-base-uncased'
model = CustomModel(encoder_name)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from transformers import AutoTokenizer

sentences = [
    'Hello world',
    'I like it',
    'I hate it',
]

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=encoder_name)
sentences_tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
print('分词后的句子', sentences_tokens['input_ids'])

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.02MB/s]


分词后的句子 tensor([[ 101, 7592, 2088,  102,    0],
        [ 101, 1045, 2066, 2009,  102],
        [ 101, 1045, 5223, 2009,  102]])


In [11]:
x = sentences_tokens
probs = model(x)
preds = model.predict(x)
print(f'预测概率\n{probs}\n预测分类\n{preds}')

'''
由于有dropout，输出结果存在随机性，概率与分类可能对不上
'''

预测概率
tensor([[0.2819, 0.4500, 0.2681],
        [0.3406, 0.4056, 0.2537],
        [0.3876, 0.3266, 0.2857]], grad_fn=<SoftmaxBackward0>)
预测分类
tensor([1, 0, 1])


写于 2023.8.11