In [4]:
import numpy as np
import random
import torch
import matplotlib.pyplot as plt
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')

In [5]:
SEED = 123
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-2
EPSILON = 1e-8

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f813410f5d0>

In [6]:
# 读取文件，返回文件内容
def readfile(filename):
    with open(filename, encoding="utf-8") as f:
        # 按行进行读取
        content = f.readlines()
        return content

In [7]:
# 正负情感语料
pos_text, neg_text = readfile('./pos.txt'), readfile('./neg.txt')

In [8]:
# 所有语料
sentences = pos_text + neg_text
print(len(pos_text)) # 5000个正样本
print(len(neg_text)) # 5000个负样本
print(len(sentences)) # 一共1万样本

5000
5000
10000


In [9]:
# 设定标签，positive为1，negative为0
pos_targets = np.ones((len(pos_text)))
neg_targets = np.zeros((len(neg_text)))
# 情感label 拼接到一起，shape = (10000, 1)
targets = np.concatenate((pos_targets, neg_targets), axis=0).reshape(-1, 1)   
targets.shape


(10000, 1)

In [10]:
# 转换为tensor
total_targets = torch.tensor(targets)
total_targets.shape


torch.Size([10000, 1])

In [11]:
# 从预训练模型中加载bert-base-chinese
# [UNK] 特征  [CLS]起始 [SEP]结束
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir="/root/bert/transformer_file/")
tokenizer


Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [12]:
print(pos_text[2])
# 进行分词
print(tokenizer.tokenize(pos_text[2]))
# bert编码，会增加起始[CLS] 和 结束[SEP]标记
print(tokenizer.encode(pos_text[2]))
# 将bert编码转换为 字
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(pos_text[2])))


不错，下次还考虑入住。交通也方便，在餐厅吃的也不错。

['不', '错', '，', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', '，', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。']
[101, 679, 7231, 8024, 678, 3613, 6820, 5440, 5991, 1057, 857, 511, 769, 6858, 738, 3175, 912, 8024, 1762, 7623, 1324, 1391, 4638, 738, 679, 7231, 511, 102]
['[CLS]', '不', '错', '，', '下', '次', '还', '考', '虑', '入', '住', '。', '交', '通', '也', '方', '便', '，', '在', '餐', '厅', '吃', '的', '也', '不', '错', '。', '[SEP]']


In [13]:
# 在的编码为1762，开始[CLS]编码为101，结束[SEP]编码为102
tokenizer.encode('在')

[101, 1762, 102]

In [14]:
#将每一句转成数字（大于126做截断，小于126做PADDING，加上首尾两个标识，长度总共等于128）
def convert_text_to_token(tokenizer, sentence, limit_size=126):
    tokens = tokenizer.encode(sentence[:limit_size])  #直接截断
    #补齐（pad的索引号就是0）
    if len(tokens) < limit_size + 2:                  
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens


In [15]:
# 对每个句子进行编码
input_ids = [convert_text_to_token(tokenizer, x) for x in sentences]
# 放到tensor中
input_tokens = torch.tensor(input_ids)
print(input_tokens.shape) #torch.Size([10000, 128])


torch.Size([10000, 128])


In [16]:
input_tokens[1]


tensor([  101,  1765,  4415,   855,  5390,   679,  7231,  8024,  7317,   704,
         1357,  7474,   511,  2791,  7313,  3683,  6772,  2397,  1112,  8024,
         2357,  2229,  1394,  4415,   511,   852,  3221,  7392,  7509,  3126,
         3362,  1922,  2345,   749,  8024,  3300,   857,  5042,  3211,  2145,
         3404,  4638,  2697,  6230,   511,   707,  3717,  4638,  2791,  7313,
         7599,  3250,   679,  7231,  8024,  2523,  5653,  6844,   511,  3517,
         6887,  4638,  1765,  3691,  3683,  6772,  5552,  8024,  6656,  1071,
          800,  6163,   934,   679,  1469,  6455,   511,  6133,  1041,  4157,
         6397,  8182,  2399,   128,  3299,  8149,  3189,  8038,  3315,   782,
         3221,   128,  3299,  8123,  3189,  1057,   857,  6421,  6983,  2421,
         1920,  2414,  2791,  8024,   817,  3419,   711, 11929,  1039,   511,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [17]:
# 建立mask
def attention_masks(input_ids):
    atten_masks = []
    for seq in input_ids:
        # 如果有编码（>0）即为1, pad为0
        seq_mask = [float(x>0) for x in seq]
        atten_masks.append(seq_mask)
    return atten_masks

In [18]:
# 生成attention_masks
atten_masks = attention_masks(input_ids)
# 将atten_masks放到tensor中
attention_tokens = torch.tensor(atten_masks)
print(attention_tokens)
print(attention_tokens.size())


tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 1., 0., 0.]])
torch.Size([10000, 128])


In [19]:
print('input_tokens:\n', input_tokens) # shape=[10000, 128]
print('total_targets:\n', total_targets) # shape=[10000, 1]
print('attention_tokens:\n', attention_tokens) # shape=[10000, 128]
print('input_tokens:\n', input_tokens) # shape=[10000, 128]
print(input_tokens.shape)


input_tokens:
 tensor([[ 101, 6983, 2421,  ..., 3119, 7178,  102],
        [ 101, 1765, 4415,  ...,    0,    0,    0],
        [ 101,  679, 7231,  ...,    0,    0,    0],
        ...,
        [ 101, 2769, 2697,  ...,    0,    0,    0],
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101, 5439,  782,  ...,  102,    0,    0]])
total_targets:
 tensor([[1.],
        [1.],
        [1.],
        ...,
        [0.],
        [0.],
        [0.]], dtype=torch.float64)
attention_tokens:
 tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 1., 0., 0.]])
input_tokens:
 tensor([[ 101, 6983, 2421,  ..., 3119, 7178,  102],
        [ 101, 1765, 4415,  ...,    0,    0,    0],
        [ 101,  679, 7231,  ...,    0,    0,    0],
        ...,
        [ 101, 2769, 2697,  ...,    0,    0,    0],
        [ 101, 

In [20]:
from sklearn.model_selection import train_test_split
# 使用random_state固定切分方式，切分 train_inputs, train_labels, train_masks,
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens, total_targets, random_state=2021, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_tokens, input_tokens, random_state=666, test_size=0.2)
print(train_inputs.shape, test_inputs.shape)    #torch.Size([8000, 128]) torch.Size([2000, 128])
print(train_masks.shape, test_masks.shape)      #torch.Size([8000, 128])和train_inputs形状一样

print(train_inputs[0])
print(train_masks[0])

torch.Size([8000, 128]) torch.Size([2000, 128])
torch.Size([8000, 128]) torch.Size([2000, 128])
tensor([ 101, 6983, 2421, 1922, 5439,  749, 8024, 6392, 3177, 6963, 3191,  749,
         102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,

In [21]:
# 使用TensorDataset对tensor进行打包
train_data = TensorDataset(train_inputs, train_masks, train_labels)
# 无放回地随机采样样本元素
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)



In [22]:
# 查看dataloader内容
for i, (train, mask, label) in enumerate(train_dataloader):
    #torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])
    print(train)
    print(mask)
    print(label)
    print(train.shape, mask.shape, label.shape)       
    break
print('len(train_dataloader)=', len(train_dataloader)) #500


tensor([[ 101, 1765, 1770,  ...,    0,    0,    0],
        [ 101, 3302, 1218,  ...,    0,    0,    0],
        [ 101, 1728,  711,  ...,    0,    0,    0],
        ...,
        [ 101,  855, 5390,  ...,    0,    0,    0],
        [ 101, 2345, 8013,  ...,  100,  102,    0],
        [ 101, 2792, 3300,  ...,    0,    0,    0]])
tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])
tensor([[1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.]], dtype=torch.float64)
torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])
len(train_dataloader)= 500


In [23]:

# 加载预训练模型， num_labels表示2个分类，好评和差评
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels = 2)
# 使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Downloading:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [24]:
# 定义优化器 AdamW， eps默认就为1e-8（增加分母的数值，用来提高数值稳定性）
#optimizer = AdamW(model.parameters(), lr = LEARNING_RATE, eps = EPSILON)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': WEIGHT_DECAY},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr = LEARNING_RATE, eps = EPSILON)
"""
from torch import optim
# 定义优化器
#optimizer = optim.Adam(model.parameters(), lr=1e-3)
optimizer = optim.Adam(model.parameters())
"""

'\nfrom torch import optim\n# 定义优化器\n#optimizer = optim.Adam(model.parameters(), lr=1e-3)\noptimizer = optim.Adam(model.parameters())\n'

In [25]:
epochs = 2
# training steps 的数量: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)


# # 模型训练、评估

In [26]:
# 二分类结果评估
def binary_acc(preds, labels):      #preds.shape=(16, 2) labels.shape=torch.Size([16, 1])
    # eq里面的两个参数的shape=torch.Size([16]) 
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()         
    if 0:
        print('binary acc ********')
        print('preds = ', preds)
        print('labels = ', labels)
        print('correct = ', correct)
    acc = correct.sum().item() / len(correct)
    return acc



In [27]:
import time
import datetime
# 时间格式化
def format_time(elapsed):    
    elapsed_rounded = int(round((elapsed)))    
    return str(datetime.timedelta(seconds=elapsed_rounded))   #返回 hh:mm:ss 形式的时间



In [28]:
def train(model, optimizer):
    # 记录当前时刻
    t0 = time.time()
    # 统计m每个batch的loss 和 acc
    avg_loss, avg_acc = [],[]
    
    # 开启训练模式
    model.train()
    for step, batch in enumerate(train_dataloader):
        # 每隔40个batch 输出一下所用时间.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # 从batch中取数据，并放到GPU中
        b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
        # 前向传播，得到output
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # 得到loss和预测结果logits
        loss, logits = output[0], output[1]
        # 记录每次的loss和acc
        avg_loss.append(loss.item())
        # 评估acc
        acc = binary_acc(logits, b_labels)
        avg_acc.append(acc)
        # 清空上一轮梯度
        optimizer.zero_grad()
        # 反向传播
        loss.backward()
        # 大于1的梯度将其设为1.0, 以防梯度爆炸
        clip_grad_norm_(model.parameters(), 1.0)
        # 更新模型参数
        optimizer.step()
        #更新learning rate
        scheduler.step()
    # 统计平均loss和acc
    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()
    return avg_loss, avg_acc


In [29]:
# 模型评估
def evaluate(model):
    avg_acc = []
    #表示进入测试模式
    model.eval()         

    with torch.no_grad():
        for batch in test_dataloader:
            # 从batch中取数据，并放到GPU中
            b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
            # 前向传播，得到output
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            # 统计当前batch的acc
            acc = binary_acc(output[0], b_labels)
            avg_acc.append(acc)
    # 统计平均acc
    avg_acc = np.array(avg_acc).mean()
    return avg_acc


In [30]:
# 训练 & 评估
for epoch in range(epochs): 
    # 模型训练
    train_loss, train_acc = train(model, optimizer)
    print('epoch={},训练准确率={}，损失={}'.format(epoch, train_acc, train_loss))
    # 模型评估
    test_acc = evaluate(model)
    print("epoch={},测试准确率={}".format(epoch, test_acc))


  Batch    40  of    500.    Elapsed: 0:00:13.
  Batch    80  of    500.    Elapsed: 0:00:26.
  Batch   120  of    500.    Elapsed: 0:00:40.
  Batch   160  of    500.    Elapsed: 0:00:53.
  Batch   200  of    500.    Elapsed: 0:01:07.
  Batch   240  of    500.    Elapsed: 0:01:20.
  Batch   280  of    500.    Elapsed: 0:01:34.
  Batch   320  of    500.    Elapsed: 0:01:47.
  Batch   360  of    500.    Elapsed: 0:02:01.
  Batch   400  of    500.    Elapsed: 0:02:15.
  Batch   440  of    500.    Elapsed: 0:02:29.
  Batch   480  of    500.    Elapsed: 0:02:43.
epoch=0,训练准确率=0.855125，损失=0.33603662432730197
epoch=0,测试准确率=0.8985
  Batch    40  of    500.    Elapsed: 0:00:14.
  Batch    80  of    500.    Elapsed: 0:00:28.
  Batch   120  of    500.    Elapsed: 0:00:42.
  Batch   160  of    500.    Elapsed: 0:00:56.
  Batch   200  of    500.    Elapsed: 0:01:10.
  Batch   240  of    500.    Elapsed: 0:01:24.
  Batch   280  of    500.    Elapsed: 0:01:38.
  Batch   320  of    500.    Elapsed: 0:

In [31]:
def predict(sen):
    # 将sen 转换为id
    input_id = convert_text_to_token(tokenizer, sen)
    print(input_id)
    # 放到tensor中
    input_token =  torch.tensor(input_id).long().to(device)            #torch.Size([128])
    # 统计有id的部分，即为 1(mask)，并且转换为float类型
    atten_mask = [float(i>0) for i in input_id]
    # 将mask放到tensor中
    attention_token = torch.tensor(atten_mask).long().to(device)       #torch.Size([128])
    # 转换格式 size= [1,128]， torch.Size([128])->torch.Size([1, 128])否则会报错
    attention_mask = attention_token.view(1, -1)

    output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_mask)
    return torch.max(output[0], dim=1)[1]

In [32]:
label = predict('酒店位置难找，环境不太好，隔音差，下次不会再来的。')
print('好评' if label==1 else '差评')

[101, 6983, 2421, 855, 5390, 7410, 2823, 8024, 4384, 1862, 679, 1922, 1962, 8024, 7392, 7509, 2345, 8024, 678, 3613, 679, 833, 1086, 3341, 4638, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
差评


In [33]:
label = predict('酒店还可以，接待人员很热情，卫生合格，空间也比较大，不足的地方就是没有窗户')
print('好评' if label==1 else '差评')

[101, 6983, 2421, 6820, 1377, 809, 8024, 2970, 2521, 782, 1447, 2523, 4178, 2658, 8024, 1310, 4495, 1394, 3419, 8024, 4958, 7313, 738, 3683, 6772, 1920, 8024, 679, 6639, 4638, 1765, 3175, 2218, 3221, 3766, 3300, 4970, 2787, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
好评


In [34]:
label = predict('"服务各方面没有不周到的地方, 各方面没有没想到的细节"')
print('好评' if label==1 else '差评')

[101, 107, 3302, 1218, 1392, 3175, 7481, 3766, 3300, 679, 1453, 1168, 4638, 1765, 3175, 117, 1392, 3175, 7481, 3766, 3300, 3766, 2682, 1168, 4638, 5301, 5688, 107, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
好评


In [35]:
sen = '酒店位置难找，环境不太好，隔音差，下次不会再来的。'
input_id = convert_text_to_token(tokenizer, sen)
print(input_id)

[101, 6983, 2421, 855, 5390, 7410, 2823, 8024, 4384, 1862, 679, 1922, 1962, 8024, 7392, 7509, 2345, 8024, 678, 3613, 679, 833, 1086, 3341, 4638, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [36]:
input_token =  torch.tensor(input_id).long().to(device)            #torch.Size([128])
print(input_token)

tensor([ 101, 6983, 2421,  855, 5390, 7410, 2823, 8024, 4384, 1862,  679, 1922,
        1962, 8024, 7392, 7509, 2345, 8024,  678, 3613,  679,  833, 1086, 3341,
        4638,  511,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0], device='cuda:0')


In [37]:
# 统计有id的部分，即为 1(mask)，并且转换为float类型
atten_mask = [float(i>0) for i in input_id]
print('atten_mask=\n', atten_mask)

atten_mask=
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [38]:
# 将mask放到tensor中
attention_token = torch.tensor(atten_mask).long().to(device)       #torch.Size([128])
# 转换格式 size= [1,128]
attention_mask = attention_token.view(1, -1)
print(attention_mask.size())

torch.Size([1, 128])


In [39]:
output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_mask)     #torch.Size([128])->torch.Size([1, 128])否则会报错
print(output)
print(output[0])

print('result=', torch.max(output[0], dim=1)[1])

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.6870, -2.6489]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
tensor([[ 2.6870, -2.6489]], device='cuda:0', grad_fn=<AddmmBackward>)
result= tensor([0], device='cuda:0')


In [40]:
label = predict('总体不错，但是一楼商店外包后，商品质量不能保证')
print('好评' if label==1 else '差评')

[101, 2600, 860, 679, 7231, 8024, 852, 3221, 671, 3517, 1555, 2421, 1912, 1259, 1400, 8024, 1555, 1501, 6574, 7030, 679, 5543, 924, 6395, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
好评


In [None]:
label = predict('风光秀丽')
print('好评' if label==1 else '差评')