# BERT

## 任务：分类

In [34]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from datetime import timedelta
from numpy import ndarray
from typing import Union, List, Dict
from sklearn.preprocessing import MinMaxScaler
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from tqdm import tqdm  # 打印进度条
import math
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List
from pandas.tseries import offsets
from pandas.tseries.frequencies import to_offset
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import codecs
import random
import warnings

warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False

In [27]:
def generator(data_list, train_ratio, valid_ratio, tokenizer, batch_size, max_length: int = 128):
    """
    读取数据，并对数据进行划分，生成加载器

    参数说明
    ----------
    data_list : {list[DataFrame]}
        输入数据，包含数据和标签
    train_ratio : {float}
        用于训练的数据集占比:将数据按照一定比例进行切分，取值范围为(0,1)
    valid_ratio : {float}
        用于验证的数据集占比:将数据按照一定比例进行切分，取值范围为(0,1)
    tokenizer : {}
        分词器
    batch_size : {int} 
        输入数据的批次大小，正整数
    max_length : {int} 
        最大文本截取长度，正整数

    返回值
    -------
    data_loader : {torch.utils.data.dataloader.DataLoader}
        数据加载器
    """
    # 获取数据
    texts = data_list[0]  # 特征
    labels = data_list[1]  # 目标
    
    # 划分数据
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=train_ratio, random_state=42)

    # 定义数据集的Dataset类
    class CustomDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_length=max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    
        def __len__(self):
            return len(self.labels)
    
        def __getitem__(self, idx):
            text = self.texts[idx]
            label = self.labels[idx]
            
            # 对文本进行token化，并转换为输入id和attention mask
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors="pt"
            )
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.long)
            }

    # 创建Dataset和DataLoader
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=max_length)
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(val_dataset, batch_size=batch_size)

    return train_dataset, val_dataset, train_loader, valid_loader

In [28]:
# 加载语料，酒店评论
data = pd.read_csv('../../../../../data/03.nlp/ChineseNlpCorpus/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv')
data

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"
...,...,...
7761,0,尼斯酒店的几大特点：噪音大、环境差、配置低、服务效率低。如：1、隔壁歌厅的声音闹至午夜3点许...
7762,0,盐城来了很多次，第一次住盐阜宾馆，我的确很失望整个墙壁黑咕隆咚的，好像被烟熏过一样家具非常的...
7763,0,看照片觉得还挺不错的，又是4星级的，但入住以后除了后悔没有别的，房间挺大但空空的，早餐是有但...
7764,0,我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...


In [29]:
# 构造参数字典
params1 = {
    "data_list": [data['review'].values, data['label'].values],
    "train_ratio": 0.8,
    "valid_ratio": 0.1,
    "tokenizer": BertTokenizer.from_pretrained('bert-base-chinese'),
    "batch_size": 3,
    "max_length": 128,
}

# 函数传参
train_dataset, val_dataset, train_loader, valid_loader = generator(**params1)

In [30]:
sample = train_dataset[0]  # 获取第一个样本
input_ids = sample['input_ids']  # 提取 input_ids
input_ids

tensor([ 101, 6983, 2421, 1762, 7188, 6662, 3178, 8024, 3241,  677, 4125,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [31]:
sample['labels']

tensor(0)

## 任务：命名实体识别

In [113]:
def load_iob2(file_path):
    '''加载 IOB2 格式的数据'''
    token_seqs = []
    label_seqs = []
    tokens = []
    labels = []
    with codecs.open(file_path, encoding='utf-8') as f:
        for index, line in enumerate(f):
            items = line.strip().split()
            if len(items) == 2:
                token, label = items
                tokens.append(token)
                labels.append(label)
            elif len(items) == 0:
                if tokens:
                    token_seqs.append(tokens)
                    label_seqs.append(labels)
                    tokens = []
                    labels = []
            else:
                print(f'格式错误。行号：{index} 内容：{line.strip()}')
                continue
                
    if tokens:  # 文件末尾处理
        token_seqs.append(tokens)
        label_seqs.append(labels)
        
    return token_seqs, label_seqs


def show_iob2(token_seqs, label_seqs, num=5, shuffle=True):
    '''显示 IOB2 格式数据'''
    length = len(token_seqs)
    indexes = random.sample(range(length), min(num, length)) if shuffle else range(min(num, length))
    
    for i in indexes:
        tokens, labels = token_seqs[i], label_seqs[i]
        print(' '.join(f'{token}/{label}' for token, label in zip(tokens, labels)))
        print()


In [133]:
def generator(data_list, train_ratio, valid_ratio, tokenizer, batch_size, label2id, max_length=128):
    """
    读取数据，并对数据进行划分，生成加载器
    """
    texts = data_list[0]  # 特征
    labels = data_list[1]  # 标签（每个token的标签）

    # 划分训练集和临时验证集+测试集
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        texts, labels, test_size=(1 - train_ratio), random_state=42
    )

    # 划分验证集和测试集
    val_size = valid_ratio / (1 - train_ratio)
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts, temp_labels, test_size=1 - val_size, random_state=42
    )

    class CustomDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_length, label2id):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
            self.label2id = label2id

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            text = self.texts[idx]
            label = self.labels[idx]

            # 对文本进行token化
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                is_split_into_words=True,  # 支持拆分后的token输入
                return_tensors="pt"
            )

            # 标签编码：将标签映射为数字
            label_ids = [self.label2id.get(l, 0) for l in label]  # 使用get避免出现未知标签

            # 处理特殊标记：忽略[CLS], [SEP]等特殊token，填充标签时使用-100
            input_ids = encoding['input_ids'].squeeze(0)
            attention_mask = encoding['attention_mask'].squeeze(0)

            # 忽略特殊标记的位置（[CLS]和[SEP]），将其标签设为-100
            label_ids = label_ids + [0] * (self.max_length - len(label_ids))  # 填充标签
            label_ids = label_ids[:self.max_length]  # 确保长度不超过max_length

            # 获取特殊标记的索引：[CLS]的索引为0，[SEP]的索引为102
            special_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
            cls_idx = special_tokens.index('[CLS]') if '[CLS]' in special_tokens else -1
            sep_idx = special_tokens.index('[SEP]') if '[SEP]' in special_tokens else -1
            pad_ids = [idx for idx, token in enumerate(special_tokens) if token == '[PAD]']

            # 将特殊标记和PAD的标签设置为-100（忽略这些位置）
            if cls_idx != -1:
                label_ids[cls_idx] = -100
            if sep_idx != -1:
                label_ids[sep_idx] = -100
            for pad_idx in pad_ids:
                label_ids[pad_idx] = -100

            return {
                'input_ids': input_ids,  # 取消batch维度
                'attention_mask': attention_mask,  # 取消batch维度
                'labels': torch.tensor(label_ids, dtype=torch.long)  # 标签
            }

    # 创建Dataset和DataLoader
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length, label2id)
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length, label2id)
    test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_length, label2id)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader

In [134]:
token_seqs, label_seqs = load_iob2('../../../../../data/03.nlp/ChineseNlpCorpus/datasets/dh_msra/dh_msra/dh_msra.txt')

In [135]:
token_seqs[30]

['由',
 '于',
 '这',
 '一',
 '时',
 '期',
 '战',
 '争',
 '频',
 '繁',
 '，',
 '条',
 '件',
 '艰',
 '苦',
 '，',
 '又',
 '遭',
 '国',
 '民',
 '党',
 '毁',
 '禁',
 '，',
 '传',
 '世',
 '量',
 '稀',
 '少',
 '，',
 '购',
 '藏',
 '不',
 '易',
 '。']

In [136]:
label_seqs[30]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [137]:
# 标签映射，假设标签为字符串，需要映射到整数
label2id = {
    'O': 0,       # 不是实体的标签
    'B-PER': 1,   # 人名
    'I-PER': 2,   # 人名的内部
    'B-LOC': 3,   # 地点
    'I-LOC': 4,   # 地点的内部
    'B-ORG': 5,   # 组织
    'I-ORG': 6    # 组织的内部
}

In [138]:
# 构造参数字典
params1 = {
    "data_list": [token_seqs, label_seqs],
    "train_ratio": 0.8,
    "valid_ratio": 0.1,
    "tokenizer": BertTokenizer.from_pretrained('bert-base-chinese'),
    "batch_size": 3,
    "max_length": 128,
    "label2id": label2id
}

# 函数传参
train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader = generator(**params1)

In [139]:
sample = train_dataset[30]  # 获取第一个样本
input_ids = sample['input_ids']  # 提取 input_ids
input_ids

tensor([ 101, 1343, 2399, 1374, 2458, 4638,  704, 1744, 1066,  772, 1054, 5018,
        1282,  758, 3613, 1059, 1744,  807, 6134, 1920,  833, 1469, 1184,  679,
         719, 1374, 2458, 4638,  736, 2237, 1059, 1744,  782, 1920,  671, 3613,
         833, 6379, 8024, 4802, 2137,  749, 2769,  812, 1744, 2157, 6659,  686,
        5279, 1355, 2245, 4638, 2131,  836, 4680, 3403,  511,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [140]:
sample['labels']

tensor([-100,    0,    0,    0,    0,    5,    6,    6,    6,    6,    6,    6,
           6,    6,    6,    6,    6,    6,    6,    6,    0,    0,    0,    0,
           0,    0,    0,    5,    6,    6,    6,    6,    6,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100])

In [119]:
sample['attention_mask']

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

## 任务：问答系统

In [130]:
data = pd.read_csv('../../../../../data/03.nlp/ChineseNlpCorpus/datasets/baoxianzhidao/baoxianzhidao_filter.csv')
data

Unnamed: 0,title,question,reply,is_best
0,最近在安邦长青树中看到什么豁免，这个是什么意思？,,您好，这个是重疾险中给予投保者的一项权利，安*长青树保障责任规定，投保者可以享受多次赔付，豁...,1
1,和老婆利用假期去澳*探亲，但是第一次去不大熟悉，有没有相关保险呢？,,您好，HUTS保险中的乐游全球（探亲版）-慧择旅游保险澳新计划是澳*新西兰探亲专属保障，承保...,0
2,HUTS中有没有适合帆船比赛的保险，我男朋友这周就要开始了,,您好，水上运动比赛，尤其是带有奖金的比赛一般承保的公司比较少。不过，HUTS保险中的众行天下...,1
3,计划端午节和男朋友自驾去九*山，买保险三天要多少钱？,,您好，端午出行的人比较多，而且自驾存在一定风险，所以有保险意识还是很好的。考虑到价格以及保障...,1
4,计划端午节和男朋友自驾去九*山，买保险三天要多少钱？,,不到10块钱………………,0
...,...,...,...,...
8357,如何为一家三口买保险？,近段时间一直想给自己的小家买份保险，但是保险公司多，保险品种更多，看得眼花。所以想请各位专家...,你好！每年的保费不要超过年收入的20%,0
8358,如何为一家三口买保险？,近段时间一直想给自己的小家买份保险，但是保险公司多，保险品种更多，看得眼花。所以想请各位专家...,可以退保费的意外险下载注册平安app，填邀请码自已投保里面有N个一百万身价,0
8359,如何为一家三口买保险？,近段时间一直想给自己的小家买份保险，但是保险公司多，保险品种更多，看得眼花。所以想请各位专家...,你好平安守护星是一款分红型产品也可以做为教育金为主是一款少儿产品如有意向可以私聊我具体了解,0
8360,23岁买什么保险好啊？,我今年刚刚23岁，大学毕业刚开始工作，想给自己买份保险，不知道有什么保险好啊？,根据您提供的信息，建议您购买一份综合意外保险。保障普通意外、意外医疗、交通意外、住院津贴等等...,1
