In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,AutoConfig, AutoModel, BitsAndBytesConfig
from transformers.generation.utils import GenerationConfig
import torch.nn as nn


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /root/anaconda3/envs/ytw_llm/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.6/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /root/anaconda3/envs/ytw_llm/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda116.so...


In [2]:
import os 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  #（保证程序cuda序号与实际cuda序号对应）
os.environ['CUDA_VISIBLE_DEVICES'] = "1"  #（代表仅使用第0，1号GPU）
torch.cuda.set_device(1) # 只有这句可以其效果

In [3]:
## 模型加载
#使用QLoRA引入的 NF4量化数据类型以节约显存
model_name_or_path ='/work/ytw/LLM/Baichuan2-13B-Chat' #远程 'baichuan-inc/Baichuan-13B-Chat'

bnb_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        )

tokenizer = AutoTokenizer.from_pretrained(
   model_name_or_path, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                quantization_config=bnb_config,
                trust_remote_code=True) 

model.generation_config = GenerationConfig.from_pretrained(model_name_or_path)

Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  9.24s/it]


In [4]:
# 测试用例
from IPython.display import clear_output 
messages = []
messages.append({"role": "user",
                 "content": "世界上第二高的山峰是哪座?"})
response = model.chat(tokenizer,messages=messages,stream=True)
for res in response:
    print(res)
    clear_output(wait=True)

世界上第二高的山峰是乔戈里峰（K2），海拔8,611米（28,251英尺）。它位于巴基斯坦和中国边境的喀喇昆仑山脉。


In [5]:
prefix = '''命名实体识别：抽取文本中的 人名，地点，组织 这三类命名实体，并按照json格式返回结果。

下面是一些范例：

小明对小红说:"你听说过安利吗？" -> {"人名": ["小明","小红"], "组织": ["安利"]}
现在，每年有几十万中国人到美国访问，几千名中国留学生到美国就学。 -> {"地点": ["中国", "美国"]}
中国是联合国安理会常任理事国之一。 -> {"地点": ["中国"], "组织": ["联合国"]}

请对下述文本进行实体抽取，返回json格式。

'''

In [6]:
## 构造简单的few-shot prompt
def get_prompt(text):
    return prefix+text+' -> '

def get_message(prompt,response):
    return [{"role": "user", "content": f'{prompt} -> '},
            {"role": "assistant", "content": response}]


In [7]:
messages  = [{"role": "user", "content": get_prompt("一些摩洛哥球迷已按捺不住，在看台上欢呼雀跃")}]
response = model.chat(tokenizer, messages)
print(response)

{"人名": [], "地点": ["摩洛哥"], "组织": []}


In [8]:
messages = messages+[{"role": "assistant", "content": "{'地点': ['摩洛哥']}"}]
messages.extend(get_message("这次轮到北京国安队，不知会不会再步后尘？","{'组织': ['北京国安队']}"))
messages.extend(get_message("革命党人孙中山在澳门成立同盟会分会","{'人名': ['孙中山'], '地名': ['澳门'], '组织': ['同盟会']}"))
messages.extend(get_message("我曾在安徽芜湖市和上海浦东打工。","{'地点': ['安徽芜湖市', '上海浦东']}"))
print(messages)

[{'role': 'user', 'content': '命名实体识别：抽取文本中的 人名，地点，组织 这三类命名实体，并按照json格式返回结果。\n\n下面是一些范例：\n\n小明对小红说:"你听说过安利吗？" -> {"人名": ["小明","小红"], "组织": ["安利"]}\n现在，每年有几十万中国人到美国访问，几千名中国留学生到美国就学。 -> {"地点": ["中国", "美国"]}\n中国是联合国安理会常任理事国之一。 -> {"地点": ["中国"], "组织": ["联合国"]}\n\n请对下述文本进行实体抽取，返回json格式。\n\n一些摩洛哥球迷已按捺不住，在看台上欢呼雀跃 -> '}, {'role': 'assistant', 'content': "{'地点': ['摩洛哥']}"}, {'role': 'user', 'content': '这次轮到北京国安队，不知会不会再步后尘？ -> '}, {'role': 'assistant', 'content': "{'组织': ['北京国安队']}"}, {'role': 'user', 'content': '革命党人孙中山在澳门成立同盟会分会 -> '}, {'role': 'assistant', 'content': "{'人名': ['孙中山'], '地名': ['澳门'], '组织': ['同盟会']}"}, {'role': 'user', 'content': '我曾在安徽芜湖市和上海浦东打工。 -> '}, {'role': 'assistant', 'content': "{'地点': ['安徽芜湖市', '上海浦东']}"}]


In [9]:
def predict(text,temperature=0.01):
    model.generation_config.temperature=temperature
    response = model.chat(tokenizer, 
                          messages = messages+[{'role':'user','content':f'{text} -> '}])
    return response


In [10]:
predict('杜甫是李白的粉丝。') 

"{'人名': ['李白', '杜甫']}"

In [11]:
# 利用作者给出的简单ner数据集进行测试
from sklearn.model_selection import train_test_split
import pandas as pd 
from tqdm import tqdm

df = pd.read_pickle('dfner_13k.pkl')
dfdata,dftest = train_test_split(df,test_size=300,random_state=42)
dftrain,dfval = train_test_split(dfdata,test_size=200,random_state=42)

preds = ['' for x in dftest['target']]
for i in tqdm(range(len(preds))):
    preds[i] = predict(dftest['text'].iloc[i])
    

def toset(s):
    try:
        dic = eval(str(s))
        res = []
        for k,v in dic.items():
            for x in v:
                if x:
                    res.append((k,x))
        return set(res)
    except Exception as err:
        print(err)
        return set()

dftest['pred'] = [toset(x) for x in preds]
dftest['gt'] = [toset(x) for x in dftest['target']]
dftest['tp_cnt'] = [len(pred&gt) for pred,gt in zip(dftest['pred'],dftest['gt'])]
dftest['pred_cnt'] = [len(x) for x in dftest['pred']]
dftest['gt_cnt'] = [len(x) for x in dftest['gt']]

precision = sum(dftest['tp_cnt'])/sum(dftest['pred_cnt'])
print('precision = '+str(precision))

recall = sum(dftest['tp_cnt'])/sum(dftest['gt_cnt'])
print('recall = '+str(recall))

f1 = 2*precision*recall/(precision+recall)
print('f1_score = '+str(f1))

100%|██████████| 300/300 [09:51<00:00,  1.97s/it]

unhashable type: 'dict'
precision = 0.49033816425120774
recall = 0.5858585858585859
f1_score = 0.5338593030900723





我们仿照百川模型的 model._build_chat_input 方法来进行token编码，同时把需要学习的内容添加label.

In [12]:
import torch 

#将messages编码成 token, 同时返回labels
#注意baichuan-13b通过插入tokenizer.user_token_id和tokenizer.assistant_token_id 来区分用户和机器人会话内容

# reference@ model._build_chat_input?
def build_chat_input(messages, model=model,
                     tokenizer=tokenizer, 
                     max_new_tokens: int=0):
    max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens
    max_input_tokens = model.config.model_max_length - max_new_tokens
    max_input_tokens = max(model.config.model_max_length // 2, max_input_tokens)
    
    total_input, round_input, total_label, round_label = [], [], [], []
    
    for i, message in enumerate(messages[::-1]):
        content_tokens = tokenizer.encode(message['content'])
        if message['role'] == 'user':
            round_input = [model.generation_config.user_token_id] + content_tokens + round_input
            round_label = [-100]+[-100 for _ in content_tokens]+ round_label
            
            if total_input and len(total_input) + len(round_input) > max_input_tokens:
                break
            else:
                total_input = round_input + total_input
                total_label = round_label + total_label
                if len(total_input) >= max_input_tokens:
                    break
                else:
                    round_input = []
                    round_label = []
                    
        elif message['role'] == 'assistant':
            round_input = [
                model.generation_config.assistant_token_id
            ] + content_tokens + [
                model.generation_config.eos_token_id
            ] + round_input
            
            if i==0: #仅对最后一轮的target进行学习
                round_label = [
                    -100
                ] + content_tokens + [
                    model.generation_config.eos_token_id
                ]+ round_label
            else:
                round_label = [
                    -100
                ] + [-100 for _ in content_tokens] + [
                    -100
                ]+ round_label
                
        else:
            raise ValueError(f"message role not supported yet: {message['role']}")
            
    total_input = total_input[-max_input_tokens:]  # truncate left
    total_label = total_label[-max_input_tokens:]
    
    total_input.append(model.generation_config.assistant_token_id)
    total_label.append(-100)
    
    return total_input,total_label


In [None]:
# 做数据集
from torch.utils.data import Dataset,DataLoader 
from copy import deepcopy
class MyDataset(Dataset):
    def __init__(self,df,
                 messages
                ):
        self.df = df 
        self.messages = messages
        
    def __len__(self):
        return len(self.df)
        
    def get_samples(self,index):
        samples = []
        d = dict(self.df.iloc[index])
        samples.append(d)
        return samples
    
    def get_messages(self,index):
        samples = self.get_samples(index)
        messages = deepcopy(self.messages)
        for i,d in enumerate(samples):

            messages.append({'role':'user','content':d['text']+' -> '})
            messages.append({'role':'assistant','content':str(d['target'])})
        return messages
        
    def __getitem__(self,index):
        messages = self.get_messages(index)
        input_ids, labels = build_chat_input(messages)
        return {'input_ids':input_ids,'labels':labels}

    def show_sample(self,index):
        samples = self.get_samples(index)
        print(samples)
    
    

ds_train = MyDataset(dftrain,messages)
ds_val = MyDataset(dfval,messages)


In [None]:
# 手动释放显存
pid = os.getpid()
!kill -9 $pid