### ChatGLM2-6B

前置依赖

In [None]:
pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate

运行模型

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("model/chatglm3-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("model/chatglm3-6b", trust_remote_code=True).half().cuda()
model = model.eval()
history_ = []

In [None]:
for response, history in model.stream_chat(tokenizer, "你的任务是什么?", history=history_):
    history_ = history
    print(response)

### CCGPTModel

In [None]:
import sys
from CC.predictor.chatglm import GPTPredict

predictor = GPTPredict(model_name="ChatGLM2-6B", model_from_pretrained="model/chatglm3-6b")

In [None]:
res = predictor("你好?", history=[])
print(res['data'])

In [None]:
for res in predictor.predict_stream("你的任务是什么?", history=[]):
    sys.stdout.write('\r' + res['data'])
    sys.stdout.flush()

In [None]:
with open('./a.txt', encoding='utf-8') as f:
    ask_content = f.read()
res = predictor(ask_content, history=[])
print(res['data'])

### ChatGLM_LoRA

In [None]:
from CC.trainer.chatglm_lora import Trainer
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("model/chatglm3-6b", trust_remote_code=True)
config = AutoConfig.from_pretrained("model/chatglm3-6b", trust_remote_code=True)
trainer = Trainer(tokenizer=tokenizer, config=config, from_pretrained='./model/chatglm3-6b', loader_name='ChatGLM_Chat', data_path='FDEX2', max_length=3600, batch_size=1, task_name='FDEX2')

In [None]:
for i in trainer(num_epochs=5):
    a = i

##### Chat预测

In [None]:
from CC.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/FDRAG/ChatGLM_44136')

In [None]:
history = []

In [None]:
result = pred.chat('<rag>检索增强知识: \n1.《政府采购代理机构管理暂行办法》(财库[2018]2号)\n第十三条 代理机构受采购人委托办理采购事宜，应当与采购人签订委托代理协议，明确采购代理范围、权限、期限、档案保存、代理费用收取方式及标准、协议解除及终止、违约责任等具体事项，约定双方权利义务。</rag>\n请根据以上检索增强知识回答以下问题\n采购人委托采购代理机构代理采购项目，发布招标公告后，有权更换采购代理机构吗?', max_length=3000, history=history)
history = result[1]
print(result[0])

##### 预测文本

In [None]:
from CC.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/FDLaw/ChatGLM_5755')

In [None]:
result = pred('Instrcution: 请识别该商品的要素: 理光（Ricoh） M2700/M2701/2702多功能黑白激光复合机 a3复合机打印机一体机办公 M 2702(网络+双面+输稿器+7寸触屏) 官方标配\n Answer:', max_length=512)
print(result)

In [None]:
with open('./a.txt', encoding='utf-8') as f:
    ask_content = f.read()
result = pred(ask_content, max_length=512)
print(result)

### ChatGLM_LoRA RAG 推理

In [None]:
# 创建或者加载chromadb客户端
import chromadb
from chromadb.utils import embedding_functions

DB_SAVE_DIR = './chroma_data'
DB_NAME = 'FDQA'
N_RESULTS = 1

client = chromadb.PersistentClient(DB_SAVE_DIR)
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="DMetaSoul/sbert-chinese-general-v2")
collection = client.get_or_create_collection(DB_NAME, embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})

In [None]:
from CC.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/FDRAG/ChatGLM_44136')

In [None]:
history = []

In [None]:
user_question = '采购人委托采购代理机构代理采购项目，发布招标公告后，有权更换采购代理机构吗?'
res = collection.query(
    query_texts=[user_question],
    n_results=N_RESULTS
)
if len(res['metadatas']) > 0:
    clue = res['metadatas'][0][0]['clue']
else:
    clue = '没有相关知识'
rag_user_question = f'<rag>检索增强知识: \n{clue}</rag>\n请根据以上检索增强知识回答以下问题\n{user_question}'
result = pred.chat(rag_user_question, history=history)
history = result[1]
print(result[0])

### Qianwen_LoRA

运行前请参阅[Qianwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)安装相关依赖.

In [None]:
from CC.trainer.qianwen_lora import Trainer
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("model/Qwen-14B-Chat-Int4", trust_remote_code=True)
config = AutoConfig.from_pretrained("model/Qwen-14B-Chat-Int4", trust_remote_code=True)
config.disable_exllama = True
trainer = Trainer(tokenizer=tokenizer, config=config, from_pretrained='./model/Qwen-14B-Chat-Int4', loader_name='Qianwen_Chat', data_path='FD', max_length=512, batch_size=1, task_name='FD_Qianwen')

In [None]:
for i in trainer(num_epochs=5):
    a = i

使用Accelerator分布式训练加速

In [None]:
! accelerate launch run_qianwen_lora.py

#### Chat预测

In [None]:
from CC.predictor.qianwen_lora import Predictor

pred = Predictor(model_from_pretrained='./model/Qwen-14B-Chat-Int4', resume_path='./save_model/FDQALaw_Qianwen/Qwen_39000')

In [None]:
result = pred.chat('hello,我想问下中华人民共和国民法典中第三条是什么?')
print(result[0])

##### 预测文本

In [None]:
from CC.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/BossCondition/ChatGLM_25108')

In [None]:
result = pred('Instrcution: 请识别该商品的要素: 理光（Ricoh） M2700/M2701/2702多功能黑白激光复合机 a3复合机打印机一体机办公 M 2702(网络+双面+输稿器+7寸触屏) 官方标配\n Answer:', max_length=512)
print(result)

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/BertPred/复印机_retrieved.tsv', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

result = []
iter = tqdm(ori_list)
for item in iter:
    item = item.split('\t')
    res = pred(f'Instrcution: 请识别该商品的要素: {item[2]}\n Answer:', max_length=512)
    res_item = {
        'pred': res
    }
    answer_index = res.find('Answer:')
    iter.set_postfix(pred=json.dumps(res[answer_index + 7:], ensure_ascii=False))
    result.append(res_item)

with open('./data_record/BertPred_ChatGLMLoRA/复印机.json', 'w', encoding='utf-8') as f:
    for item in result:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

##### 预测商品蕴涵关系

In [None]:
from CC.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/BossRTE/ChatGLM_22264')

In [None]:
result = pred('Instruction: 请判断以下两个商品是否为同款商品\nContext: Source: 联想(Lenovo）启天 M415-B114 台式计算机 I3-7100/8G/1T/无光驱/15L机箱/21.5寸显示器 5288\nTarget: 戴尔（DELL） I3-6100 戴尔（DELL）成就3667-R1308商用台式电脑整机（i3-6100 4G 1T WIFI 蓝牙 三年上门 硬盘保留 Win10）19.5英寸 3455\nAnswer: ', max_length=512)
print(result)

STS数据集

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/RTE/dev.json', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

iter = tqdm(ori_list)
tp = 0
fp = 0
tn = 0
fn = 0
for item in iter:
    item = json.loads(item)
    res = pred(item['context'], max_length=512)
    res = res.split('预测结果: ')
    if len(res) < 2:
        res = 1
    else:
        res = int(res[1])
    gold = int(item['target'].split('预测结果: ')[1])
    if res == 1:
        if res == gold:
            tp += 1
        else:
            fp += 1
    else:
        if res == gold:
            tn += 1
        else:
            fn += 1
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    f1 = 2 * p * r / (p + r)
    iter.set_postfix(F1=f1, p=p, r=r)

print(f1, p, r)

全样本环境预测

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/BertPred/复印机_retrieved.tsv', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

iter = tqdm(ori_list)
tp = 0
fp = 0
tn = 0
fn = 0
current_id = 0
current_index = 0
for idx, item in enumerate(iter):
    item = item.split('\t')
    id = item[0]
    if id != current_id:
        current_id = id
        current_index = idx
    ori_item = ori_list[current_index]
    ori_item = ori_item.split('\t')
    if ori_item[2] == item[2]:
        continue
    if len(item) < 4:
        item.append(1)
    if len(ori_item) < 4:
        ori_item.append(1)
    res = pred(f"Instruction: 请判断以下两个商品是否为同款商品\nContext: Source: {ori_item[2]} {ori_item[3]}\nTarget: {item[2]} {item[3]}\nAnswer: ", max_length=512)
    res = res.split('预测结果: ')
    if len(res) < 2:
        res = 1
    else:
        res = int(res[1])
    gold = int(item[5]) if len(item) > 5 else 1
    if res == 1:
        if res == gold:
            tp += 1
        else:
            fp += 1
    else:
        if res == gold:
            tn += 1
        else:
            fn += 1
    p = 0 if tp + fp == 0 else tp / (tp + fp)
    r = 0 if tp + fn == 0 else tp / (tp + fn)
    f1 = 0 if p + r == 0 else 2 * p * r / (p + r)
    iter.set_postfix(F1=f1, p=p, r=r)

print(f1, p, r)

In [None]:
tp_ = tp + 988 / 2
fp_ = fp + 52 / 2
p = tp_ / (tp_ + fp_)
r = tp_ / (tp_ + fn)
f1 = 2 * p * r / (p + r)
print(f'F1: {f1}, P: {p}, R: {r}')

### 使用ChatGLM3-6B对商品进行要素抽取

In [None]:
import sys
from CC.predictor.chatglm import GPTPredict

predictor = GPTPredict(model_name="ChatGLM2-6B", model_from_pretrained="model/chatglm3-6b")

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/train.json') as f:
    ori_data = f.read().split('\n')

if ori_data[-1] == '':
    ori_data.pop()

result = []
for item in tqdm(ori_data):
    data = json.loads(item)
    item_id = data['item_id']
    context = data['context']
    question = context.replace('\n Answer: ', '')
    res = predictor.generate(question, max_length=1024)
    result.append({'item_id': item_id, 'question': question, 'answer': res})

with open('./data/Boss/train_result.json', encoding='utf-8', mode='w') as f:
    for item in result:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

### 使用ChatGLM3-6B预测商品蕴涵关系

In [None]:
import sys
from CC.predictor.chatglm import GPTPredict

predictor = GPTPredict(model_name="ChatGLM2-6B", model_from_pretrained="model/chatglm3-6b")

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/BertPred/台式计算机.tsv', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

iter = tqdm(ori_list)
tp = 0
fp = 0
tn = 0
fn = 0
current_id = 0
current_index = 0
for idx, item in enumerate(iter):
    item = item.split('\t')
    id = item[0]
    if id != current_id:
        current_id = id
        current_index = idx
    ori_item = ori_list[current_index]
    ori_item = ori_item.split('\t')
    if ori_item[2] == item[2]:
        continue
    res = predictor.generate(f"请判断以下两个商品是否为同款商品，直接回答“同款”或“非同款”即可。\n文本1： {ori_item[2]} {ori_item[3]}\n文本2： {item[2]} {item[3]}\n回答：", max_length=1024)
    if '非同款' in res:
        res = 0
    else:
        res = 1
    gold = int(item[5]) if len(item) > 5 else 1
    if res == 1:
        if res == gold:
            tp += 1
        else:
            fp += 1
    else:
        if res == gold:
            tn += 1
        else:
            fn += 1
    p = 0 if tp + fp == 0 else tp / (tp + fp)
    r = 0 if tp + fn == 0 else tp / (tp + fn)
    f1 = 0 if p + r == 0 else 2 * p * r / (p + r)
    iter.set_postfix(F1=f1, p=p, r=r)

print(f1, p, r)

In [None]:
tp_ = tp + 988 / 2
fp_ = fp + 52 / 2
p = tp_ / (tp_ + fp_)
r = tp_ / (tp_ + fn)
f1 = 2 * p * r / (p + r)
print(f'F1: {f1}, P: {p}, R: {r}')