In [1]:
import logging
import json
import os
import random
import re

import numpy as np
# from Utils import utils
import pandas as pd
from openai import OpenAI
from rouge_chinese import Rouge
import jieba

project_path = os.path.abspath(os.path.relpath('../../../', os.getcwd()))
data_path = os.path.join(project_path, 'FT4LLM/Data')
prompt_path = os.path.join(data_path, 'prompt')

from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
jieba.setLogLevel(logging.ERROR)

num_prompt_instructions = 5
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

In [2]:

if os.path.exists(data_path + "/machine_instructions_zh.json"):
    machine_data_df = pd.read_json(data_path + "/machine_instructions_zh.json")
    print(f"Loaded {len(machine_data_df)} GPT-written seed instructions")
else:
    print("Create new bank for machine_instructions")
    machine_data_df = pd.DataFrame()

seed_tasks = []
with open(data_path + "/seed_tasks_seizure_zh.jsonl", "r", encoding='utf-8') as file:
    for line in file:
        stripped_line = line.strip().rstrip(',')
        task = json.loads(stripped_line)
        seed_tasks.append(task)

seed_instruction_data = [
    {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
    for t in seed_tasks
]
print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")




In [3]:
machine_data_df

In [8]:



def encode_prompt(prompt_instructions, file_name):
    """Encode multiple prompt instructions into a single string."""
    prompt = open(prompt_path + file_name, 'r', encoding='utf-8').read() + "\n"

    for idx, task_dict in enumerate(prompt_instructions):
        (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        input = "<无>" if input.lower() == "" else input
        prompt += f"###\n"
        prompt += f"指令: {instruction}\n"
        prompt += f"输入:{input}\n"
        prompt += f"输出:{output}\n"
    prompt += f"###\n"
    return prompt


def encode_prompt_knowledge():
    with open(data_path + '/knowledge.txt', 'r', encoding='utf-8') as file:
        knowledge = file.read()
    with open(prompt_path + '/prompt_for_knowledge_first.txt', 'r', encoding='utf-8') as file:
        prompt_knowledge = file.read()
    prompt_knowledge = prompt_knowledge + knowledge
    return prompt_knowledge


def find_word_in_string(w, s):
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)


def process_response(raw_instructions, statistic_report):
    instructions = []
    input_texts = []
    outputs = []
    for i, inst in enumerate(raw_instructions):
        inst = inst.strip()
        statistic_report['Generate in beginning'] += 1
        if len(inst.split()) <= 3 or len(inst.split()) > 200:
            continue
        blacklist = [
            "图片",
            "图像",
            "图表",
            "图",
            "文件",
            "地图",
            "绘制",
            "绘图",
            "转到",
            "视频",
            "音频",
            "音乐",
            "流程图",
            "代码",
            "程序"
        ]
        if any(find_word_in_string(word, inst) for word in blacklist):
            continue
        instruction_match = re.search(r'指令(.+)', inst)
        input_match = re.search(r'输入(.+)', inst)
        output_match = re.search(r'输出(.+)', inst)

        if instruction_match and input_match and output_match:
            instruction = instruction_match.group(1).strip().lstrip(":").lstrip("：")
            input_text = input_match.group(1).strip().lstrip(":").lstrip("：")
            output_text = output_match.group(1).strip().lstrip(":").lstrip("：")

            if input_text.startswith("<") and input_text != "<无>":
                continue
            if "比如" in input_text or "这" in input_text:
                continue
            input_text = "" if input_text.lower() == "<无>" else input_text
            statistic_report['Keep in blocklist and formatting'] += 1
            instructions.append(instruction)
            input_texts.append(input_text)
            outputs.append(output_text)

    return {"instruction": instructions, "input_word": input_texts, "output": outputs}


def tokenize_chinese(text):
    return ' '.join(jieba.lcut(text))
def get_bank_token():
    all_instructions = [d["instruction"] for d in seed_instruction_data] + (machine_data_df['instruction'].to_list() if machine_data_df.shape[0]!=0 else [])
    _all_instruction_tokens = [tokenize_chinese(ins) for ins in all_instructions]
    return _all_instruction_tokens

def remove_instruction_scoreler_base(all_instruction_tokens, new_instructions, statistic_report):
    instructions_keeped = []
    rouge = Rouge()
    for row_index in range(new_instructions.shape[0]):
        specific_ins = new_instructions.loc[row_index, "instruction"]
        new_instruction_tokens = tokenize_chinese(specific_ins)

        with Pool(8) as p:
            rouge_scores = p.map(
                partial(rouge.get_scores, new_instruction_tokens),
                all_instruction_tokens,
            )
        rouge_scores = [score[0]['rouge-l']['f'] for score in rouge_scores]
        
        if max(rouge_scores) > 0.8:
            continue
        
        instructions_keeped.append(new_instructions.loc[row_index])
        statistic_report["Keep in similar check"] += 1
    
    return pd.DataFrame(instructions_keeped)


def extract_tasks(text):
    tasks = []
    current_task = ""

    lines = text.strip().split("\n")
    for line in lines:
        if line.startswith("任务") or line.startswith("###") or line.startswith("***") or line.startswith("**") or line.startswith("##"):
            if current_task:
                tasks.append(current_task.strip())
                current_task = ""
            current_task += line + "\n"
        elif line.startswith("指令:") or line.startswith("输入:") or line.startswith("输出:"):
            current_task += line + "\n"
        elif line.strip():
            current_task += line + "\n"

    if current_task:
        tasks.append(current_task.strip())
        
    if len(tasks)==0:
        print("tenpelate can not fit well")

    return tasks

In [9]:
def getGenerateData_woutKnowledge(client, all_instruction_tokens):
    statistic_report = {"Keep in blocklist and formatting": 0, "Generate in beginning": 0, "Keep in similar check": 0}
    valid_instruction = []
    for t in np.linspace(0.02,0.6,20):
        for _ in tqdm(range(int(len(seed_instruction_data) / num_prompt_instructions))):
            prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)
            prompt = encode_prompt(prompt_instructions, '/prompt_Zh.txt')
            completion = client.chat.completions.create(
                model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=t,
            )
            response_text = completion.choices[0].message.content

            # 查找所有符合模式的匹配项
            raw_tasks = extract_tasks(response_text)
            raw_task_ = process_response(raw_tasks, statistic_report)
            new_task_formated_df = pd.DataFrame({
                "instruction": raw_task_['instruction'],
                "input_word": raw_task_['input_word'],
                "output": raw_task_['output']
            })
        
            similar_check_keep = remove_instruction_scoreler_base(all_instruction_tokens, new_task_formated_df,
                                                                  statistic_report)

            if similar_check_keep.shape[0] != 0:
                valid_instruction.append(similar_check_keep)

    print(statistic_report)
    new_formated_data = pd.concat(valid_instruction)
    return new_formated_data


In [10]:
instruction_tokens=get_bank_token()
new_formated_data_woutKnowledge = getGenerateData_woutKnowledge(client, all_instruction_tokens=instruction_tokens)
machine_data_df=pd.concat([machine_data_df, new_formated_data_woutKnowledge], ignore_index=True)


Knowledge 

In [15]:
machine_data_df.to_json(data_path + "/machine_instructions_zh.json")