In [1]:
import json
import os
import random
import re

# from Utils import utils
import pandas as pd
from openai import OpenAI
from rouge_score import rouge_scorer

project_path = os.path.abspath(os.path.relpath('../../../', os.getcwd()))
data_path = os.path.join(project_path, 'FT4LLM/Data')
prompt_path = os.path.join(data_path, 'prompt')

from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
import  numpy as np

num_prompt_instructions = 5
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

In [2]:

if os.path.exists(data_path + "/machine_instructions.json"):
    machine_data_df = pd.read_json(data_path + "/machine_instructions.json")
    print(f"Loaded {len(machine_data_df)} GPT-written seed instructions")
else:
    print("Create new bank for machine_instructions")
    machine_data_df = pd.DataFrame()

seed_tasks = [json.loads(l.strip().rstrip(',')) for l in open(data_path + "/seed_tasks_seizure.jsonl", "r")]
seed_instruction_data = [
    {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
    for t in seed_tasks
]
print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")



In [3]:
def get_bank_token():
    all_instructions = [d["instruction"] for d in seed_instruction_data] + machine_data_df['instruction'].to_list()
    _all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions]
    return _all_instruction_tokens


In [4]:
def encode_prompt(prompt_instructions, file_name):
    """Encode multiple prompt instructions into a single string."""
    prompt = open(prompt_path + file_name).read() + "\n"

    for idx, task_dict in enumerate(prompt_instructions):
        (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        input = "<noinput>" if input.lower() == "" else input
        prompt += f"***\n"
        prompt += f"Instruction: {instruction}\n"
        prompt += f"Input:{input}\n"
        prompt += f"Output:{output}\n"
    prompt += f"***\n"
    return prompt


def encode_prompt_knowledge():
    with open(data_path + '/knowledge.txt', 'r', encoding='utf-8') as file:
        knowledge = file.read()
    with open(prompt_path + '/prompt_for_knowledge_first.txt', 'r', encoding='utf-8') as file:
        prompt_knowledge = file.read()
    prompt_knowledge = prompt_knowledge + knowledge
    return prompt_knowledge


def find_word_in_string(w, s):
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)


def process_response(raw_instructions, statistic_report):
    instructions = []
    input_texts = []
    outputs = []
    for i, inst in enumerate(raw_instructions):
        inst = inst.strip()
        if inst.startswith("Here is"):
            continue
        statistic_report['Generate in beginning'] += 1
        if len(inst.split()) <= 3 or len(inst.split()) > 150:
            continue
        blacklist = [
            "image",
            "images",
            "graph",
            "graphs",
            "picture",
            "pictures",
            "file",
            "files",
            "map",
            "maps",
            "draw",
            "plot",
            "go to",
            "video",
            "audio",
            "music",
            "flowchart",
            "diagram",
            'code',
            'program'
        ]
        if any(find_word_in_string(word, inst) for word in blacklist):
            print("filter instruction bacause of blacklist",end='')
            continue
        instruction_match = re.search(r'Instruction:(.+)', inst)
        input_match = re.search(r'Input:(.+)', inst)
        output_match = re.search(r'Output:(.+)', inst)

        if instruction_match and input_match and output_match:
            instruction = instruction_match.group(1).strip()
            input_text = input_match.group(1).strip()
            output_text = output_match.group(1).strip()
            if input_text.startswith("<") and input_text != "<noinput>":
                print("filter instruction bacause of input",end=None)
                continue
            if "e.g." in input_text or "this" in input_text:
                print("filter instruction bacause of input",end=None)
                continue
            input_text = "" if input_text.lower() == "<noinput>" else input_text
            if not instruction[0].isascii():
                print("filter instruction bacause of languadge",end=None)
                continue
            statistic_report['Keep in blocklist and formatting'] += 1
            instructions.append(instruction)
            input_texts.append(input_text)
            outputs.append(output_text)

    return {"instruction": instructions, "input_word": input_texts, "output": outputs}


def remove_instruction_scoreler_base(all_instruction_tokens, new_instructions, statistic_report):
    instructions_keeped = []
    for row_index in range(new_instructions.shape[0]):
        specific_ins = new_instructions.loc[row_index, "instruction"]
        new_instruction_tokens = scorer._tokenizer.tokenize(specific_ins)
        with Pool(8) as p:
            rouge_scores = p.map(
                partial(rouge_scorer._score_lcs, new_instruction_tokens),
                all_instruction_tokens,
            )
        rouge_scores = [score.fmeasure for score in rouge_scores]
        if max(rouge_scores) > 0.8:
            continue
        instructions_keeped.append(new_instructions.loc[row_index])
        statistic_report["Keep in similar check"] += 1
    return pd.DataFrame(instructions_keeped)

In [5]:



def getGenerateData_woutKnowledge(client,all_instruction_tokens):
    statistic_report = {"Keep in blocklist and formatting": 0, "Generate in beginning": 0, "Keep in similar check": 0}
    valid_instruction = []
    for t in np.linspace(0.12, 0.6, 10):
        for i in tqdm(range(int(len(seed_instruction_data) / num_prompt_instructions))):
            prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)
            prompt = encode_prompt(prompt_instructions,'/prompt.txt')
            completion = client.chat.completions.create(
                model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=t,
            )
            response_text = completion.choices[0].message.content
            raw_tasks = re.split(r"\*\*\*", response_text)
            raw_task_ = process_response(raw_tasks, statistic_report)
            new_task_formated_df = pd.DataFrame({
                "instruction": raw_task_['instruction'],
                "input_word": raw_task_['input_word'],
                "output": raw_task_['output']
            })
            similar_check_keep = remove_instruction_scoreler_base(all_instruction_tokens, new_task_formated_df,
                                                                  statistic_report)
            if similar_check_keep.shape[0] != 0:
                valid_instruction.append(similar_check_keep)

    print(statistic_report)
    new_formated_data = pd.concat(valid_instruction)
    return new_formated_data



In [6]:
instruction_tokens=get_bank_token()
new_formated_data_woutKnowledge=getGenerateData_woutKnowledge(client,all_instruction_tokens=instruction_tokens)
# machine_data_df=pd.concat([machine_data_df, new_formated_data_woutKnowledge], ignore_index=True)
# 

In [11]:
machine_data_df

Knowledge 

In [17]:
def getGenerateData_wKnowledge(client,all_instruction_tokens):
    statistic_report = {"Keep in blocklist and formatting": 0, "Generate in beginning": 0, "Keep in similar check": 0}
    valid_instruction = []
    prompt_knowledge = encode_prompt_knowledge()
    prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)
    prompt_later = encode_prompt(prompt_instructions, '/prompt_for_knowledge_later.txt')
    for t in np.linspace(0.10, 0.45, 8):
        for _ in tqdm(range(int(len(seed_instruction_data) / num_prompt_instructions))):
            completion = client.chat.completions.create(
                model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
                messages=[
                    {"role": "user", "content": prompt_knowledge},
                    {"role": "assistant", "content": 'Ok'},
                    {"role": "user", "content": prompt_later},
                ],
                temperature=t,
            )
            response_text = completion.choices[0].message.content
            raw_tasks = re.split(r"\*\*\*", response_text)
            raw_task_ = process_response(raw_tasks, statistic_report)
            new_task_formated_df = pd.DataFrame({
                "instruction": raw_task_['instruction'],
                "input_word": raw_task_['input_word'],
                "output": raw_task_['output']
            })
            similar_check_keep = remove_instruction_scoreler_base(all_instruction_tokens, new_task_formated_df,
                                                                  statistic_report)
            if similar_check_keep.shape[0] != 0:
                valid_instruction.append(similar_check_keep)
        print(statistic_report)
    new_formated_data = pd.concat(valid_instruction)
    return new_formated_data

In [18]:
instruction_tokens=get_bank_token()
new_formated_data_wKnowledge=getGenerateData_wKnowledge(client,all_instruction_tokens=instruction_tokens)


In [17]:

machine_data_df=pd.concat([machine_data_df, new_formated_data_wKnowledge], ignore_index=True)



In [16]:
empty_input_rows = machine_data_df[machine_data_df['input_word'].isnull() | (machine_data_df['input_word'] == '')]
not_duplicate_rows=machine_data_df.drop_duplicates(subset=['input_word'], inplace=False)
machine_data_df=pd.concat([empty_input_rows,not_duplicate_rows],ignore_index=True)
machine_data_df.to_json(data_path + "/machine_instructions.json")