In [2]:
import json
import os
from preference_datasets import get_batch_iterator
import transformers
cache_dir = '/ebs/.cache/ubuntu/'

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b', cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id
prompt_iterator = get_batch_iterator(['sharegpt'], tokenizer=tokenizer, split='combined', batch_size=1, sft_mode=True,
                                     seed=0, n_epochs=1, cache_dir=cache_dir, shuffle=False,
                                     max_prompt_length=256, max_length=1024, data_fraction=1.0, num_turns=1)

chatgpt_instruction_truncoutput_pair = []
single_turn_instructions = []
for batch in prompt_iterator:
    instruction = batch['prompt'][0]
    if instruction.startswith('Assistant:'):
        continue
    instruction = instruction[len('Human: '):-len('\n\nAssistant: ')]
    prompt_token_count = batch['prompt_input_ids'][0].shape[0]
    output = tokenizer.decode(batch['chosen_input_ids'][0][prompt_token_count:], skip_special_tokens=True)
    chatgpt_instruction_truncoutput_pair.append({'instruction': instruction,
                                                 'output': output})
    single_turn_instructions.append(instruction)

# print(instructions[0] == chatgpt_instruction_truncoutput_pair[0]['instruction'])
# chatgpt_instruction_truncoutput_pair[0]
# print(len(instructions), len(chatgpt_instruction_truncoutput_pair))

Loading the ShareGPT dataset...
done


Processing shareGPT:  92%|█████████▏| 86709/94145 [00:01<00:00, 63681.37it/s]

filtered out [{'from': 'gpt', 'value': 'To assign a class (pos or neg) to the sentence "A good, good plot and great characters, but poor acting," we will use two naive Bayes models: a multinomial naive Bayes model and a binarized naive Bayes model. Both models will use add-1 smoothing.\n\nFirst, we need to compute the likelihoods for each word in each class. For the multinomial model, we will use the frequency of each word in each class. For the binarized model, we will use the presence or absence of each word in each class.\n\nMultinomial model:\n\n| Word | Pos freq | Neg freq |\n| --- | --- | --- |\n| good | 6 | 1 |\n| poor | 0 | 8 |\n| great | 5 | 2 |\n\nBinarized model:\n\n| Word | Pos freq | Neg freq |\n| --- | --- | --- |\n| good | 1 | 1 |\n| poor | 0 | 1 |\n| great | 1 | 1 |\n\nNext, we need to compute the prior probabilities for each class. In this case, we have two positive and three negative documents, so the priors are:\n\nP(pos) = 2/5 = 0.4\nP(neg) = 3/5 = 0.6\n\nNow we can

Processing shareGPT: 100%|██████████| 94145/94145 [00:01<00:00, 55683.37it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (3159 > 2048). Running this sequence through the model will result in indexing errors


Created a dataset with 71894 prompts from ShareGPT
Finished generating 1 epochs on combined split


In [7]:
def process_llama_samples_from_dir(sample_folder):
    sft_instruction_truncoutput_pair = []
    sft_instructions = []
    for sample_file in os.listdir(sample_folder):
        sft_outputs = json.load(open(os.path.join(sample_folder, sample_file), 'r'))
        for instruction, sft_output in sft_outputs.items():
            instruction_trimmed = instruction[len('Human: '):-len('\n\nAssistant: ')] # some prompts will be processed incorrectly, but only the single-turn ones matter
            sft_instruction_truncoutput_pair.append({'instruction': instruction_trimmed,
                                                    'output': sft_output[0]})
            sft_instructions.append(instruction_trimmed)
    return sft_instruction_truncoutput_pair, sft_instructions

def match_instruction_outputs(instruction_set_1, instruct_out_1, instruction_set_2, instruct_out_2):
    matched_instruction_outputs = []
    for idx, instruction in enumerate(instruction_set_1):
        if instruction in instruction_set_2 and instruction in single_turn_instructions:
            matched_instruction_outputs.append({'instruction': instruction,
                                                'output_1': instruct_out_1[idx]['output'],
                                                'output_2': instruct_out_2[instruction_set_2.index(instruction)]['output']})
    return matched_instruction_outputs

In [8]:
base_dir = '/ebs/.cache/ubuntu/sharegpt2turn_llama7b_sft_maxlen1024_2023-09-11_21-52-36_584206/step-10000/'
model1_name = 'temp2.5'
model2_name = 'chatgpt'
max_comparisons = 200

sft_instruction_truncoutput_pair, sft_instructions = process_llama_samples_from_dir(os.path.join(base_dir, f'sharegpt2turn_noeos_maxlen1024_{model1_name}'))

if model2_name == 'chatgpt':
    matched_instruction_outputs = match_instruction_outputs(sft_instructions, sft_instruction_truncoutput_pair, single_turn_instructions, chatgpt_instruction_truncoutput_pair)
else:
    sft_instruction_truncoutput_pair_2, sft_instructions_2 = process_llama_samples_from_dir(os.path.join(base_dir, f'sharegpt2turn_noeos_maxlen1024_{model2_name}'))
    matched_instruction_outputs = match_instruction_outputs(sft_instructions, sft_instruction_truncoutput_pair, sft_instructions_2, sft_instruction_truncoutput_pair_2)

comparison_folder = os.path.join(base_dir, 'comparisons', f'{model1_name}_vs_{model2_name}')
os.makedirs(comparison_folder, exist_ok=True)

model1_outputs = []
model2_outputs = []
for idx, matched_instruction_output in enumerate(matched_instruction_outputs):
    model1_outputs.append({'instruction': matched_instruction_output['instruction'],
                           'output': matched_instruction_output['output_1'],
                           "generator": model1_name,})
    model2_outputs.append({'instruction': matched_instruction_output['instruction'],
                           'output': matched_instruction_output['output_2'],
                           "generator": model2_name,})
    if idx > max_comparisons:
        break

with open(os.path.join(comparison_folder, model1_name + '.json'), 'w') as f:
    json.dump(model1_outputs, f, indent=4)

with open(os.path.join(comparison_folder, model2_name + '.json'), 'w') as f:
    json.dump(model2_outputs, f, indent=4)