In [4]:
from datasets import load_dataset

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("anudaw/full_finetuned-code-tinyllama", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("anudaw/full_finetuned-code-tinyllama", trust_remote_code=True).cuda()

  return self.fget.__get__(instance, owner)()


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_length = 768

In [7]:
from tqdm import tqdm
import numpy as np
import math

def generate_results(model, tokenizer, entries, output_file, num_samples=5):
  results = []

  num_entries = len(entries)
  batch_size = 1

  for batch in tqdm(np.array_split(entries, math.ceil(num_entries / batch_size))):
    prompts = [row['prompt'] for (_, row) in batch.iterrows()]
    input_ids = tokenizer(prompts, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
      output_ids = model.generate(**input_ids, max_length=max_length, do_sample=True, temperature=0.2, num_return_sequences=num_samples)
    
    cur_list = []
    for i, output in enumerate(output_ids):
      generated_text = tokenizer.decode(output, skip_special_tokens=True)
      cur_list.append(generated_text)

      if len(cur_list) == num_samples:
        results.append({ 'prompt': prompts[i // num_samples], 'samples': cur_list })
        json.dump({ 'prompt': prompts[i // num_samples], 'samples': cur_list }, output_file)
        output_file.write('\n')
        cur_list = []

  return results

In [1]:
import pandas as pd
from pathlib import Path
import json

dataset_folder = Path('../../CodeT/CodeT/data/dataset')
dataset_name = 'HumanEval'

codegen_file = dataset_folder / f'{dataset_name}_for_code_generation.jsonl'
testcase_file = dataset_folder / f'{dataset_name}_for_test_case_generation.jsonl'

codegen_json = pd.read_json(path_or_buf=Path(codegen_file), lines=True)
testcase_json = pd.read_json(path_or_buf=Path(testcase_file), lines=True)

In [10]:
with open(f'{dataset_name}-testcase-20-temp0.2.jsonl', mode='w') as writer:
    results = generate_results(model, tokenizer, testcase_json, writer, num_samples=20)

  0%|          | 0/164 [00:00<?, ?it/s]

100%|██████████| 164/164 [58:06<00:00, 21.26s/it] 


In [9]:
with open(f'{dataset_name}-codegen-20-temp0.2.jsonl', mode='w') as writer:
    results = generate_results(model, tokenizer, codegen_json, writer, num_samples=20)

  0%|          | 0/164 [00:00<?, ?it/s]

100%|██████████| 164/164 [50:09<00:00, 18.35s/it] 


In [4]:
codegen_20_json = pd.read_json(path_or_buf=Path('./HumanEval-codegen-60-temp0.5.jsonl'), lines=True)
testcase_20_json = pd.read_json(path_or_buf=Path('./HumanEval-testcase-60-temp0.5.jsonl'), lines=True)

with open(f'HumanEval-codegen-60-temp0.5-suffix.jsonl', mode='w') as writer:
	for i, row in codegen_20_json.iterrows():
		prompt = row['prompt']
		samples = row['samples']
		new_samples = []
		for sample in samples:
			new_samples.append(sample[len(prompt):])
		json.dump({ 'prompt': prompt, 'samples': new_samples }, writer)
		writer.write('\n')