读取data.jsonl文件并提取对应数据保存到code.jsonl

In [1]:
import os
import json
from pathlib import Path

# set the model_id
model_id = 'starcoder2-3b_cangjie_it_2200_lr1e-05_ebs32'
folder_path = 'starcoder2-3b_cangjie_it_2200_lr1e-05_ebs32'

current_dir = os.getcwd()
proj_dir = Path(current_dir).parent
input_file_path = os.path.join(current_dir, f'{model_id}.jsonl') # translation result jsonl file
output_file_path = os.path.join(current_dir, 'codefile.jsonl')
target_folder_path = os.path.join(current_dir, folder_path)

# Remove the output file if it exists
if os.path.exists(output_file_path):
    os.remove(output_file_path)

# Read from the input file and write to the output file
with open(input_file_path, 'r') as f_in:
    with open(output_file_path, 'w') as f_out:
        for line in f_in:
            data = json.loads(line)
            pred = data.get('pred')
            if pred:
                f_out.write(json.dumps(pred) + '\n')


复制测试代码

In [2]:
import shutil

def copy_folders(source_path, target_path):
    # 遍历 0 到 163
    for i in range(164):
        folder_name = str(i)
        source_folder_path = os.path.join(source_path, folder_name)
        target_folder_path = os.path.join(target_path, folder_name)

        # 如果源文件夹存在，则复制到目标路径下
        if os.path.exists(source_folder_path):
            try:
                shutil.copytree(source_folder_path, target_folder_path)
            except Exception as e:
                print(f"复制文件夹 {folder_name} 到 {target_path} 失败: {e}")
        else:
            print(f"源文件夹 {folder_name} 不存在")

# 指定源路径和目标路径
source_path = os.path.join(proj_dir, 'humaneval-x/test')
target_path = os.path.join(proj_dir, 'bash-test')


# 调用函数进行复制
copy_folders(source_path, target_path)

将codefile.json的内容覆盖到对应行号文件夹下文件前部

In [3]:
import os

with open('codefile.jsonl', 'r') as file:
    lines = [line.strip()[1:-1] for line in file.readlines()]

for line_number, line_content in enumerate(lines):
    folder_name = str(line_number)  # 文件夹名称
    file_name = "generated_code.cj"  # 文件名称
    decoded_string = line_content.encode().decode('unicode_escape')  # 解码字符串

    # 检查文件夹是否存在，不存在则创建
    os.makedirs(folder_name, exist_ok=True)

    # 打开文件，将行内容写入文件最前方
    file_path = os.path.join(folder_name, file_name)
    with open(file_path, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(decoded_string.rstrip('\r\n') + '\n' + content)


在代码最前方写入头文件

In [4]:
for i in range(164):
    folder_name = str(i)  # 文件夹名称
    file_name = "generated_code.cj"  # 文件名称
    # 打开文件，将行内容写入文件最前方
    file_path = os.path.join(folder_name, file_name)
    with open(file_path, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write("""from std import random.*
from std import collection.*
from std import math.*
from std import sort.SortExtension
"""  + content)

读取codefile.json每一行的内容，生成cj文件，若成功编译，将输出保存到 {序号}/output.json中。否则保存在{序号}/error.json中
其中{序号}即代码在codefile.json中的行号

In [5]:
import subprocess

for i in range(164):
    folder_name = str(i)
    command = ["./compile.sh", "generated_code.cj", "correct", folder_name]
    
    try:
        # 使用subprocess.run()运行Shell命令，并设置超时时间为15秒
        subprocess.run(command, timeout=15)
    except subprocess.TimeoutExpired:
        # 如果超时，输出当前的i，并跳过这次执行
        print(f"Timeout occurred for folder {folder_name}")
        continue

遍历文件夹，若存在error.json,说明编译失败，反之则确认output.json文件是否存在,内容是否包含false

In [6]:
successful_line_numbers = []

for line_number in range(164):
    folder_name = str(line_number)
    error_file_path = os.path.join(folder_name, "error.json")
    output_file_path = os.path.join(folder_name, "output.json")
    
    if os.path.exists(output_file_path):
        with open(output_file_path, 'r') as output_file:
            output_content = output_file.read()
            if 'false' in output_content:
                continue
            else:
                successful_line_numbers.append(line_number)
    else:
        continue
print(successful_line_numbers)
print("Number of successful line numbers:", len(successful_line_numbers))

[0, 1, 3, 5, 6, 7, 8, 9, 12, 13, 15, 16, 18, 21, 23, 24, 26, 29, 31, 32, 38, 40, 41, 43, 48, 49, 52, 53, 55, 56, 59, 60, 61, 62, 63, 68, 73, 76, 78, 80, 81, 82, 85, 97, 100, 102, 106, 110, 121, 127, 135, 138, 139, 150, 152, 157, 159, 163]
Number of successful line numbers: 58


生成完毕后，将同目录下名为0-163的文件夹剪切到对应模型ID文件夹下

In [7]:
#import shutil

#删除文件夹及其包含的所有内容
try:
    shutil.rmtree(folder_path)
except OSError as e:
    print(f"删除文件夹 {folder_path} 失败: {e}")

os.mkdir(target_folder_path)
for i in range(164):  # 总共164个文件夹
    folder_to_move = str(i)
    folder_to_move_path = os.path.join(current_dir, folder_to_move)
    # 如果文件夹存在，则剪切到目标文件夹中
    if os.path.isdir(folder_to_move_path):
        try:
            shutil.move(folder_to_move_path, target_folder_path)
        except Exception as e:
            print(f"剪切文件夹 {folder_to_move} 到 {target_folder_path} 失败: {e}")
