In [14]:
import json

# 读取文件内容
file_path = 'data/pr_filter2_page1.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

# 修正格式：在字典之间插入逗号，并将内容包裹成 JSON 数组
fixed_content = file_content.replace("}\n{", "},\n{")

# 将内容包裹在方括号中，形成 JSON 数组
fixed_content = "[" + fixed_content + "]"

# 尝试将修正后的内容加载为 JSON
try:
    data = json.loads(fixed_content)
    print("数据加载成功：", data[0])
except json.JSONDecodeError as e:
    print(f"加载失败，错误信息：{e}")


数据加载成功： {'项目名称': 'opentrons', '项目star': '431', '项目网址': 'https://github.com//Opentrons/opentrons/pull/13222', 'pr的文本描述': "# Overview\r\n\r\nCloses RLAB-357\r\n\r\n# Test Plan\r\n\r\nJust unit- and integration tests should suffice for this since it's only updating definitions\r\n\r\n# Changelog\r\n\r\n- added `gripForce` and `gripHeightFromLabwareBottom` optional fields to labware schema and the supported, tested labware's definitions\r\n- updated labwareDefinition model & type dict signature\r\n- updated definition/schema verification tests\r\n\r\n# Review requests\r\n\r\n- make sure the grip force & height added are correct for the specific labware (tested values are in the sheet linked in the jira ticket above)\r\n- \r\n\r\n# Risk assessment\r\n\r\n<!--\r\nCarefully go over your pull request and look at the other parts of the codebase it may affect. Look for the possibility, even if you think it's small, that your change may affect some other part of the system - for instance, changin

In [6]:
data[0].keys()

dict_keys(['项目名称', '项目star', '项目网址', 'pr的文本描述', '增加的代码', '删减的代码', '最后的完整代码'])

In [15]:
# 重构 JSON 格式
restructured_data = {}
for item in data:
    project_name = item['项目名称']
    if project_name not in restructured_data:
        restructured_data[project_name] = {
            '项目star': item['项目star'],
            '项目prs': []
        }
    pr_info = {
        '项目网址': item['项目网址'],
        'pr的文本描述': item['pr的文本描述'],
        '增加的代码': item['增加的代码'],
        '删减的代码': item['删减的代码'],
        '最后的完整代码': item['最后的完整代码']
    }
    restructured_data[project_name]['项目prs'].append(pr_info)
    
restructured_data.__reversed__()
# 将重构后的数据保存为 JSON 文件
with open('data/restructed_pr_filter2_page1.json', 'w') as json_file:
    json.dump(restructured_data, json_file, indent=4, ensure_ascii=False)

print("Restructured JSON content saved to restructed_pr_filter2_page1.json")

Restructured JSON content saved to restructed_pr_filter2_page1.json


In [6]:
# 提取部分数据测试LLM判断是否存在test plan的能力
import json
with open ('data/restructed_pr_filter2_page1.json', 'r') as file:
    restructured_data_json = json.load(file)
restructured_data_json.keys()

part_of_data = {}

project_name, project_info = restructured_data_json.items().__iter__().__next__()
part_of_data[project_name] = {
    '项目star': project_info['项目star'],
    '项目prs': []
}
part_of_data[project_name]['项目prs'] = project_info['项目prs'][:2]
    
    
with open('data/part_of_restructured_pr_filter2_page1.json', 'w') as json_file:
    json.dump(part_of_data, json_file, indent=4, ensure_ascii=False)

part_of_data

{'opentrons': {'项目star': '431',
  '项目prs': [{'项目网址': 'https://github.com//Opentrons/opentrons/pull/13222',
    'pr的文本描述': "# Overview\r\n\r\nCloses RLAB-357\r\n\r\n# Test Plan\r\n\r\nJust unit- and integration tests should suffice for this since it's only updating definitions\r\n\r\n# Changelog\r\n\r\n- added `gripForce` and `gripHeightFromLabwareBottom` optional fields to labware schema and the supported, tested labware's definitions\r\n- updated labwareDefinition model & type dict signature\r\n- updated definition/schema verification tests\r\n\r\n# Review requests\r\n\r\n- make sure the grip force & height added are correct for the specific labware (tested values are in the sheet linked in the jira ticket above)\r\n- \r\n\r\n# Risk assessment\r\n\r\n<!--\r\nCarefully go over your pull request and look at the other parts of the codebase it may affect. Look for the possibility, even if you think it's small, that your change may affect some other part of the system - for instance, chang

In [4]:
import json
from  data_process.llm_process_3 import llm_restructure_pr_body

dataset_dir = "data/part_of_restructured_pr_filter2_page1.json"
with open(dataset_dir, "r") as dataset_dir:
    data = json.load(dataset_dir)

for project_name, project_info in data.items():

    for pr_info in project_info['项目prs']:
        llm_pr_body = llm_restructure_pr_body(pr_info['pr的文本描述'])

        restructured_pr_body = llm_pr_body.strip('```json').strip('```')
        print(restructured_pr_body)

        json_content = json.loads(restructured_pr_body)

        if json_content["Test plan"] != "None":
            pr_info["测试计划"] = json_content["Test plan"]
        else:
            pr_info["测试计划"] = "None"
        
with open('data/part_of_llm_restructed_pr_filter2_page1.json', 'w') as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii=False)



{
    "Description of changes": "Added `gripForce` and `gripHeightFromLabwareBottom` optional fields to the labware schema and updated the definitions of supported, tested labware. Updated the labwareDefinition model and type dict signature, and updated definition/schema verification tests.",
    "Test plan": "Unit- and integration tests should suffice for this change since it only involves updating definitions.",
    "Others": "Reviewers are requested to ensure that the grip force and height values added are correct for the specific labware, with tested values available in the sheet linked in the Jira ticket. A risk assessment is also requested to identify any potential impacts on other parts of the system."
}


{
    "Description of changes": "Added placeholder endpoints and models for the Estop State Query and Estop Acknowledge-Disengage endpoints. The state query endpoint returns the overall estop status for the robot and the physical status of each estop mount point. The acknowle