# 第一步，导入用户标注数据，下载对应的axtree，同时格式化并根据每个step中的path去retrieve出node

In [33]:
import json
import os
import pandas as pd

user_annotate_dir = "./query_results3"
all_user_annotate_files = os.listdir(user_annotate_dir)

print(all_user_annotate_files)

['Kyle.json', 'Im.json', 'Cyberpunk.json', 'dcynsd.json', 'Siing.json', 'Kyle..json', 'sk.json', 'cceatmore_new.json', 'cceatmore.json', 'Jingqiu.json', 'HATBTBS.json']


In [34]:
%load_ext autoreload
%autoreload 1
%aimport utils

from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
# 建立存储axtree的文件夹，一共三种，raw_axtree, formatted_axtree, retrieved_axtree

user_name = "Cyberpunk"

raw_axtree_dir = "./raw_axtree"
formatted_axtree_dir = "./formatted_axtree"
retrieved_axtree_dir = "./retrieved_axtree"

if not os.path.exists(raw_axtree_dir):
    os.makedirs(raw_axtree_dir)
if not os.path.exists(formatted_axtree_dir):
    os.makedirs(formatted_axtree_dir)
if not os.path.exists(retrieved_axtree_dir):
    os.makedirs(retrieved_axtree_dir)


In [36]:
# 读取用户文件
# 这里的user_annotate_data是一个列表，列表中的每个元素是一个字典，每个字典都是一个完整的任务标注。

user_annotate_file = os.path.join(user_annotate_dir, user_name + ".json")
with open(user_annotate_file, "r") as f:
    user_annotate_data = json.load(f)

In [38]:
# 遍历user_annotate_data，对每个任务标注，生成axtree，并存储到raw_axtree_dir中。
import copy

axtid_cnt = 0
total_cnt = 0
not_found_cnt = 0

new_user_annotate_data = []
for task_num, task in enumerate(user_annotate_data):
    print(f"Processing task {task_num} of {len(user_annotate_data)}")
    new_task = copy.deepcopy(task)
    steps_ls = json.loads(task["steps"])
    new_steps_ls = []
    for step_num, step in enumerate(steps_ls):
        new_step = copy.deepcopy(step)
        if "axTree" in step.keys() and step["axTree"] is not None:
            axTree_url = step["axTree"]
            restore_raw_dir = f"{raw_axtree_dir}/{user_name}/{task_num}"
            restore_raw_path = f"{restore_raw_dir}/{step_num}.json"
            if not os.path.exists(restore_raw_dir):
                os.makedirs(restore_raw_dir)
            # 下载axTree, 并存储到restore_path中
            download_json(axTree_url, restore_raw_path)
            
            # 将raw axtree格式化，生成formatted_axtree
            restore_formatted_dir = f"{formatted_axtree_dir}/{user_name}/{task_num}"
            restore_formatted_path = f"{restore_formatted_dir}/{step_num}.txt"
            if not os.path.exists(restore_formatted_dir):
                os.makedirs(restore_formatted_dir)
            raw_axtree = json.load(open(restore_raw_path, "r"))
            formatted_nodes = format_node(raw_axtree)
            formatted_axtree = ""
            for node in formatted_nodes:
                node += '\n'
                formatted_axtree += node
            with open(restore_formatted_path, "w") as f:
                f.write(formatted_axtree)
                
            # 根据path找到axtree中的节点，并且生成retrieved_axtree
            path = ["html"] + step["path"].split('>')
            retrieved_axtree = ""
            raw_retrieved_axtree = find_node_by_path(raw_axtree, path)
            if raw_retrieved_axtree is None:
                print(f"No node found in path {path}, {user_name}, {task_num}, {step_num}")
                not_found_cnt += 1
                raw_retrieved_axtree = {}
            
            # 给new_step拼上id属性
            axtid = raw_retrieved_axtree.get("attributes", {}).get("data-imean-axt-id", "")
            if "axtId" in new_step:
                if axtid != new_step["axtId"]:
                    print(f"ID mismatch: existing={new_step['axtId']}, found={axtid}")
                    # 决定使用哪个ID或如何处理不匹配情况
                assert axtid == new_step["axtId"]
            new_step["axtId"] = axtid
            if len(axtid) > 0:
                axtid_cnt += 1
            total_cnt += 1
                
            # 把retrieved axtree格式化
            for node in format_node(raw_retrieved_axtree):
                node += '\n'
                retrieved_axtree += node
            restore_retrieved_dir = f"{retrieved_axtree_dir}/{user_name}/{task_num}"
            restore_retrieved_path = f"{restore_retrieved_dir}/{step_num}.txt"
            if not os.path.exists(restore_retrieved_dir):
                os.makedirs(restore_retrieved_dir)
            with open(restore_retrieved_path, "w") as f:
                f.write(retrieved_axtree)
        new_steps_ls.append(new_step)
    new_task["steps"] = json.dumps(new_steps_ls)
    new_user_annotate_data.append(new_task)

new_user_annotate_file = os.path.join(user_annotate_dir, user_name + "_new.json")
print(new_user_annotate_file)

with open(new_user_annotate_file, "w") as f:
    json.dump(new_user_annotate_data, f, ensure_ascii=False, indent=2)

print(f"axtid_cnt: {axtid_cnt}, node_not_found_cnt: {not_found_cnt}, total_cnt: {total_cnt}")

Processing task 0 of 50


Processing task 1 of 50
No node found in path ['html', 'body', 'div', 'div', 'section', 'div', 'div', 'div', 'div', 'div', 'section', 'div', 'ul', 'li', 'a', 'div'], Cyberpunk, 1, 4
Processing task 2 of 50
Processing task 3 of 50
Processing task 4 of 50
Processing task 5 of 50
Processing task 6 of 50
Processing task 7 of 50
Processing task 8 of 50
Processing task 9 of 50
Processing task 10 of 50
Processing task 11 of 50
Processing task 12 of 50
Processing task 13 of 50
Processing task 14 of 50
No node found in path ['html', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'section', 'div', 'div', 'div', 'div', 'div', 'ul', 'li', 'section', 'div', 'h2', 'a'], Cyberpunk, 14, 5
Processing task 15 of 50
Processing task 16 of 50
Processing task 17 of 50
Processing task 18 of 50
Processing task 19 of 50
Processing task 20 of 50
Processing task 21 of 50
No node found in path ['html', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'di

# 第二步，组sft数据，按user存储

In [9]:
prompt_system = '''
# CONTEXT

You are an autonomous intelligent agent tasked with navigating a web browser to accomplish various web-based tasks. Your success depends on effectively utilizing the specific actions available to you. Below is the information and guidance you will have during the task:

## TASK INFORMATION

1. **User's Objective**: The goal you are tasked to achieve.  
2. **Current Web Page's Accessibility Tree**: A simplified representation of the webpage, providing key information about its elements.  
3. **Current Web Page's URL**: The URL of the page you are currently viewing.  
4. **Previous Action List**: A record of all the actions you have performed so far, useful for tracking your progress.  

## AVAILABLE ACTIONS

### 1. **Page Operation Actions**
- `click [id]`: Click on a webpage element identified by its `id`.  
- `type [id][content]`: Type `content` into the field with the specified `id`.  
- `copy [id]`: Copy the content of an element identified by its `id`.  
- `paste [id]`: Paste previously copied content into a field identified by its `id`.  
- `cache [id]`: Cache the information or value from the element with the specified `id` for later use.  
- `hover [id]`: Hover over an element identified by its `id`.  
- `press_enter`: Simulate pressing the "Enter" key.  
- `double_click [id]`: Perform a double click on the element identified by its `id`.  
- `right_click [id]`: Perform a right-click on the element identified by its `id`.  
- `select [id]`: Select text within an element identified by its `id`.  

### 2. **Navigation Actions**
- `back`: Return to the previously viewed page.  

### 3. **Completion Action**
- `stop [answer]`: Use this action when you believe the task is complete. Provide the result inside the brackets:  
  - For text-based answers, write the answer directly.  
  - If the task is impossible to complete, use `"N/A"` as the answer.  

## RULES

1. Only issue actions that are valid based on the current observation.  
2. Perform one action at a time.  
3. Follow the reasoning examples and proceed step by step before issuing the next action.  
4. Format actions correctly. Use the following structure:  
   - Start with: *"In summary, the next action I will perform is"*  
   - Followed by the action in backticks, e.g., `click [1234]`.  
5. Use the `stop` action once you achieve the objective. Do not generate any further output after issuing the `stop` action.  

By adhering to these instructions and leveraging the available actions effectively, you will successfully complete the assigned web-based tasks.
'''

In [10]:
prompt_input_template = '''
# OBSERVATION

{full_axtree}

# URL

{url}

# OBJECTIVE

{objective}

# PREVIOUS ACTIONS

{action_list}
'''

In [11]:
prompt_output_template = '''
First, Let's find the most relevant part of axtree that I need:

{retrieved_axtree}

Therefore, the next action I will perform is:

```json
{action}
```
'''

In [40]:
action_template = '''
## Action {i}
- action_type: {action_type}
- action_value: {action_value}
'''

In [23]:
sft_data_dir = f"./sft_data/{user_name}"

组sft数据主要是选择output中的step，然后把前面的step拼成一个action list。
组出input，output，system，url这几个字段

In [41]:
import json

def get_prompt_input(formatted_axtree, href, objective, step_num, steps):
    action_list = ""
    for previous_step_num, previous_step in enumerate(steps):
        tmp_action = ""
        if previous_step_num < step_num:
            action_type = previous_step["type"]
            action_value = previous_step["value"]
            tmp_action = action_template.format(i=previous_step_num, action_type=action_type, action_value=action_value)
            action_list += tmp_action
        else:
            break

    return prompt_input_template.format(full_axtree=formatted_axtree.strip(), url=href, objective=objective, action_list=action_list)

In [42]:
for task_num, task in enumerate(new_user_annotate_data):
    steps = json.loads(task["steps"])
    objective = task['title']
    for step_num, step in enumerate(steps):
        if "axTree" in step.keys() and step["axTree"] != "":
            restore_sft_dir = f'{sft_data_dir}/{task_num}'
            output_file = f'{restore_sft_dir}/{step_num}.json'
            if not os.path.exists(restore_sft_dir):
                os.makedirs(restore_sft_dir)
            output = open(output_file, 'w')

            data_dict = {}
            formatted_axtree_path = f"./formatted_axtree/{user_name}/{task_num}/{step_num}.txt"
            retrieved_axtree_path = f"./retrieved_axtree/{user_name}/{task_num}/{step_num}.txt"
            formatted_axtree = open(formatted_axtree_path, 'r').read()
            retrieved_axtree = open(retrieved_axtree_path, 'r').read()
            
            # 如果retrieved_axtree或formatted_axtree为空，则跳过，筛选掉没有找到node的step（没有axtId）
            if len(retrieved_axtree) == 0 or len(formatted_axtree) == 0:
                continue
            
            prompt_input = get_prompt_input(formatted_axtree, step["href"], objective, step_num, copy.deepcopy(steps))
            action = {
                "action_type": step["type"],
                "action_id": step["axtId"],
                "action_value": step["value"]
            }
            action = json.dumps(action, ensure_ascii=False, indent=2)
            prompt_output = prompt_output_template.format(retrieved_axtree=retrieved_axtree.strip(), action=action)
            data_dict[step_num] = {
                "prompt_system": prompt_system,
                "prompt_input": prompt_input,
                "prompt_output": prompt_output,
                "url": step["href"],
                "user_name": user_name,
                "task_num": task_num,
                "step_num": step_num
            }

            output.write(json.dumps(data_dict, ensure_ascii=False)+'\n')
            output.close()