In [None]:
import random
import json
import os
import yaml
from tqdm.notebook import tqdm
import pandas as pd
def process_datasets(config_folder: str, output_folder: str):
    """
    遍历给定文件夹内的 YAML 配置文件，处理相应的 JSON 文件，并生成 JSONL 格式的输出。

    Args:
        config_folder (str): 包含 YAML 配置文件的文件夹路径。
        output_folder (str): 用于保存新生成的 JSONL 文件的文件夹路径。
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")

    for root, _, files in os.walk(config_folder):
        for file_name in files:
            if file_name.endswith('.yaml') and 'config_' in file_name:
                yaml_path = os.path.join(root, file_name)
                
                print(f"\nProcessing YAML file: {yaml_path}")
                try:
                    with open(yaml_path, 'r', encoding='utf-8') as f:
                        yaml_data = yaml.safe_load(f)
                except Exception as e:
                    print(f"Error reading YAML file {yaml_path}: {e}")
                    continue

                if 'train_file_path' in yaml_data:
                    json_path = yaml_data['train_file_path']
                    dataset_name = yaml_data.get('dataset', 'unknown_dataset')
                    task_name = yaml_data.get('task', 'unknown_task')
                    
                    full_dataset_name = f"{task_name}_{dataset_name}"
                    
                    if not os.path.exists(json_path):
                        print(f"Warning: JSON file not found at {json_path}. Skipping.")
                        continue
                        
                    print(f"  -> Reading JSON file: {json_path}")
                    try:
                        df = pd.read_json(json_path)
                    except Exception as e:
                        print(f"Error reading JSON file {json_path}: {e}")
                        continue
                    
                    # 检查所需的列是否存在
                    if 'instruction' not in df.columns or 'output' not in df.columns:
                        print(f"Warning: JSON file {json_path} is missing 'instruction' or 'output' column. Skipping.")
                        continue
                        
                    # 创建新的 JSONL 文件名
                    full_output_file = os.path.join(output_folder, f"{full_dataset_name}_full.jsonl")
                    subset_output_file = os.path.join(output_folder, f"{full_dataset_name}_subset.jsonl")
                    
                    # 生成完整数据集
                    full_data_list = []
                    for index, row in tqdm(df.iterrows(), total=len(df), desc="  -> Formatting data"):
                        new_entry = {
                            "dataset": full_dataset_name,
                            "id": f"{full_dataset_name}_{index}",
                            "messages": [
                                {
                                    "role": "user",
                                    "content": row['instruction']
                                },
                                {
                                    "role": "assistant",
                                    "content": row['output']
                                }
                            ]
                        }
                        full_data_list.append(new_entry)
                    
                    # 保存完整数据集
                    with open(full_output_file, 'w', encoding='utf-8') as f:
                        for item in full_data_list:
                            f.write(json.dumps(item, ensure_ascii=False) + '\n')
                    print(f"  -> Successfully created full JSONL file: {full_output_file}")
                    
                    # 随机抽取1%的数据样本
                    sample_size = max(1, int(len(full_data_list) * 0.3))
                    subset_data_list = random.sample(full_data_list, sample_size)
                    
                    # 保存子集
                    with open(subset_output_file, 'w', encoding='utf-8') as f:
                        for item in subset_data_list:
                            f.write(json.dumps(item, ensure_ascii=False) + '\n')
                    print(f"  -> Successfully created 1% subset JSONL file: {subset_output_file}")
def merge_jsonl_files(input_folder: str, output_folder: str):
    """
    将指定文件夹内的所有 full.jsonl 和 subset.jsonl 文件合并。

    Args:
        input_folder (str): 包含要合并的 JSONL 文件的文件夹路径。
        output_folder (str): 用于保存合并后的文件的文件夹路径。
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")

    # 合并 full.jsonl 文件
    full_output_path = os.path.join(output_folder, 'full.jsonl')
    full_file_list = [f for f in os.listdir(input_folder) if f.endswith('_full.jsonl')]
    
    if not full_file_list:
        print("No files ending with '_full.jsonl' were found. Skipping full file merge.")
    else:
        print(f"Found {len(full_file_list)} full files to merge.")
        with open(full_output_path, 'w', encoding='utf-8') as outfile:
            for file_name in full_file_list:
                file_path = os.path.join(input_folder, file_name)
                print(f"  -> Merging: {file_name}")
                with open(file_path, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        # 逐行读取并写入到新的文件中
                        outfile.write(line)
        print(f"Successfully merged all full files into: {full_output_path}")
        
    # ---

    # 合并 subset.jsonl 文件
    subset_output_path = os.path.join(output_folder, 'subset.jsonl')
    subset_file_list = [f for f in os.listdir(input_folder) if f.endswith('_subset.jsonl')]

    if not subset_file_list:
        print("No files ending with '_subset.jsonl' were found. Skipping subset file merge.")
    else:
        print(f"Found {len(subset_file_list)} subset files to merge.")
        with open(subset_output_path, 'w', encoding='utf-8') as outfile:
            for file_name in subset_file_list:
                file_path = os.path.join(input_folder, file_name)
                print(f"  -> Merging: {file_name}")
                with open(file_path, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        # 逐行读取并写入到新的文件中
                        outfile.write(line)
        print(f"Successfully merged all subset files into: {subset_output_path}")

import os
import yaml
import pandas as pd
import json
import random
from tqdm import tqdm

def process_datasets(config_folder: str, output_folder: str):
    """
    Traverses YAML configuration files in a given folder, processes the corresponding
    JSON test files, and generates a 1% random subset in JSONL format,
    following the 'task-dataset-test-subset.jsonl' naming convention.

    Args:
        config_folder (str): The path to the folder containing the YAML configuration files.
        output_folder (str): The path to the folder where the new JSONL files will be saved.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")

    for root, _, files in os.walk(config_folder):
        for file_name in files:
            if file_name.endswith('.yaml') and 'config_' in file_name:
                yaml_path = os.path.join(root, file_name)

                print(f"\nProcessing YAML file: {yaml_path}")
                try:
                    with open(yaml_path, 'r', encoding='utf-8') as f:
                        yaml_data = yaml.safe_load(f)
                except Exception as e:
                    print(f"Error reading YAML file {yaml_path}: {e}")
                    continue

                if 'test_file_path' in yaml_data:
                    json_path = yaml_data['test_file_path']
                    dataset_name = yaml_data.get('dataset', 'unknown_dataset')
                    task_name = yaml_data.get('task', 'unknown_task')

                    # Create a unified dataset name from task and dataset
                    unified_dataset_name = f"{task_name}_{dataset_name}"
                    
                    if not os.path.exists(json_path):
                        print(f"Warning: JSON file not found at {json_path}. Skipping.")
                        continue

                    print(f"  -> Reading JSON file: {json_path}")
                    try:
                        df = pd.read_json(json_path)
                    except Exception as e:
                        print(f"Error reading JSON file {json_path}: {e}")
                        continue

                    # Check for required columns
                    if 'instruction' not in df.columns or 'output' not in df.columns:
                        print(f"Warning: JSON file {json_path} is missing 'instruction' or 'output' column. Skipping.")
                        continue

                    # Generate the output file name according to the specified standard
                    output_file_name = f"{task_name}-{dataset_name}-test-subset.jsonl"
                    output_path = os.path.join(output_folder, output_file_name)

                    # Generate the full data list for sampling
                    full_data_list = []
                    for index, row in tqdm(df.iterrows(), total=len(df), desc="  -> Formatting data"):
                        new_entry = {
                            "dataset": unified_dataset_name,
                            "id": f"{unified_dataset_name}_{index}",
                            "messages": [
                                {
                                    "role": "user",
                                    "content": row['instruction']
                                },
                                {
                                    "role": "assistant",
                                    "content": row['output']
                                }
                            ]
                        }
                        full_data_list.append(new_entry)

                    # Randomly sample 1% of the data
                    if not full_data_list:
                        print(f"Warning: No data found in {json_path}. Skipping subset creation.")
                        continue

                    sample_size = max(1, int(len(full_data_list) * 0.01))
                    subset_data_list = random.sample(full_data_list, sample_size)

                    # Save the subset to the new file
                    with open(output_path, 'w', encoding='utf-8') as f:
                        for item in subset_data_list:
                            f.write(json.dumps(item, ensure_ascii=False) + '\n')
                    print(f"  -> Successfully created 1% subset JSONL file: {output_path}")

In [2]:
process_datasets('script/','../LESS/LESS_data/')

NameError: name 'process_datasets' is not defined

In [None]:
import pandas as pd

In [6]:
process_datasets_test('script/','../LESS/LESS_data/')


Processing YAML file: script/config_CTA_WebTable.yaml
  -> Reading JSON file: train/CTA/WebTable/WebTable-test.json


  -> Formatting data:   0%|          | 0/17709 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/CTA_WebTable_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/CTA_WebTable_test_subset.jsonl

Processing YAML file: script/config_RE_RE.yaml
  -> Reading JSON file: train/RE/RE/RE-test.json


  -> Formatting data:   0%|          | 0/2072 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/RE_RE_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/RE_RE_test_subset.jsonl

Processing YAML file: script/config_ER_wdc.yaml
  -> Reading JSON file: train/ER/wdc/wdc-test.json


  -> Formatting data:   0%|          | 0/4398 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/ER_wdc_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/ER_wdc_test_subset.jsonl

Processing YAML file: script/config_ER_semi-text-w.yaml
  -> Reading JSON file: train/ER/semi-text-w/semi-text-w-test.json


  -> Formatting data:   0%|          | 0/1846 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/ER_semi-text-w_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/ER_semi-text-w_test_subset.jsonl

Processing YAML file: script/config_ER_semi-text-c.yaml
  -> Reading JSON file: train/ER/semi-text-c/semi-text-c-test.json


  -> Formatting data:   0%|          | 0/4179 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/ER_semi-text-c_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/ER_semi-text-c_test_subset.jsonl

Processing YAML file: script/config_ER_abt-buy.yaml
  -> Reading JSON file: train/ER/abt-buy/abt-buy-test.json


  -> Formatting data:   0%|          | 0/1916 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/ER_abt-buy_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/ER_abt-buy_test_subset.jsonl

Processing YAML file: script/config_ER_amazon-google.yaml
  -> Reading JSON file: train/ER/amazon-google/amazon-google-test.json


  -> Formatting data:   0%|          | 0/2289 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/ER_amazon-google_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/ER_amazon-google_test_subset.jsonl

Processing YAML file: script/config_ER_walmart-amazon.yaml
  -> Reading JSON file: train/ER/walmart-amazon/walmart-amazon-test.json


  -> Formatting data:   0%|          | 0/2049 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/ER_walmart-amazon_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/ER_walmart-amazon_test_subset.jsonl

Processing YAML file: script/config_DC_hospital.yaml
  -> Reading JSON file: train/DC/hospital/hospital-test.json


  -> Formatting data:   0%|          | 0/508 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/DC_hospital_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/DC_hospital_test_subset.jsonl

Processing YAML file: script/config_DC_rayyan.yaml
  -> Reading JSON file: train/DC/rayyan/rayyan-test.json


  -> Formatting data:   0%|          | 0/1117 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/DC_rayyan_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/DC_rayyan_test_subset.jsonl

Processing YAML file: script/config_DC_beer.yaml
  -> Reading JSON file: train/DC/beer/beer-test.json


  -> Formatting data:   0%|          | 0/3364 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/DC_beer_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/DC_beer_test_subset.jsonl

Processing YAML file: script/config_DI_walmart.yaml
  -> Reading JSON file: train/DI/walmart/walmart-test.json


  -> Formatting data:   0%|          | 0/104 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/DI_walmart_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/DI_walmart_test_subset.jsonl

Processing YAML file: script/config_DI_amazon.yaml
  -> Reading JSON file: train/DI/amazon/amazon-test.json


  -> Formatting data:   0%|          | 0/816 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/DI_amazon_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/DI_amazon_test_subset.jsonl

Processing YAML file: script/config_SM_CMS.yaml
  -> Reading JSON file: train/SM/CMS/CMS-test.json


  -> Formatting data:   0%|          | 0/5127 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/SM_CMS_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/SM_CMS_test_subset.jsonl

Processing YAML file: script/config_AVE_oa_mine.yaml
  -> Reading JSON file: train/AVE/oa_mine/oa_mine-test.json


  -> Formatting data:   0%|          | 0/2451 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/AVE_oa_mine_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/AVE_oa_mine_test_subset.jsonl

Processing YAML file: script/config_CTA_SimTab.yaml
  -> Reading JSON file: /data/home/wangys/MELD/dataset/CTA/SimTab_test_few.json


  -> Formatting data:   0%|          | 0/7610 [00:00<?, ?it/s]

  -> Successfully created full JSONL file: ../LESS/LESS_data/CTA_SimTab_test_full.jsonl
  -> Successfully created 1% subset JSONL file: ../LESS/LESS_data/CTA_SimTab_test_subset.jsonl


In [None]:
merge_jsonl_files('LESS_data','LESS_data')

In [3]:
pd.read_json('LESS_data/full.jsonl',lines=True)

Unnamed: 0,dataset,id,messages
0,CTA_WebTable,CTA_WebTable_0,"[{'role': 'user', 'content': 'You are an exper..."
1,CTA_WebTable,CTA_WebTable_1,"[{'role': 'user', 'content': 'You are an exper..."
2,CTA_WebTable,CTA_WebTable_2,"[{'role': 'user', 'content': 'You are an exper..."
3,CTA_WebTable,CTA_WebTable_3,"[{'role': 'user', 'content': 'You are an exper..."
4,CTA_WebTable,CTA_WebTable_4,"[{'role': 'user', 'content': 'You are an exper..."
...,...,...,...
99777,CTA_SimTab,CTA_SimTab_6364,"[{'role': 'user', 'content': 'You are an exper..."
99778,CTA_SimTab,CTA_SimTab_6365,"[{'role': 'user', 'content': 'You are an exper..."
99779,CTA_SimTab,CTA_SimTab_6366,"[{'role': 'user', 'content': 'You are an exper..."
99780,CTA_SimTab,CTA_SimTab_6367,"[{'role': 'user', 'content': 'You are an exper..."
