In [None]:
from glob import glob
import pandas as pd
DATA_DIR = "/.cache/modelscope/datasets/deepctrl/deepctrl-sft-data/sft_data_zh.jsonl"
import json
import random

In [None]:
def convert_to_llama_factory_format(item):
    """
    将SFT数据格式转换为llama-factory格式
    原格式：instruction为空，input包含任务描述，output包含生成内容
    转换后：instruction包含任务描述，input为空，output保持不变
    """
    return {
        "instruction": item["input"],
        "input": "",
        "output": item["output"].strip()
    }

all_data = []
token_num_list = []

with open(DATA_DIR, 'r', encoding='utf-8') as file:
    for idx, line in enumerate(file):
        # 解码JSON对象
        json_obj = json.loads(line)
        
        # 检查必要字段是否存在且有效
        if json_obj.get("input", "") == "" or json_obj.get("output", "") == "":
            continue
            
        # 转换为llama-factory格式
        converted_data = convert_to_llama_factory_format(json_obj)
        all_data.append(converted_data)
        token_num_list.append(len(converted_data))

print(f"总共处理了 {len(all_data)} 条数据")
print(len(token_num_list))

# 显示转换后的数据示例
if all_data:
    print("转换后的数据示例:")
    print(all_data[0])

In [None]:
import numpy as np
np.percentile(token_num_list,[50, 60, 70, 80, 90,95,99])
print(np.percentile(token_num_list,[50, 60, 70, 80, 90,95, 99]))
filtered_long_data = []
filtered_token_num = []
for i in range(len(all_data)):
    if token_num_list[i] < 768:
        filtered_long_data.append(all_data[i])
        filtered_token_num.append(token_num_list[i])
print(np.percentile(filtered_token_num,[50, 60, 70, 80, 90,95, 99]))
print(len(filtered_token_num))

In [None]:
few_data = random.sample(filtered_long_data, min(2000000, len(filtered_long_data)))
print(f"采样后数据量: {len(few_data)}")

In [None]:
import os
file_path = "/.cache/sft_data/llamafactory_input/deepctl_200W.jsonl"

directory = os.path.dirname(file_path)

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"目录{directory} 不存在，已创建")
else:
    print(f"目录{directory} 已存在")

with open(file_path, 'w', encoding="utf-8") as f:
    for item in few_data:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

print(f"数据已保存到: {file_path}")