In [1]:
from datasets import Dataset, Features, Value, Image, Sequence, load_dataset
import json, os


features = Features({
    "Question ID": Value("int64"),  # 唯一标识符
    "Share Context": Value("string"),  # 长文本的上下文信息
    "Share Image": Sequence(Value("string")),  # 图像路径列表
    "Question Text": Value("string"),  # 问题文本
    "Image": Value("string"),  # 图像数据（可能为空）
    "Options": Value("string"),
    "Answer": Value("string"),  # 正确答案
    "Explanation": Value("string"),  # 答案解析
    "QA Type": Value("string"),  # 问答类型
    "Level of Difficulty": Value("string"),  # 难度等级
    "shared_description": Value("string"),  # 共享描述
    "description": Value("string"),  # 描述字段（可能为空）
    "Datasplit": Value("string"),  # 数据划分（train, val, test）
    "Index": Value("int64"),  # 索引
})

def trans(data):
    formatted_data = {
    "Question ID": [entry["Question ID"] for entry in data],
    "Share Context": [entry["Share Context"] for entry in data],
    "Share Image": [entry["Share Image"] for entry in data],
    "Question Text": [entry["Question Text"] for entry in data],
    "Image": [entry["Image"] for entry in data],
    "Options": [entry["Options"] for entry in data],
    "Answer": [entry["Answer"] for entry in data],
    "Explanation": [entry["Explanation"] for entry in data],
    "QA Type": [entry["QA Type"] for entry in data],
    "Level of Difficulty": [entry["Level of Difficulty"] for entry in data],
    "shared_description": [entry["shared_description"] for entry in data],
    "description": [entry["description"] for entry in data],
    "Datasplit": [entry["Datasplit"] for entry in data],
    "Index": [entry["Index"] for entry in data],
    }
    return formatted_data

# 定义编码函数
def embed_images(example):
    # 对单独的 Image 列进行处理
    if example["Image"] is None:  # 如果图像字段为 None
        result = {"Image": None}
    else:
        if os.path.exists(example["Image"]):  # 检查路径是否存在
            with open(example["Image"], "rb") as f:
                result = {"Image": {"path": example["Image"], "bytes": f.read()}}
        else:
            result = {"Image": None}  # 如果路径无效，设置为 None

    # 对 Share Image 列进行处理
    share_images = example["Share Image"]
    if share_images:  # 如果 Share Image 列不为空
        encoded_share_images = []
        for image_path in share_images:
            if os.path.exists(image_path):  # 检查路径是否存在
                with open(image_path, "rb") as f:
                    encoded_share_images.append({"path": image_path, "bytes": f.read()})
            else:
                encoded_share_images.append(None)  # 如果路径无效，填充 None
        result["Share Image"] = encoded_share_images
    else:
        result["Share Image"] = None  # 如果 Share Image 列为空，保留为 None

    return result


In [2]:
# 读取 JSON 文件
with open('data/FinQA_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)  # 将 JSON 数据解析为 Python 对象（如 dict 或 list）

In [3]:
data = trans(data)
dataset = Dataset.from_dict(data, features=features)
dataset = dataset.map(embed_images)
dataset = dataset.cast_column("Share Image", Sequence(Image()))
dataset = dataset.cast_column("Image", Image())

Map:   0%|          | 0/3201 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3201 [00:00<?, ? examples/s]

In [4]:
dataset.to_parquet("train.parquet")

Creating parquet from Arrow format:   0%|          | 0/33 [00:00<?, ?ba/s]

362292455

In [17]:
dataset_train = load_dataset("parquet", data_files='train.parquet')

Generating train split: 0 examples [00:00, ? examples/s]

In [19]:
print(dataset_train['train']['Share Image'][0])

[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=620x251 at 0x1C0FE5A1D60>, <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=386x543 at 0x1C0FE802150>, <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=602x370 at 0x1C0FE801280>, <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=484x242 at 0x1C0FE8026C0>]
