In [1]:
from datasets import load_dataset

# 1) 전체 데이터를 불러온다 (현재 split 이름이 train 하나뿐)
dataset = load_dataset("OutFlankShu/MATE_DATASET", split="train")

# 2) 셔플 (재현성을 위해 seed 고정)
dataset = dataset.shuffle(seed=42)

# 3) 80% train / 20% temp 로 먼저 나누기
train_valid = dataset.train_test_split(test_size=0.2)

# 4) 나뉜 20%를 다시 50:50으로 잘라서 val / test로
valid_test = train_valid["test"].train_test_split(test_size=0.5)

train_ds = train_valid["train"]   # 80%
val_ds   = valid_test["train"]    # 10%
test_ds  = valid_test["test"]     # 10%


README.md: 0.00B [00:00, ?B/s]

no_explain.zip:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

strategy.zip:   0%|          | 0.00/73.6M [00:00<?, ?B/s]

strategy_explanation.zip:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

tactic.zip:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

testset.zip:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Failed to load JSON from file 'zip://no_explain/calculatelines.py::/root/.cache/huggingface/hub/datasets--OutFlankShu--MATE_DATASET/snapshots/2f9f2154d32377a0f2eee375fc0047b674348134/no_explain.zip' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Invalid value. in row 0


DatasetGenerationError: An error occurred while generating the dataset

In [3]:
from datasets import load_dataset
from pathlib import Path

def create_splits_from_hf(
    hf_name: str,
    split: str = "train",              # HF에서 가져올 split 이름
    output_dir: str = "/root/AdaSTaR/data",
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42,
):
    """
    HuggingFace 데이터셋을 불러와서 train/val/test jsonl 파일을 새로 생성하는 함수.
    기존 jsonl 파일이 존재할 필요 없음.
    """
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "비율 합이 1이어야 해요!"

    # 1) HF에서 바로 데이터 로드 (기존 파일 필요 X)
    dataset = load_dataset(hf_name, split=split)

    # 2) 셔플
    dataset = dataset.shuffle(seed=seed)

    # 3) 비율로 인덱스 나누기
    n = len(dataset)
    n_train = int(n * train_ratio)
    n_val   = int(n * val_ratio)
    n_test  = n - n_train - n_val

    train_ds = dataset.select(range(0, n_train))
    val_ds   = dataset.select(range(n_train, n_train + n_val))
    test_ds  = dataset.select(range(n_train + n_val, n))

    # 4) 출력 디렉토리 생성 (여기만 미리 존재하면 됨)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # 5) jsonl로 새로 저장 (기존 파일 없어도 됨)
    train_path = output_path / "train.jsonl"
    val_path   = output_path / "val.jsonl"
    test_path  = output_path / "test.jsonl"

    train_ds.to_json(str(train_path), lines=True)
    val_ds.to_json(str(val_path), lines=True)
    test_ds.to_json(str(test_path), lines=True)

    print(f"총 샘플 수: {n}")
    print(f"train: {n_train} → {train_path}")
    print(f"val:   {n_val}   → {val_path}")
    print(f"test:  {n_test}  → {test_path}")


if __name__ == "__main__":
    # 예시: 다른 벤치마크 HF 이름을 여기에 넣으면 됨
    create_splits_from_hf(
        hf_name="OutFlankShu/MATE_DATASET",   # 여기만 실제 벤치마크 이름으로 교체
        split="train",                        # HF 쪽 split 이름
        output_dir="/root/AdaSTaR/data",      # AdaSTaR 안에 저장하고 싶은 경로
        train_ratio=0.8,
        val_ratio=0.1,
        test_ratio=0.1,
    )


Generating train split: 0 examples [00:00, ? examples/s]

Failed to load JSON from file 'zip://no_explain/calculatelines.py::/root/.cache/huggingface/hub/datasets--OutFlankShu--MATE_DATASET/snapshots/2f9f2154d32377a0f2eee375fc0047b674348134/no_explain.zip' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Invalid value. in row 0


DatasetGenerationError: An error occurred while generating the dataset

In [8]:
def create_splits_from_list(
    data,
    output_dir: str,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42,
):
    import json
    import random
    from pathlib import Path

    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "비율 합이 1이어야 해요!"

    random.seed(seed)
    random.shuffle(data)

    n = len(data)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    n_test = n - n_train - n_val

    train_data = data[:n_train]
    val_data = data[n_train:n_train + n_val]
    test_data = data[n_train + n_val:]

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    def write_jsonl(path, rows):
        with open(path, "w", encoding="utf-8") as f:
            for row in rows:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

    write_jsonl(output_path / "train.jsonl", train_data)
    write_jsonl(output_path / "val.jsonl", val_data)
    write_jsonl(output_path / "test.jsonl", test_data)

    print(f"총 샘플 수: {n}")
    print(f"train: {n_train}")
    print(f"val:   {n_val}")
    print(f"test:  {n_test}")
    print(f"Saved to: {output_path.resolve()}")


if __name__ == "__main__":
    subset = "both"   # or "strategy", "tactic", "no_explain" 등
    mate_data = load_mate_subset(subset=subset)

    out_dir = f"/root/AdaSTaR/data/mate_{subset}"

    create_splits_from_list(
        data=mate_data,
        output_dir=out_dir,
        train_ratio=0.8,
        val_ratio=0.1,
        test_ratio=0.1,
        seed=42,
    )


Downloading both.zip from OutFlankShu/MATE_DATASET ...
Downloaded to: /root/.cache/huggingface/hub/datasets--OutFlankShu--MATE_DATASET/snapshots/2f9f2154d32377a0f2eee375fc0047b674348134/both.zip
Files inside zip:
  - both/
  - both/explain_dataset07_both.jsonl
  - __MACOSX/both/._explain_dataset07_both.jsonl
  - both/.DS_Store
  - __MACOSX/both/._.DS_Store
  - both/explain_dataset13_both.jsonl
  - __MACOSX/both/._explain_dataset13_both.jsonl
  - both/explain_dataset02_both.jsonl
  - __MACOSX/both/._explain_dataset02_both.jsonl
  - both/explain_dataset05_both.jsonl
  - __MACOSX/both/._explain_dataset05_both.jsonl
  - both/explain_dataset14_both.jsonl
  - __MACOSX/both/._explain_dataset14_both.jsonl
  - both/explain_dataset08_both.jsonl
  - __MACOSX/both/._explain_dataset08_both.jsonl
  - both/explain_dataset11_both.jsonl
  - __MACOSX/both/._explain_dataset11_both.jsonl
  - both/explain_dataset00_both.jsonl
  - __MACOSX/both/._explain_dataset00_both.jsonl
  - both/explain_dataset12_both.

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 37: invalid continuation byte

In [14]:
import json
import random
import zipfile
import pickle
from pathlib import Path

from huggingface_hub import hf_hub_download


REPO_ID = "OutFlankShu/MATE_DATASET"

ZIP_MAP = {
    "no_explain": "no_explain.zip",
    "strategy": "strategy.zip",
    "tactic": "tactic.zip",
    "both": "both.zip",
    "strategy_explanation": "strategy_explanation.zip",
    "testset": "testset.zip",
}


def is_mac_metadata(path: str) -> bool:
    """__MACOSX / .DS_Store / '._파일' 같은 Mac 메타데이터인지 판별."""
    if path.startswith("__MACOSX/"):
        return True
    if path.endswith(".DS_Store"):
        return True
    if "/._" in path:
        return True
    return False


def load_mate_subset(subset: str = "both"):
    """
    MATE_DATASET의 서브셋(zip)을 직접 받아서 파싱해
    Python 리스트(예: list[dict])로 리턴하는 함수.

    subset: "no_explain", "strategy", "tactic", "both",
            "strategy_explanation", "testset" 중 하나
    """
    if subset not in ZIP_MAP:
        raise ValueError(f"subset must be one of {list(ZIP_MAP.keys())}, got {subset}")

    zip_name = ZIP_MAP[subset]
    print(f"Downloading {zip_name} from {REPO_ID} ...")

    zip_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=zip_name,
        repo_type="dataset",
    )
    print(f"Downloaded to: {zip_path}")

    data = []

    with zipfile.ZipFile(zip_path, "r") as z:
        members = z.namelist()
        print("Files inside zip:")
        for m in members:
            print("  -", m)

        for m in members:
            # 0) Mac 메타데이터는 전부 스킵
            if is_mac_metadata(m):
                print(f"Skipping Mac metadata file: {m}")
                continue

            # 1) 피클 파일이 있을 경우 (지금 both.zip에는 없을 가능성이 높지만 방어적으로)
            if m.endswith(".pkl"):
                print(f"Loading pickle: {m}")
                with z.open(m, "r") as f:
                    obj = pickle.load(f)
                    if isinstance(obj, list):
                        data.extend(obj)
                    elif isinstance(obj, dict):
                        for v in obj.values():
                            if isinstance(v, list):
                                data.extend(v)
                            else:
                                data.append(v)
                    else:
                        print(f"Warning: unexpected pickle type: {type(obj)}")

            # 2) json/jsonl 파일 처리 (실제 데이터)
            elif m.endswith(".jsonl") or m.endswith(".json"):
                print(f"Loading json/jsonl: {m}")
                with z.open(m, "r") as f:
                    for raw_line in f:
                        try:
                            line = raw_line.decode("utf-8").strip()
                        except UnicodeDecodeError:
                            # 혹시 또 이상한 바이트가 섞여 있으면 해당 줄만 건너뜀
                            print(f"  -> Skipping undecodable line in {m}")
                            continue

                        if not line:
                            continue
                        data.append(json.loads(line))

            # 3) 나머지 파일은 무시
            else:
                print(f"Skipping non-data file: {m}")

    print(f"Loaded {len(data)} examples from subset '{subset}'")
    return data


def create_splits_from_list(
    data,
    output_dir: str,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42,
):
    import json
    import random
    from pathlib import Path

    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "비율 합이 1이어야 해요!"

    random.seed(seed)
    random.shuffle(data)

    n = len(data)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    n_test = n - n_train - n_val

    train_data = data[:n_train]
    val_data = data[n_train:n_train + n_val]
    test_data = data[n_train + n_val:]

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    def write_jsonl(path, rows):
        with open(path, "w", encoding="utf-8") as f:
            for row in rows:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

    write_jsonl(output_path / "train.jsonl", train_data)
    write_jsonl(output_path / "val.jsonl", val_data)
    write_jsonl(output_path / "test.jsonl", test_data)

    print(f"총 샘플 수: {n}")
    print(f"train: {n_train}")
    print(f"val:   {n_val}")
    print(f"test:  {n_test}")
    print(f"Saved to: {output_path.resolve()}")


if __name__ == "__main__":
    subset = "no_explain"   # or "strategy", "tactic", "no_explain" 등
    mate_data = load_mate_subset(subset=subset)

    out_dir = f"/root/AdaSTaR/data/mate_{subset}"

    create_splits_from_list(
        data=mate_data,
        output_dir=out_dir,
        train_ratio=1,
        val_ratio=0.0,
        test_ratio=0.0,
        seed=42,
    )


Downloading no_explain.zip from OutFlankShu/MATE_DATASET ...
Downloaded to: /root/.cache/huggingface/hub/datasets--OutFlankShu--MATE_DATASET/snapshots/2f9f2154d32377a0f2eee375fc0047b674348134/no_explain.zip
Files inside zip:
  - no_explain/
  - __MACOSX/._no_explain
  - no_explain/explain_dataset13_noexplain.jsonl
  - __MACOSX/no_explain/._explain_dataset13_noexplain.jsonl
  - no_explain/explain_dataset05_noexplain.jsonl
  - __MACOSX/no_explain/._explain_dataset05_noexplain.jsonl
  - no_explain/.DS_Store
  - __MACOSX/no_explain/._.DS_Store
  - no_explain/explain_dataset02_noexplain.jsonl
  - __MACOSX/no_explain/._explain_dataset02_noexplain.jsonl
  - no_explain/explain_dataset14_noexplain.jsonl
  - __MACOSX/no_explain/._explain_dataset14_noexplain.jsonl
  - no_explain/explain_dataset03_noexplain.jsonl
  - __MACOSX/no_explain/._explain_dataset03_noexplain.jsonl
  - no_explain/explain_dataset12_noexplain.jsonl
  - __MACOSX/no_explain/._explain_dataset12_noexplain.jsonl
  - no_explain/exp