<a href="https://colab.research.google.com/github/bidulgiya999/medgemma_test/blob/main/csv%ED%8C%8C%EC%9D%BC_%EC%B6%94%EC%B6%9C%EB%B2%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#### 이게 신민재씨가 작성한 프롬프

C:\Users\user\Desktop\dataset ├─ img\01,02,03\... (이미지들) └─ label\01,02,03\... (각 이미지에 대응하는 json) 위와 같은 구조의 디렉토리에서 다음을 작업을 수행하려고 해. 1. label\01, label\02, label\03 디렉토리 안에 각각 약 1100개의 디랙토리가 존재함. 2. 약 1100개의 디랙토리 안에 여러 json파일이 존재함. 3. 0001_01_F_00.json, 0001_01_F_01.json, 0001_01_F_02.json, ... ,0001_01_F_08.json 의 파일에서만 데이터를 추출할 것. 4. 0001_01_F_00.json 에는 {"info": {"filename": "0001_01_F.jpg", "id": "0001", "gender": "F", "age": 55, "date": "2023-07-27", "skin_type": 3, "sensitive": 0}, "images": {"device": 0, "width": 2136, "height": 3216, "angle": 0, "facepart": 0, "bbox": [0, 0, 2136, 3216]}, "annotations": {"acne": null}, "equipment": {"pigmentation_count": 147}} 내용이 있으며, "skin_type": 3, "sensitive": 0만을 추출하여 CSV파일에 저장할 것. 5. 나머지 파일에 대해서는 {"info": {"filename": "0001_01_F.jpg", "id": "0001", "gender": "F", "age": 55, "date": "2023-07-27", "skin_type": 3, "sensitive": 0}, "images": {"device": 0, "width": 2136, "height": 3216, "angle": 0, "facepart": 1, "bbox": [469, 661, 1638, 1197]}, "annotations": {"forehead_pigmentation": 1, "forehead_wrinkle": 3}, "equipment": {"forehead_moisture": 53.0, "forehead_elasticity_R0": 0.167, "forehead_elasticity_R1": 0.058, "forehead_elasticity_R2": 0.653, "forehead_elasticity_R3": 0.208, "forehead_elasticity_R4": 0.085, "forehead_elasticity_R5": 0.765, "forehead_elasticity_R6": 0.965, "forehead_elasticity_R7": 0.389, "forehead_elasticity_R8": 0.109, "forehead_elasticity_R9": 0.041, "forehead_elasticity_Q0": 33.4, "forehead_elasticity_Q1": 0.589, "forehead_elasticity_Q2": 0.478, "forehead_elasticity_Q3": 0.111}} 와 같음. 이 중 "annotations": {"forehead_pigmentation": 1, "forehead_wrinkle": 3}값만을 추출하여 CSV 파일에 저장할 것. 6. 1100개의 디렉토리안에 JSON파일을 모두 순회하면 label\02, label\03 순으로 모두 시행하고, CSV파일은 01, 02, 03 총 3개 작성할 것.

In [None]:
import os
import json
import csv
from pathlib import Path
from typing import Dict, Any, Optional

# ====== 설정 ======
# Windows 경로 예시: C:\Users\user\Desktop\dataset
DATASET_ROOT = Path(r"C:\Users\user\Desktop\dataset")

# label 하위의 01, 02, 03 각각을 처리하여 CSV 3개 생성
LABEL_ROOT = DATASET_ROOT / "label"
LABEL_GROUPS = ["01", "02", "03"]  # 처리할 그룹


def safe_get(d: Dict[str, Any], path: str, default=None):
    """
    dict에서 'a.b.c' 형태로 안전하게 값을 조회.
    키가 없거나 타입이 dict가 아니면 default 반환.
    """
    cur = d
    for key in path.split("."):
        if not isinstance(cur, dict) or key not in cur:
            return default
        cur = cur[key]
    return cur


def want_this_file(name: str) -> bool:
    """
    파일명이 *_00.json ~ *_08.json 인지 여부만 True.
    예: 0001_01_F_00.json, 0001_01_F_01.json, ... , 0001_01_F_08.json
    """
    if not name.lower().endswith(".json"):
        return False
    stem = name[:-5]  # .json 제거
    # 뒤에서 언더스코어 이후 두 자리 숫자를 추출
    parts = stem.rsplit("_", 1)
    if len(parts) != 2:
        return False
    suffix = parts[1]
    if len(suffix) != 2 or not suffix.isdigit():
        return False
    n = int(suffix)
    return 0 <= n <= 8


def extract_row(json_data: Dict[str, Any], label_group: str, subdir: Path, file_path: Path) -> Dict[str, Any]:
    """
    규칙에 따라 필요한 값만 추출해 행(dict)으로 반환.
    - *_00.json: info.skin_type, info.sensitive
    - 나머지 *_01~*_08.json: annotations.forehead_pigmentation, annotations.forehead_wrinkle
    공통 메타: label_group, subdir_name, filename(json), image_id(info.id), gender(info.gender), age(info.age), image_filename(info.filename)
    """
    # 공통 메타
    info_id = safe_get(json_data, "info.id")
    gender = safe_get(json_data, "info.gender")
    age = safe_get(json_data, "info.age")
    image_filename = safe_get(json_data, "info.filename")

    # 접미사 번호
    suffix = file_path.stem.rsplit("_", 1)[-1]  # "00".."08"
    is_00 = (suffix == "00")

    row = {
        "label_group": label_group,                 # "01"/"02"/"03"
        "subdir_name": subdir.name,                 # 하위 1100개 디렉토리명
        "json_file": file_path.name,                # JSON 파일명
        "json_suffix": suffix,                      # "00".."08"
        "image_id": info_id,
        "gender": gender,
        "age": age,
        "image_filename": image_filename,
        # 아래 필드는 상황에 따라 채움 (없으면 빈칸)
        "skin_type": "",
        "sensitive": "",
        "forehead_pigmentation": "",
        "forehead_wrinkle": "",
    }

    if is_00:
        row["skin_type"] = safe_get(json_data, "info.skin_type", "")
        row["sensitive"] = safe_get(json_data, "info.sensitive", "")
    else:
        row["forehead_pigmentation"] = safe_get(json_data, "annotations.forehead_pigmentation", "")
        row["forehead_wrinkle"] = safe_get(json_data, "annotations.forehead_wrinkle", "")

    return row


def process_one_label_group(label_group: str, label_root: Path) -> Path:
    """
    label/<label_group> 아래의 약 1100개 하위 디렉토리를 순회하며
    *_00.json~*_08.json 파일만 처리해서 CSV로 저장.
    반환: 생성된 CSV 경로
    """
    group_dir = label_root / label_group
    if not group_dir.exists():
        raise FileNotFoundError(f"[ERROR] 디렉토리가 존재하지 않습니다: {group_dir}")

    out_csv = group_dir / f"{label_group}.csv"
    out_csv.parent.mkdir(parents=True, exist_ok=True)

    # CSV 헤더 정의
    fieldnames = [
        "label_group", "subdir_name", "json_file", "json_suffix",
        "image_id", "gender", "age", "image_filename",
        "skin_type", "sensitive",
        "forehead_pigmentation", "forehead_wrinkle",
    ]

    count_files = 0
    count_rows = 0

    with open(out_csv, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        # 하위 디렉토리(약 1100개) 순회
        for subdir in sorted([p for p in group_dir.iterdir() if p.is_dir()]):
            # 각 하위 디렉토리 안의 JSON 파일들 중 원하는 패턴만 처리
            json_files = [p for p in subdir.glob("*.json") if want_this_file(p.name)]
            for jp in sorted(json_files):
                count_files += 1
                try:
                    with open(jp, "r", encoding="utf-8") as jf:
                        data = json.load(jf)
                    row = extract_row(data, label_group, subdir, jp)
                    writer.writerow(row)
                    count_rows += 1
                except json.JSONDecodeError:
                    print(f"[WARN] JSON 파싱 실패: {jp}")
                except Exception as e:
                    print(f"[WARN] 처리 중 예외 발생: {jp} -> {e}")

    print(f"[INFO] 그룹 {label_group}: 처리 파일 {count_files}개, 기록 행 {count_rows}개 → {out_csv}")
    return out_csv


def main():
    print(f"[INFO] 데이터 루트: {DATASET_ROOT}")
    print(f"[INFO] 라벨 루트:  {LABEL_ROOT}")

    for g in LABEL_GROUPS:
        try:
            process_one_label_group(g, LABEL_ROOT)
        except Exception as e:
            print(f"[ERROR] 그룹 {g} 처리 실패: {e}")


if __name__ == "__main__":
    main()


In [3]:
### 아래가 내 프롬프트

  File "/tmp/ipython-input-128936180.py", line 2
    현재 생성된 파일을 보면 label_group,subdir_name,json_file,json_suffix,image_id,gender,age,image_filename,skin_type,sensitive,forehead_pigmentation,forehead_wrinkle 01,0001,0001_01_F_00.json,00,0001,F,55,0001_01_F.jpg,3,0,, 01,0001,0001_01_F_01.json,01,0001,F,55,0001_01_F.jpg,,,1,3 01,0001,0001_01_F_02.json,02,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_03.json,03,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_04.json,04,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_05.json,05,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_06.json,06,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_07.json,07,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_08.json,08,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_Fb_00.json,00,0001,F,55,0001_01_Fb.jpg,3,0,, 01,0001,0001_01_Fb_01.json,01,0001,F,55,0001_01_Fb.jpg,,,1,3 01,0001,0001_01_Fb_02.json,02,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Fb_03.json,03,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Fb_04.json,04,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Fb_05.json,05,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Fb_06.json,06,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Fb_07.json,07,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Fb_08.json,08,0001,F,55,0001_01_Fb.jpg,,,, 01,0001,0001_01_Ft_00.json,00,0001,F,55,0001_01_Ft.jpg,3,0,, 01,0001,0001_01_Ft_01.json,01,0001,F,55,0001_01_Ft.jpg,,,1,3 01,0001,0001_01_Ft_02.json,02,0001,F,55,0001_01_Ft.jpg,,,, 01,0001,0001_01_Ft_03.json,03,0001,F,55,0001_01_Ft.jpg,,,, 01,0001,0001_01_Ft_04.json,04,0001,F,55,...
       이렇게 나오는데 정보손실이 있는거 같아                                                                                                                                                         


In [None]:
import os
import json
import csv
from pathlib import Path

# ====== 설정 ======
DATASET_ROOT = Path(r"C:\Users\user\Desktop\dataset")
LABEL_ROOT = DATASET_ROOT / "label"
LABEL_GROUPS = ["01", "02", "03"]

def safe_get(d, path, default=None):
    cur = d
    for k in path.split("."):
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def parse_name_and_suffix(json_path: Path):
    """
    0001_01_F_00.json -> base='0001_01_F', suffix='00'
    """
    stem = json_path.stem  # without .json
    parts = stem.rsplit("_", 1)
    if len(parts) != 2 or not parts[1].isdigit() or len(parts[1]) != 2:
        return None, None
    return parts[0], parts[1]

def want_this_file(name: str) -> bool:
    if not name.lower().endswith(".json"):
        return False
    stem = name[:-5]
    parts = stem.rsplit("_", 1)
    if len(parts) != 2:
        return False
    suf = parts[1]
    return suf.isdigit() and len(suf) == 2 and 0 <= int(suf) <= 8

def process_group(label_group: str):
    group_dir = LABEL_ROOT / label_group
    if not group_dir.exists():
        print(f"[WARN] group not found: {group_dir}")
        return

    # 누적 딕셔너리: key=(subdir_name, base_name) → 한 행(와이드)
    rows = {}

    for subdir in sorted([p for p in group_dir.iterdir() if p.is_dir()]):
        for jp in sorted([p for p in subdir.glob("*.json") if want_this_file(p.name)]):
            base, suf = parse_name_and_suffix(jp)
            if base is None:
                continue
            key = (subdir.name, base)

            # 최초 행 생성
            if key not in rows:
                rows[key] = {
                    "label_group": label_group,
                    "subdir_name": subdir.name,
                    "base_name": base,          # 예: 0001_01_F
                    "image_id": "",
                    "gender": "",
                    "age": "",
                    "image_filename": "",
                    "skin_type": "",
                    "sensitive": "",
                }
                # suffix별 컬럼 초기화
                for i in range(1, 9):  # 01..08
                    s = f"{i:02d}"
                    rows[key][f"forehead_pigmentation_{s}"] = ""
                    rows[key][f"forehead_wrinkle_{s}"] = ""

            try:
                with open(jp, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                print(f"[WARN] JSON read error: {jp} -> {e}")
                continue

            # 공통 메타(가능하면 한 번만 세팅)
            if not rows[key]["image_id"]:
                rows[key]["image_id"] = safe_get(data, "info.id", "")
            if not rows[key]["gender"]:
                rows[key]["gender"] = safe_get(data, "info.gender", "")
            if not rows[key]["age"]:
                rows[key]["age"] = safe_get(data, "info.age", "")
            if not rows[key]["image_filename"]:
                rows[key]["image_filename"] = safe_get(data, "info.filename", "")

            if suf == "00":
                # _00 → skin_type / sensitive
                rows[key]["skin_type"] = safe_get(data, "info.skin_type", rows[key]["skin_type"])
                rows[key]["sensitive"] = safe_get(data, "info.sensitive", rows[key]["sensitive"])
            else:
                # _01.._08 → annotations.forehead_pigmentation / forehead_wrinkle
                fp = safe_get(data, "annotations.forehead_pigmentation", "")
                fw = safe_get(data, "annotations.forehead_wrinkle", "")
                # 일부 파일은 키가 없을 수 있으므로 덮어쓰기 조건부
                if fp != "":
                    rows[key][f"forehead_pigmentation_{suf}"] = fp
                if fw != "":
                    rows[key][f"forehead_wrinkle_{suf}"] = fw

    # CSV 헤더 구성
    fieldnames = [
        "label_group", "subdir_name", "base_name",
        "image_id", "gender", "age", "image_filename",
        "skin_type", "sensitive",
    ] + [f"forehead_pigmentation_{i:02d}" for i in range(1, 9)] \
      + [f"forehead_wrinkle_{i:02d}" for i in range(1, 9)]

    out_csv = (group_dir / f"{label_group}_wide.csv")
    with open(out_csv, "w", newline="", encoding="utf-8-sig") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for key in sorted(rows.keys()):
            w.writerow(rows[key])

    print(f"[INFO] 그룹 {label_group}: {len(rows)}행 출력 → {out_csv}")

def main():
    for g in LABEL_GROUPS:
        process_group(g)

if __name__ == "__main__":
    main()


label_group,subdir_name,json_file,json_suffix,image_id,gender,age,image_filename,skin_type,sensitive,forehead_pigmentation,forehead_wrinkle 여기 나와있는데로 정보를 얻고싶어 01,0001,0001_01_F_00.json,00,0001,F,55,0001_01_F.jpg,3,0,, 01,0001,0001_01_F_01.json,01,0001,F,55,0001_01_F.jpg,,,1,3 01,0001,0001_01_F_02.json,02,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_03.json,03,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_04.json,04,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_05.json,05,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_06.json,06,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_07.json,07,0001,F,55,0001_01_F.jpg,,,, 01,0001,0001_01_F_08.json,08,0001,F,55,0001_01_F.jpg,,,, 근데 이런식으로 생략된것도 있는거 같아

In [None]:

def iter_json_files(subdir: Path):
    # *.json, *.JSON 모두 수집
    for p in subdir.glob("*.json"):
        yield p
    for p in subdir.glob("*.JSON"):
        yield p

def want_this_file(name: str) -> bool:
    # *_00.json ~ *_08.json (확장자 대소문자 무관)
    lower = name.lower()
    if not lower.endswith(".json"):
        return False
    stem = lower[:-5]
    parts = stem.rsplit("_", 1)
    if len(parts) != 2:
        return False
    suf = parts[1]
    return suf.isdigit() and len(suf) == 2 and 0 <= int(suf) <= 8


In [None]:
json_files = [p for p in iter_json_files(subdir) if want_this_file(p.name)]


In [None]:
import os, json, csv
from pathlib import Path

DATASET_ROOT = Path(r"C:\Users\user\Desktop\dataset")
LABEL_ROOT = DATASET_ROOT / "label"
LABEL_GROUPS = ["01", "02", "03"]

def safe_get(d, path, default=None, _exist_flag=False):
    cur = d
    for k in path.split("."):
        if not isinstance(cur, dict) or k not in cur:
            return (default, False) if _exist_flag else default
        cur = cur[k]
    return (cur, True) if _exist_flag else cur

def iter_json_files(subdir: Path):
    for p in subdir.glob("*.json"):
        yield p
    for p in subdir.glob("*.JSON"):
        yield p

def want_this_file(name: str) -> bool:
    lower = name.lower()
    if not lower.endswith(".json"):
        return False
    stem = lower[:-5]
    parts = stem.rsplit("_", 1)
    if len(parts) != 2:
        return False
    suf = parts[1]
    return suf.isdigit() and len(suf) == 2 and 0 <= int(suf) <= 8

def process_one_label_group(label_group: str, label_root: Path) -> None:
    group_dir = label_root / label_group
    out_csv = group_dir / f"{label_group}.csv"
    audit_csv = group_dir / f"{label_group}_audit.csv"

    fieldnames = [
        "label_group","subdir_name","json_file","json_suffix",
        "image_id","gender","age","image_filename",
        "skin_type","sensitive","forehead_pigmentation","forehead_wrinkle"
    ]

    audit_fields = [
        "label_group","subdir_name","json_file","json_suffix",
        "reason","detail_key"
    ]

    with open(out_csv, "w", newline="", encoding="utf-8-sig") as f, \
         open(audit_csv, "w", newline="", encoding="utf-8-sig") as af:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        aw = csv.DictWriter(af, fieldnames=audit_fields)
        aw.writeheader()

        for subdir in sorted([p for p in group_dir.iterdir() if p.is_dir()]):
            for jp in sorted([p for p in iter_json_files(subdir) if want_this_file(p.name)]):
                suffix = jp.stem.rsplit("_", 1)[-1]
                row = {
                    "label_group": label_group,
                    "subdir_name": subdir.name,
                    "json_file": jp.name,
                    "json_suffix": suffix,
                    "image_id": "", "gender": "", "age": "", "image_filename": "",
                    "skin_type": "", "sensitive": "", "forehead_pigmentation": "", "forehead_wrinkle": ""
                }

                try:
                    data = json.load(open(jp, "r", encoding="utf-8"))
                except Exception as e:
                    aw.writerow({
                        "label_group": label_group, "subdir_name": subdir.name,
                        "json_file": jp.name, "json_suffix": suffix,
                        "reason": "json_read_error", "detail_key": str(e)
                    })
                    continue

                row["image_id"]        = safe_get(data, "info.id", "")
                row["gender"]          = safe_get(data, "info.gender", "")
                row["age"]             = safe_get(data, "info.age", "")
                row["image_filename"]  = safe_get(data, "info.filename", "")

                if suffix == "00":
                    val, ok = safe_get(data, "info.skin_type", "", _exist_flag=True)
                    row["skin_type"] = "" if val is None else val
                    if not ok:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"missing_key","detail_key":"info.skin_type"})
                    elif val is None:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"null_value","detail_key":"info.skin_type"})

                    val, ok = safe_get(data, "info.sensitive", "", _exist_flag=True)
                    row["sensitive"] = "" if val is None else val
                    if not ok:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"missing_key","detail_key":"info.sensitive"})
                    elif val is None:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"null_value","detail_key":"info.sensitive"})
                else:
                    val, ok = safe_get(data, "annotations.forehead_pigmentation", "", _exist_flag=True)
                    row["forehead_pigmentation"] = "" if val is None else val
                    if not ok:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"missing_key","detail_key":"annotations.forehead_pigmentation"})
                    elif val is None:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"null_value","detail_key":"annotations.forehead_pigmentation"})

                    val, ok = safe_get(data, "annotations.forehead_wrinkle", "", _exist_flag=True)
                    row["forehead_wrinkle"] = "" if val is None else val
                    if not ok:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"missing_key","detail_key":"annotations.forehead_wrinkle"})
                    elif val is None:
                        aw.writerow({**{k: row[k] for k in ["label_group","subdir_name","json_file","json_suffix"]},
                                     "reason":"null_value","detail_key":"annotations.forehead_wrinkle"})

                w.writerow(row)

def main():
    for g in LABEL_GROUPS:
        process_one_label_group(g, LABEL_ROOT)

if __name__ == "__main__":
    main()
