In [1]:
from google.colab import drive

# 구글 드라이브 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# -*- coding: utf-8 -*-
"""
SNOMED CT RF2(Snapshot_INT_YYYYMMDD) -> (개념-설명) 쌍 생성 -> Qwen3-8B SFT JSONL
- 파일 자동 탐색(같은 배포 날짜 우선)
- 활성(active==1)만 사용
- 언어 refset 기반 Preferred term 우선 선택
- TextDefinition 있으면 설명으로 사용, 없으면 짧은 기본 문장으로 대체
- synonyms를 meta로 함께 저장(선택)
"""

from pathlib import Path
import pandas as pd
import re, json

# ==================== 사용자 설정 ====================
BASE = Path("/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot")
OUT_JSONL = "/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/snomed_concept_description_qwen.jsonl"

# 선호 언어 Refset ID (US English=900000000000509007, GB English=900000000000508004)
# 한국어 refset을 쓸 경우 여기를 해당 refsetId로 교체하세요.
REFSET_LANG_ID = 900000000000509007  # US English
# ====================================================

def pick_first(paths):
    paths = list(paths)
    return paths[0] if paths else None

def find_release_files(base: Path):
    # Concept
    concept_fp = pick_first(base.rglob("Terminology/sct2_Concept_Snapshot_INT_*.txt")) \
              or pick_first(base.rglob("**/sct2_Concept_Snapshot_INT_*.txt"))
    if not concept_fp:
        raise FileNotFoundError("Concept Snapshot 파일을 찾지 못했습니다.")
    print("[Concept]", concept_fp)

    m = re.search(r"_(\d{8})\.txt$", concept_fp.name)
    if not m:
        raise RuntimeError("Concept 파일명에서 날짜(YYYYMMDD)를 추출하지 못했습니다.")
    date_str = m.group(1)
    print("[DATE]", date_str)

    # Description (동일 날짜 우선)
    desc_fp = pick_first(base.rglob(f"Terminology/sct2_Description_Snapshot-en_INT_{date_str}.txt")) \
           or pick_first(base.rglob(f"**/sct2_Description_Snapshot-en_INT_{date_str}.txt")) \
           or pick_first(base.rglob("**/sct2_Description_Snapshot-en_INT_*.txt"))
    if not desc_fp:
        raise FileNotFoundError("Description Snapshot 파일을 찾지 못했습니다.")
    print("[Description]", desc_fp)

    # TextDefinition (선택)
    textdef_fp = pick_first(base.rglob(f"Terminology/sct2_TextDefinition_Snapshot-en_INT_{date_str}.txt")) \
              or pick_first(base.rglob(f"**/sct2_TextDefinition_Snapshot-en_INT_{date_str}.txt")) \
              or pick_first(base.rglob("**/sct2_TextDefinition_Snapshot-en_INT_*.txt"))
    print("[TextDefinition]", textdef_fp if textdef_fp else "없음")

    # Language refset (여러 개일 수 있어 패턴 넓게)
    lang_candidates = []
    for pat in [
        f"Refset/Language/der2_cRefset_Language*{date_str}.txt",
        f"**/der2_cRefset_Language*{date_str}.txt",
        "Refset/Language/der2_cRefset_Language*.txt",
        "**/der2_cRefset_Language*.txt",
    ]:
        lang_candidates += list(base.rglob(pat))
    lang_candidates = sorted(set(lang_candidates), key=lambda p: p.name)
    if not lang_candidates:
        raise FileNotFoundError("Language refset 파일을 찾지 못했습니다.")
    # 실제로는 여러 언어 파일이 존재할 수 있음. 여기서는 '선호 refset id'로 필터링할 것이므로 아무거나 하나 잡아 로딩 후 필터링.
    lang_fp = lang_candidates[0]
    print("[Language(any)]", lang_fp.parent)

    return concept_fp, desc_fp, textdef_fp, lang_fp

def strip_semantic_tag(t: str) -> str:
    # "Neoplasm of jejunum (disorder)" -> "Neoplasm of jejunum"
    m = re.match(r"^(.*)\s\([^)]+\)$", t)
    return m.group(1) if m else t

def main():
    concept_fp, desc_fp, textdef_fp, lang_fp = find_release_files(BASE)

    # === 로드 (dtype 최적화) ===
    dtype_common = {"id":"string","effectiveTime":"string","active":"int8","moduleId":"string"}

    print("Loading Concepts...")
    # Concepts: conceptId는 숫자라 유지 가능
    concepts = pd.read_csv(concept_fp, sep="\t",
        dtype={**dtype_common, "definitionStatusId":"string"}  # <- string로 통일해도 무방
    )
    # 나중에 쓸 열만 int로 바꾸기
    concepts["id"] = concepts["id"].astype("int64")  # conceptId는 숫자 보장
    concepts = concepts.query("active==1")[["id"]].rename(columns={"id":"conceptId"})

    # Descriptions: description의 id는 문자열(조인 키), conceptId는 숫자
    descs = pd.read_csv(desc_fp, sep="\t",
        dtype={**dtype_common,
               "conceptId":"int64",
              "languageCode":"string",
              "typeId":"string",              # 안전하게 문자열
              "term":"string",
              "caseSignificanceId":"string"}
    ).query("active==1")

    # Language refset: 전부 문자열로(특히 id / referencedComponentId / acceptabilityId)
    lang_all = pd.read_csv(lang_fp, sep="\t",
        dtype={**dtype_common,
               "refsetId":"string",
              "referencedComponentId":"string",
              "acceptabilityId":"string"}
    ).query("active==1")

    if textdef_fp:
        textdefs = pd.read_csv(textdef_fp, sep="\t",
            dtype={**dtype_common,
                  "conceptId":"int64",
                  "languageCode":"string",
                  "typeId":"string",
                  "term":"string",
                   "caseSignificanceId":"string"}
        ).query("active==1")
    else:
        textdefs = pd.DataFrame(columns=["conceptId","term"]).assign(typeId=[])

    # === 상수: 문자열로 통일 ===
    TYPEID_FSN       = "900000000000003001"
    TYPEID_SYNONYM   = "900000000000013009"
    TYPEID_TEXT_DEF  = "900000000000550004"

    ACC_PREFERRED    = "900000000000548007"
    ACC_ACCEPTABLE   = "900000000000549004"

    REFSET_LANG_ID   = "900000000000509007"  # 이미 문자열

    # === 활성 필터 ===
    #concepts = concepts.query("active==1")[["id"]].rename(columns={"id":"conceptId"})
    descs = descs.query("active==1")
    textdefs = textdefs.query("active==1") if not textdefs.empty else textdefs

    # 선택한 언어 refset만 사용
    lang = lang_all.query("refsetId==@REFSET_LANG_ID")[
        ["referencedComponentId","acceptabilityId"]
    ].rename(columns={"referencedComponentId":"descriptionId"})

    if lang.empty:
        print(f"[WARN] 선택한 REFSET_LANG_ID={REFSET_LANG_ID} 에 해당하는 레코드가 없습니다.")
        print("       der2_cRefset_Language*.txt 내 refsetId 분포를 확인해 올바른 refsetId로 교체하세요.")
        # 계속 진행은 가능하지만, acceptability 기반 선택이 제한됨.

    # === 설명(라벨) 선택 로직 ===
    # Synonym 중 Language refset의 Preferred를 최우선
    syn = descs.query("typeId==@TYPEID_SYNONYM")[["id","conceptId","term"]].rename(columns={"id":"descriptionId"})
    # 언어 refset 조인 (없으면 NaN)
    syn = syn.merge(lang, on="descriptionId", how="left")
    syn["acc_score"] = syn["acceptabilityId"].map({ACC_PREFERRED:2, ACC_ACCEPTABLE:1}).fillna(0).astype(int)
    syn_ranked = syn.sort_values(["conceptId","acc_score"], ascending=[True, False])
    preferred_syn = syn_ranked.groupby("conceptId", as_index=False).first()[["conceptId","term","acc_score"]]
    preferred_syn = preferred_syn.rename(columns={"term":"label"})

    # FSN fallback
    fsn = descs.query("typeId==@TYPEID_FSN")[["conceptId","term"]].rename(columns={"term":"fsn"})
    fsn["fsn_clean"] = fsn["fsn"].map(strip_semantic_tag)

    label_df = concepts.merge(preferred_syn, on="conceptId", how="left") \
                       .merge(fsn[["conceptId","fsn_clean"]], on="conceptId", how="left")
    # label = Preferred synonym or FSN(clean)
    label_df["label"] = label_df["label"].fillna(label_df["fsn_clean"])
    label_df = label_df.drop(columns=[c for c in ["acc_score","fsn_clean"] if c in label_df.columns])

    # 정의문(있으면 붙이기)
    if not textdefs.empty:
        defs = textdefs.query("typeId==@TYPEID_TEXT_DEF")[["conceptId","term"]] \
                       .rename(columns={"term":"definition"})
    else:
        defs = pd.DataFrame(columns=["conceptId","definition"])

    pairs = label_df.merge(defs, on="conceptId", how="left")

    # (라벨/FSN 만들고 나서) 동의어 목록 만들기 바로 전에 교체
    syn["term"] = syn["term"].astype("string").str.strip()          # 문자열+공백 제거
    syn_valid = syn[syn["term"].notna() & (syn["term"] != "")]      # NA/빈문자 제거

    # 순서 보존 dedup
    syn_list = (
       syn_valid.groupby("conceptId")["term"]
        .agg(lambda col: list(dict.fromkeys(col.tolist())))
       .reset_index(name="synonyms")
    )

    pairs = pairs.merge(syn_list, on="conceptId", how="left")
    pairs["synonyms"] = pairs["synonyms"].apply(lambda x: x if isinstance(x, list) else [])

    # 후처리
    pairs["label"] = pairs["label"].fillna("").astype(str).str.strip()
    pairs["definition"] = pairs["definition"].fillna("").astype(str).str.strip()
    pairs["synonyms"] = pairs["synonyms"].apply(lambda x: x if isinstance(x, list) else [])

    # 빈 라벨 제거(이상치)
    pairs = pairs[pairs["label"] != ""].copy()

    print("Total active concepts kept:", len(pairs))

    # === Qwen3-8B용 JSONL로 저장 ===
    # messages: user=label, assistant=definition(or fallback)
    # meta: conceptId, synonyms
    n_written = 0
    with open(OUT_JSONL, "w", encoding="utf-8") as f:
        for _, r in pairs.iterrows():
            concept = r["label"]
            desc = r["definition"] if r["definition"] else f"{concept}에 대한 의학 개념입니다."
            rec = {
                "messages": [
                    {"role":"user", "content": concept},
                    {"role":"assistant", "content": desc}
                ],
                "meta": {
                    "conceptId": int(r["conceptId"]),
                    "synonyms": r["synonyms"]
                }
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            n_written += 1

    print(f"Saved JSONL: {OUT_JSONL} (rows={n_written})")

if __name__ == "__main__":
    main()


[Concept] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Terminology/sct2_Concept_Snapshot_INT_20251001.txt
[DATE] 20251001
[Description] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Terminology/sct2_Description_Snapshot-en_INT_20251001.txt
[TextDefinition] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Terminology/sct2_TextDefinition_Snapshot-en_INT_20251001.txt
[Language(any)] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Refset/Language
Loading Concepts...
Total active concepts kept: 380837
Saved JSONL: /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/snomed_concept_description_qwen.jsonl (rows=

In [None]:
jsonl_path = "/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/snomed_concept_description_qwen.jsonl"

In [None]:
import pandas as pd
df = pd.read_json(jsonl_path, lines=True)

# 완전 텍스트 출력
print(df.head(10).to_string(index=False))


                                                                                                                                                                           messages                                                                                                                                                                          meta
                                                          [{'role': 'user', 'content': 'Quilonia ethiopica'}, {'role': 'assistant', 'content': 'Quilonia ethiopica에 대한 의학 개념입니다.'}]                                                                                                                     {'conceptId': 101009, 'synonyms': ['Quilonia ethiopica']}
                                                        [{'role': 'user', 'content': 'Hemoglobin Okaloosa'}, {'role': 'assistant', 'content': 'Hemoglobin Okaloosa에 대한 의학 개념입니다.'}]                                                                     {'conceptId': 102002, 'synonyms': ['Hemoglob

# term-definition쌍으로 된 JSONL생성 코드(conceptId/동의어 제외)

In [2]:
# -*- coding: utf-8 -*-
"""
SNOMED CT RF2 (Snapshot_INT_YYYYMMDD) -> (term, definition) JSONL for SFT
- conceptId / synonyms 제외
- 활성(active==1)만 사용
- 언어 refset 기반 Preferred term 우선 선택 (US English 기본)
- TextDefinition 있으면 정의로 사용
- 없으면 결측 처리 정책(MISSING_DEFINITION_POLICY) 적용: 'drop' 또는 'fallback'
"""

from pathlib import Path
import pandas as pd
import re, json

# ==================== 사용자 설정 ====================
BASE = Path("/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot")
OUT_JSONL = "/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/snomed_term_definition_only.jsonl"

# 언어 Refset ID (US English=900000000000509007, GB English=900000000000508004)
REFSET_LANG_ID = "900000000000509007"  # 문자열로 두세요

# 정의 결측 처리 정책: 'drop' 또는 'fallback'
MISSING_DEFINITION_POLICY = "drop"   # 'fallback' 으로 바꾸면 템플릿 채움
FALLBACK_TEMPLATE = "{term}: a SNOMED CT clinical concept. No detailed text definition was found in the release."
# ====================================================

def pick_first(paths):
    paths = list(paths)
    return paths[0] if paths else None

def find_release_files(base: Path):
    # Concept
    concept_fp = pick_first(base.rglob("Terminology/sct2_Concept_Snapshot_INT_*.txt")) \
              or pick_first(base.rglob("**/sct2_Concept_Snapshot_INT_*.txt"))
    if not concept_fp:
        raise FileNotFoundError("Concept Snapshot 파일을 찾지 못했습니다.")
    print("[Concept]", concept_fp)

    m = re.search(r"_(\d{8})\.txt$", concept_fp.name)
    if not m:
        raise RuntimeError("Concept 파일명에서 날짜(YYYYMMDD)를 추출하지 못했습니다.")
    date_str = m.group(1)
    print("[DATE]", date_str)

    # Description (동일 날짜 우선)
    desc_fp = pick_first(base.rglob(f"Terminology/sct2_Description_Snapshot-en_INT_{date_str}.txt")) \
           or pick_first(base.rglob(f"**/sct2_Description_Snapshot-en_INT_{date_str}.txt")) \
           or pick_first(base.rglob("**/sct2_Description_Snapshot-en_INT_*.txt"))
    if not desc_fp:
        raise FileNotFoundError("Description Snapshot 파일을 찾지 못했습니다.")
    print("[Description]", desc_fp)

    # TextDefinition (선택)
    textdef_fp = pick_first(base.rglob(f"Terminology/sct2_TextDefinition_Snapshot-en_INT_{date_str}.txt")) \
              or pick_first(base.rglob(f"**/sct2_TextDefinition_Snapshot-en_INT_{date_str}.txt")) \
              or pick_first(base.rglob("**/sct2_TextDefinition_Snapshot-en_INT_*.txt"))
    print("[TextDefinition]", textdef_fp if textdef_fp else "없음")

    # Language refset (여러 개일 수 있어 패턴 넓게)
    lang_candidates = []
    for pat in [
        f"Refset/Language/der2_cRefset_Language*{date_str}.txt",
        f"**/der2_cRefset_Language*{date_str}.txt",
        "Refset/Language/der2_cRefset_Language*.txt",
        "**/der2_cRefset_Language*.txt",
    ]:
        lang_candidates += list(base.rglob(pat))
    lang_candidates = sorted(set(lang_candidates), key=lambda p: p.name)
    if not lang_candidates:
        raise FileNotFoundError("Language refset 파일을 찾지 못했습니다.")
    lang_fp = lang_candidates[0]
    print("[Language(any)]", lang_fp.parent)

    return concept_fp, desc_fp, textdef_fp, lang_fp

def strip_semantic_tag(t: str) -> str:
    # "Neoplasm of jejunum (disorder)" -> "Neoplasm of jejunum"
    m = re.match(r"^(.*)\s\([^)]+\)$", t)
    return m.group(1) if m else t

def main():
    concept_fp, desc_fp, textdef_fp, lang_fp = find_release_files(BASE)

    dtype_common = {"id":"string","effectiveTime":"string","active":"int8","moduleId":"string"}

    # === Concepts ===
    concepts = pd.read_csv(
        concept_fp, sep="\t",
        dtype={**dtype_common, "definitionStatusId":"string"}
    )
    concepts = concepts.query("active==1")[["id"]].rename(columns={"id":"conceptId"})
    concepts["conceptId"] = concepts["conceptId"].astype("int64")

    # === Descriptions ===
    descs = pd.read_csv(
        desc_fp, sep="\t",
        dtype={
            **dtype_common,
            "conceptId":"int64",
            "languageCode":"string",
            "typeId":"string",
            "term":"string",
            "caseSignificanceId":"string"
        }
    ).query("active==1")

    # === Language refset ===
    lang_all = pd.read_csv(
        lang_fp, sep="\t",
        dtype={**dtype_common, "refsetId":"string", "referencedComponentId":"string", "acceptabilityId":"string"}
    ).query("active==1")
    lang = lang_all.query("refsetId==@REFSET_LANG_ID")[
        ["referencedComponentId","acceptabilityId"]
    ].rename(columns={"referencedComponentId":"descriptionId"})

    # === Text Definitions (optional) ===
    if textdef_fp:
        textdefs = pd.read_csv(
            textdef_fp, sep="\t",
            dtype={
                **dtype_common,
                "conceptId":"int64",
                "languageCode":"string",
                "typeId":"string",
                "term":"string",
                "caseSignificanceId":"string"
            }
        ).query("active==1")
    else:
        textdefs = pd.DataFrame(columns=["conceptId","term","typeId"])

    # === RF2 상수 ===
    TYPEID_FSN       = "900000000000003001"
    TYPEID_SYNONYM   = "900000000000013009"
    TYPEID_TEXT_DEF  = "900000000000550004"
    ACC_PREFERRED    = "900000000000548007"
    ACC_ACCEPTABLE   = "900000000000549004"

    # ---- Preferred term 선택 (synonym 우선, 없으면 FSN clean) ----
    syn = descs.query("typeId==@TYPEID_SYNONYM")[["id","conceptId","term"]].rename(columns={"id":"descriptionId"})
    syn = syn.merge(lang, on="descriptionId", how="left")
    syn["acc_score"] = syn["acceptabilityId"].map({ACC_PREFERRED:2, ACC_ACCEPTABLE:1}).fillna(0).astype(int)
    syn_ranked = syn.sort_values(["conceptId","acc_score"], ascending=[True, False])
    preferred_syn = syn_ranked.groupby("conceptId", as_index=False).first()[["conceptId","term"]]
    preferred_syn = preferred_syn.rename(columns={"term":"label"})

    fsn = descs.query("typeId==@TYPEID_FSN")[["conceptId","term"]].rename(columns={"term":"fsn"})
    fsn["fsn_clean"] = fsn["fsn"].map(strip_semantic_tag)

    label_df = concepts.merge(preferred_syn, on="conceptId", how="left") \
                       .merge(fsn[["conceptId","fsn_clean"]], on="conceptId", how="left")
    # 최종 term
    label_df["term"] = label_df["label"].fillna(label_df["fsn_clean"]).astype("string").str.strip()
    label_df = label_df.drop(columns=[c for c in ["label","fsn_clean"] if c in label_df.columns])

    # ---- definition 붙이기 ----
    if not textdefs.empty:
        defs = textdefs.query("typeId==@TYPEID_TEXT_DEF")[["conceptId","term"]] \
                       .rename(columns={"term":"definition"})
    else:
        defs = pd.DataFrame(columns=["conceptId","definition"])

    pairs = label_df.merge(defs, on="conceptId", how="left")
    pairs["definition"] = pairs["definition"].fillna("").astype("string").str.strip()
    pairs["term"] = pairs["term"].fillna("").astype("string").str.strip()

    # term 비어있는 것 제거
    pairs = pairs[pairs["term"] != ""].copy()

    # --- definition 결측 처리 정책 ---
    if MISSING_DEFINITION_POLICY == "drop":
        before = len(pairs)
        pairs = pairs[pairs["definition"] != ""].copy()
        print(f"[Policy=drop] 정의문 없는 {before - len(pairs)}행 제거, 남은 행: {len(pairs)}")
    elif MISSING_DEFINITION_POLICY == "fallback":
        mask = pairs["definition"] == ""
        n_missing = int(mask.sum())
        pairs.loc[mask, "definition"] = pairs.loc[mask, "term"].apply(
            lambda t: FALLBACK_TEMPLATE.format(term=t)
        )
        print(f"[Policy=fallback] 정의문 없는 {n_missing}행을 템플릿으로 대체, 총 행: {len(pairs)}")
    else:
        raise ValueError("MISSING_DEFINITION_POLICY must be 'drop' or 'fallback'.")

    # ==== JSONL 저장 (conceptId/동의어 제외) ====
    n_written = 0
    with open(OUT_JSONL, "w", encoding="utf-8") as f:
        for _, r in pairs.iterrows():
            rec = {
                "messages": [
                    {"role": "user", "content": r["term"]},
                    {"role": "assistant", "content": r["definition"]}
                ]
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            n_written += 1

    print(f"Saved JSONL: {OUT_JSONL} (rows={n_written})")

if __name__ == "__main__":
    main()


[Concept] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Terminology/sct2_Concept_Snapshot_INT_20251001.txt
[DATE] 20251001
[Description] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Terminology/sct2_Description_Snapshot-en_INT_20251001.txt
[TextDefinition] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Terminology/sct2_TextDefinition_Snapshot-en_INT_20251001.txt
[Language(any)] /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/SNOMED_CT_datasets/SNOMED_International/SnomedCT_InternationalRF2/Snapshot/Refset/Language
[Policy=drop] 정의문 없는 368376행 제거, 남은 행: 12461
Saved JSONL: /content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/snomed_term_definition_only.jsonl (rows=12461)


In [5]:
jsonl_path = "/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Datasets/snomed_term_definition_only.jsonl"

In [6]:
import pandas as pd
df = pd.read_json(jsonl_path, lines=True)

# 완전 텍스트 출력
print(df.head(10).to_string(index=False))


                                                                                                                                                                                                                                                                                messages
                                                                                                                                                                       [{'role': 'user', 'content': 'Thermal injury'}, {'role': 'assistant', 'content': 'Injury due to increased heat'}]
                                                                     [{'role': 'user', 'content': 'Vaginopexy by colposuspension'}, {'role': 'assistant', 'content': 'Vaginopexy according to Williams and Richardson is an abdominal colposuspension by strips from external oblique'}]
                                                       [{'role': 'user', 'content': 'Epilepsia partialis continua'}, {'role': 'assistant', 'content': 'A prol

# 용어 정의 제대로 하는지 검증

In [None]:
# Inference: Ask the fine-tuned model to define a term
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# ==== 경로 설정 ====
BASE_MODEL   = "/content/drive/MyDrive/DILAB/qwen3-8b"  # 베이스 모델(또는 허깅페이스 경로)
ADAPTER_DIR  = "/content/drive/MyDrive/DILAB/OK/DI_LAB/MARS_Datathon/Models/qwen3_8b_snomed_lora"  # LoRA 어댑터
USE_4BIT     = True
USE_BF16     = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

# ==== 모델/토크나이저 로드 ====
bnb = None
if USE_4BIT:
    bnb = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16 if USE_BF16 else torch.float16
    )

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb,
    trust_remote_code=True,
    device_map="auto",
    dtype=torch.bfloat16 if USE_BF16 else torch.float16,  # torch_dtype deprec → dtype
)
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

# LoRA 어댑터 부착 (병합 모델이면 이 부분 건너뜀)
model = PeftModel.from_pretrained(model, ADAPTER_DIR)
model.eval()

def define_term(term: str, max_new_tokens: int = 160, deterministic: bool = True) -> str:
    """학습한 포맷 그대로: user=term → assistant=definition"""
    messages = [
        {"role": "user", "content": term}
    ]
    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tok([prompt], return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=not deterministic,   # 정의문은 보통 결정적 생성 권장
            temperature=0.2,
            top_p=0.9,
            eos_token_id=tok.eos_token_id,
        )
    text = tok.decode(out[0], skip_special_tokens=True)
    # 프롬프트 길이만큼 잘라서 assistant 응답만 반환
    return text[len(prompt):].strip()

# === 사용 예시 ===
print(define_term("Asthma"))
print(define_term("Myocardial infarction"))


PackageNotFoundError: No package metadata was found for bitsandbytes