# Llama 학습을 위해 지시문 줄인 버전

## 파인튜닝 1번째 버전

In [None]:
# ====== 0. 필요 라이브러리 다운로드 ======
print("모든 필수 라이브러리를 최신 버전으로 설치")
!pip install -U transformers accelerate peft trl datasets pandas rich
!pip install -U bitsandbytes    # bitbytes는 4bit양자화를 위한 필수 패키지

# ====== 1. 라이브러리 임포트 & 환경 설정======
print("\n라이브러리 불러오기")
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import BitsAndBytesConfig   # BitsAndBytesConfig: 4bit로딩 설정
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model  # perft: LoRA어댑터 설정 및 결합
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
from trl import SFTTrainer, SFTConfig   # TRL: SFTTrainer로 학습 루프/로깅을 간단화
from tqdm.auto import tqdm
tqdm.pandas()
progress_callbacks = []  # 빈 리스트로 대체

# ====== 2.환경 변수 설정 ======
os.environ["WANDB_DISABLED"] = "true"  # W&B 비활성화
BASE = "/content/drive/MyDrive/DILAB/MARS/mimic-iv-note_2.2/files/note/input_output"

TRAIN_INPUT   = f"{BASE}/discharge_train_input.parquet"
TRAIN_OUTPUT  = f"{BASE}/discharge_train_output.parquet"
TEST_INPUT    = f"{BASE}/discharge_test_input.parquet"
TEST_OUTPUT   = f"{BASE}/discharge_test_output.parquet"

# ✅ Llama3-8B-Instruct 모델 경로/ID (허깅페이스 ID 또는 로컬 경로로 변경 가능)
MODEL_ID   = "/content/drive/MyDrive/DILAB/llama3-8b-instruct"
OUTPUT_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/logs/original_1"
ADAPTER_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_1"

# ⚠️ Llama3-8B-Instruct 컨텍스트 한계는 8k
MAX_SEQ_LENGTH     = 8192   # 8k컨텍스트 제한 명시
TRAIN_LIMIT        = 10_000
VALID_LIMIT        = 2_000
SHUFFLE_SEED       = 42
PACKING            = False
EPOCH              = 1
KEEP_MARGIN        = 32     # 토큰 길이 계산 오차/특수토큰을 위한 여유
MIN_ANSWER_TOKENS  = 128    # 어시스턴트 본문 최소 길이 보장. 손실 0 방지 핵심 파라미터

KEYS = ["subject_id", "hadm_id", "note_id"]

# ====== 3. 구글 드라이브 마운트 =====
from google.colab import drive
drive.mount('/content/drive')

# ====== 4. Train/Test 파일에서 랜덤으로 TRAIN_LIMIT만큼 데이터 행 개수 꺼내오기 ======
train_input_df = pd.read_parquet(TRAIN_INPUT)
train_output_df = pd.read_parquet(TRAIN_OUTPUT)
print(f"✅ TRAIN INPUT rows: {len(train_input_df):,}")
print(f"✅ TRAIN OUTPUT rows: {len(train_output_df):,}")

# 입력/정답 테이블을 **KEYS(subject_id/hadm_id/note_id)**로 inner join
# inner join - 두 개 이상의 테이블에서 공통된 값(특정 칼럼)을 기준으로 서로 연결하는 SQL 연산 방식
train_merged_df = train_input_df.merge(train_output_df, on=KEYS, how="inner")
print(f"✅ TRAIN MERGED rows: {len(train_merged_df):,}")

both_empty = (train_merged_df["bhc_text"].fillna("").str.len() == 0) & \
             (train_merged_df["di_text"].fillna("").str.len() == 0)
train_merged_df = train_merged_df[~both_empty].reset_index(drop=True)

if TRAIN_LIMIT is not None and TRAIN_LIMIT < len(train_merged_df):
    sample_df = train_merged_df.sample(n=TRAIN_LIMIT, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    sample_df = train_merged_df.copy()
print(f"🎯 최종 선택된 샘플 수: {len(sample_df):,}")

# ====== 5. 프롬프트 포맷팅: message 구조 만들기 ======
EXAMPLES = r"""
Brief Hospital Course
- Admitted with progressive abdominal distension and confusion in the setting of cirrhosis.
- Diagnostic paracentesis negative for SBP; CXR unremarkable; UA/cultures without growth.
- Initiated diuretics: Furosemide 40 mg PO daily; Spironolactone 50 mg PO daily (dose chosen given K+ 4.5).
- Symptomatic improvement with diuresis; electrolytes stable on regimen.
- Outpatient follow-up arranged for liver clinic and endoscopic screening as noted in input.

Discharge Instructions
- Take Furosemide 40 mg PO daily and Spironolactone 50 mg PO daily exactly as prescribed.
- Follow a low-sodium diet as instructed in the input.
- Attend the scheduled outpatient visits listed in the input.
- Return for care if you develop abdominal pain, fever, or confusion as specified in the input.
"""

INSTRUCTION = (
    """
You are an expert physician-writer who crafts hospital discharge summaries.

    """
    f"{EXAMPLES}\n"
)

# 원본 메시지 구조 (system, user, assistant)를 그대로 보관
triples = []
print("🪄 메시지 트리플 생성 중...")
for _, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Formatting", ncols=100):
    system_msg = {"role": "system", "content": INSTRUCTION}
    user_msg   = {"role": "user", "content": str(row["text"]).strip()}

    assistant_text = ""
    if str(row["bhc_text"]).strip():
        assistant_text += f"Brief Hospital Course\n{str(row['bhc_text']).strip()}\n\n"
    if str(row["di_text"]).strip():
        assistant_text += f"Discharge Instructions\n{str(row['di_text']).strip()}"
    assistant_msg = {"role": "assistant", "content": assistant_text.strip()}

    triples.append((system_msg, user_msg, assistant_msg))
print(f"✅ 메시지 트리플 준비 완료! 총 {len(triples):,}개 샘플")

# ====== 6. 모델/토크나이저 로드 ======
# nf4 4bit 로딩 + k-bit 학습 준비
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # 모델을 4비트로 로딩(메모리 절약 & 파인튜닝 효율)
    bnb_4bit_quant_type="nf4",             # 4비트 타입은 nf5(최신 연구, 더 우수한 압축/정확도)
    bnb_4bit_compute_dtype=torch.bfloat16, # 연산시 bfloat16 상요(속도와 안정성 증가, low-memory 환경 적합)
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,                         # 로드할 모델
    dtype=torch.bfloat16,             # 기본 데이터타입 지정
    quantization_config=bnb_config,   # 위에서 정의한 4비트 양자화 config 적용
    device_map="auto",                # Colab에서 자동으로 GPU/CPU 등에 모델 분산
    offload_buffers=True,             # 일부 높은 램 점유 버퍼는 CPU로 offload -> 메모리 절약
)
model.config.use_cache = False                  # 캐시기능 off -> 추론시 어텐션 캐시(메모리 확보)는 off -> 파인튜닝시 필요 없는 부분 미리 제거해서 GPU 메모리 관리에 도움
model = prepare_model_for_kbit_training(model)  # 양자화(k-bit) 파인튜닝에 필요한 특수 레이어/옵션을 추가 적용(PEFT/LoRA, QLoRA 전처리 등과 연계)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)  #
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # pad 토큰 없으면 eos(종료) 토큰을 대신 사용
tokenizer.padding_side     = "right"          # (문장/배치 패딩 기준) pad는 오른쪽에 붙임
tokenizer.truncation_side  = "left"           # 문장이 너무 길면 왼쪽부터 자름 -> 최신 맥락이 항상 남게 됨
tokenizer.model_max_length = MAX_SEQ_LENGTH   # 최대 허용 입력 길의 정의

# ====== 7. ★ 파트별 직렬화: (system+user)와 (assistant) 분리 ======
def serialize_system_user(sys_msg, user_msg):
    return tokenizer.apply_chat_template(
        [sys_msg, user_msg], tokenize=False, add_generation_prompt=False
    )

def serialize_assistant(asst_msg):
    return tokenizer.apply_chat_template(
        [asst_msg], tokenize=False, add_generation_prompt=False
    )

# 어시스턴트 헤더 길이를 “정확한 토큰 길이”로 알기 위해, 빈 assistant를 직렬화하여 토큰화
assistant_header_text = serialize_assistant({"role":"assistant","content":""})
ASSISTANT_HEADER_IDS  = tokenizer.encode(assistant_header_text, add_special_tokens=False)

# ====== 8. 길이 안전 전처리: assistant 100% 보존 + pre(left-trim) ======
def pack_one(sys_msg, user_msg, asst_msg):
    pre_text  = serialize_system_user(sys_msg, user_msg)
    asst_text = serialize_assistant(asst_msg)

    pre_ids   = tokenizer.encode(pre_text,   add_special_tokens=False)
    asst_ids  = tokenizer.encode(asst_text,  add_special_tokens=False)

    # asst_ids = [ASSISTANT_HEADER_IDS + assistant_content_ids]
    header_len = len(ASSISTANT_HEADER_IDS)
    content_ids = asst_ids[header_len:] if len(asst_ids) >= header_len else []

    # 1) 어시스턴트 본문 최소 토큰 보장
    total_allowed   = MAX_SEQ_LENGTH - KEEP_MARGIN
    min_needed_asst = header_len + max(1, MIN_ANSWER_TOKENS)
    if min_needed_asst > total_allowed:
        min_needed_asst = header_len + 1

    # 2) pre 예산 계산
    budget_for_pre = total_allowed - min_needed_asst
    budget_for_pre = max(0, budget_for_pre)

    # 3) pre 왼쪽 절단
    if len(pre_ids) > budget_for_pre:
        pre_ids = pre_ids[-budget_for_pre:]

    # 4) 어시스턴트 본문 담을 수 있는 만큼 확보 (가급적 최근/결론부 보존: 뒤에서부터 유지)
    remaining       = total_allowed - len(pre_ids)
    allow_content   = max(1, remaining - header_len)
    allow_content   = min(allow_content, len(content_ids))
    kept_content    = content_ids[-allow_content:] if allow_content > 0 else []

    final_asst_ids  = ASSISTANT_HEADER_IDS + kept_content
    input_ids       = pre_ids + final_asst_ids
    attention_mask  = [1] * len(input_ids)

    # 5) 라벨: pre(-100) + header(-100) + 본문(정답)
    labels = [-100] * len(pre_ids) + [-100] * header_len + kept_content[:]
    if len(labels) < len(input_ids):
        labels += [-100] * (len(input_ids) - len(labels))
    elif len(labels) > len(input_ids):
        labels = labels[:len(input_ids)]

    # 최종 safety clip
    input_ids      = input_ids[:MAX_SEQ_LENGTH]
    attention_mask = attention_mask[:MAX_SEQ_LENGTH]
    labels         = labels[:MAX_SEQ_LENGTH]
    return input_ids, attention_mask, labels

# ====== 9. Dataset 생성 ======
def build_dataset(triples, batch_size=512):
    all_input_ids, all_attn, all_labels = [], [], []
    for i in tqdm(range(0, len(triples), batch_size), desc="Packing", ncols=100):
        chunk = triples[i:i+batch_size]
        for sys_msg, user_msg, asst_msg in chunk:
            ids, attn, labs = pack_one(sys_msg, user_msg, asst_msg)
            all_input_ids.append(ids)
            all_attn.append(attn)
            all_labels.append(labs)
    return Dataset.from_dict({"input_ids": all_input_ids, "attention_mask": all_attn, "labels": all_labels})

tokenized_ds = build_dataset(triples)

# ====== 9-1. 라벨 유효 토큰 통계 출력 ======
def quick_label_stats(ds, n=200):
    import random
    n = min(n, len(ds))
    idxs = random.sample(range(len(ds)), n)
    cnts = []
    for i in idxs:
        y = ds[i]["labels"]
        cnts.append(sum(1 for t in y if t != -100))
    return {"checked": n, "min": int(np.min(cnts)), "median": int(np.median(cnts)), "max": int(np.max(cnts))}
print("🔎 Label token stats (non -100):", quick_label_stats(tokenized_ds, n=200))

# ====== 10. LoRA 설정 & 모델에 적용 ======
peft_config = LoraConfig(
    r=64, lora_alpha=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ====== 11. SFT 하이퍼파라미터 ======
sft_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    num_train_epochs=EPOCH,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=3,
    logging_steps=10,
    output_dir=OUTPUT_DIR,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_steps=-1,
    report_to="tensorboard",
    packing=False,
    max_length=MAX_SEQ_LENGTH,
)

# ====== 12. 커스텀 콜레이터 (패딩) ======
class PadToSameLengthCollator:
    def __init__(self, tokenizer, max_length=None, pad_to_multiple_of=8):
        self.tok = tokenizer
        self.max_length = max_length or tokenizer.model_max_length
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        input_ids_list = [f["input_ids"] for f in features]
        attn_list     = [f.get("attention_mask", [1]*len(f["input_ids"])) for f in features]
        labels_list   = [f["labels"] for f in features]

        max_len = min(max(len(x) for x in input_ids_list), self.max_length)
        if self.pad_to_multiple_of is not None and max_len % self.pad_to_multiple_of != 0:
            max_len = ((max_len // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of
            max_len = min(max_len, self.max_length)

        def pad_seq(seq, pad_id, tgt_len):
            if len(seq) >= tgt_len:
                return seq[:tgt_len]
            return seq + [pad_id] * (tgt_len - len(seq))

        batch_input_ids, batch_attention, batch_labels = [], [], []
        for ids, attn, lab in zip(input_ids_list, attn_list, labels_list):
            ids_p  = pad_seq(ids,  self.tok.pad_token_id, tgt_len=max_len)
            attn_p = pad_seq(attn, 0, tgt_len=max_len)
            lab_p  = pad_seq(lab, -100, tgt_len=max_len)

            batch_input_ids.append(ids_p)
            batch_attention.append(attn_p)
            batch_labels.append(lab_p)

        import torch
        return {
            "input_ids":      torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention, dtype=torch.long),
            "labels":         torch.tensor(batch_labels, dtype=torch.long),
        }

data_collator = PadToSameLengthCollator(tokenizer, max_length=MAX_SEQ_LENGTH, pad_to_multiple_of=8)

# ====== 13. SFTTrainer 설정 & 학습 ======
trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
)

print("tokenizer model_max_length:", tokenizer.model_max_length)
print("trainer max_length:", str(trainer.args.max_length))
print("🪄 학습 시작")
trainer.train()

# ====== 14. 어댑터 저장하기 ======
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print(f"✅ LoRA 어댑터 저장 완료: {ADAPTER_DIR}")


모든 필수 라이브러리를 최신 버전으로 설치

라이브러리 불러오기
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ TRAIN INPUT rows: 263,204
✅ TRAIN OUTPUT rows: 263,204
✅ TRAIN MERGED rows: 263,204
🎯 최종 선택된 샘플 수: 10,000
🪄 메시지 트리플 생성 중...


Formatting:   0%|                                                         | 0/10000 [00:00<?, ?it/s]

✅ 메시지 트리플 준비 완료! 총 10,000개 샘플


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Packing:   0%|                                                               | 0/20 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8642 > 8192). Running this sequence through the model will result in indexing errors


🔎 Label token stats (non -100): {'checked': 200, 'min': 40, 'median': 807, 'max': 2884}
trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}.


tokenizer model_max_length: 8192
trainer max_length: 8192
🪄 학습 시작


Step,Training Loss
10,1.9931
20,2.0884
30,1.8784
40,1.823
50,1.8576
60,1.8328
70,1.6604
80,1.726
90,1.7596
100,1.7075


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using

✅ LoRA 어댑터 저장 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_1


## 파인튜닝 2번째 버전(지시문 수정)

In [None]:
import os, shutil
from google.colab import drive

# 1) 혹시 기존에 남아있는 일반 디렉터리/심볼릭 링크 제거
if os.path.exists('/content/drive'):
    # 안전하게 폴더 전체 삭제 (로컬 세션의 빈 폴더/파일만 지움)
    shutil.rmtree('/content/drive', ignore_errors=True)

# 2) 이제 깨끗해졌으니 마운트
drive.mount('/content/drive', force_remount=True)

# 3) 확인
import os
print("Mounted:", drive.is_mounted('/content/drive'))
print("List:", os.listdir('/content/drive'))


Mounted at /content/drive


AttributeError: module 'google.colab.drive' has no attribute 'is_mounted'

In [None]:
import os
from pathlib import Path

mountpoint = "/content/drive"

# 드라이브 마운트 여부 확인 (FUSE 마운트 + MyDrive 존재 확인)
is_mounted = os.path.ismount(mountpoint) and os.path.exists(os.path.join(mountpoint, "MyDrive"))
print("Mounted:", is_mounted)

# 상위 항목 몇 개만 확인
root = Path(mountpoint)
print("List:", [p.name for p in root.iterdir()][:10])


Mounted: True
List: ['MyDrive', '.shortcut-targets-by-id', '.Trash-0', '.Encrypted']


In [None]:
# ====== 0. 필요 라이브러리 다운로드 ======
print("모든 필수 라이브러리를 최신 버전으로 설치")
!pip install -U transformers accelerate peft trl datasets pandas rich
!pip install -U bitsandbytes  # 4bit 양자화용

# ====== 1. 라이브러리 임포트 & 환경 설정======
print("\n라이브러리 불러오기")
import os
import random
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer, SFTConfig
from tqdm.auto import tqdm
tqdm.pandas()
progress_callbacks = []  # 빈 리스트로 대체

# 재현성 고정
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ====== 2.환경 변수 설정 ======
os.environ["WANDB_DISABLED"] = "true"  # W&B 비활성화
BASE = "/content/drive/MyDrive/DILAB/MARS/mimic-iv-note_2.2/files/note/input_output"

TRAIN_INPUT   = f"{BASE}/discharge_train_input.parquet"
TRAIN_OUTPUT  = f"{BASE}/discharge_train_output.parquet"
TEST_INPUT    = f"{BASE}/discharge_test_input.parquet"
TEST_OUTPUT   = f"{BASE}/discharge_test_output.parquet"

# ✅ Llama3-8B-Instruct 모델 경로/ID
MODEL_ID   = "/content/drive/MyDrive/DILAB/llama3-8b-instruct"
OUTPUT_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/logs/original_2"
ADAPTER_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_2"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(ADAPTER_DIR, exist_ok=True)

# ⚠️ Llama3-8B-Instruct 컨텍스트 한계는 8k
MAX_SEQ_LENGTH     = 8192
TRAIN_LIMIT        = 10_000
VALID_LIMIT        = 2_000
SHUFFLE_SEED       = 42
PACKING            = False
EPOCH              = 3
KEEP_MARGIN        = 32      # 안전 여유
MIN_ANSWER_TOKENS  = 64      # 어시스턴트 본문 최소 길이(완화)

KEYS = ["subject_id", "hadm_id", "note_id"]

# ====== 4. Train/Test 파일 로드 및 병합 ======
train_input_df = pd.read_parquet(TRAIN_INPUT)
train_output_df = pd.read_parquet(TRAIN_OUTPUT)
print(f"✅ TRAIN INPUT rows: {len(train_input_df):,}")
print(f"✅ TRAIN OUTPUT rows: {len(train_output_df):,}")

# 입력/정답 테이블을 KEYS로 inner join
train_merged_df = train_input_df.merge(train_output_df, on=KEYS, how="inner")
print(f"✅ TRAIN MERGED rows: {len(train_merged_df):,}")

# BHC와 DI가 모두 비어있는 행 제거
both_empty = (train_merged_df["bhc_text"].fillna("").str.len() == 0) & \
             (train_merged_df["di_text"].fillna("").str.len() == 0)
train_merged_df = train_merged_df[~both_empty].reset_index(drop=True)

# 샘플링
if TRAIN_LIMIT is not None and TRAIN_LIMIT < len(train_merged_df):
    sample_df = train_merged_df.sample(n=TRAIN_LIMIT, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    sample_df = train_merged_df.copy()
print(f"🎯 최종 선택된 샘플 수: {len(sample_df):,}")

# ====== 5. 프롬프트 포맷팅: message 구조 만들기 ======
EXAMPLES = r"""
Brief Hospital Course
- Admitted with progressive abdominal distension and confusion in the setting of cirrhosis.
- Diagnostic paracentesis negative for SBP; CXR unremarkable; UA/cultures without growth.
- Initiated diuretics: Furosemide 40 mg PO daily; Spironolactone 50 mg PO daily (dose chosen given K+ 4.5).
- Symptomatic improvement with diuresis; electrolytes stable on regimen.
- Outpatient follow-up arranged for liver clinic and endoscopic screening as noted in input.

Discharge Instructions
- Take Furosemide 40 mg PO daily and Spironolactone 50 mg PO daily exactly as prescribed.
- Follow a low-sodium diet as instructed in the input.
- Attend the scheduled outpatient visits listed in the input.
- Return for care if you develop abdominal pain, fever, or confusion as specified in the input.
"""

INSTRUCTION = (
    "You are an expert physician-writer who crafts hospital discharge summaries.\n"
    "[Guide] Adhere to the headings 'Brief Hospital Course' and 'Discharge Instructions'.\n"
    "Use only facts explicitly present in the input; if unknown, write 'not specified'.\n"
    "Do not invent medications, doses, dates, devices, or anticoagulation.\n"
    "Keep timelines consistent (POD, diet advancement, Foley removal, PT/OT, brace).\n"
    "Avoid duplication; do not repeat the 'Discharge Instructions' section.\n\n"
    f"{EXAMPLES}\n"
)

# 원본 메시지 구조 (system, user, assistant)
triples = []
print("🪄 메시지 트리플 생성 중...")
for _, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Formatting", ncols=100):
    system_msg = {"role": "system", "content": INSTRUCTION}
    user_msg   = {"role": "user", "content": str(row["text"]).strip()}

    assistant_text = ""
    if str(row["bhc_text"]).strip():
        assistant_text += f"Brief Hospital Course\n{str(row['bhc_text']).strip()}\n\n"
    if str(row["di_text"]).strip():
        assistant_text += f"Discharge Instructions\n{str(row['di_text']).strip()}"
    assistant_msg = {"role": "assistant", "content": assistant_text.strip()}

    triples.append((system_msg, user_msg, assistant_msg))
print(f"✅ 메시지 트리플 준비 완료! 총 {len(triples):,}개 샘플")

# ====== 6. 모델/토크나이저 로드 ======
# nf4 4bit 로딩 + k-bit 학습 준비
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side     = "right"
tokenizer.truncation_side  = "left"
tokenizer.model_max_length = MAX_SEQ_LENGTH

# ====== 7. 직렬화 함수 ======
def has_chat_template(tok):
    try:
        tmpl = tok.chat_template
        return tmpl is not None and len(str(tmpl)) > 0
    except Exception:
        return False

USE_CHAT_TEMPLATE = has_chat_template(tokenizer)

def serialize_system_user(sys_msg, user_msg):
    if USE_CHAT_TEMPLATE:
        return tokenizer.apply_chat_template(
            [sys_msg, user_msg], tokenize=False, add_generation_prompt=False
        )
    return f"<|system|>\n{sys_msg['content']}\n<|user|>\n{user_msg['content']}\n"

def serialize_assistant(asst_msg):
    if USE_CHAT_TEMPLATE:
        return tokenizer.apply_chat_template(
            [asst_msg], tokenize=False, add_generation_prompt=False
        )
    return f"<|assistant|>\n{asst_msg['content']}"

# 어시스턴트 헤더 길이
assistant_header_text = serialize_assistant({"role": "assistant", "content": ""})
ASSISTANT_HEADER_IDS  = tokenizer.encode(assistant_header_text, add_special_tokens=False)

# ====== 8. 길이 안전 전처리: assistant 100% 보존 + pre(left-trim) ======
def pack_one(sys_msg, user_msg, asst_msg):
    pre_text  = serialize_system_user(sys_msg, user_msg)
    asst_text = serialize_assistant(asst_msg)

    pre_ids   = tokenizer.encode(pre_text,   add_special_tokens=False)
    asst_ids  = tokenizer.encode(asst_text,  add_special_tokens=False)

    header_len  = len(ASSISTANT_HEADER_IDS)
    content_ids = asst_ids[header_len:] if len(asst_ids) >= header_len else []

    # 0) 감독 토큰이 전혀 없으면 학습 무의미 → 스킵 신호
    if len(content_ids) == 0:
        return None

    # 1) 어시스턴트 본문 최소 토큰 보장
    total_allowed   = MAX_SEQ_LENGTH - KEEP_MARGIN
    min_needed_asst = header_len + max(1, MIN_ANSWER_TOKENS)
    if min_needed_asst > total_allowed:
        min_needed_asst = header_len + 1  # 최저 1토큰이라도 보장

    # 2) pre 예산 계산
    budget_for_pre = max(0, total_allowed - min_needed_asst)

    # 3) pre 왼쪽 절단
    if len(pre_ids) > budget_for_pre:
        pre_ids = pre_ids[-budget_for_pre:]

    # 4) 어시스턴트 본문(가능하면 뒤쪽 유지)
    remaining       = total_allowed - len(pre_ids)
    allow_content   = max(1, remaining - header_len)
    allow_content   = min(allow_content, len(content_ids))
    kept_content    = content_ids[-allow_content:] if allow_content > 0 else []

    final_asst_ids  = ASSISTANT_HEADER_IDS + kept_content
    input_ids       = pre_ids + final_asst_ids
    attention_mask  = [1] * len(input_ids)

    # 5) 라벨: pre(-100) + header(-100) + 본문(정답)
    labels = [-100] * len(pre_ids) + [-100] * header_len + kept_content[:]
    if len(labels) < len(input_ids):
        labels += [-100] * (len(input_ids) - len(labels))
    elif len(labels) > len(input_ids):
        labels = labels[:len(input_ids)]

    # Safety clip
    input_ids      = input_ids[:MAX_SEQ_LENGTH]
    attention_mask = attention_mask[:MAX_SEQ_LENGTH]
    labels         = labels[:MAX_SEQ_LENGTH]
    return input_ids, attention_mask, labels

# ====== 9. Dataset 생성 ======
def build_dataset(triples, batch_size=512):
    all_input_ids, all_attn, all_labels = [], [], []
    skipped = 0
    for i in tqdm(range(0, len(triples), batch_size), desc="Packing", ncols=100):
        chunk = triples[i:i+batch_size]
        for sys_msg, user_msg, asst_msg in chunk:
            packed = pack_one(sys_msg, user_msg, asst_msg)
            if packed is None:
                skipped += 1
                continue
            ids, attn, labs = packed
            all_input_ids.append(ids)
            all_attn.append(attn)
            all_labels.append(labs)
    print(f"🚧 Skipped samples (no supervised tokens): {skipped}")
    return Dataset.from_dict({"input_ids": all_input_ids, "attention_mask": all_attn, "labels": all_labels})

tokenized_ds = build_dataset(triples)

# ====== 9-1. 라벨 유효 토큰 통계 출력 ======
def quick_label_stats(ds, n=200):
    n = min(n, len(ds))
    idxs = random.sample(range(len(ds)), n)
    cnts = []
    for i in idxs:
        y = ds[i]["labels"]
        cnts.append(sum(1 for t in y if t != -100))
    return {"checked": n, "min": int(np.min(cnts)), "median": int(np.median(cnts)), "max": int(np.max(cnts))}
print("🔎 Label token stats (non -100):", quick_label_stats(tokenized_ds, n=200))

# ====== 10. LoRA 설정 & 모델에 적용 ======
peft_config = LoraConfig(
    r=64, lora_alpha=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ====== 11. SFT 하이퍼파라미터 ======
sft_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    num_train_epochs=EPOCH,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=3,
    logging_steps=10,
    output_dir=OUTPUT_DIR,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_steps=-1,
    report_to="tensorboard",
    packing=False,  # 커스텀 패킹 사용
)

# ====== 12. 커스텀 콜레이터 (패딩) ======
class PadToSameLengthCollator:
    def __init__(self, tokenizer, max_length=None, pad_to_multiple_of=8):
        self.tok = tokenizer
        self.max_length = max_length or tokenizer.model_max_length
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        input_ids_list = [f["input_ids"] for f in features]
        attn_list     = [f.get("attention_mask", [1]*len(f["input_ids"])) for f in features]
        labels_list   = [f["labels"] for f in features]

        max_len = min(max(len(x) for x in input_ids_list), self.max_length)
        if self.pad_to_multiple_of is not None and max_len % self.pad_to_multiple_of != 0:
            max_len = ((max_len // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of
            max_len = min(max_len, self.max_length)

        def pad_seq(seq, pad_id, tgt_len):
            if len(seq) >= tgt_len:
                return seq[:tgt_len]
            return seq + [pad_id] * (tgt_len - len(seq))

        batch_input_ids, batch_attention, batch_labels = [], [], []
        for ids, attn, lab in zip(input_ids_list, attn_list, labels_list):
            ids_p  = pad_seq(ids,  self.tok.pad_token_id, tgt_len=max_len)
            attn_p = pad_seq(attn, 0, tgt_len=max_len)
            lab_p  = pad_seq(lab, -100, tgt_len=max_len)

            batch_input_ids.append(ids_p)
            batch_attention.append(attn_p)
            batch_labels.append(lab_p)

        return {
            "input_ids":      torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention, dtype=torch.long),
            "labels":         torch.tensor(batch_labels, dtype=torch.long),
        }

data_collator = PadToSameLengthCollator(tokenizer, max_length=MAX_SEQ_LENGTH, pad_to_multiple_of=8)

# ====== 13. SFTTrainer 설정 & 학습 ======
trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
    max_seq_length=MAX_SEQ_LENGTH,
)

print("tokenizer model_max_length:", tokenizer.model_max_length)
print("🪄 학습 시작")
trainer.train()

# ====== 14. 어댑터 저장하기 ======
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print(f"✅ LoRA 어댑터 저장 완료: {ADAPTER_DIR}")


모든 필수 라이브러리를 최신 버전으로 설치

라이브러리 불러오기
✅ TRAIN INPUT rows: 263,204
✅ TRAIN OUTPUT rows: 263,204
✅ TRAIN MERGED rows: 263,204
🎯 최종 선택된 샘플 수: 10,000
🪄 메시지 트리플 생성 중...


Formatting:   0%|                                                         | 0/10000 [00:00<?, ?it/s]

✅ 메시지 트리플 준비 완료! 총 10,000개 샘플


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Packing:   0%|                                                               | 0/20 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8713 > 8192). Running this sequence through the model will result in indexing errors


🚧 Skipped samples (no supervised tokens): 0
🔎 Label token stats (non -100): {'checked': 200, 'min': 40, 'median': 807, 'max': 2884}
trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'max_seq_length'

### SFT가 아닌 Trainer 사용 코드

In [6]:
# ====== 0. 필요 라이브러리 다운로드 ======
print("모든 필수 라이브러리를 최신 버전으로 설치")
!pip install -U transformers accelerate peft trl datasets pandas rich
!pip install -U bitsandbytes  # 4bit 양자화용

# ====== 1. 라이브러리 임포트 & 환경 설정======
print("\n라이브러리 불러오기")
import os
import random
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, Trainer # ✅ SFTTrainer 대신 Trainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTConfig # ✅ SFTConfig는 TrainingArguments를 상속하므로 그대로 사용
from tqdm.auto import tqdm

# ====== 2.환경 변수 설정 ======
os.environ["WANDB_DISABLED"] = "true"  # W&B 비활성화
BASE = "/content/drive/MyDrive/DILAB/MARS/mimic-iv-note_2.2/files/note/input_output"

TRAIN_INPUT   = f"{BASE}/discharge_train_input.parquet"
TRAIN_OUTPUT  = f"{BASE}/discharge_train_output.parquet"
TEST_INPUT    = f"{BASE}/discharge_test_input.parquet"
TEST_OUTPUT   = f"{BASE}/discharge_test_output.parquet"

# ✅ Llama3-8B-Instruct 모델 경로/ID
MODEL_ID   = "/content/drive/MyDrive/DILAB/llama3-8b-instruct"
OUTPUT_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/logs/original_2"
ADAPTER_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_2"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(ADAPTER_DIR, exist_ok=True)

# ⚠️ Llama3-8B-Instruct 컨텍스트 한계는 8k
MAX_SEQ_LENGTH     = 8192
TRAIN_LIMIT        = 10_000
VALID_LIMIT        = 2_000
SHUFFLE_SEED       = 42
PACKING            = False
EPOCH              = 3
KEEP_MARGIN        = 32      # 안전 여유
MIN_ANSWER_TOKENS  = 64      # 어시스턴트 본문 최소 길이(완화)

KEYS = ["subject_id", "hadm_id", "note_id"]

# ====== 4. Train/Test 파일 로드 및 병합 ======
train_input_df = pd.read_parquet(TRAIN_INPUT)
train_output_df = pd.read_parquet(TRAIN_OUTPUT)
print(f"✅ TRAIN INPUT rows: {len(train_input_df):,}")
print(f"✅ TRAIN OUTPUT rows: {len(train_output_df):,}")

# 입력/정답 테이블을 KEYS로 inner join
train_merged_df = train_input_df.merge(train_output_df, on=KEYS, how="inner")
print(f"✅ TRAIN MERGED rows: {len(train_merged_df):,}")

# BHC와 DI가 모두 비어있는 행 제거
both_empty = (train_merged_df["bhc_text"].fillna("").str.len() == 0) & \
             (train_merged_df["di_text"].fillna("").str.len() == 0)
train_merged_df = train_merged_df[~both_empty].reset_index(drop=True)

# 샘플링
if TRAIN_LIMIT is not None and TRAIN_LIMIT < len(train_merged_df):
    sample_df = train_merged_df.sample(n=TRAIN_LIMIT, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    sample_df = train_merged_df.copy()
print(f"🎯 최종 선택된 샘플 수: {len(sample_df):,}")

# ====== 5. 프롬프트 포맷팅: message 구조 만들기 ======
EXAMPLES = r"""
Brief Hospital Course
- Admitted with progressive abdominal distension and confusion in the setting of cirrhosis.
- Diagnostic paracentesis negative for SBP; CXR unremarkable; UA/cultures without growth.
- Initiated diuretics: Furosemide 40 mg PO daily; Spironolactone 50 mg PO daily (dose chosen given K+ 4.5).
- Symptomatic improvement with diuresis; electrolytes stable on regimen.
- Outpatient follow-up arranged for liver clinic and endoscopic screening as noted in input.

Discharge Instructions
- Take Furosemide 40 mg PO daily and Spironolactone 50 mg PO daily exactly as prescribed.
- Follow a low-sodium diet as instructed in the input.
- Attend the scheduled outpatient visits listed in the input.
- Return for care if you develop abdominal pain, fever, or confusion as specified in the input.
"""

INSTRUCTION = (
    "You are an expert physician-writer who crafts hospital discharge summaries.\n"
    "[Guide] Adhere to the headings 'Brief Hospital Course' and 'Discharge Instructions'.\n"
    "Use only facts explicitly present in the input; if unknown, write 'not specified'.\n"
    "Do not invent medications, doses, dates, devices, or anticoagulation.\n"
    "Keep timelines consistent (POD, diet advancement, Foley removal, PT/OT, brace).\n"
    "Avoid duplication; do not repeat the 'Discharge Instructions' section.\n\n"
    f"{EXAMPLES}\n"
)

# 원본 메시지 구조 (system, user, assistant)
triples = []
print("🪄 메시지 트리플 생성 중...")
for _, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Formatting", ncols=100):
    system_msg = {"role": "system", "content": INSTRUCTION}
    user_msg   = {"role": "user", "content": str(row["text"]).strip()}

    assistant_text = ""
    if str(row["bhc_text"]).strip():
        assistant_text += f"Brief Hospital Course\n{str(row['bhc_text']).strip()}\n\n"
    if str(row["di_text"]).strip():
        assistant_text += f"Discharge Instructions\n{str(row['di_text']).strip()}"
    assistant_msg = {"role": "assistant", "content": assistant_text.strip()}

    triples.append((system_msg, user_msg, assistant_msg))
print(f"✅ 메시지 트리플 준비 완료! 총 {len(triples):,}개 샘플")

# ====== 6. 모델/토크나이저 로드 ======
# nf4 4bit 로딩 + k-bit 학습 준비
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side     = "right"
tokenizer.truncation_side  = "left"
tokenizer.model_max_length = MAX_SEQ_LENGTH

# ====== 7. 직렬화 함수 ======
def has_chat_template(tok):
    try:
        tmpl = tok.chat_template
        return tmpl is not None and len(str(tmpl)) > 0
    except Exception:
        return False

USE_CHAT_TEMPLATE = has_chat_template(tokenizer)

def serialize_system_user(sys_msg, user_msg):
    if USE_CHAT_TEMPLATE:
        return tokenizer.apply_chat_template(
            [sys_msg, user_msg], tokenize=False, add_generation_prompt=False
        )
    return f"<|system|>\n{sys_msg['content']}\n<|user|>\n{user_msg['content']}\n"

def serialize_assistant(asst_msg):
    if USE_CHAT_TEMPLATE:
        return tokenizer.apply_chat_template(
            [asst_msg], tokenize=False, add_generation_prompt=False
        )
    return f"<|assistant|>\n{asst_msg['content']}"

# 어시스턴트 헤더 길이
assistant_header_text = serialize_assistant({"role": "assistant", "content": ""})
ASSISTANT_HEADER_IDS  = tokenizer.encode(assistant_header_text, add_special_tokens=False)

# ====== 8. 길이 안전 전처리: assistant 100% 보존 + pre(left-trim) ======
def pack_one(sys_msg, user_msg, asst_msg):
    pre_text  = serialize_system_user(sys_msg, user_msg)
    asst_text = serialize_assistant(asst_msg)

    pre_ids   = tokenizer.encode(pre_text,   add_special_tokens=False)
    asst_ids  = tokenizer.encode(asst_text,  add_special_tokens=False)

    header_len  = len(ASSISTANT_HEADER_IDS)
    content_ids = asst_ids[header_len:] if len(asst_ids) >= header_len else []

    # 0) 감독 토큰이 전혀 없으면 학습 무의미 → 스킵 신호
    if len(content_ids) == 0:
        return None

    # 1) 어시스턴트 본문 최소 토큰 보장
    total_allowed   = MAX_SEQ_LENGTH - KEEP_MARGIN
    min_needed_asst = header_len + max(1, MIN_ANSWER_TOKENS)
    if min_needed_asst > total_allowed:
        min_needed_asst = header_len + 1  # 최저 1토큰이라도 보장

    # 2) pre 예산 계산
    budget_for_pre = max(0, total_allowed - min_needed_asst)

    # 3) pre 왼쪽 절단
    if len(pre_ids) > budget_for_pre:
        pre_ids = pre_ids[-budget_for_pre:]

    # 4) 어시스턴트 본문(가능하면 뒤쪽 유지)
    remaining       = total_allowed - len(pre_ids)
    allow_content   = max(1, remaining - header_len)
    allow_content   = min(allow_content, len(content_ids))
    kept_content    = content_ids[-allow_content:] if allow_content > 0 else []

    final_asst_ids  = ASSISTANT_HEADER_IDS + kept_content
    input_ids       = pre_ids + final_asst_ids
    attention_mask  = [1] * len(input_ids)

    # 5) 라벨: pre(-100) + header(-100) + 본문(정답)
    labels = [-100] * len(pre_ids) + [-100] * header_len + kept_content[:]
    if len(labels) < len(input_ids):
        labels += [-100] * (len(input_ids) - len(labels))
    elif len(labels) > len(input_ids):
        labels = labels[:len(input_ids)]

    # Safety clip
    input_ids      = input_ids[:MAX_SEQ_LENGTH]
    attention_mask = attention_mask[:MAX_SEQ_LENGTH]
    labels         = labels[:MAX_SEQ_LENGTH]
    return input_ids, attention_mask, labels

# ====== 9. Dataset 생성 ======
def build_dataset(triples, batch_size=512):
    all_input_ids, all_attn, all_labels = [], [], []
    skipped = 0
    for i in tqdm(range(0, len(triples), batch_size), desc="Packing", ncols=100):
        chunk = triples[i:i+batch_size]
        for sys_msg, user_msg, asst_msg in chunk:
            packed = pack_one(sys_msg, user_msg, asst_msg)
            if packed is None:
                skipped += 1
                continue
            ids, attn, labs = packed
            all_input_ids.append(ids)
            all_attn.append(attn)
            all_labels.append(labs)
    print(f"🚧 Skipped samples (no supervised tokens): {skipped}")
    return Dataset.from_dict({"input_ids": all_input_ids, "attention_mask": all_attn, "labels": all_labels})

tokenized_ds = build_dataset(triples)

# ====== 9-1. 라벨 유효 토큰 통계 출력 ======
def quick_label_stats(ds, n=200):
    n = min(n, len(ds))
    idxs = random.sample(range(len(ds)), n)
    cnts = []
    for i in idxs:
        y = ds[i]["labels"]
        cnts.append(sum(1 for t in y if t != -100))
    return {"checked": n, "min": int(np.min(cnts)), "median": int(np.median(cnts)), "max": int(np.max(cnts))}
print("🔎 Label token stats (non -100):", quick_label_stats(tokenized_ds, n=200))

# ====== 10. LoRA 설정 & 모델에 적용 ======
peft_config = LoraConfig(
    r=64, lora_alpha=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ====== 11. SFT 하이퍼파라미터 ======
# SFTConfig는 TrainingArguments를 상속하므로 그대로 사용해도 됩니다.
# 변수 이름만 sft_args -> training_args 로 변경 (권장)
training_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    num_train_epochs=EPOCH,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=3,
    logging_steps=10,
    output_dir=OUTPUT_DIR,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_steps=-1,
    report_to="tensorboard",
    packing=False,  # 커스텀 패킹 사용
)

# ====== 12. 커스텀 콜레이터 (패딩) ======
class PadToSameLengthCollator:
    def __init__(self, tokenizer, max_length=None, pad_to_multiple_of=8):
        self.tok = tokenizer
        self.max_length = max_length or tokenizer.model_max_length
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        input_ids_list = [f["input_ids"] for f in features]
        attn_list     = [f.get("attention_mask", [1]*len(f["input_ids"])) for f in features]
        labels_list   = [f["labels"] for f in features]

        max_len = min(max(len(x) for x in input_ids_list), self.max_length)
        if self.pad_to_multiple_of is not None and max_len % self.pad_to_multiple_of != 0:
            max_len = ((max_len // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of
            max_len = min(max_len, self.max_length)

        def pad_seq(seq, pad_id, tgt_len):
            if len(seq) >= tgt_len:
                return seq[:tgt_len]
            return seq + [pad_id] * (tgt_len - len(seq))

        batch_input_ids, batch_attention, batch_labels = [], [], []
        for ids, attn, lab in zip(input_ids_list, attn_list, labels_list):
            ids_p  = pad_seq(ids,  self.tok.pad_token_id, tgt_len=max_len)
            attn_p = pad_seq(attn, 0, tgt_len=max_len)
            lab_p  = pad_seq(lab, -100, tgt_len=max_len)

            batch_input_ids.append(ids_p)
            batch_attention.append(attn_p)
            batch_labels.append(lab_p)

        return {
            "input_ids":      torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention, dtype=torch.long),
            "labels":         torch.tensor(batch_labels, dtype=torch.long),
        }

data_collator = PadToSameLengthCollator(tokenizer, max_length=MAX_SEQ_LENGTH, pad_to_multiple_of=8)

# ====== 13. SFTTrainer 설정 & 학습 ======
# ✅✅✅ SFTTrainer를 Trainer로 변경 ✅✅✅
trainer = Trainer(
    model=model,
    args=training_args,      # 위에서 변경한 변수 이름
    train_dataset=tokenized_ds,
    data_collator=data_collator,
    # max_seq_length 인자는 여기에 필요 없습니다.
)

print("tokenizer model_max_length:", tokenizer.model_max_length)
print("🪄 학습 시작")
trainer.train()

# ====== 14. 어댑터 저장하기 ======
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print(f"✅ LoRA 어댑터 저장 완료: {ADAPTER_DIR}")


모든 필수 라이브러리를 최신 버전으로 설치

라이브러리 불러오기
✅ TRAIN INPUT rows: 263,204
✅ TRAIN OUTPUT rows: 263,204
✅ TRAIN MERGED rows: 263,204
🎯 최종 선택된 샘플 수: 10,000
🪄 메시지 트리플 생성 중...


Formatting:   0%|                                                         | 0/10000 [00:00<?, ?it/s]

✅ 메시지 트리플 준비 완료! 총 10,000개 샘플


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Packing:   0%|                                                               | 0/20 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8713 > 8192). Running this sequence through the model will result in indexing errors


🚧 Skipped samples (no supervised tokens): 0
🔎 Label token stats (non -100): {'checked': 200, 'min': 40, 'median': 807, 'max': 2884}
trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465
tokenizer model_max_length: 8192
🪄 학습 시작


Step,Training Loss
10,2.0183
20,2.1261
30,1.9531
40,1.9194
50,1.9435
60,1.9175
70,1.7527
80,1.8181
90,1.8474
100,1.787


✅ LoRA 어댑터 저장 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_2


# 학습된 어댑터 출력 확인

## 1번째 버전

In [None]:
# ====== 0. 필요 모델 import & 다운로드 ======
!pip install -U "transformers>=4.42.0" accelerate peft trl datasets pandas bitsandbytes rich tqdm

import os, re, pandas as pd, torch, gc
from typing import List, Tuple
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    StoppingCriteria, StoppingCriteriaList, set_seed
)
from peft import PeftModel

# ====== 1. 필요 변수 선언 ======
MAX_SEQ_LENGTH = 20000              # 토크나이저 상한(안전 마진)
CTX_TOKENS_LIMIT = 8192             # 실제 프롬프트 토큰 상한(안정성/속도)
GEN_MAX_NEW_TOKENS_BHC = 500        # BHC 생성 길이
GEN_MAX_NEW_TOKENS_DI  = 450        # DI 생성 길이
GEN_MIN_NEW_TOKENS     = 120        # 너무 일찍 끊김 방지
SHUFFLE_SEED = 42
TEST_LIMIT = 1
KEYS = ["subject_id", "hadm_id", "note_id"]

MODEL_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct"  # base
ADAPTER_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_1"
BASE_DATA   = "/content/drive/MyDrive/DILAB/MARS/mimic-iv-note_2.2/files/note/input_output"

TEST_INPUT  = f"{BASE_DATA}/discharge_test_input.parquet"
TEST_OUTPUT = f"{BASE_DATA}/discharge_test_output.parquet"

USE_NUMERIC_FILTER = True           # 입력에 없는 숫자가 나타난 줄 제거(필요 시 False)

set_seed(42)  # 재현성

# ====== 2. 모델 로드하기 ======
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = {"": 0} if device == "cuda" else "cpu"

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    offload_buffers=True
)
trained_model = PeftModel.from_pretrained(
    base_model,
    ADAPTER_DIR,
    is_trainable=False
)
trained_model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.model_max_length = MAX_SEQ_LENGTH

print("\n✅ Trained Model & 토크나이저 로드 완료!")

# ====== 3. 데이터 로드 & 샘플 ======
test_input_df = pd.read_parquet(TEST_INPUT)
test_output_df = pd.read_parquet(TEST_OUTPUT)
print(f"✅ TEST INPUT rows: {len(test_input_df):,}")
print(f"✅ TEST OUTPUT rows: {len(test_output_df):,}")

test_merged_df = test_input_df.merge(test_output_df, on=KEYS, how="inner")
print(f"✅ TEST MERGED rows: {len(test_merged_df):,}")

both_empty = (test_merged_df["bhc_text"].fillna("").str.len() == 0) & \
             (test_merged_df["di_text"].fillna("").str.len() == 0)
test_merged_df = test_merged_df[~both_empty].reset_index(drop=True)

if TEST_LIMIT is not None and TEST_LIMIT < len(test_merged_df):
    sample_df = test_merged_df.sample(n=TEST_LIMIT, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    sample_df = test_merged_df.copy()
print(f"🎯 최종 선택된 테스트 샘플 수: {len(sample_df):,}")

# ====== 4. 공통 규칙/프롬프트 ======
INSTRUCTION = """You are an expert physician-writer who crafts hospital discharge summaries.

STRICT RULES:
- Use ONLY facts explicitly present in the INPUT (this admission only). If not stated, omit it. No inventions.
- Preserve numbers/units/doses/medication names verbatim when cited.
- Expand abbreviations on first mention.
- English, clinical, concise; prefer bullets; avoid fluff; chronological clarity.
- Mention tubes/lines/diet/ambulation ONLY if present in INPUT.
- No new clinicians/tests/appointments beyond INPUT.
- Do NOT label values as high/low/improved unless INPUT states it.
- Merge duplicates; avoid repeating a problem header more than once.
- Do NOT introduce any new diagnoses, etiologies, problems, tests, or follow-ups that are not explicitly present in INPUT. If unsure, omit.
"""

BHC_TASK = """TASK:
Write only the section "Brief Hospital Course" as concise bullets or short paragraphs.
Cover: presentation → key findings → workup → treatments/changes with rationale → progression by day/stage if described → status at discharge → explicit pending items and follow-ups from INPUT. Include lines/tubes/diet/ambulation only if mentioned.
End your output with </END> (print it literally).
No new subsections like ‘Medications on Admission’, ‘Transitional Issues’.
"""

DI_TASK = """TASK:
Write only the section "Discharge Instructions". Patient-facing, concise, and strictly derived from INPUT directions.
Include only actionable items explicitly present in INPUT (medication directions/changes, labs/procedures with timing, activity/wound care restrictions if stated, monitoring tasks, whom/where to follow up if named). Do NOT restate the full medication list unless provided as instructions.
Use bullets when possible. Avoid invented or generic advice. No extra sections.
End your output with </END> (print it literally).
No extra headings or repeated ‘Instructions’ blocks.
"""

def build_prompt_bhc(input_text: str) -> str:
    return (
        f"<<INSTRUCTION>>\n{INSTRUCTION}\n"
        f"<<INPUT>>\n{input_text.strip()}\n"
        f"<<OUTPUT>>\nBrief Hospital Course\n{BHC_TASK}\n"
    )

def build_prompt_di(input_text: str, bhc_text_for_context: str) -> str:
    # DI 생성 시 BHC를 약간의 맥락으로 제공(입력 사실 반복 방지). 단, DI는 INPUT 사실만 허용됨을 재강조.
    return (
        f"<<INSTRUCTION>>\n{INSTRUCTION}\n"
        f"<<INPUT>>\n{input_text.strip()}\n"
        f"<<BHC (context only; do not invent beyond INPUT)>>\n{bhc_text_for_context.strip()}\n"
        f"<<OUTPUT>>\nDischarge Instructions\n{DI_TASK}\n"
    )

# ====== 5. 정지/후처리/트렁케이션 ======
class StopOnEndTag(StoppingCriteria):
    def __init__(self, tokenizer, stop_strings: List[str]):
        self.stop_ids = [tokenizer.encode(s, add_special_tokens=False) for s in stop_strings]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for s in self.stop_ids:
            L = len(s)
            if L and input_ids.shape[1] >= L:
                if torch.equal(input_ids[0, -L:], torch.tensor(s, device=input_ids.device)):
                    return True
        return False

def collect_eos_ids(tok):
    ids = []
    for sym in ["</think>", "<|im_end|>"]:
        tid = tok.convert_tokens_to_ids(sym)
        if tid is not None and tid != tok.unk_token_id:
            ids.append(tid)
    ids.append(tok.eos_token_id)
    return ids

def strip_think(text: str) -> str:
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

def dedup_consecutive_lines(text: str) -> str:
    lines = [l.rstrip() for l in text.splitlines()]
    out, prev = [], None
    for l in lines:
        if l != prev:
            out.append(l)
        prev = l
    return "\n".join(out).strip()

def numeric_whitelist_filter(text: str, input_text: str, enable=True) -> str:
    if not enable:
        return text
    nums = set(re.findall(r"[-+]?\d+(?:\.\d+)?", input_text))
    out_lines = []
    for line in text.splitlines():
        line_nums = set(re.findall(r"[-+]?\d+(?:\.\d+)?", line))
        if not line_nums or line_nums.issubset(nums):
            out_lines.append(line)
    return "\n".join(out_lines).strip()

def truncate_by_tokens(text: str, max_tokens: int) -> str:
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return text
    ids = ids[-max_tokens:]  # 뒤쪽(최근 정보) 보존
    return tokenizer.decode(ids, skip_special_tokens=False)

# ====== 6. 생성 유틸 (단계별) ======
GEN_KWARGS_COMMON = dict(
    do_sample=False,                 # greedy로 지시문 준수↑ (원하면 샘플링 설정으로 변경)
    no_repeat_ngram_size=6,
    repetition_penalty=1.12,
)

def run_generate(model, prompt: str, max_new: int, min_new: int) -> str:
    prompt = truncate_by_tokens(prompt, CTX_TOKENS_LIMIT)
    eos_ids = collect_eos_ids(tokenizer)
    stopping = StoppingCriteriaList([StopOnEndTag(tokenizer, ["</END>"])])

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        gen = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new,
            min_new_tokens=min_new,
            eos_token_id=eos_ids,                # 사고/EOS 토큰에서 정지
            pad_token_id=tokenizer.pad_token_id,
            stopping_criteria=stopping,
            **GEN_KWARGS_COMMON
        )
    gen_only = [o[len(inputs.input_ids[0]):] for o in gen]
    text = tokenizer.decode(gen_only[0], skip_special_tokens=True)
    text = strip_think(text)
    text = text.replace("</END>", "").strip()
    text = dedup_consecutive_lines(text)
    return text

def generate_bhc(model, input_text: str) -> str:
    prompt = build_prompt_bhc(input_text)
    text = run_generate(model, prompt, max_new=GEN_MAX_NEW_TOKENS_BHC, min_new=GEN_MIN_NEW_TOKENS)
    if USE_NUMERIC_FILTER:
        text = numeric_whitelist_filter(text, input_text, enable=True)
    # 헤더 보장
    if not text.lower().startswith("brief hospital course"):
        text = "Brief Hospital Course\n" + text
    return text.strip()

def generate_di(model, input_text: str, bhc_text: str) -> str:
    prompt = build_prompt_di(input_text, bhc_text_for_context=bhc_text)
    text = run_generate(model, prompt, max_new=GEN_MAX_NEW_TOKENS_DI, min_new=GEN_MIN_NEW_TOKENS)
    if USE_NUMERIC_FILTER:
        text = numeric_whitelist_filter(text, input_text, enable=True)
    # 헤더 보장
    if not text.lower().startswith("discharge instructions"):
        text = "Discharge Instructions\n" + text
    return text.strip()

def generate_bhc_and_di(model, input_text: str) -> Tuple[str, str, str]:
    input_text_trunc = truncate_by_tokens(input_text, CTX_TOKENS_LIMIT)
    bhc = generate_bhc(model, input_text_trunc)
    di  = generate_di(model, input_text_trunc, bhc_text=bhc)
    final = f"{bhc}\n\n{di}"
    return bhc, di, final

# ====== 7. 실행 ======
for i, row in sample_df.iterrows():
    print(f"\n\n------------- {i+1}번째 예시 ---------------")
    input_text = row["text"]

    # 학습(LoRA) 모델로만 2-단계 생성 (BHC -> DI)
    t_bhc, t_di, trained_response  = generate_bhc_and_di(trained_model,  input_text)

    print("\n\n✅input:")
    print(input_text)

    print("\n\n✅trained_model 출력 결과:")
    print(trained_response)

    print("\n\n📌output(정답):")
    print("\t📁bhc:")
    print(row["bhc_text"])
    print("\t📁di:")
    print(row["di_text"])

# (선택) 메모리 정리
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


✅ Trained Model & 토크나이저 로드 완료!
✅ TEST INPUT rows: 68,589
✅ TEST OUTPUT rows: 68,589
✅ TEST MERGED rows: 68,589
🎯 최종 선택된 테스트 샘플 수: 1


------------- 1번째 예시 ---------------


✅input:
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: SURGERY
 
Allergies: 
Patient recorded as having No Known Allergies to Drugs
 
Attending: ___.
 
Chief Complaint:
Pancreatic neuroendocrine tumor 
 
Major Surgical or Invasive Procedure:
___:
1. Central pancreatectomy with pancreaticojejunostomy using
   a Roux-en-Y conduit.
2. Cholecystectomy.

 
History of Present Illness:
Ms. ___ is a ___ woman who was in pretty good health 
with the exception of a history of breast cancer in the past.  
On follow-up imaging of a
renal cyst situation, she was found to have an incidental lesion 
in the middle of her pancreas.  This had the hallmark features 
of a neuroendocrine tumor and this was biopsy-proven by Dr.

## 2번째 버전

In [13]:
# ====== 0. 필요 모델 import & 다운로드 ======
!pip install -U "transformers>=4.42.0" accelerate peft trl datasets pandas bitsandbytes rich tqdm

import os, re, pandas as pd, torch, gc
from typing import List, Tuple
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    StoppingCriteria, StoppingCriteriaList, set_seed
)
from peft import PeftModel

# ====== 1. 필요 변수 선언 ======
MAX_SEQ_LENGTH = 20000          # 토크나이저 상한(안전 마진)
CTX_TOKENS_LIMIT = 8192         # 실제 프롬프트 토큰 상한(안정성/속도)
GEN_MAX_NEW_TOKENS_BHC = 800    # (참고용)
GEN_MAX_NEW_TOKENS_DI  = 800    # (참고용)
# [!! NEW !!] BHC와 DI를 합쳐서 생성할 최대 길이
GEN_MAX_NEW_TOKENS_COMBINED = GEN_MAX_NEW_TOKENS_BHC + GEN_MAX_NEW_TOKENS_DI + 100 # 예: 1050 (여유분)

GEN_MIN_NEW_TOKENS     = 120    # 너무 일찍 끊김 방지
SHUFFLE_SEED = 42
TEST_LIMIT = 1
KEYS = ["subject_id", "hadm_id", "note_id"]

MODEL_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct"  # base
ADAPTER_DIR = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/adapter/original_2"
BASE_DATA   = "/content/drive/MyDrive/DILAB/MARS/mimic-iv-note_2.2/files/note/input_output"

TEST_INPUT  = f"{BASE_DATA}/discharge_test_input.parquet"
TEST_OUTPUT = f"{BASE_DATA}/discharge_test_output.parquet"

USE_NUMERIC_FILTER = True       # 입력에 없는 숫자가 나타난 줄 제거(필요 시 False)

set_seed(42)  # 재현성

# ====== 2. 모델 로드하기 ======
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = {"": 0} if device == "cuda" else "cpu"

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    offload_buffers=True
)
trained_model = PeftModel.from_pretrained(
    base_model,
    ADAPTER_DIR,
    is_trainable=False
)
trained_model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.model_max_length = MAX_SEQ_LENGTH

print("\n✅ Trained Model & 토크나이저 로드 완료!")

# ====== 3. 데이터 로드 & 샘플 ======
test_input_df = pd.read_parquet(TEST_INPUT)
test_output_df = pd.read_parquet(TEST_OUTPUT)
print(f"✅ TEST INPUT rows: {len(test_input_df):,}")
print(f"✅ TEST OUTPUT rows: {len(test_output_df):,}")

test_merged_df = test_input_df.merge(test_output_df, on=KEYS, how="inner")
print(f"✅ TEST MERGED rows: {len(test_merged_df):,}")

both_empty = (test_merged_df["bhc_text"].fillna("").str.len() == 0) & \
             (test_merged_df["di_text"].fillna("").str.len() == 0)
test_merged_df = test_merged_df[~both_empty].reset_index(drop=True)

if TEST_LIMIT is not None and TEST_LIMIT < len(test_merged_df):
    sample_df = test_merged_df.sample(n=TEST_LIMIT, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    sample_df = test_merged_df.copy()
print(f"🎯 최종 선택된 테스트 샘플 수: {len(sample_df):,}")

# ====== 4. 공통 규칙/프롬프트 ======
# [!! MODIFIED !!] 훈련 스크립트(====== 5. ...)와 동일한 INSTRUCTION을 사용해야 합니다.
EXAMPLES = r"""
Brief Hospital Course
- Admitted with progressive abdominal distension and confusion in the setting of cirrhosis.
- Diagnostic paracentesis negative for SBP; CXR unremarkable; UA/cultures without growth.
- Initiated diuretics: Furosemide 40 mg PO daily; Spironolactone 50 mg PO daily (dose chosen given K+ 4.5).
- Symptomatic improvement with diuresis; electrolytes stable on regimen.
- Outpatient follow-up arranged for liver clinic and endoscopic screening as noted in input.

Discharge Instructions
- Take Furosemide 40 mg PO daily and Spironolactone 50 mg PO daily exactly as prescribed.
- Follow a low-sodium diet as instructed in the input.
- Attend the scheduled outpatient visits listed in the input.
- Return for care if you develop abdominal pain, fever, or confusion as specified in the input.
"""

INSTRUCTION = (
    "You are an expert physician-writer who crafts hospital discharge summaries.\n"
    "[Guide] Adhere to the headings 'Brief Hospital Course' and 'Discharge Instructions'.\n"
    "Use only facts explicitly present in the input; if unknown, write 'not specified'.\n"
    "Do not invent medications, doses, dates, devices, or anticoagulation.\n"
    "Keep timelines consistent (POD, diet advancement, Foley removal, PT/OT, brace).\n"
    "Avoid duplication; do not repeat the 'Discharge Instructions' section.\n\n"
    f"{EXAMPLES}\n"
)

# [!! MODIFIED !!] Llama 3 채팅 템플릿을 사용하도록 프롬프트 빌더 변경
def build_prompt_combined(input_text: str) -> str:
    system_msg = {"role": "system", "content": INSTRUCTION}
    user_msg   = {"role": "user", "content": input_text.strip()}

    # tokenizer.apply_chat_template을 사용하여 훈련 시와 동일한 형식으로 만듭니다.
    # add_generation_prompt=True가 핵심입니다. (Llama3가 <|start_header_id|>assistant<|end_header_id|>를 추가함)
    prompt_text = tokenizer.apply_chat_template(
        [system_msg, user_msg],
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt_text


# ====== 5. 정지/후처리/트렁케이션 ======
# (이 섹션의 함수들은 '====== 6. ...'에서 호출되므로 그대로 둡니다)
class StopOnEndTag(StoppingCriteria):
    def __init__(self, tokenizer, stop_strings: List[str]):
        self.stop_ids = [tokenizer.encode(s, add_special_tokens=False) for s in stop_strings]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for s in self.stop_ids:
            L = len(s)
            if L and input_ids.shape[1] >= L:
                if torch.equal(input_ids[0, -L:], torch.tensor(s, device=input_ids.device)):
                    return True
        return False

def collect_eos_ids(tok):
    ids = []
    for sym in ["</think>", "<|im_end|>"]:
        tid = tok.convert_tokens_to_ids(sym)
        if tid is not None and tid != tok.unk_token_id:
            ids.append(tid)
    # Llama 3의 공식 EOS 토큰
    ids.append(tokenizer.eos_token_id)
    # Llama 3가 훈련 시 사용했을 수 있는 추가적인 EOS 토큰
    ids.append(tokenizer.convert_tokens_to_ids("<|eot_id|>"))
    return list(set(ids)) # 중복 제거

def strip_think(text: str) -> str:
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

def dedup_consecutive_lines(text: str) -> str:
    lines = [l.rstrip() for l in text.splitlines()]
    out, prev = [], None
    for l in lines:
        if l != prev:
            out.append(l)
        prev = l
    return "\n".join(out).strip()

def numeric_whitelist_filter(text: str, input_text: str, enable=True) -> str:
    if not enable:
        return text
    nums = set(re.findall(r"[-+]?\d+(?:\.\d+)?", input_text))
    out_lines = []
    for line in text.splitlines():
        line_nums = set(re.findall(r"[-+]?\d+(?:\.\d+)?", line))
        if not line_nums or line_nums.issubset(nums):
            out_lines.append(line)
    return "\n".join(out_lines).strip()

def truncate_by_tokens(text: str, max_tokens: int) -> str:
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return text
    ids = ids[-max_tokens:]  # 뒤쪽(최근 정보) 보존
    return tokenizer.decode(ids, skip_special_tokens=False)

# ====== 6. 생성 유틸 (단일 호출) ======
GEN_KWARGS_COMMON = dict(
    do_sample=False,              # greedy로 지시문 준수↑ (원하면 샘플링 설정으로 변경)
    no_repeat_ngram_size=6,
    repetition_penalty=1.12,
)

def run_generate(model, prompt: str, max_new: int, min_new: int) -> str:
    prompt = truncate_by_tokens(prompt, CTX_TOKENS_LIMIT)

    # [!! MODIFIED !!] Llama 3 채팅 템플릿은 <|eot_id|> 같은 eos_token_id로 정지합니다.
    eos_ids = collect_eos_ids(tokenizer) # Llama 3의 모든 관련 EOS ID 수집

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        gen = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new,
            min_new_tokens=min_new,
            eos_token_id=eos_ids,                # Llama 3의 공식 EOS 토큰
            pad_token_id=tokenizer.pad_token_id,
            # stopping_criteria 삭제 (커스텀 </END> 불필요)
            **GEN_KWARGS_COMMON
        )
    gen_only = [o[len(inputs.input_ids[0]):] for o in gen]
    text = tokenizer.decode(gen_only[0], skip_special_tokens=True)
    text = strip_think(text)
    text = dedup_consecutive_lines(text)
    return text.strip()

# [!! MODIFIED !!] 2단계 파이프라인을 단일 호출 및 후처리로 변경
def generate_bhc_and_di(model, input_text: str) -> Tuple[str, str, str]:
    input_text_trunc = truncate_by_tokens(input_text, CTX_TOKENS_LIMIT)

    # 1. Llama 3 채팅 템플릿 프롬프트 빌드
    prompt = build_prompt_combined(input_text_trunc)

    # 2. 모델 '단일' 호출 (BHC와 DI가 합쳐진 텍스트 생성)
    raw_text = run_generate(
        model,
        prompt,
        max_new=GEN_MAX_NEW_TOKENS_COMBINED, # 합쳐진 최대 길이 사용
        min_new=GEN_MIN_NEW_TOKENS
    )

    # 3. (선택적) 숫자 필터링
    if USE_NUMERIC_FILTER:
        raw_text = numeric_whitelist_filter(raw_text, input_text_trunc, enable=True)

    # 4. 생성된 단일 텍스트를 BHC와 DI로 분리
    bhc_text = ""
    di_text = ""

    # 'Discharge Instructions' 헤더를 기준으로 분리
    di_header_pattern = re.compile(r'Discharge Instructions', re.IGNORECASE)
    match = di_header_pattern.search(raw_text)

    if match:
        bhc_text = raw_text[:match.start()].strip()
        di_text = raw_text[match.start():].strip()

        # BHC 헤더 보장 (모델이 이미 훈련받았으므로, 이 부분은 보험용입니다)
        if not bhc_text.lower().startswith("brief hospital course"):
             bhc_text = "Brief Hospital Course\n" + bhc_text
    else:
        # DI가 생성되지 않았거나, BHC만 생성된 경우
        bhc_text = raw_text.strip()
        if not bhc_text.lower().startswith("brief hospital course"):
             bhc_text = "Brief Hospital Course\n" + bhc_text
        di_text = "Discharge Instructions\n(No DI generated)" # DI 생성 실패 표시

    return bhc_text, di_text, raw_text # raw_text를 trained_response로 반환

# ====== 7. 실행 ======
# (이 섹션은 수정 없이 그대로 사용합니다)
for i, row in sample_df.iterrows():
    print(f"\n\n------------- {i+1}번째 예시 ---------------")

    # [!! DEBUG !!] 모델에 실제 입력되는 원본 텍스트 확인
    # print("\n\n🔥 [DEBUG] 모델에 실제 입력되는 INPUT 텍스트:")
    # print("="*50)
    # print(row["text"])
    # print("="*50)

    # 학습(LoRA) 모델로만 단일 생성
    t_bhc, t_di, trained_response   = generate_bhc_and_di(trained_model,   row["text"])

    # (input_text는 너무 기므로 주석 처리)
    # print("\n\n✅input:")
    # print(row["text"])

    print("\n\n✅trained_model 출력 결과:")
    print(trained_response)

    print("\n\n📌output(정답):")
    print("\t📁bhc:")
    print(row["bhc_text"])
    print("\t📁di:")
    print(row["di_text"])

# (선택) 메모리 정리
gc.collect()
if device == "cuda":
    torch.cuda.empty_cache()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


✅ Trained Model & 토크나이저 로드 완료!
✅ TEST INPUT rows: 68,589
✅ TEST OUTPUT rows: 68,589
✅ TEST MERGED rows: 68,589
🎯 최종 선택된 테스트 샘플 수: 1


------------- 1번째 예시 ---------------


✅trained_model 출력 결과:
The patient underwent central pancreatectomy with
pancreaticojejunostomy and cholecystectomy on ___. The
operation went well without complications. She was transferred to
the PACU post-operatively where she remained hemodynamically
stable. Her NG tube was kept in place due to high output. On
post-operative day one, her NG tube output decreased and it was
removed. She tolerated sips and ice chips. On POD 2, she began
passing flatus and was started on clear liquids which were
tolerated well. She was advanced to regular food which she also
tolerated well but complained of some nausea. Zofran was added
and helped with her symptoms. On POD 3, she continued to tolerate
regular food and her diet was advanced to regular. She was
started on reglan to help stimulate bowel function. She passed
flatus aga

# 결과들 파일로 만들기

## 1번째 버전

In [None]:
# ====== X. 100개 샘플 생성 & 저장 ======
!pip -q install pyarrow  # Parquet 저장용(한 번만 설치하면 됩니다)

import os
import pandas as pd
from tqdm import tqdm

SAVE_PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet"
N_SAMPLES = 100

# 1) 샘플 100개 선택 (데이터가 100개 미만이면 가능한 만큼만 사용)
if len(test_merged_df) >= N_SAMPLES:
    eval_df = test_merged_df.sample(n=N_SAMPLES, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    eval_df = test_merged_df.copy()

print(f"📦 생성 대상 샘플 수: {len(eval_df)}")

# 2) 생성 실행 및 결과 수집
rows = []
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Generating"):
    input_text = row["text"]
    # 이미 위에서 정의한 함수 사용
    bhc_pred, di_pred, pred_full = generate_bhc_and_di(trained_model, input_text)

    rows.append({
        # 키 보존
        "subject_id": row.get("subject_id", None),
        "hadm_id": row.get("hadm_id", None),
        "note_id": row.get("note_id", None),

        # 입력/출력
        "input_text": input_text,
        "bhc_pred": bhc_pred,
        "di_pred": di_pred,
        "pred_full": pred_full,

        # 정답(비교용)
        "bhc_gt": row.get("bhc_text", None),
        "di_gt": row.get("di_text", None),
    })

results_df = pd.DataFrame(rows)

# 3) 저장 디렉토리 생성 & Parquet 저장
os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
results_df.to_parquet(SAVE_PATH, index=False)
print(f"✅ 저장 완료: {SAVE_PATH} (rows={len(results_df)})")


📦 생성 대상 샘플 수: 100


Generating: 100%|██████████| 100/100 [2:27:38<00:00, 88.59s/it]

✅ 저장 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet (rows=100)





## 2번쨰 버전

In [14]:
# ====== X. 100개 샘플 생성 & 저장 ======
!pip -q install pyarrow  # Parquet 저장용(한 번만 설치하면 됩니다)

import os
import pandas as pd
from tqdm import tqdm

SAVE_PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet"
N_SAMPLES = 100

# 1) 샘플 100개 선택 (데이터가 100개 미만이면 가능한 만큼만 사용)
if len(test_merged_df) >= N_SAMPLES:
    eval_df = test_merged_df.sample(n=N_SAMPLES, random_state=SHUFFLE_SEED).reset_index(drop=True)
else:
    eval_df = test_merged_df.copy()

print(f"📦 생성 대상 샘플 수: {len(eval_df)}")

# 2) 생성 실행 및 결과 수집
rows = []
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Generating"):
    input_text = row["text"]
    # 이미 위에서 정의한 함수 사용
    bhc_pred, di_pred, pred_full = generate_bhc_and_di(trained_model, input_text)

    rows.append({
        # 키 보존
        "subject_id": row.get("subject_id", None),
        "hadm_id": row.get("hadm_id", None),
        "note_id": row.get("note_id", None),

        # 입력/출력
        "input_text": input_text,
        "bhc_pred": bhc_pred,
        "di_pred": di_pred,
        "pred_full": pred_full,

        # 정답(비교용)
        "bhc_gt": row.get("bhc_text", None),
        "di_gt": row.get("di_text", None),
    })

results_df = pd.DataFrame(rows)

# 3) 저장 디렉토리 생성 & Parquet 저장
os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
results_df.to_parquet(SAVE_PATH, index=False)
print(f"✅ 저장 완료: {SAVE_PATH} (rows={len(results_df)})")


📦 생성 대상 샘플 수: 100


Generating: 100%|██████████| 100/100 [2:20:23<00:00, 84.23s/it]

✅ 저장 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet (rows=100)





# 모델 평가

## 데이터 준비

### 1번째 버전

In [None]:
# ============================================
# A) Google Drive 마운트 & 기본 설정
# ============================================
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import pandas as pd

# 결과 Parquet 경로 (필요 시 수정)
PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet"

# 확인할 행 개수
VIEW_NUM_ALL = 15   # 전체 열 프린트용 상단 N개
VIEW_NUM_SECT = 5   # 섹션(BHC/DI) 분리 확인용 상단 N개

# ============================================
# B) 데이터 로드 & 기본 정보 출력
# ============================================
df = pd.read_parquet(PATH)
print(f"✅ 로드 완료: {PATH}")
print(f"📦 총 행 수: {len(df)}")
print("🧩 컬럼 목록:")
for i, c in enumerate(df.columns, start=1):
    print(f"  {i:>2}. {c}")

# ============================================
# C) 유틸: 다양한 컬럼명(구버전/신버전) 자동 매핑
#    - 파일 저장 시점마다 컬럼명이 다를 수 있어 안전하게 처리
# ============================================
def first_existing_col(df, candidates, default=None):
    """candidates 중 df에 존재하는 첫 번째 컬럼명을 반환. 없으면 default."""
    for c in candidates:
        if c in df.columns:
            return c
    return default

# 예측(파인튜닝) 전체 출력 텍스트(모델 통합 출력)
COL_PRED_FULL = first_existing_col(
    df,
    candidates=["pred_full", "trained_model_output", "model_output", "llm_output"]
)

# 예측 BHC/DI가 별도 저장된 경우(없을 수 있음)
COL_BHC_PRED = first_existing_col(df, ["bhc_pred", "pred_bhc", "bhc_output"])
COL_DI_PRED  = first_existing_col(df, ["di_pred", "pred_di", "di_output"])

# 정답(GT)
COL_BHC_GT = first_existing_col(df, ["bhc_gt", "original_bhc", "gt_bhc"])
COL_DI_GT  = first_existing_col(df, ["di_gt", "original_di", "gt_di"])

# 원문 입력 텍스트(있으면 디버깅에 도움)
COL_INPUT = first_existing_col(df, ["input_text", "input", "note_text", "text"])

# 키(있으면 함께 보여주면 편리)
COL_SUBJECT = first_existing_col(df, ["subject_id"])
COL_HADM    = first_existing_col(df, ["hadm_id"])
COL_NOTE    = first_existing_col(df, ["note_id"])

# ============================================
# D) (옵션) 상단 VIEW_NUM_ALL개 행에 대해 모든 열 프린트
#    - 원래 올려주신 "전체 열 프린트" 루틴을 안전하게 정리
# ============================================
print("\n" + "="*80)
print(f"🔎 상단 {VIEW_NUM_ALL}개 행, 모든 열 출력")
print("="*80)

for ridx, row in df.head(VIEW_NUM_ALL).iterrows():
    print(f"\n📌======= {ridx + 1}번 행 =======📌")
    for j, col in enumerate(df.columns, start=1):
        val = row[col]
        print(f"✅{j}. {col}\n{val}\n")

# ============================================
# E) (옵션) 특정 열만 골라 출력
#    - 사용자가 원래 쓰신 'trained_model_output', 'original_bhc', 'original_di'를
#      실제 df에 존재하는 컬럼으로 자동 매핑해 안전하게 처리
# ============================================
# '원하는 열(라벨)' -> '실제 df 컬럼명' 매핑
columns_wanted = {
    "trained_model_output": COL_PRED_FULL,
    "original_bhc": COL_BHC_GT,
    "original_di": COL_DI_GT
}

# 실제 존재하는 열만 추려내기
columns_to_print = [real for label, real in columns_wanted.items() if real is not None]

if columns_to_print:
    print("\n" + "="*80)
    print(f"🧷 상단 {min(VIEW_NUM_ALL, len(df))}개 행, 선택 열만 출력")
    print("="*80)

    # 원래 인덱스(컬럼 순서 기반 번호) 표시를 위해 위치 계산
    col_indices = {col: df.columns.get_loc(col) + 1 for col in columns_to_print}

    for ridx, row in df.head(VIEW_NUM_ALL).iterrows():
        print(f"\n📌======= {ridx + 1}번 행 =======📌")
        for col_name in columns_to_print:
            value = row[col_name]
            original_index = col_indices[col_name]
            print(f"✅{original_index}. {col_name}\n{value}\n")
else:
    print("\n⚠️ 선택 출력용으로 지정한 컬럼이 현재 데이터프레임에 없습니다. (무시하고 다음 단계 진행)")

# ============================================
# F) BHC/DI 섹션 분리 유틸
#    - 다양한 표기(약어/대소문자/콜론/개행 등)에 견고하게 동작하도록 정규식 보강
# ============================================
def split_bhc_di(text: str):
    """
    통합 출력(text)에서 'Brief Hospital Course(=BHC)'와 'Discharge Instructions(=DI)' 구간을 분리.
    - 가능한 프리픽스: 'Brief Hospital Course' | 'BHC'
    - 가능한 프리픽스: 'Discharge Instructions' | 'DI'
    - 콜론(:) 유무, 개행 유무 등 다양한 경우 대응
    """
    if not isinstance(text, str):
        return "", ""

    # \s*와 [:.-]* 등으로 콜론/대시/점 등 가벼운 구두점과 공백 허용
    bhc_patterns = [
        r"(?i)\bBrief\s*Hospital\s*Course\b\s*[:.\-]*",  # 'Brief Hospital Course'
        r"(?i)\bBHC\b\s*[:.\-]*"                          # 'BHC'
    ]
    di_patterns = [
        r"(?i)\bDischarge\s*Instructions\b\s*[:.\-]*",    # 'Discharge Instructions'
        r"(?i)\bDI\b\s*[:.\-]*"                            # 'DI'
    ]

    # 가장 먼저 등장하는 BHC/DI 앵커 탐색
    bhc_match = None
    di_match = None

    for pat in bhc_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            if (bhc_match is None) or (m.start() < bhc_match.start()):
                bhc_match = m

    for pat in di_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            if (di_match is None) or (m.start() < di_match.start()):
                di_match = m

    bhc_text = ""
    di_text = ""

    if bhc_match and di_match:
        # 등장 순서에 따라 안전하게 분리
        first, second = (bhc_match, di_match) if bhc_match.start() < di_match.start() else (di_match, bhc_match)
        if first is bhc_match:
            # BHC 먼저, 그 다음 DI
            bhc_start = bhc_match.end()
            bhc_end   = di_match.start()
            bhc_text  = text[bhc_start:bhc_end].strip().lstrip(':').strip()
            di_text   = text[di_match.end():].strip().lstrip(':').strip()
        else:
            # DI 먼저, 그 다음 BHC (드문 경우지만 방어)
            di_start = di_match.end()
            di_end   = bhc_match.start()
            di_text  = text[di_start:di_end].strip().lstrip(':').strip()
            bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif bhc_match:
        bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif di_match:
        di_text = text[di_match.end():].strip().lstrip(':').strip()

    return bhc_text, di_text

# ============================================
# G) 상단 VIEW_NUM_SECT개 행에 대해: 예측/정답 BHC/DI 비교 출력
#    - pred_full(통합 출력)에서 분리 시도
#    - pred_full이 없거나 분리가 안 되면 bhc_pred/di_pred 컬럼 사용
# ============================================
print("\n" + "="*80)
print(f"🧪 상단 {VIEW_NUM_SECT}개 행: 예측/정답 BHC·DI 비교")
print("="*80)

for ridx, row in df.head(VIEW_NUM_SECT).iterrows():
    # Key 정보(있을 때만 표시)
    key_info = []
    if COL_SUBJECT: key_info.append(f"subject_id={row[COL_SUBJECT]}")
    if COL_HADM:    key_info.append(f"hadm_id={row[COL_HADM]}")
    if COL_NOTE:    key_info.append(f"note_id={row[COL_NOTE]}")
    key_str = (" | ".join(key_info)) if key_info else "키 정보 없음"

    print(f"\n\n📌======= {ridx + 1}번 행 분할 결과 확인 =======📌")
    print(f"🧷 {key_str}")

    # 1) 파인튜닝 모델 통합 출력에서 분리 (가능하면 우선 사용)
    tm_output = row[COL_PRED_FULL] if COL_PRED_FULL else None
    tm_bhc_from_full, tm_di_from_full = split_bhc_di(tm_output) if tm_output else ("", "")

    # 2) 통합 출력에서 분리 실패 시, 별도 저장된 예측 컬럼 사용
    tm_bhc = tm_bhc_from_full if tm_bhc_from_full else (row[COL_BHC_PRED] if COL_BHC_PRED else "")
    tm_di  = tm_di_from_full  if tm_di_from_full  else (row[COL_DI_PRED]  if COL_DI_PRED  else "")

    # 3) 정답(GT)
    gt_bhc = str(row[COL_BHC_GT]) if (COL_BHC_GT and pd.notna(row[COL_BHC_GT])) else ""
    gt_di  = str(row[COL_DI_GT])  if (COL_DI_GT  and pd.notna(row[COL_DI_GT]))  else ""

    # 4) (옵션) 입력 원문도 보고 싶으면 출력
    if COL_INPUT:
        print("\n--- [입력 원문 일부] ---")
        src = str(row[COL_INPUT])
        print(src[:600] + ("..." if len(src) > 600 else ""))

    # 5) 결과 출력
    print("\n--- [파인튜닝 모델 BHC] ---")
    print(tm_bhc if tm_bhc else "(비어있음)")

    print("\n--- [파인튜닝 모델 DI] ---")
    print(tm_di if tm_di else "(비어있음)")

    print("\n--- [정답 BHC] ---")
    print(gt_bhc if gt_bhc else "(비어있음)")

    print("\n--- [정답 DI] ---")
    print(gt_di if gt_di else "(비어있음)")

    print("=" * 60)

# ============================================
# H) (옵션) Parquet 내용 요약 저장/내보내기
#     - CSV로 일부 열만 내보내 미리보기 용도로 활용하고 싶을 때
# ============================================
# 필요 시 주석 해제:
# preview_cols = [c for c in [COL_SUBJECT, COL_HADM, COL_NOTE, COL_INPUT, COL_PRED_FULL, COL_BHC_PRED, COL_DI_PRED, COL_BHC_GT, COL_DI_GT] if c]
# preview_path = PATH.replace(".parquet", "_preview.csv")
# df[preview_cols].head(50).to_csv(preview_path, index=False)
# print(f"\n📝 미리보기 CSV 저장: {preview_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ 로드 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet
📦 총 행 수: 100
🧩 컬럼 목록:
   1. subject_id
   2. hadm_id
   3. note_id
   4. input_text
   5. bhc_pred
   6. di_pred
   7. pred_full
   8. bhc_gt
   9. di_gt

🔎 상단 15개 행, 모든 열 출력

✅1. subject_id
19058186

✅2. hadm_id
20680591

✅3. note_id
19058186-DS-10

✅4. input_text
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: SURGERY
 
Allergies: 
Patient recorded as having No Known Allergies to Drugs
 
Attending: ___.
 
Chief Complaint:
Pancreatic neuroendocrine tumor 
 
Major Surgical or Invasive Procedure:
___:
1. Central pancreatectomy with pancreaticojejunostomy using
   a Roux-en-Y conduit.
2. Cholecystectomy.

 
History of Present Illness:
Ms. ___ is a ___ woman who w

In [None]:
# 평가에 필요한 라이브러리 다운
!pip uninstall transformers bert-score -y
!pip install transformers bert-score

Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
[0mCollecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m118.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, bert-score
Successfully installed bert-score-0.3.13 transformers-4.57.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# ============================================
# BERTScore 계산 스크립트 (견고/가변 컬럼 대응 버전)
# ============================================
import pandas as pd
from bert_score import score
import re
import warnings
import torch
import numpy as np
import os

# 경고 정리
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized from the model checkpoint")

# --- 설정 ---
PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet"
VIEW_NUM = None            # None이면 전체, 숫자면 상위 N개만
MODEL_TYPE = "roberta-large"  # 'microsoft/deberta-xlarge-mnli' 등 변경 가능
LANG = "en"                # bert-score 언어
BATCH_SIZE = 1             # score() 내부 배치 (메모리 여유 있으면 키워도 됨)
# --- 설정 끝 ---

# 장치 설정 (문자열)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}  |  Model: {MODEL_TYPE}")

# 데이터 로드
df = pd.read_parquet(PATH)
total_rows = len(df) if VIEW_NUM is None else min(VIEW_NUM, len(df))
print(f"총 {total_rows}개 행에 대한 BERTScore 계산을 시작합니다...")

# ============================================
# 유틸: 컬럼 자동 매핑 (다른 스크립트/시점에 따라 컬럼명이 달라져도 동작)
# ============================================
def first_existing_col(df_, candidates, default=None):
    for c in candidates:
        if c in df_.columns:
            return c
    return default

# 통합 출력(모델 출력 전체) 추정 컬럼
COL_OM_FULL = first_existing_col(df, ["original_model_output", "om_output", "base_output", "original_output"])
COL_TM_FULL = first_existing_col(df, ["trained_model_output", "pred_full", "model_output", "llm_output"])

# BHC/DI가 분리 저장된 경우(없을 수 있음)
COL_OM_BHC = first_existing_col(df, ["om_bhc", "bhc_om", "original_bhc_pred"])
COL_OM_DI  = first_existing_col(df, ["om_di", "di_om", "original_di_pred"])
COL_TM_BHC = first_existing_col(df, ["bhc_pred", "pred_bhc", "tm_bhc"])
COL_TM_DI  = first_existing_col(df, ["di_pred", "pred_di", "tm_di"])

# 정답(GT)
COL_GT_BHC = first_existing_col(df, ["bhc_gt", "original_bhc", "gt_bhc"])
COL_GT_DI  = first_existing_col(df, ["di_gt", "original_di", "gt_di"])

# ============================================
# BHC/DI 분리 함수 (강화된 정규식)
#  - 'Brief Hospital Course' 또는 'BHC'
#  - 'Discharge Instructions' 또는 'DI'
#  - 콜론/대시/점/공백 유연 처리
# ============================================
def split_bhc_di(text: str):
    if not isinstance(text, str):
        return "", ""
    bhc_patterns = [
        r"(?i)\bBrief\s*Hospital\s*Course\b\s*[:.\-]*",
        r"(?i)\bBHC\b\s*[:.\-]*"
    ]
    di_patterns = [
        r"(?i)\bDischarge\s*Instructions\b\s*[:.\-]*",
        r"(?i)\bDI\b\s*[:.\-]*"
    ]
    bhc_match = None
    di_match = None

    for pat in bhc_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (bhc_match is None or m.start() < bhc_match.start()):
            bhc_match = m
    for pat in di_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (di_match is None or m.start() < di_match.start()):
            di_match = m

    bhc_text, di_text = "", ""
    if bhc_match and di_match:
        if bhc_match.start() < di_match.start():
            bhc_text = text[bhc_match.end():di_match.start()].strip().lstrip(':').strip()
            di_text  = text[di_match.end():].strip().lstrip(':').strip()
        else:
            di_text  = text[di_match.end():bhc_match.start()].strip().lstrip(':').strip()
            bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif bhc_match:
        bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif di_match:
        di_text = text[di_match.end():].strip().lstrip(':').strip()

    return bhc_text, di_text

# ============================================
# 안전 계산 함수 (bert-score F1만 추출)
# ============================================
def safe_bertscore_f1(candidate: str, reference: str):
    if not candidate or not reference:
        return np.nan
    try:
        _, _, F1 = score(
            [candidate], [reference],
            model_type=MODEL_TYPE,
            lang=LANG,
            verbose=False,
            device=device,
            batch_size=BATCH_SIZE
        )
        return F1[0].item() if F1.numel() > 0 else np.nan
    except Exception as e:
        # 필요 시 상세 로그
        # print(f"[WARN] BERTScore 실패: {e}")
        return np.nan

# ============================================
# 행 루프
#  - 우선: 통합 출력에서 BHC/DI 분리 시도
#  - 실패 시: 별도 저장된 BHC/DI 컬럼 사용
# ============================================
results = []

iter_rows = df.iterrows() if VIEW_NUM is None else df.head(VIEW_NUM).iterrows()

for idx, row in iter_rows:
    if ((idx + 1) % 50 == 0) or (idx == 0):
        print(f"행 {idx + 1}/{total_rows} 처리 중...")

    # 원본/파인튜닝 통합 출력
    om_full = str(row[COL_OM_FULL]) if COL_OM_FULL and pd.notna(row[COL_OM_FULL]) else ""
    tm_full = str(row[COL_TM_FULL]) if COL_TM_FULL and pd.notna(row[COL_TM_FULL]) else ""

    om_bhc_from_full, om_di_from_full = split_bhc_di(om_full) if om_full else ("", "")
    tm_bhc_from_full, tm_di_from_full = split_bhc_di(tm_full) if tm_full else ("", "")

    # 별도 컬럼 폴백
    om_bhc = om_bhc_from_full if om_bhc_from_full else (str(row[COL_OM_BHC]) if COL_OM_BHC and pd.notna(row[COL_OM_BHC]) else "")
    om_di  = om_di_from_full  if om_di_from_full  else (str(row[COL_OM_DI])  if COL_OM_DI  and pd.notna(row[COL_OM_DI])  else "")
    tm_bhc = tm_bhc_from_full if tm_bhc_from_full else (str(row[COL_TM_BHC]) if COL_TM_BHC and pd.notna(row[COL_TM_BHC]) else "")
    tm_di  = tm_di_from_full  if tm_di_from_full  else (str(row[COL_TM_DI])  if COL_TM_DI  and pd.notna(row[COL_TM_DI])  else "")

    # 정답
    gt_bhc = str(row[COL_GT_BHC]) if COL_GT_BHC and pd.notna(row[COL_GT_BHC]) else ""
    gt_di  = str(row[COL_GT_DI])  if COL_GT_DI  and pd.notna(row[COL_GT_DI])  else ""

    # --- BERTScore 계산 (6가지 비교) ---
    f1_om_gt_bhc = safe_bertscore_f1(om_bhc, gt_bhc)
    f1_tm_gt_bhc = safe_bertscore_f1(tm_bhc, gt_bhc)
    f1_om_gt_di  = safe_bertscore_f1(om_di,  gt_di)
    f1_tm_gt_di  = safe_bertscore_f1(tm_di,  gt_di)
    f1_om_tm_bhc = safe_bertscore_f1(om_bhc, tm_bhc)
    f1_om_tm_di  = safe_bertscore_f1(om_di,  tm_di)

    results.append({
        "row_index": idx + 1,
        "F1_OM_vs_GT_BHC": f1_om_gt_bhc,
        "F1_TM_vs_GT_BHC": f1_tm_gt_bhc,
        "F1_OM_vs_GT_DI":  f1_om_gt_di,
        "F1_TM_vs_GT_DI":  f1_tm_gt_di,
        "F1_OM_vs_TM_BHC": f1_om_tm_bhc,
        "F1_OM_vs_TM_DI":  f1_om_tm_di,
        # 디버깅에 유용한 길이 정보 (옵션)
        "len_om_bhc": len(om_bhc) if om_bhc else 0,
        "len_tm_bhc": len(tm_bhc) if tm_bhc else 0,
        "len_gt_bhc": len(gt_bhc) if gt_bhc else 0,
        "len_om_di":  len(om_di)  if om_di  else 0,
        "len_tm_di":  len(tm_di)  if tm_di  else 0,
        "len_gt_di":  len(gt_di)  if gt_di  else 0,
    })

print(f"총 {total_rows}개 행에 대한 BERTScore 계산 완료.")

# 결과 DataFrame
results_df = pd.DataFrame(results)

# 상위 5개 행 출력
print("\n===== BERTScore F1 결과 (상위 5개 행) =====")
print(results_df.head())

# 평균( NaN 제외 )
print("\n===== 평균 BERTScore F1 (NaN 제외) =====")
avg = results_df.mean(numeric_only=True)
print(f"원본 모델 BHC vs 정답 BHC: {avg.get('F1_OM_vs_GT_BHC', 0.0):.4f}")
print(f"파인튜닝 모델 BHC vs 정답 BHC: {avg.get('F1_TM_vs_GT_BHC', 0.0):.4f}")
print(f"원본 모델 DI vs 정답 DI: {avg.get('F1_OM_vs_GT_DI', 0.0):.4f}")
print(f"파인튜닝 모델 DI vs 정답 DI: {avg.get('F1_TM_vs_GT_DI', 0.0):.4f}")
print(f"원본 모델 BHC vs 파인튜닝 모델 BHC: {avg.get('F1_OM_vs_TM_BHC', 0.0):.4f}")
print(f"원본 모델 DI vs 파인튜닝 모델 DI: {avg.get('F1_OM_vs_TM_DI', 0.0):.4f}")

# 각 컬럼별 NaN 개수
print("\n===== 각 비교별 NaN (계산 건너뜀) 개수 =====")
print(results_df[[
    "F1_OM_vs_GT_BHC", "F1_TM_vs_GT_BHC",
    "F1_OM_vs_GT_DI",  "F1_TM_vs_GT_DI",
    "F1_OM_vs_TM_BHC", "F1_OM_vs_TM_DI"
]].isnull().sum())

# (옵션) 결과 저장
out_parquet = PATH.replace(".parquet", "_bertscore.parquet")
out_csv = PATH.replace(".parquet", "_bertscore.csv")
try:
    results_df.to_parquet(out_parquet, index=False)
    print(f"\n💾 저장 완료(Parquet): {out_parquet}")
except Exception as e:
    print(f"\n[WARN] Parquet 저장 실패: {e}")
results_df.to_csv(out_csv, index=False)
print(f"💾 저장 완료(CSV): {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: CUDA:0  |  Model: roberta-large
총 100개 행에 대한 BERTScore 계산을 시작합니다...
행 1/100 처리 중...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

행 50/100 처리 중...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

행 100/100 처리 중...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


총 100개 행에 대한 BERTScore 계산 완료.

===== BERTScore F1 결과 (상위 5개 행) =====
   row_index  F1_OM_vs_GT_BHC  F1_TM_vs_GT_BHC  F1_OM_vs_GT_DI  \
0          1              NaN         0.833945             NaN   
1          2              NaN         0.797073             NaN   
2          3              NaN         0.842362             NaN   
3          4              NaN         0.857543             NaN   
4          5              NaN         0.854245             NaN   

   F1_TM_vs_GT_DI  F1_OM_vs_TM_BHC  F1_OM_vs_TM_DI  len_om_bhc  len_tm_bhc  \
0        0.885874              NaN             NaN           0         725   
1        0.813444              NaN             NaN           0        2209   
2        0.807339              NaN             NaN           0        1880   
3        0.818683              NaN             NaN           0         978   
4        0.849310              NaN             NaN           0        1817   

   len_gt_bhc  len_om_di  len_tm_di  len_gt_di  
0        1705   

### 2번째 버전

In [2]:
import pandas as pd

path = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet"

VIEW_NUM = 15

df = pd.read_parquet(path)

for idx, row in df.iterrows():
  print(f"\n📌======= {idx + 1}번 행 =======📌")

  for i, col in enumerate(df.columns, start=1):
    value = row[col]

    print(f"✅{i}. {col}\n{value}\n")


✅1. subject_id
19058186

✅2. hadm_id
20680591

✅3. note_id
19058186-DS-10

✅4. input_text
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: SURGERY
 
Allergies: 
Patient recorded as having No Known Allergies to Drugs
 
Attending: ___.
 
Chief Complaint:
Pancreatic neuroendocrine tumor 
 
Major Surgical or Invasive Procedure:
___:
1. Central pancreatectomy with pancreaticojejunostomy using
   a Roux-en-Y conduit.
2. Cholecystectomy.

 
History of Present Illness:
Ms. ___ is a ___ woman who was in pretty good health 
with the exception of a history of breast cancer in the past.  
On follow-up imaging of a
renal cyst situation, she was found to have an incidental lesion 
in the middle of her pancreas.  This had the hallmark features 
of a neuroendocrine tumor and this was biopsy-proven by Dr. 
___ an endoscopic ultrasound approach.  I met with 
Ms. ___ week prior to the operation a

In [1]:
# ============================================
# A) Google Drive 마운트 & 기본 설정
# ============================================
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import pandas as pd

# 결과 Parquet 경로 (필요 시 수정)
PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet"

# 확인할 행 개수
VIEW_NUM_ALL = 15   # 전체 열 프린트용 상단 N개
VIEW_NUM_SECT = 5   # 섹션(BHC/DI) 분리 확인용 상단 N개

# ============================================
# B) 데이터 로드 & 기본 정보 출력
# ============================================
df = pd.read_parquet(PATH)
print(f"✅ 로드 완료: {PATH}")
print(f"📦 총 행 수: {len(df)}")
print("🧩 컬럼 목록:")
for i, c in enumerate(df.columns, start=1):
    print(f"  {i:>2}. {c}")

# ============================================
# C) 유틸: 다양한 컬럼명(구버전/신버전) 자동 매핑
#    - 파일 저장 시점마다 컬럼명이 다를 수 있어 안전하게 처리
# ============================================
def first_existing_col(df, candidates, default=None):
    """candidates 중 df에 존재하는 첫 번째 컬럼명을 반환. 없으면 default."""
    for c in candidates:
        if c in df.columns:
            return c
    return default

# 예측(파인튜닝) 전체 출력 텍스트(모델 통합 출력)
COL_PRED_FULL = first_existing_col(
    df,
    candidates=["pred_full", "trained_model_output", "model_output", "llm_output"]
)

# 예측 BHC/DI가 별도 저장된 경우(없을 수 있음)
COL_BHC_PRED = first_existing_col(df, ["bhc_pred", "pred_bhc", "bhc_output"])
COL_DI_PRED  = first_existing_col(df, ["di_pred", "pred_di", "di_output"])

# 정답(GT)
COL_BHC_GT = first_existing_col(df, ["bhc_gt", "original_bhc", "gt_bhc"])
COL_DI_GT  = first_existing_col(df, ["di_gt", "original_di", "gt_di"])

# 원문 입력 텍스트(있으면 디버깅에 도움)
COL_INPUT = first_existing_col(df, ["input_text", "input", "note_text", "text"])

# 키(있으면 함께 보여주면 편리)
COL_SUBJECT = first_existing_col(df, ["subject_id"])
COL_HADM    = first_existing_col(df, ["hadm_id"])
COL_NOTE    = first_existing_col(df, ["note_id"])

# ============================================
# D) (옵션) 상단 VIEW_NUM_ALL개 행에 대해 모든 열 프린트
#    - 원래 올려주신 "전체 열 프린트" 루틴을 안전하게 정리
# ============================================
print("\n" + "="*80)
print(f"🔎 상단 {VIEW_NUM_ALL}개 행, 모든 열 출력")
print("="*80)

for ridx, row in df.head(VIEW_NUM_ALL).iterrows():
    print(f"\n📌======= {ridx + 1}번 행 =======📌")
    for j, col in enumerate(df.columns, start=1):
        val = row[col]
        print(f"✅{j}. {col}\n{val}\n")

# ============================================
# E) (옵션) 특정 열만 골라 출력
#    - 사용자가 원래 쓰신 'trained_model_output', 'original_bhc', 'original_di'를
#      실제 df에 존재하는 컬럼으로 자동 매핑해 안전하게 처리
# ============================================
# '원하는 열(라벨)' -> '실제 df 컬럼명' 매핑
columns_wanted = {
    "trained_model_output": COL_PRED_FULL,
    "original_bhc": COL_BHC_GT,
    "original_di": COL_DI_GT
}

# 실제 존재하는 열만 추려내기
columns_to_print = [real for label, real in columns_wanted.items() if real is not None]

if columns_to_print:
    print("\n" + "="*80)
    print(f"🧷 상단 {min(VIEW_NUM_ALL, len(df))}개 행, 선택 열만 출력")
    print("="*80)

    # 원래 인덱스(컬럼 순서 기반 번호) 표시를 위해 위치 계산
    col_indices = {col: df.columns.get_loc(col) + 1 for col in columns_to_print}

    for ridx, row in df.head(VIEW_NUM_ALL).iterrows():
        print(f"\n📌======= {ridx + 1}번 행 =======📌")
        for col_name in columns_to_print:
            value = row[col_name]
            original_index = col_indices[col_name]
            print(f"✅{original_index}. {col_name}\n{value}\n")
else:
    print("\n⚠️ 선택 출력용으로 지정한 컬럼이 현재 데이터프레임에 없습니다. (무시하고 다음 단계 진행)")

# ============================================
# F) BHC/DI 섹션 분리 유틸
#    - 다양한 표기(약어/대소문자/콜론/개행 등)에 견고하게 동작하도록 정규식 보강
# ============================================
def split_bhc_di(text: str):
    """
    통합 출력(text)에서 'Brief Hospital Course(=BHC)'와 'Discharge Instructions(=DI)' 구간을 분리.
    - 가능한 프리픽스: 'Brief Hospital Course' | 'BHC'
    - 가능한 프리픽스: 'Discharge Instructions' | 'DI'
    - 콜론(:) 유무, 개행 유무 등 다양한 경우 대응
    """
    if not isinstance(text, str):
        return "", ""

    # \s*와 [:.-]* 등으로 콜론/대시/점 등 가벼운 구두점과 공백 허용
    bhc_patterns = [
        r"(?i)\bBrief\s*Hospital\s*Course\b\s*[:.\-]*",  # 'Brief Hospital Course'
        r"(?i)\bBHC\b\s*[:.\-]*"                          # 'BHC'
    ]
    di_patterns = [
        r"(?i)\bDischarge\s*Instructions\b\s*[:.\-]*",    # 'Discharge Instructions'
        r"(?i)\bDI\b\s*[:.\-]*"                            # 'DI'
    ]

    # 가장 먼저 등장하는 BHC/DI 앵커 탐색
    bhc_match = None
    di_match = None

    for pat in bhc_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            if (bhc_match is None) or (m.start() < bhc_match.start()):
                bhc_match = m

    for pat in di_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            if (di_match is None) or (m.start() < di_match.start()):
                di_match = m

    bhc_text = ""
    di_text = ""

    if bhc_match and di_match:
        # 등장 순서에 따라 안전하게 분리
        first, second = (bhc_match, di_match) if bhc_match.start() < di_match.start() else (di_match, bhc_match)
        if first is bhc_match:
            # BHC 먼저, 그 다음 DI
            bhc_start = bhc_match.end()
            bhc_end   = di_match.start()
            bhc_text  = text[bhc_start:bhc_end].strip().lstrip(':').strip()
            di_text   = text[di_match.end():].strip().lstrip(':').strip()
        else:
            # DI 먼저, 그 다음 BHC (드문 경우지만 방어)
            di_start = di_match.end()
            di_end   = bhc_match.start()
            di_text  = text[di_start:di_end].strip().lstrip(':').strip()
            bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif bhc_match:
        bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif di_match:
        di_text = text[di_match.end():].strip().lstrip(':').strip()

    return bhc_text, di_text

# ============================================
# G) 상단 VIEW_NUM_SECT개 행에 대해: 예측/정답 BHC/DI 비교 출력
#    - pred_full(통합 출력)에서 분리 시도
#    - pred_full이 없거나 분리가 안 되면 bhc_pred/di_pred 컬럼 사용
# ============================================
print("\n" + "="*80)
print(f"🧪 상단 {VIEW_NUM_SECT}개 행: 예측/정답 BHC·DI 비교")
print("="*80)

for ridx, row in df.head(VIEW_NUM_SECT).iterrows():
    # Key 정보(있을 때만 표시)
    key_info = []
    if COL_SUBJECT: key_info.append(f"subject_id={row[COL_SUBJECT]}")
    if COL_HADM:    key_info.append(f"hadm_id={row[COL_HADM]}")
    if COL_NOTE:    key_info.append(f"note_id={row[COL_NOTE]}")
    key_str = (" | ".join(key_info)) if key_info else "키 정보 없음"

    print(f"\n\n📌======= {ridx + 1}번 행 분할 결과 확인 =======📌")
    print(f"🧷 {key_str}")

    # 1) 파인튜닝 모델 통합 출력에서 분리 (가능하면 우선 사용)
    tm_output = row[COL_PRED_FULL] if COL_PRED_FULL else None
    tm_bhc_from_full, tm_di_from_full = split_bhc_di(tm_output) if tm_output else ("", "")

    # 2) 통합 출력에서 분리 실패 시, 별도 저장된 예측 컬럼 사용
    tm_bhc = tm_bhc_from_full if tm_bhc_from_full else (row[COL_BHC_PRED] if COL_BHC_PRED else "")
    tm_di  = tm_di_from_full  if tm_di_from_full  else (row[COL_DI_PRED]  if COL_DI_PRED  else "")

    # 3) 정답(GT)
    gt_bhc = str(row[COL_BHC_GT]) if (COL_BHC_GT and pd.notna(row[COL_BHC_GT])) else ""
    gt_di  = str(row[COL_DI_GT])  if (COL_DI_GT  and pd.notna(row[COL_DI_GT]))  else ""

    # 4) (옵션) 입력 원문도 보고 싶으면 출력
    if COL_INPUT:
        print("\n--- [입력 원문 일부] ---")
        src = str(row[COL_INPUT])
        print(src[:600] + ("..." if len(src) > 600 else ""))

    # 5) 결과 출력
    print("\n--- [파인튜닝 모델 BHC] ---")
    print(tm_bhc if tm_bhc else "(비어있음)")

    print("\n--- [파인튜닝 모델 DI] ---")
    print(tm_di if tm_di else "(비어있음)")

    print("\n--- [정답 BHC] ---")
    print(gt_bhc if gt_bhc else "(비어있음)")

    print("\n--- [정답 DI] ---")
    print(gt_di if gt_di else "(비어있음)")

    print("=" * 60)

# ============================================
# H) (옵션) Parquet 내용 요약 저장/내보내기
#     - CSV로 일부 열만 내보내 미리보기 용도로 활용하고 싶을 때
# ============================================
# 필요 시 주석 해제:
# preview_cols = [c for c in [COL_SUBJECT, COL_HADM, COL_NOTE, COL_INPUT, COL_PRED_FULL, COL_BHC_PRED, COL_DI_PRED, COL_BHC_GT, COL_DI_GT] if c]
# preview_path = PATH.replace(".parquet", "_preview.csv")
# df[preview_cols].head(50).to_csv(preview_path, index=False)
# print(f"\n📝 미리보기 CSV 저장: {preview_path}")


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
o If you go 48 hours without a bowel movement, or have pain 
moving the bowels, call your surgeon. 
 
PAIN MANAGEMENT:
o It is normal to feel some discomfort/pain following abdominal 
surgery. This pain is often described as "soreness". 
o Your pain should get better day by day. If you find the pain 
is getting worse instead of better, please contact your surgeon.
o You will receive a prescription for pain medicine to take by 
mouth. It is important to take this medicine as directed. o Do 
not take it more frequently than prescribed. Do not take more 
medicine at one time than prescribed.
o Your pain medicine will work better if you take it before your 
pain gets too severe.
o Talk with your surgeon about how long you will need to take 
prescription pain medicine. Please don't take any other pain 
medicine, including non-prescription pain medicine, unless your 
surgeon has said its okay.
o If you are experiencing no pain, it is okay to 

In [7]:
# 평가에 필요한 라이브러리 다운
!pip uninstall transformers bert-score -y
!pip install transformers bert-score

Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: bert-score 0.3.13
Uninstalling bert-score-0.3.13:
  Successfully uninstalled bert-score-0.3.13
Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: transformers, bert-score
Successfully installed bert-score-0.3.13 transformers-4.57.1


In [1]:
from google.colab import drive
drive.mount('/content/drive')

# ============================================
# BERTScore 계산 스크립트 (견고/가변 컬럼 대응 버전)
# ============================================
import pandas as pd
from bert_score import score
import re
import warnings
import torch
import numpy as np
import os

# 경고 정리
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized from the model checkpoint")

# --- 설정 ---
PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet"
VIEW_NUM = None            # None이면 전체, 숫자면 상위 N개만
MODEL_TYPE = "roberta-large"  # 'microsoft/deberta-xlarge-mnli' 등 변경 가능
LANG = "en"                # bert-score 언어
BATCH_SIZE = 1             # score() 내부 배치 (메모리 여유 있으면 키워도 됨)
# --- 설정 끝 ---

# 장치 설정 (문자열)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}  |  Model: {MODEL_TYPE}")

# 데이터 로드
df = pd.read_parquet(PATH)
total_rows = len(df) if VIEW_NUM is None else min(VIEW_NUM, len(df))
print(f"총 {total_rows}개 행에 대한 BERTScore 계산을 시작합니다...")

# ============================================
# 유틸: 컬럼 자동 매핑 (다른 스크립트/시점에 따라 컬럼명이 달라져도 동작)
# ============================================
def first_existing_col(df_, candidates, default=None):
    for c in candidates:
        if c in df_.columns:
            return c
    return default

# 통합 출력(모델 출력 전체) 추정 컬럼
COL_OM_FULL = first_existing_col(df, ["original_model_output", "om_output", "base_output", "original_output"])
COL_TM_FULL = first_existing_col(df, ["trained_model_output", "pred_full", "model_output", "llm_output"])

# BHC/DI가 분리 저장된 경우(없을 수 있음)
COL_OM_BHC = first_existing_col(df, ["om_bhc", "bhc_om", "original_bhc_pred"])
COL_OM_DI  = first_existing_col(df, ["om_di", "di_om", "original_di_pred"])
COL_TM_BHC = first_existing_col(df, ["bhc_pred", "pred_bhc", "tm_bhc"])
COL_TM_DI  = first_existing_col(df, ["di_pred", "pred_di", "tm_di"])

# 정답(GT)
COL_GT_BHC = first_existing_col(df, ["bhc_gt", "original_bhc", "gt_bhc"])
COL_GT_DI  = first_existing_col(df, ["di_gt", "original_di", "gt_di"])

# ============================================
# BHC/DI 분리 함수 (강화된 정규식)
#  - 'Brief Hospital Course' 또는 'BHC'
#  - 'Discharge Instructions' 또는 'DI'
#  - 콜론/대시/점/공백 유연 처리
# ============================================
def split_bhc_di(text: str):
    if not isinstance(text, str):
        return "", ""
    bhc_patterns = [
        r"(?i)\bBrief\s*Hospital\s*Course\b\s*[:.\-]*",
        r"(?i)\bBHC\b\s*[:.\-]*"
    ]
    di_patterns = [
        r"(?i)\bDischarge\s*Instructions\b\s*[:.\-]*",
        r"(?i)\bDI\b\s*[:.\-]*"
    ]
    bhc_match = None
    di_match = None

    for pat in bhc_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (bhc_match is None or m.start() < bhc_match.start()):
            bhc_match = m
    for pat in di_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (di_match is None or m.start() < di_match.start()):
            di_match = m

    bhc_text, di_text = "", ""
    if bhc_match and di_match:
        if bhc_match.start() < di_match.start():
            bhc_text = text[bhc_match.end():di_match.start()].strip().lstrip(':').strip()
            di_text  = text[di_match.end():].strip().lstrip(':').strip()
        else:
            di_text  = text[di_match.end():bhc_match.start()].strip().lstrip(':').strip()
            bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif bhc_match:
        bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif di_match:
        di_text = text[di_match.end():].strip().lstrip(':').strip()

    return bhc_text, di_text

# ============================================
# 안전 계산 함수 (bert-score F1만 추출)
# ============================================
def safe_bertscore_f1(candidate: str, reference: str):
    if not candidate or not reference:
        return np.nan
    try:
        _, _, F1 = score(
            [candidate], [reference],
            model_type=MODEL_TYPE,
            lang=LANG,
            verbose=False,
            device=device,
            batch_size=BATCH_SIZE
        )
        return F1[0].item() if F1.numel() > 0 else np.nan
    except Exception as e:
        # 필요 시 상세 로그
        # print(f"[WARN] BERTScore 실패: {e}")
        return np.nan

# ============================================
# 행 루프
#  - 우선: 통합 출력에서 BHC/DI 분리 시도
#  - 실패 시: 별도 저장된 BHC/DI 컬럼 사용
# ============================================
results = []

iter_rows = df.iterrows() if VIEW_NUM is None else df.head(VIEW_NUM).iterrows()

for idx, row in iter_rows:
    if ((idx + 1) % 50 == 0) or (idx == 0):
        print(f"행 {idx + 1}/{total_rows} 처리 중...")

    # 원본/파인튜닝 통합 출력
    om_full = str(row[COL_OM_FULL]) if COL_OM_FULL and pd.notna(row[COL_OM_FULL]) else ""
    tm_full = str(row[COL_TM_FULL]) if COL_TM_FULL and pd.notna(row[COL_TM_FULL]) else ""

    om_bhc_from_full, om_di_from_full = split_bhc_di(om_full) if om_full else ("", "")
    tm_bhc_from_full, tm_di_from_full = split_bhc_di(tm_full) if tm_full else ("", "")

    # 별도 컬럼 폴백
    om_bhc = om_bhc_from_full if om_bhc_from_full else (str(row[COL_OM_BHC]) if COL_OM_BHC and pd.notna(row[COL_OM_BHC]) else "")
    om_di  = om_di_from_full  if om_di_from_full  else (str(row[COL_OM_DI])  if COL_OM_DI  and pd.notna(row[COL_OM_DI])  else "")
    tm_bhc = tm_bhc_from_full if tm_bhc_from_full else (str(row[COL_TM_BHC]) if COL_TM_BHC and pd.notna(row[COL_TM_BHC]) else "")
    tm_di  = tm_di_from_full  if tm_di_from_full  else (str(row[COL_TM_DI])  if COL_TM_DI  and pd.notna(row[COL_TM_DI])  else "")

    # 정답
    gt_bhc = str(row[COL_GT_BHC]) if COL_GT_BHC and pd.notna(row[COL_GT_BHC]) else ""
    gt_di  = str(row[COL_GT_DI])  if COL_GT_DI  and pd.notna(row[COL_GT_DI])  else ""

    # --- BERTScore 계산 (6가지 비교) ---
    f1_om_gt_bhc = safe_bertscore_f1(om_bhc, gt_bhc)
    f1_tm_gt_bhc = safe_bertscore_f1(tm_bhc, gt_bhc)
    f1_om_gt_di  = safe_bertscore_f1(om_di,  gt_di)
    f1_tm_gt_di  = safe_bertscore_f1(tm_di,  gt_di)
    f1_om_tm_bhc = safe_bertscore_f1(om_bhc, tm_bhc)
    f1_om_tm_di  = safe_bertscore_f1(om_di,  tm_di)

    results.append({
        "row_index": idx + 1,
        "F1_OM_vs_GT_BHC": f1_om_gt_bhc,
        "F1_TM_vs_GT_BHC": f1_tm_gt_bhc,
        "F1_OM_vs_GT_DI":  f1_om_gt_di,
        "F1_TM_vs_GT_DI":  f1_tm_gt_di,
        "F1_OM_vs_TM_BHC": f1_om_tm_bhc,
        "F1_OM_vs_TM_DI":  f1_om_tm_di,
        # 디버깅에 유용한 길이 정보 (옵션)
        "len_om_bhc": len(om_bhc) if om_bhc else 0,
        "len_tm_bhc": len(tm_bhc) if tm_bhc else 0,
        "len_gt_bhc": len(gt_bhc) if gt_bhc else 0,
        "len_om_di":  len(om_di)  if om_di  else 0,
        "len_tm_di":  len(tm_di)  if tm_di  else 0,
        "len_gt_di":  len(gt_di)  if gt_di  else 0,
    })

print(f"총 {total_rows}개 행에 대한 BERTScore 계산 완료.")

# 결과 DataFrame
results_df = pd.DataFrame(results)

# 상위 5개 행 출력
print("\n===== BERTScore F1 결과 (상위 5개 행) =====")
print(results_df.head())

# 평균( NaN 제외 )
print("\n===== 평균 BERTScore F1 (NaN 제외) =====")
avg = results_df.mean(numeric_only=True)
print(f"원본 모델 BHC vs 정답 BHC: {avg.get('F1_OM_vs_GT_BHC', 0.0):.4f}")
print(f"파인튜닝 모델 BHC vs 정답 BHC: {avg.get('F1_TM_vs_GT_BHC', 0.0):.4f}")
print(f"원본 모델 DI vs 정답 DI: {avg.get('F1_OM_vs_GT_DI', 0.0):.4f}")
print(f"파인튜닝 모델 DI vs 정답 DI: {avg.get('F1_TM_vs_GT_DI', 0.0):.4f}")
print(f"원본 모델 BHC vs 파인튜닝 모델 BHC: {avg.get('F1_OM_vs_TM_BHC', 0.0):.4f}")
print(f"원본 모델 DI vs 파인튜닝 모델 DI: {avg.get('F1_OM_vs_TM_DI', 0.0):.4f}")

# 각 컬럼별 NaN 개수
print("\n===== 각 비교별 NaN (계산 건너뜀) 개수 =====")
print(results_df[[
    "F1_OM_vs_GT_BHC", "F1_TM_vs_GT_BHC",
    "F1_OM_vs_GT_DI",  "F1_TM_vs_GT_DI",
    "F1_OM_vs_TM_BHC", "F1_OM_vs_TM_DI"
]].isnull().sum())

# (옵션) 결과 저장
out_parquet = PATH.replace(".parquet", "_bertscore.parquet")
out_csv = PATH.replace(".parquet", "_bertscore.csv")
try:
    results_df.to_parquet(out_parquet, index=False)
    print(f"\n💾 저장 완료(Parquet): {out_parquet}")
except Exception as e:
    print(f"\n[WARN] Parquet 저장 실패: {e}")
results_df.to_csv(out_csv, index=False)
print(f"💾 저장 완료(CSV): {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: CUDA:0  |  Model: roberta-large
총 100개 행에 대한 BERTScore 계산을 시작합니다...
행 1/100 처리 중...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

행 50/100 처리 중...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

행 100/100 처리 중...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


총 100개 행에 대한 BERTScore 계산 완료.

===== BERTScore F1 결과 (상위 5개 행) =====
   row_index  F1_OM_vs_GT_BHC  F1_TM_vs_GT_BHC  F1_OM_vs_GT_DI  \
0          1              NaN         0.846594             NaN   
1          2              NaN         0.816348             NaN   
2          3              NaN         0.841213             NaN   
3          4              NaN         0.834872             NaN   
4          5              NaN         0.860443             NaN   

   F1_TM_vs_GT_DI  F1_OM_vs_TM_BHC  F1_OM_vs_TM_DI  len_om_bhc  len_tm_bhc  \
0        0.936990              NaN             NaN           0        2448   
1        0.842119              NaN             NaN           0         343   
2        0.874338              NaN             NaN           0        1735   
3        0.906795              NaN             NaN           0         755   
4        0.852825              NaN             NaN           0        1555   

   len_gt_bhc  len_om_di  len_tm_di  len_gt_di  
0        1705   

## 결과 시각화

In [None]:
# ============================================
# BERTScore 결과 박스플롯 (matplotlib-only / no subplots / no colors)
# ============================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# (필요 시 경로 수정) BERTScore 스크립트에서 저장한 결과 파일
RESULTS_CSV     = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results_bertscore.csv"
RESULTS_PARQUET = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results_bertscore.parquet"

# 출력 경로(폴더). Colab이라면 /content 아래가 빠릅니다.
OUTPUT_DIR = "/content"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- 1) results_df 확보 ----------
if 'results_df' not in globals():
    if os.path.exists(RESULTS_CSV):
        results_df = pd.read_csv(RESULTS_CSV)
        print(f"✅ CSV에서 로드: {RESULTS_CSV}")
    elif os.path.exists(RESULTS_PARQUET):
        results_df = pd.read_parquet(RESULTS_PARQUET)
        print(f"✅ Parquet에서 로드: {RESULTS_PARQUET}")
    else:
        raise RuntimeError(
            "오류: 메모리에 'results_df'가 없고, RESULTS_CSV/RESULTS_PARQUET 경로에서도 찾을 수 없습니다. "
            "먼저 BERTScore 스크립트를 실행해 결과 파일을 저장해 주세요."
        )
else:
    print("✅ 메모리의 results_df 사용")

# ---------- 2) 헬퍼: matplotlib 박스플롯 ----------
def simple_boxplot(data_dict, title, ylabel, out_name):
    """
    data_dict: {label(str): 1D array-like of floats}
    유효 데이터(NaN 제외)가 있는 항목만 그림. 없으면 건너뜀.
    """
    prepared, labels = [], []
    for lab, vals in data_dict.items():
        arr = np.asarray(vals, dtype=float)
        arr = arr[~np.isnan(arr)]  # NaN 제거
        if arr.size > 0:
            prepared.append(arr)
            labels.append(lab)

    if not prepared:
        print(f"[SKIP] '{title}'에 사용할 유효 데이터가 없습니다.")
        return None

    fig, ax = plt.subplots(figsize=(8, 6))   # 한 그림에 하나의 차트
    ax.boxplot(prepared, labels=labels, showmeans=True)
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.set_ylim(0.0, 1.0)
    fig.tight_layout()

    out_path = os.path.join(OUTPUT_DIR, out_name)
    fig.savefig(out_path)
    plt.close(fig)
    print(f"💾 저장 완료: {out_path}")
    return out_path

# ---------- 3) BHC 데이터 준비 (OM 컬럼 없어도 OK) ----------
bhc_cols = [c for c in ["F1_OM_vs_GT_BHC", "F1_TM_vs_GT_BHC"] if c in results_df.columns]
label_map_bhc = {
    "F1_OM_vs_GT_BHC": "Original vs. GT",
    "F1_TM_vs_GT_BHC": "Fine-tuned vs. GT",
}
bhc_data = {label_map_bhc[c]: results_df[c].values for c in bhc_cols}

# ---------- 4) DI 데이터 준비 ----------
di_cols = [c for c in ["F1_OM_vs_GT_DI", "F1_TM_vs_GT_DI"] if c in results_df.columns]
label_map_di = {
    "F1_OM_vs_GT_DI": "Original vs. GT",
    "F1_TM_vs_GT_DI": "Fine-tuned vs. GT",
}
di_data = {label_map_di[c]: results_df[c].values for c in di_cols}

# ---------- 5) 그리기 (각각 별도 그림) ----------
bhc_png = simple_boxplot(
    data_dict=bhc_data,
    title="BHC: BERTScore F1 Comparison (vs. Ground Truth)",
    ylabel="BERTScore F1",
    out_name="bertscore_BHC_vs_GT_boxplot.png"
)

di_png = simple_boxplot(
    data_dict=di_data,
    title="DI: BERTScore F1 Comparison (vs. Ground Truth)",
    ylabel="BERTScore F1",
    out_name="bertscore_DI_vs_GT_boxplot.png"
)

# ---------- 6) 안내 ----------
if not bhc_data:
    print("⚠️ BHC 관련 컬럼을 찾지 못했습니다: F1_OM_vs_GT_BHC, F1_TM_vs_GT_BHC")
if not di_data:
    print("⚠️ DI 관련 컬럼을 찾지 못했습니다: F1_OM_vs_GT_DI, F1_TM_vs_GT_DI")

print("완료.")


✅ 메모리의 results_df 사용
💾 저장 완료: /content/bertscore_BHC_vs_GT_boxplot.png


  ax.boxplot(prepared, labels=labels, showmeans=True)
  ax.boxplot(prepared, labels=labels, showmeans=True)


💾 저장 완료: /content/bertscore_DI_vs_GT_boxplot.png
완료.


## BELU 평가

In [3]:
!pip install nltk



### 1번째

In [None]:
# ============================================
# BLEU-4 계산 스크립트 (가변 컬럼/결측 안전/정규식 보강)
# ============================================
# 필요 시 최초 1회:
# !pip -q install nltk

import os
import re
import numpy as np
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# --- NLTK 데이터 준비 ---
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# --- 설정 ---
PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet"
VIEW_NUM = None  # None이면 전체, 숫자면 상위 N개만
SAVE_RESULTS = True  # 결과 저장 여부
# --- 설정 끝 ---

# ============================================
# 유틸: 컬럼 자동 매핑
# ============================================
def first_existing_col(df_, candidates, default=None):
    """candidates 중 df에 존재하는 첫 번째 컬럼명을 반환. 없으면 default."""
    for c in candidates:
        if c in df_.columns:
            return c
    return default

# ============================================
# BHC/DI 분리 함수 (강화된 정규식)
#  - 'Brief Hospital Course' 또는 'BHC'
#  - 'Discharge Instructions' 또는 'DI'
#  - 콜론/대시/점/공백 유연 처리
# ============================================
def split_bhc_di(text: str):
    if not isinstance(text, str):
        return "", ""
    bhc_patterns = [
        r"(?i)\bBrief\s*Hospital\s*Course\b\s*[:.\-]*",
        r"(?i)\bBHC\b\s*[:.\-]*",
    ]
    di_patterns = [
        r"(?i)\bDischarge\s*Instructions\b\s*[:.\-]*",
        r"(?i)\bDI\b\s*[:.\-]*",
    ]
    bhc_match = None
    di_match = None

    for pat in bhc_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (bhc_match is None or m.start() < bhc_match.start()):
            bhc_match = m
    for pat in di_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (di_match is None or m.start() < di_match.start()):
            di_match = m

    bhc_text = ""
    di_text = ""
    if bhc_match and di_match:
        if bhc_match.start() < di_match.start():
            bhc_text = text[bhc_match.end():di_match.start()].strip().lstrip(':').strip()
            di_text  = text[di_match.end():].strip().lstrip(':').strip()
        else:
            di_text  = text[di_match.end():bhc_match.start()].strip().lstrip(':').strip()
            bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif bhc_match:
        bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif di_match:
        di_text = text[di_match.end():].strip().lstrip(':').strip()

    return bhc_text, di_text

# ============================================
# BLEU 계산 함수 (NaN/오류 안전)
# ============================================
chencherry = SmoothingFunction()
def calc_bleu(hypothesis_str: str, reference_str: str):
    if not hypothesis_str or not reference_str:
        return np.nan
    try:
        # 간단한 공백 토크나이즈; 필요 시 word_tokenize로 교체 가능
        hyp = hypothesis_str.split()
        ref = [reference_str.split()]
        return sentence_bleu(ref, hyp, smoothing_function=chencherry.method1)
    except Exception as e:
        # 필요 시 상세 로그 출력
        # print(f"[WARN] BLEU 실패: {e}")
        return np.nan

# ============================================
# 데이터 로드
# ============================================
df = pd.read_parquet(PATH)
print(f"✅ 로드 완료: {PATH}")
print(f"📦 총 행 수: {len(df)}")

# 컬럼 자동 매핑
# 통합 출력(모델 전체 출력)
COL_OM_FULL = first_existing_col(df, ["original_model_output", "om_full", "base_output", "original_output"])
COL_TM_FULL = first_existing_col(df, ["trained_model_output", "pred_full", "model_output", "llm_output"])

# 별도 저장된 BHC/DI 예측 (없을 수 있음)
COL_OM_BHC = first_existing_col(df, ["om_bhc", "bhc_om", "original_bhc_pred"])
COL_OM_DI  = first_existing_col(df, ["om_di", "di_om", "original_di_pred"])
COL_TM_BHC = first_existing_col(df, ["bhc_pred", "pred_bhc", "tm_bhc"])
COL_TM_DI  = first_existing_col(df, ["di_pred", "pred_di", "tm_di"])

# 정답(GT)
COL_GT_BHC = first_existing_col(df, ["bhc_gt", "original_bhc", "gt_bhc"])
COL_GT_DI  = first_existing_col(df, ["di_gt", "original_di", "gt_di"])

# 처리할 행 결정
iter_rows = df.iterrows() if VIEW_NUM is None else df.head(VIEW_NUM).iterrows()
total_rows = len(df) if VIEW_NUM is None else min(VIEW_NUM, len(df))
print(f"총 {total_rows}개 행에 대한 BLEU-4 점수 계산을 시작합니다...")

# ============================================
# 루프
# ============================================
records = []
for idx, row in iter_rows:
    if ((idx + 1) % 50 == 0) or (idx == 0):
        print(f"행 {idx + 1}/{total_rows} 처리 중...")

    # 원본/파인튜닝 통합 출력에서 분리 시도
    om_full = str(row[COL_OM_FULL]) if COL_OM_FULL and pd.notna(row[COL_OM_FULL]) else ""
    tm_full = str(row[COL_TM_FULL]) if COL_TM_FULL and pd.notna(row[COL_TM_FULL]) else ""

    om_bhc_from_full, om_di_from_full = split_bhc_di(om_full) if om_full else ("", "")
    tm_bhc_from_full, tm_di_from_full = split_bhc_di(tm_full) if tm_full else ("", "")

    # 별도 컬럼 폴백
    om_bhc = om_bhc_from_full if om_bhc_from_full else (str(row[COL_OM_BHC]) if COL_OM_BHC and pd.notna(row[COL_OM_BHC]) else "")
    om_di  = om_di_from_full  if om_di_from_full  else (str(row[COL_OM_DI])  if COL_OM_DI  and pd.notna(row[COL_OM_DI])  else "")
    tm_bhc = tm_bhc_from_full if tm_bhc_from_full else (str(row[COL_TM_BHC]) if COL_TM_BHC and pd.notna(row[COL_TM_BHC]) else "")
    tm_di  = tm_di_from_full  if tm_di_from_full  else (str(row[COL_TM_DI])  if COL_TM_DI  and pd.notna(row[COL_TM_DI])  else "")

    # 정답
    gt_bhc = str(row[COL_GT_BHC]) if COL_GT_BHC and pd.notna(row[COL_GT_BHC]) else ""
    gt_di  = str(row[COL_GT_DI])  if COL_GT_DI  and pd.notna(row[COL_GT_DI])  else ""

    # BLEU 계산
    bleu_om_gt_bhc = calc_bleu(om_bhc, gt_bhc)
    bleu_tm_gt_bhc = calc_bleu(tm_bhc, gt_bhc)
    bleu_om_gt_di  = calc_bleu(om_di,  gt_di)
    bleu_tm_gt_di  = calc_bleu(tm_di,  gt_di)

    records.append({
        "row_index": idx + 1,
        "BLEU4_OM_vs_GT_BHC": bleu_om_gt_bhc,
        "BLEU4_TM_vs_GT_BHC": bleu_tm_gt_bhc,
        "BLEU4_OM_vs_GT_DI":  bleu_om_gt_di,
        "BLEU4_TM_vs_GT_DI":  bleu_tm_gt_di,
        # 디버깅용 길이 정보(선택)
        "len_om_bhc": len(om_bhc) if om_bhc else 0,
        "len_tm_bhc": len(tm_bhc) if tm_bhc else 0,
        "len_gt_bhc": len(gt_bhc) if gt_bhc else 0,
        "len_om_di":  len(om_di)  if om_di  else 0,
        "len_tm_di":  len(tm_di)  if tm_di  else 0,
        "len_gt_di":  len(gt_di)  if gt_di  else 0,
    })

print(f"총 {total_rows}개 행에 대한 BLEU-4 점수 계산 완료.")

# 결과 DF
bleu_results_df = pd.DataFrame(records)

# 결과 미리보기
print("\n===== BLEU-4 점수 결과 (상위 5개 행) =====")
print(bleu_results_df.head())

# 평균( NaN 제외 )
print("\n===== 평균 BLEU-4 점수 (NaN 제외) =====")
avg = bleu_results_df.mean(numeric_only=True)
print(f"원본 모델 BHC vs 정답 BHC: {avg.get('BLEU4_OM_vs_GT_BHC', 0.0):.4f}")
print(f"파인튜닝 모델 BHC vs 정답 BHC: {avg.get('BLEU4_TM_vs_GT_BHC', 0.0):.4f}")
print(f"원본 모델 DI vs 정답 DI: {avg.get('BLEU4_OM_vs_GT_DI', 0.0):.4f}")
print(f"파인튜닝 모델 DI vs 정답 DI: {avg.get('BLEU4_TM_vs_GT_DI', 0.0):.4f}")

print("\n===== 각 비교별 NaN (계산 건너뜀) 개수 =====")
print(bleu_results_df[[
    "BLEU4_OM_vs_GT_BHC", "BLEU4_TM_vs_GT_BHC",
    "BLEU4_OM_vs_GT_DI",  "BLEU4_TM_vs_GT_DI"
]].isnull().sum())

# (옵션) 결과 저장
if SAVE_RESULTS:
    out_parquet = PATH.replace(".parquet", "_bleu.parquet")
    out_csv     = PATH.replace(".parquet", "_bleu.csv")
    try:
        bleu_results_df.to_parquet(out_parquet, index=False)
        print(f"\n💾 저장 완료(Parquet): {out_parquet}")
    except Exception as e:
        print(f"\n[WARN] Parquet 저장 실패: {e}")
    bleu_results_df.to_csv(out_csv, index=False)
    print(f"💾 저장 완료(CSV): {out_csv}")


✅ 로드 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_1_results.parquet
📦 총 행 수: 100
총 100개 행에 대한 BLEU-4 점수 계산을 시작합니다...
행 1/100 처리 중...
행 50/100 처리 중...
행 100/100 처리 중...
총 100개 행에 대한 BLEU-4 점수 계산 완료.

===== BLEU-4 점수 결과 (상위 5개 행) =====
   row_index  BLEU4_OM_vs_GT_BHC  BLEU4_TM_vs_GT_BHC  BLEU4_OM_vs_GT_DI  \
0          1                 NaN            0.008771                NaN   
1          2                 NaN            0.008608                NaN   
2          3                 NaN            0.087148                NaN   
3          4                 NaN            0.289226                NaN   
4          5                 NaN            0.097239                NaN   

   BLEU4_TM_vs_GT_DI  len_om_bhc  len_tm_bhc  len_gt_bhc  len_om_di  \
0           0.370533           0         725        1705          0   
1           0.026980           0        2209         948          0   
2           0.036398           0        1880        2067          0   
3      

### 2번쨰

In [4]:
# ============================================
# BLEU-4 계산 스크립트 (가변 컬럼/결측 안전/정규식 보강)
# ============================================
# 필요 시 최초 1회:
# !pip -q install nltk

import os
import re
import numpy as np
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# --- NLTK 데이터 준비 ---
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# --- 설정 ---
PATH = "/content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet"
VIEW_NUM = None  # None이면 전체, 숫자면 상위 N개만
SAVE_RESULTS = True  # 결과 저장 여부
# --- 설정 끝 ---

# ============================================
# 유틸: 컬럼 자동 매핑
# ============================================
def first_existing_col(df_, candidates, default=None):
    """candidates 중 df에 존재하는 첫 번째 컬럼명을 반환. 없으면 default."""
    for c in candidates:
        if c in df_.columns:
            return c
    return default

# ============================================
# BHC/DI 분리 함수 (강화된 정규식)
#  - 'Brief Hospital Course' 또는 'BHC'
#  - 'Discharge Instructions' 또는 'DI'
#  - 콜론/대시/점/공백 유연 처리
# ============================================
def split_bhc_di(text: str):
    if not isinstance(text, str):
        return "", ""
    bhc_patterns = [
        r"(?i)\bBrief\s*Hospital\s*Course\b\s*[:.\-]*",
        r"(?i)\bBHC\b\s*[:.\-]*",
    ]
    di_patterns = [
        r"(?i)\bDischarge\s*Instructions\b\s*[:.\-]*",
        r"(?i)\bDI\b\s*[:.\-]*",
    ]
    bhc_match = None
    di_match = None

    for pat in bhc_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (bhc_match is None or m.start() < bhc_match.start()):
            bhc_match = m
    for pat in di_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m and (di_match is None or m.start() < di_match.start()):
            di_match = m

    bhc_text = ""
    di_text = ""
    if bhc_match and di_match:
        if bhc_match.start() < di_match.start():
            bhc_text = text[bhc_match.end():di_match.start()].strip().lstrip(':').strip()
            di_text  = text[di_match.end():].strip().lstrip(':').strip()
        else:
            di_text  = text[di_match.end():bhc_match.start()].strip().lstrip(':').strip()
            bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif bhc_match:
        bhc_text = text[bhc_match.end():].strip().lstrip(':').strip()
    elif di_match:
        di_text = text[di_match.end():].strip().lstrip(':').strip()

    return bhc_text, di_text

# ============================================
# BLEU 계산 함수 (NaN/오류 안전)
# ============================================
chencherry = SmoothingFunction()
def calc_bleu(hypothesis_str: str, reference_str: str):
    if not hypothesis_str or not reference_str:
        return np.nan
    try:
        # 간단한 공백 토크나이즈; 필요 시 word_tokenize로 교체 가능
        hyp = hypothesis_str.split()
        ref = [reference_str.split()]
        return sentence_bleu(ref, hyp, smoothing_function=chencherry.method1)
    except Exception as e:
        # 필요 시 상세 로그 출력
        # print(f"[WARN] BLEU 실패: {e}")
        return np.nan

# ============================================
# 데이터 로드
# ============================================
df = pd.read_parquet(PATH)
print(f"✅ 로드 완료: {PATH}")
print(f"📦 총 행 수: {len(df)}")

# 컬럼 자동 매핑
# 통합 출력(모델 전체 출력)
COL_OM_FULL = first_existing_col(df, ["original_model_output", "om_full", "base_output", "original_output"])
COL_TM_FULL = first_existing_col(df, ["trained_model_output", "pred_full", "model_output", "llm_output"])

# 별도 저장된 BHC/DI 예측 (없을 수 있음)
COL_OM_BHC = first_existing_col(df, ["om_bhc", "bhc_om", "original_bhc_pred"])
COL_OM_DI  = first_existing_col(df, ["om_di", "di_om", "original_di_pred"])
COL_TM_BHC = first_existing_col(df, ["bhc_pred", "pred_bhc", "tm_bhc"])
COL_TM_DI  = first_existing_col(df, ["di_pred", "pred_di", "tm_di"])

# 정답(GT)
COL_GT_BHC = first_existing_col(df, ["bhc_gt", "original_bhc", "gt_bhc"])
COL_GT_DI  = first_existing_col(df, ["di_gt", "original_di", "gt_di"])

# 처리할 행 결정
iter_rows = df.iterrows() if VIEW_NUM is None else df.head(VIEW_NUM).iterrows()
total_rows = len(df) if VIEW_NUM is None else min(VIEW_NUM, len(df))
print(f"총 {total_rows}개 행에 대한 BLEU-4 점수 계산을 시작합니다...")

# ============================================
# 루프
# ============================================
records = []
for idx, row in iter_rows:
    if ((idx + 1) % 50 == 0) or (idx == 0):
        print(f"행 {idx + 1}/{total_rows} 처리 중...")

    # 원본/파인튜닝 통합 출력에서 분리 시도
    om_full = str(row[COL_OM_FULL]) if COL_OM_FULL and pd.notna(row[COL_OM_FULL]) else ""
    tm_full = str(row[COL_TM_FULL]) if COL_TM_FULL and pd.notna(row[COL_TM_FULL]) else ""

    om_bhc_from_full, om_di_from_full = split_bhc_di(om_full) if om_full else ("", "")
    tm_bhc_from_full, tm_di_from_full = split_bhc_di(tm_full) if tm_full else ("", "")

    # 별도 컬럼 폴백
    om_bhc = om_bhc_from_full if om_bhc_from_full else (str(row[COL_OM_BHC]) if COL_OM_BHC and pd.notna(row[COL_OM_BHC]) else "")
    om_di  = om_di_from_full  if om_di_from_full  else (str(row[COL_OM_DI])  if COL_OM_DI  and pd.notna(row[COL_OM_DI])  else "")
    tm_bhc = tm_bhc_from_full if tm_bhc_from_full else (str(row[COL_TM_BHC]) if COL_TM_BHC and pd.notna(row[COL_TM_BHC]) else "")
    tm_di  = tm_di_from_full  if tm_di_from_full  else (str(row[COL_TM_DI])  if COL_TM_DI  and pd.notna(row[COL_TM_DI])  else "")

    # 정답
    gt_bhc = str(row[COL_GT_BHC]) if COL_GT_BHC and pd.notna(row[COL_GT_BHC]) else ""
    gt_di  = str(row[COL_GT_DI])  if COL_GT_DI  and pd.notna(row[COL_GT_DI])  else ""

    # BLEU 계산
    bleu_om_gt_bhc = calc_bleu(om_bhc, gt_bhc)
    bleu_tm_gt_bhc = calc_bleu(tm_bhc, gt_bhc)
    bleu_om_gt_di  = calc_bleu(om_di,  gt_di)
    bleu_tm_gt_di  = calc_bleu(tm_di,  gt_di)

    records.append({
        "row_index": idx + 1,
        "BLEU4_OM_vs_GT_BHC": bleu_om_gt_bhc,
        "BLEU4_TM_vs_GT_BHC": bleu_tm_gt_bhc,
        "BLEU4_OM_vs_GT_DI":  bleu_om_gt_di,
        "BLEU4_TM_vs_GT_DI":  bleu_tm_gt_di,
        # 디버깅용 길이 정보(선택)
        "len_om_bhc": len(om_bhc) if om_bhc else 0,
        "len_tm_bhc": len(tm_bhc) if tm_bhc else 0,
        "len_gt_bhc": len(gt_bhc) if gt_bhc else 0,
        "len_om_di":  len(om_di)  if om_di  else 0,
        "len_tm_di":  len(tm_di)  if tm_di  else 0,
        "len_gt_di":  len(gt_di)  if gt_di  else 0,
    })

print(f"총 {total_rows}개 행에 대한 BLEU-4 점수 계산 완료.")

# 결과 DF
bleu_results_df = pd.DataFrame(records)

# 결과 미리보기
print("\n===== BLEU-4 점수 결과 (상위 5개 행) =====")
print(bleu_results_df.head())

# 평균( NaN 제외 )
print("\n===== 평균 BLEU-4 점수 (NaN 제외) =====")
avg = bleu_results_df.mean(numeric_only=True)
print(f"원본 모델 BHC vs 정답 BHC: {avg.get('BLEU4_OM_vs_GT_BHC', 0.0):.4f}")
print(f"파인튜닝 모델 BHC vs 정답 BHC: {avg.get('BLEU4_TM_vs_GT_BHC', 0.0):.4f}")
print(f"원본 모델 DI vs 정답 DI: {avg.get('BLEU4_OM_vs_GT_DI', 0.0):.4f}")
print(f"파인튜닝 모델 DI vs 정답 DI: {avg.get('BLEU4_TM_vs_GT_DI', 0.0):.4f}")

print("\n===== 각 비교별 NaN (계산 건너뜀) 개수 =====")
print(bleu_results_df[[
    "BLEU4_OM_vs_GT_BHC", "BLEU4_TM_vs_GT_BHC",
    "BLEU4_OM_vs_GT_DI",  "BLEU4_TM_vs_GT_DI"
]].isnull().sum())

# (옵션) 결과 저장
if SAVE_RESULTS:
    out_parquet = PATH.replace(".parquet", "_bleu.parquet")
    out_csv     = PATH.replace(".parquet", "_bleu.csv")
    try:
        bleu_results_df.to_parquet(out_parquet, index=False)
        print(f"\n💾 저장 완료(Parquet): {out_parquet}")
    except Exception as e:
        print(f"\n[WARN] Parquet 저장 실패: {e}")
    bleu_results_df.to_csv(out_csv, index=False)
    print(f"💾 저장 완료(CSV): {out_csv}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


✅ 로드 완료: /content/drive/MyDrive/DILAB/llama3-8b-instruct/results/original_2_results.parquet
📦 총 행 수: 100
총 100개 행에 대한 BLEU-4 점수 계산을 시작합니다...
행 1/100 처리 중...
행 50/100 처리 중...
행 100/100 처리 중...
총 100개 행에 대한 BLEU-4 점수 계산 완료.

===== BLEU-4 점수 결과 (상위 5개 행) =====
   row_index  BLEU4_OM_vs_GT_BHC  BLEU4_TM_vs_GT_BHC  BLEU4_OM_vs_GT_DI  \
0          1                 NaN            0.205003                NaN   
1          2                 NaN            0.004184                NaN   
2          3                 NaN            0.072213                NaN   
3          4                 NaN            0.043399                NaN   
4          5                 NaN            0.077662                NaN   

   BLEU4_TM_vs_GT_DI  len_om_bhc  len_tm_bhc  len_gt_bhc  len_om_di  \
0           0.371599           0        2448        1705          0   
1           0.040277           0         343         948          0   
2           0.132945           0        1735        2067          0   
3      