# baseline v3

이 베이스라인 코드는 `사전학습 모델 로드`, `배치 학습`, `파인튜닝`, `양자화`, `PEFT` 등이 적용된 버전입니다.

Colab의 GPU 환경에서 개발되었습니다.
- 런타임 - 런타임 유형 변경 - GPU로 변경(T4 GPU 등)



# 환경 준비

개발 환경에 필요한 라이브러리 버전을 고정하고 최신 버전으로 라이브러리를 업데이트합니다.

- 아래 셀 실행
- 실행 완료 후 런타임 - 세션 다시 시작

In [1]:
!pip -q install "transformers>=4.44.2" "accelerate>=0.34.2" "peft>=0.13.2" "bitsandbytes>=0.43.1" datasets pillow pandas torch torchvision albumentations --upgrade


# 데이터 준비

개발에 필요한 데이터를 준비합니다.

- train.csv, train 폴더
- test.csv, test 폴더
- sample_submission.csv

본 베이스라인은 colab에서 구글 드라이브를 마운트하여 사용합니다.

데이터를 압축 해제하는데 몇 분 정도의 시간이 소요됩니다.

#### 실습 참고 내용

    챕터 2-2 합성 데이터 실습
    - 구글 드라이브 마운트 : drive()

In [None]:
# 구글드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 압축 해제
!unzip "/content/aa.zip" -d "/content/"

# 라이브러리, 데이터, 설정

In [2]:
import os, re, math, random
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import torch
from typing import Dict, List, Any
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm import tqdm
from albumentations import Compose, RandomBrightnessContrast, ShiftScaleRotate, HorizontalFlip, VerticalFlip

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

torch.set_float32_matmul_precision("high")
torch.backends.cudnn.benchmark = True

Image.MAX_IMAGE_PIXELS = None

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
IMAGE_SIZE = 384
MAX_NEW_TOKENS = 8

train_df = pd.read_csv("./train.csv")
test_df  = pd.read_csv("./test.csv")
train_df = train_df.reset_index(drop=True)


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


  self.setter(val)


# 모델, Processor

7.5GB 정도의 모델 다운로드가 진행됩니다. 10~20분 정도가 소요됩니다.

#### 실습 참고 내용

    챕터 5-1 PEFT(파라미터 효율적 튜닝)
    - LoRA 구현 : LoraConfig()

In [3]:
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=IMAGE_SIZE * IMAGE_SIZE,
    max_pixels=IMAGE_SIZE * IMAGE_SIZE,
    trust_remote_code=True
)

base_model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.05s/it]


trainable params: 37,152,768 || all params: 3,791,775,744 || trainable%: 0.9798


# 프롬프트 템플릿

#### 실습 참고 내용

    챕터 5-1 PEFT(파라미터 효율적 튜닝)
    - 프롬프트 템플릿 : convert_to_chatml(), formatting_prompts_func()

In [4]:
SYSTEM_INSTRUCT = (
    "You are a visual question answering assistant.\n"
    "Answer the question by choosing exactly one letter: a, b, c, or d.\n"
    "Output only the letter — no punctuation, spaces, or explanation.\n"
    "If unsure, pick the most likely letter.\n"
)

def build_mc_prompt(question, a, b, c, d):
    q = question.strip()
    return (
        f"{q}\n\n"
        f"A. {a}\n"
        f"B. {b}\n"
        f"C. {c}\n"
        f"D. {d}\n\n"
        "정답을 소문자 a, b, c, d 중 하나로만 출력하시오:"
    )


# Custom Dataset, Collator

#### 실습 참고 내용

    챕터 1-2 MLP 구현
    - TensorDataset()

    챕터 5-2 데이터 생성 및 파인튜닝 (향후 학습 분량)
    - IntentDataset()

In [5]:
from albumentations import (
    Compose, RandomResizedCrop, HorizontalFlip, VerticalFlip,
    RandomBrightnessContrast, HueSaturationValue, RGBShift,
    GaussNoise, MotionBlur, GaussianBlur, Downscale,
    ElasticTransform, GridDistortion,
    CoarseDropout, RandomFog, RandomShadow, ToFloat
)

class VQAMCDataset(Dataset):
    def __init__(self, df, processor, train=True, augmentations=None):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.train = train
        self.augmentations = augmentations

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        img = Image.open(row["path"]).convert("RGB")
        arr = np.array(img, dtype=np.uint8)

        if self.augmentations:
            arr = self.augmentations(image=arr)["image"]

        q = row["question"].strip()
        a = row["a"].strip()
        b = row["b"].strip()
        c = row["c"].strip()
        d = row["d"].strip()
        user_text = build_mc_prompt(q, a, b, c, d)

        messages = [
            {"role":"system","content":[{"type":"text","text":SYSTEM_INSTRUCT}]},
            {"role":"user","content":[
                {"type":"image","image":arr},
                {"type":"text","text":user_text}
            ]}
        ]
        if self.train:
            gold = row["answer"].strip().lower()
            messages.append({"role":"assistant","content":[{"type":"text","text":gold}]})

        return {"messages": messages, "image": arr}


augmentations = Compose([
    RandomResizedCrop((384, 384), scale=(0.8, 1.0), p=0.7),
    HorizontalFlip(p=0.5),
    RandomBrightnessContrast(p=0.5),
    HueSaturationValue(p=0.4),
    GaussianBlur(p=0.2),
    ToFloat(max_value=255)
])


@dataclass
class DataCollator:
    processor: Any
    train: bool = True

    def __call__(self, batch):
        texts = [self.processor.apply_chat_template(sample["messages"], tokenize=False, add_generation_prompt=False) for sample in batch]
        images = [sample["image"] for sample in batch]

        enc = self.processor(
            text=texts,
            images=images,
            padding=True,
            return_tensors="pt"
        )

        if self.train:
            enc["labels"] = enc["input_ids"].clone()

        return enc


# DataLoader

#### 실습 참고 내용

    챕터 3-1 Transfer Learning 기반의 CNN 모델 학습
    - 데이터로더 정의 : DataLoader()

In [6]:
# 데이터 분리
split = int(len(train_df) * 0.9)
train_subset = train_df.iloc[:split]
valid_subset = train_df.iloc[split:]

# Dataset 생성
train_ds = VQAMCDataset(
    train_subset,
    processor,
    train=True,
    augmentations=augmentations
)
valid_ds = VQAMCDataset(
    valid_subset,
    processor,
    train=True
)

# DataLoader 설정 (Windows 최적화)
train_loader = DataLoader(
    train_ds,
    batch_size=4,                        # GPU 메모리에 맞는 값 (12GB 기준 안전값)
    shuffle=True,
    collate_fn=DataCollator(processor, True),
    num_workers=0,                       # ✅ Windows에서는 반드시 0 추천
    pin_memory=True                      # GPU 전송 속도 최적화
    # persistent_workers는 num_workers=0일 때 의미 없음 → 제거
)

valid_loader = DataLoader(
    valid_ds,
    batch_size=4,
    shuffle=False,
    collate_fn=DataCollator(processor, True), 
    num_workers=0,
    pin_memory=True
)


# fine-tuning

- 200개만 학습 : 10~20분 소요

#### 실습 참고 내용

    챕터 1-2 MLP 구현
    - 모델 정의 : SimpleMLP(), SequentialMLP()

    챕터 3-1 Transfer Learning 기반의 CNN 모델 학습
    - 학습 루프 : 문제 6: 모델 학습을 위한 반복문
    - 추론 : with torch.no_grad(), model.eval()

In [7]:
from tqdm.auto import tqdm
import numpy as np
import torch
import math

model = model.to(device)
GRAD_ACCUM = 8  # 기존 4 → 8로 확장 (효과적인 batch size 증가)

# 옵티마이저 & 스케줄러
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
num_training_steps = 15 * math.ceil(len(train_loader) / GRAD_ACCUM)

# Cosine Annealing 스케줄러
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_training_steps)

# 자동 혼합 정밀도 스케일러
scaler = torch.cuda.amp.GradScaler(enabled=True)

# Early Stopping & Checkpoint
best_val_loss = float("inf")
patience = 3
counter = 0
SAVE_DIR = "./qwen2_5_vl_3b_lora_best"

num_epochs = 2
global_step = 0

for epoch in range(num_epochs):
    model.train()
    running = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [train]", unit="batch")
    for step, batch in enumerate(progress_bar, start=1):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss / GRAD_ACCUM

        scaler.scale(loss).backward()
        running += loss.item()

        if step % GRAD_ACCUM == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

            avg_loss = running / GRAD_ACCUM
            progress_bar.set_postfix({"loss": f"{avg_loss:.4f}"})
            running = 0.0

    # ============================
    # Validation + Early Stopping
    # ============================
    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
        for vb in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{num_epochs} [valid]", unit="batch"):
            vb = {k: v.to(device) for k, v in vb.items()}
            val_loss += model(**vb).loss.item()
            val_steps += 1

    val_loss /= val_steps
    print(f"[Epoch {epoch+1}] Validation Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(SAVE_DIR)
        processor.save_pretrained(SAVE_DIR)
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            break

# 학습 완료 후 best 모델 저장 확인
print(f"Best validation loss: {best_val_loss:.4f}")
print("Best model saved at:", SAVE_DIR)


  scaler = torch.cuda.amp.GradScaler(enabled=True)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Epoch 1/2 [train]: 100%|██████████| 875/875 [36:06<00:00,  2.48s/batch, loss=0.5382]
Epoch 1/2 [valid]: 100%|██████████| 98/98 [01:02<00:00,  1.56batch/s]


[Epoch 1] Validation Loss: 4.4755


Epoch 2/2 [train]: 100%|██████████| 875/875 [18:59<00:00,  1.30s/batch, loss=0.5451]
Epoch 2/2 [valid]: 100%|██████████| 98/98 [01:02<00:00,  1.57batch/s]


[Epoch 2] Validation Loss: 4.4550
Best validation loss: 4.4550
Best model saved at: ./qwen2_5_vl_3b_lora_best


In [9]:
SAVE_DIR = "./qwen2_5_vl_3b_lora_manual_epoch8_step28"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("✅ 중단된 지점까지의 모델을 저장했습니다.")


✅ 중단된 지점까지의 모델을 저장했습니다.


# inference

30분~1시간 소요

#### 실습 참고 내용

    챕터4-1 RAG 기반 Customer Service AI 에이전트 개발
    - 데이터 파서 : langchain_core.output_parsers(), StrOutputParser()

    챕터 3-1 Transfer Learning 기반의 CNN 모델 학습
    - 학습 루프 : 문제 6: 모델 학습을 위한 반복문
    - 추론 : with torch.no_grad(), model.eval()

In [None]:
# 데이터 파서 : 모델의 응답에서 선지를 추출
def extract_choice(text: str) -> str:
    text = text.strip().lower()

    lines = [l.strip() for l in text.splitlines() if l.strip()]
    if not lines:
        return "a"
    last = lines[-1]
    if last in ["a", "b", "c", "d"]:
        return last

    tokens = last.split()
    for tok in tokens:
        if tok in ["a", "b", "c", "d"]:
            return tok
    return "a"

# 추론을 위해 모든 레이어 활성화
model.eval()
preds = []

# 추론 루프
for i in tqdm(range(len(test_df)), desc="Inference", unit="sample"):
    row = test_df.iloc[i]
    img = Image.open(row["path"]).convert("RGB")
    user_text = build_mc_prompt(row["question"], row["a"], row["b"], row["c"], row["d"])

    messages = [
        {"role":"system","content":[{"type":"text","text":SYSTEM_INSTRUCT}]},
        {"role":"user","content":[
            {"type":"image","image":img},
            {"type":"text","text":user_text}
        ]}
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[img], return_tensors="pt").to(device)

    with torch.no_grad():
        out_ids = model.generate(**inputs, max_new_tokens=2, do_sample=False,
                                 eos_token_id=processor.tokenizer.eos_token_id)
    output_text = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
    preds.append(extract_choice(output_text))

# 제출 파일 생성
submission = pd.DataFrame({"id": test_df["id"], "answer": preds})
submission.to_csv("./submission.csv", index=False)
print("Saved ./submission.csv")


Inference:   0%|          | 0/3887 [00:00<?, ?sample/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Inference:  16%|█▌        | 628/3887 [03:48<19:17,  2.82sample/s]

In [None]:
# 모델 응답 예시
print(output_text)

system
You are a helpful visual question answering assistant. Answer using exactly one letter among a, b, c, or d. No explanation.
user
이 사진의 주요 상황은 무엇인가요?
(a) 수업 시간에 공부하고 있다
(b) 회의에 참석하고 있다
(c) 졸업식 준비 중이다
(d) 시험을 치르고 있다

정답을 반드시 a, b, c, d 중 하나의 소문자 한 글자로만 출력하세요.
assistant
c
