# Pre-screening: 한국 문화 질문 50개

## 목적
- Student 모델(Qwen3-4B-Thinking)이 Knowledge 없이 각 질문에 답변
- 답변 품질 확인하여 "적절히 못하는" 질문 선별
- LUPI-SKD 학습용 데이터셋 구성

## 평가 기준
- 0점: 완전히 틀림 / "모르겠다"
- 1점: 부분적으로 맞음, 핵심 누락 ← 타겟
- 2점: 대체로 맞음, 세부사항 부족 ← 타겟
- 3점: 정확하고 구체적 ← 제외

In [None]:
# Cell 1: Import 및 환경 설정
import sys
import os
import json
from datetime import datetime

# GPU 설정 (필요시 수정)
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

# Project root 설정
PROJECT_ROOT = os.path.dirname(os.getcwd())
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"Available GPUs: {torch.cuda.device_count()}")
print(f"Project root: {PROJECT_ROOT}")

# 재현성
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

In [None]:
# Cell 2: Student 모델 로드 (Qwen3-4B-Thinking)

student_model_name = "Qwen/Qwen3-4B-Thinking-2507"

print(f"Loading Student Model: {student_model_name}")

tokenizer = AutoTokenizer.from_pretrained(student_model_name)
tokenizer.pad_token = tokenizer.eos_token

student_model = AutoModelForCausalLM.from_pretrained(
    student_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
).eval()

print(f"✓ Student Model Loaded")
print(f"  Params: {sum(p.numel() for p in student_model.parameters())/1e9:.2f}B")
print(f"  Device: {student_model.device}")

In [None]:
# Cell 3: 후보 질문 로드

candidates_path = os.path.join(PROJECT_ROOT, "data", "korean_culture_candidates.json")

with open(candidates_path, 'r', encoding='utf-8') as f:
    candidates = json.load(f)

print(f"Loaded {len(candidates)} candidate questions")
print(f"\nCategories:")
from collections import Counter
category_counts = Counter([c['category'] for c in candidates])
for cat, count in category_counts.items():
    print(f"  {cat}: {count}개")

print(f"\nFirst 3 questions:")
for i in range(3):
    print(f"  [{candidates[i]['id']}] {candidates[i]['query']}")

In [None]:
# Cell 4: 답변 생성 함수 (test_lupi_skd.ipynb 기반)

def generate_student_response(
    query: str,
    model,
    tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
):
    """Student 모델로 답변 생성 (Knowledge 없이 Query만)"""
    
    # Query만 포함한 메시지 (Knowledge 없음)
    messages = [
        {"role": "user", "content": query},
    ]
    
    # Chat template 적용
    inputs = tokenizer.apply_chat_template(
        messages,
        return_dict=True,
        tokenize=True,
        return_tensors="pt",
        add_generation_prompt=True,
        enable_thinking=True,
    )
    
    # 생성
    was_training = model.training
    model.eval()
    
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=torch.ones_like(inputs['input_ids']).to(device),
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # 디코딩 (프롬프트 제외)
    prompt_len = inputs['input_ids'].shape[1]
    generated_text = tokenizer.decode(
        generated_ids[0][prompt_len:], 
        skip_special_tokens=True
    )
    
    if was_training:
        model.train()
    
    return generated_text

# 테스트
test_query = "설날에 세배를 드릴 때 어떤 예절을 지켜야 하나요?"
test_response = generate_student_response(test_query, student_model, tokenizer, max_new_tokens=256)
print("[Test Query]")
print(test_query)
print("\n[Student Response]")
print(test_response[:500])

In [None]:
# Cell 5: 50개 질문에 대한 답변 생성 (메인 실행)

from tqdm import tqdm
import time

print("="*80)
print("Starting Pre-screening: Generating 50 Responses")
print("="*80)

results = []
start_time = time.time()

for i, candidate in enumerate(tqdm(candidates, desc="Generating responses")):
    query = candidate['query']
    
    # 답변 생성
    response = generate_student_response(
        query=query,
        model=student_model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
    )
    
    # 결과 저장
    result = {
        "id": candidate['id'],
        "category": candidate['category'],
        "query": query,
        "knowledge_hint": candidate['knowledge_hint'],
        "student_response": response,
        "response_length": len(response),
        "timestamp": datetime.now().isoformat(),
    }
    results.append(result)
    
    # 진행 상황 출력 (10개마다)
    if (i + 1) % 10 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / (i + 1)
        remaining = avg_time * (len(candidates) - i - 1)
        print(f"\n[Progress] {i+1}/50 완료 | 평균 {avg_time:.1f}s/질문 | 예상 남은 시간: {remaining/60:.1f}분")

total_time = time.time() - start_time
print(f"\n✓ 완료! 총 소요 시간: {total_time/60:.1f}분")
print(f"평균 응답 생성 시간: {total_time/len(candidates):.1f}초/질문")

In [None]:
# Cell 6: 결과 저장

output_path = os.path.join(PROJECT_ROOT, "data", "prescreening_responses.json")

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✓ 결과 저장 완료: {output_path}")
print(f"  총 {len(results)}개 응답")
print(f"  평균 응답 길이: {sum(r['response_length'] for r in results) / len(results):.0f} 문자")

# 통계 출력
print("\n[응답 길이 분포]")
lengths = [r['response_length'] for r in results]
print(f"  최소: {min(lengths)}")
print(f"  최대: {max(lengths)}")
print(f"  중앙값: {sorted(lengths)[len(lengths)//2]}")

In [None]:
# Cell 7: 샘플 확인 (카테고리별 1개씩)

print("="*80)
print("Sample Responses (1 per category)")
print("="*80)

categories_seen = set()
for result in results:
    cat = result['category']
    if cat not in categories_seen:
        categories_seen.add(cat)
        print(f"\n[{cat}] {result['id']}")
        print(f"Q: {result['query']}")
        print(f"A: {result['student_response'][:300]}...")
        print("-" * 80)
        
        if len(categories_seen) >= 5:
            break