In [None]:
print(tk.vocab)

In [None]:
from utils import OktTokenizer, create_okt_dataloader

with open("verdict3.txt", "r", encoding="utf-8") as f:
    text = f.read()

tk = OktTokenizer(vocab_size=20000)
pre_text = tk.preprocess(text)
tokens = tk.tokenize(pre_text)
tk.build_vocab(tokens)
encoded = tk.encode(tokens, tk.vocab)


# 데이터로더 생성 및 검증
okt_dataloader = create_okt_dataloader(encoded, batch_size=4)
for batch in okt_dataloader:
    print(tk.decode(batch[0][0].numpy()))  # 첫 샘플 텍스트 복원
    print(batch)
    break


In [None]:
print("내가 지정한 vocab_size:", tk.vocab_size)
print("실제 코퍼스 내 unique 토큰 수:", len(tk.vocab))


In [None]:
# 만약 OktTokenizer 클래스처럼 vocab이 내부에 있을 때:
print("Total vocab size:", len(tk.vocab))
print("Vocab 예시 30개:", list(tk.vocab.keys())[:30])


In [None]:
import torch
from utils import GPTModel


from importlib.metadata import version

pkgs = ["torch",           # PyTorch - 딥러닝 프레임워크
        "numpy",           # 수치 연산
        "matplotlib"       # 시각화 (선택사항)
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

import torch
print(torch.__file__)


In [None]:
# 1) 토크나이저 객체 생성(코퍼스 vocab 먼저 빌드해야 사용 가능!)
tokenizer = OktTokenizer(vocab_size=20000)

# 2) verdict3.txt 전체 읽고 토크나이저에 vocab 구축
with open("verdict3.txt", encoding="utf-8") as f:
    raw_text = f.read()

pre_text = tokenizer.preprocess(raw_text)
tokens = tokenizer.tokenize(pre_text)
tokenizer.build_vocab(tokens)

# 3) 단일 테스트 문장 준비
start_context = "복잡한, 구두점이 '여기' 있습니다! (인식을 잘 할까~) 문장의 끝도 잘 인식할까요??"

def text_to_token_ids(text, tokenizer):
    # 반드시 verdict3.txt 기반으로 vocab이 빌드된 상태임
    pre = tokenizer.preprocess(text)
    toks = tokenizer.tokenize(pre)
    encoded = tokenizer.encode(toks, tokenizer.vocab)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # (1, sequence_length)
    return encoded_tensor

# 4) 테스트 실행
token_ids = text_to_token_ids(start_context, tokenizer)
print("Test context shape:", token_ids.shape)
print("토크나 ID 리스트:", token_ids)
print("디코드 결과:", tokenizer.decode(token_ids.squeeze().tolist()))


In [None]:
def token_ids_to_text(token_ids, tk):
    token_ids_list = token_ids.squeeze(0).tolist()  # Remove batch dimension and convert to list
    text_list = tk.decode(token_ids_list)           # -> 리스트 반환
    text_str = " ".join(text_list)                  # 공백 join 해서 문자열로
    return text_str

print(token_ids_to_text(token_ids, tk))

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
from utils import OktTokenizer, create_okt_dataloader

tk = OktTokenizer()
tokens = tk.tokenize(tk.preprocess(raw_text))  # 반드시 토큰 단위 리스트로!
tk.build_vocab(tokens)    # vocab 사전 구축

ids = tk.encode(tokens)   # 리스트(int)로 변환

# train/val split
split_idx = int(0.9 * len(ids))
train_ids = ids[:split_idx]
val_ids = ids[split_idx:]

train_loader = create_okt_dataloader(train_ids, batch_size=2, max_length=256, stride=256, drop_last=True, shuffle=True)
val_loader = create_okt_dataloader(val_ids, batch_size=2, max_length=256, stride=256, drop_last=False, shuffle=False)



In [None]:
GPT_CONFIG_124M = {
    "vocab_size": len(tk.vocab),
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}
model = GPTModel(GPT_CONFIG_124M)  # 직접 구현 또는 ch.4에서 가져오기


In [None]:
print(len(tk.vocab))

In [None]:
print("Train loader batch 수:", len(train_loader))
print("Val loader batch 수:", len(val_loader))
for i, (inputs, targets) in enumerate(train_loader):
    print(f"Train batch {i}: shape {inputs.shape}, {targets.shape}")
    if i > 4: break  # 처음 몇 개만 출력해도 충분
for i, (inputs, targets) in enumerate(val_loader):
    print(f"Val batch {i}: shape {inputs.shape}, {targets.shape}")
    if i > 4: break

In [None]:
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

In [None]:
import torch.nn.functional as F

def calc_loss_batch(input_batch, target_batch, model, device):
    inputs, targets = input_batch.to(device), target_batch.to(device)
    logits = model(inputs)
    # logits: (batch, seq, vocab), targets: (batch, seq)
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),
        targets.view(-1)
    )
    return loss


In [None]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss, batches_counted = 0., 0

    for batch_idx, (input_batch, target_batch) in enumerate(data_loader):
        inputs = input_batch.to(device)
        targets = target_batch.to(device)
        with torch.no_grad():
            loss = calc_loss_batch(inputs, targets, model, device)
            total_loss += loss.item()
            batches_counted += 1
        if num_batches is not None and batches_counted >= num_batches:
            break

    avg_loss = total_loss / batches_counted if batches_counted > 0 else float("inf")
    return avg_loss


In [None]:
from utils import generate_text_simple
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # 트랙용 리스트
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs} 시작")
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            # 평가 주기일 때만 loss/성능 log
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(
                    f"Ep {epoch+1} (Step {global_step:06d}): "
                    f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}"
                )
        
        # 에폭 끝날 때마다 샘플 생성
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()


In [None]:
import torch
# 시간 측정 (선택사항)
import time
start_time = time.time()

torch.manual_seed(123)  # (또는 set_seed 함수로 더 정밀하게)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=100, eval_iter=5,
    start_context="직접경험은 중요해지고 있다. 나는 너를", tokenizer=tk
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")


# 5.3.3 Modifying the text generation function

# 5.4 Loading and saving model weights in PyTorch

In [None]:
torch.save(model.state_dict(), "model.name")

In [None]:
model.state_dict()