In [None]:
import torch
from utils import GPTModel


from importlib.metadata import version

pkgs = ["torch",           # PyTorch - 딥러닝 프레임워크
        "numpy",           # 수치 연산
        "matplotlib"       # 시각화 (선택사항)
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

import torch
print(torch.__file__)


In [None]:
GPT_CONFIG_META = {
    "vocab_size": 1203,         # 그냥 바로 사용 (model, tokenizer, config 모두 통일!)
    "context_length": 256,      # 문장/문단/챗봇 상황 권장값 (짧은 텍스트나 메모리 작을 때는 128로도 OK)
    "emb_dim": 256,             # 임베딩 차원 (작은 모델 실험에 권장)
    "n_heads": 4,               # attention head (emb_dim 기준 4~8이 일반적)
    "n_layers": 4,              # transformer 블럭 수 (작은 모델 실험에 권장)
    "drop_rate": 0.1,           # 소규모엔 0.1, overfitting 크면 0.2 추천
    "qkv_bias": False           # False로 두는 게 일반적 (기본 세팅)
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_META)
model.eval();  # Disable dropout during inference

In [None]:
from utils import CustomTokenizer

# 1) 초기화
tokenizer = CustomTokenizer()

# 2) verdict4.txt 불러오기
with open("verdict4.txt", encoding="utf-8") as f:
    txt = f.read()

# 3) 전처리 (기존 <EOS>/<PARA_END>가 있는 경우)
preprocessed = tokenizer.preprocess(txt)

# 4) 서브워드 병합 규칙 학습 (소형 테스트면 vocab_size=300)
tokenizer.learn_bpe(preprocessed, vocab_size=300)
print("학습된 병합 규칙 수:", len(tokenizer.merges))

# 5) 토큰화 테스트
sample = "직접경험은 중요해지고 있다. 나는 너를 사랑한다. <EOS> <PARA_END>"
encoded = tokenizer.encode(sample)
decoded = tokenizer.decode(encoded)

print("\n입력 문장:", sample)
print("→ 토큰 IDs:", encoded[:40])
print("→ 복원 결과:", decoded[:80])


In [None]:
start_context = "복잡한, 구두점이 '여기' 있습니다! (인식을 잘 할까~) 문장의 끝도 잘 인식할까요??"



def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension
    return encoded_tensor

token_ids = text_to_token_ids(start_context, tokenizer)
print(token_ids.shape)  # Should be (1, sequence_length)

In [None]:
'''
from utils import MetaTokenizer
tokenizer = MetaTokenizer('mymeta_tokenizer4.model')
start_context = "복잡한, 구두점이 '여기' 있습니다! (인식을 잘 할까~) 문장의 끝도 잘 인식할까요??"
'''


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension
    return encoded_tensor

token_ids = text_to_token_ids(start_context, tokenizer)
print(token_ids.shape)  # Should be (1, sequence_length)

In [None]:
def token_ids_to_text(token_ids, tokenizer):
    token_ids_list = token_ids.squeeze(0).tolist()  # Remove batch dimension and convert to list
    text = tokenizer.decode(token_ids_list)
    return text

token_ids_to_text(token_ids, tokenizer)

In [None]:
from utils import generate_text_simple

token_ids = generate_text_simple(
    model=model,
    idx=token_ids,
    max_new_tokens=50,
    context_size=GPT_CONFIG_META['context_length'],
)



In [None]:
print("min:", min(token_ids), "max:", max(token_ids))


In [None]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load("mymeta_tokenizer4.model")
print(sp.get_piece_size())


# 5.2 Training an LLM

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))  # 0번 GPU 이름 출력
    print(torch.cuda.device_count())      # GPU 개수 출력
else:
    print("CUDA is NOT available")


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
import os

from utils import MetaTokenizer, create_dataloader_v2

tokenizer = MetaTokenizer("mymeta_tokenizer4.model")
file_path = "verdict4.txt"
with open("verdict4.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

ids = tokenizer.encode(raw_text)

# 훈련/검증셋 나누기
split_idx = int(0.9 * len(ids))
ids_train = ids[:split_idx]
ids_val = ids[split_idx:]

train_loader = create_dataloader_v2(
    ids_train, batch_size=32, max_length=256, stride=128, shuffle=True, drop_last=True
)
val_loader = create_dataloader_v2(
    ids_val, batch_size=32, max_length=256, stride=128, shuffle=False, drop_last=False
)


In [None]:
print("Train loader batch 수:", len(train_loader))
print("Val loader batch 수:", len(val_loader))
for i, (inputs, targets) in enumerate(train_loader):
    print(f"Train batch {i}: shape {inputs.shape}, {targets.shape}")
    if i > 4: break  # 처음 몇 개만 출력해도 충분
for i, (inputs, targets) in enumerate(val_loader):
    print(f"Val batch {i}: shape {inputs.shape}, {targets.shape}")
    if i > 4: break


In [None]:
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

In [None]:
import torch.nn.functional as F

def calc_loss_batch(input_batch, target_batch, model, device):
    inputs, targets = input_batch.to(device), target_batch.to(device)
    logits = model(inputs)
    # logits: (batch, seq, vocab), targets: (batch, seq)
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),
        targets.view(-1)
    )
    return loss


In [None]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss, batches_counted = 0., 0

    for batch_idx, (input_batch, target_batch) in enumerate(data_loader):
        inputs = input_batch.to(device)
        targets = target_batch.to(device)
        with torch.no_grad():
            loss = calc_loss_batch(inputs, targets, model, device)
            total_loss += loss.item()
            batches_counted += 1
        if num_batches is not None and batches_counted >= num_batches:
            break

    avg_loss = total_loss / batches_counted if batches_counted > 0 else float("inf")
    return avg_loss


In [None]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # 트랙용 리스트
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            # 평가 주기일 때만 loss/성능 log
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(
                    f"Ep {epoch+1} (Step {global_step:06d}): "
                    f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}"
                )
        
        # 에폭 끝날 때마다 샘플 생성
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()


In [None]:
import torch
# 시간 측정 (선택사항)
import time
start_time = time.time()

torch.manual_seed(123)  # (또는 set_seed 함수로 더 정밀하게)
model = GPTModel(GPT_CONFIG_META)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="직접경험은 중요해지고 있다. 나는 너를", tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")


In [None]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load('mymeta_tokenizer2.model')
print(sp.encode('여기에 호호 입력'))
print(sp.decode(sp.encode('여기에 호호 입력')))
