## 2.2 데이터 준비와 모델 구성

In [None]:
!git clone https://github.com/wikibook/llm-finetuning.git

In [None]:
!pip install -q datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")

In [None]:
data = dataset
data

In [None]:
data['train']['document'][0]

In [None]:
ko_text = "".join(data["train"]["document"])
ko_chars = sorted(list(set(ko_text)))
ko_vocab_size = len(ko_chars)

print("총 글자 수 :", ko_vocab_size)

In [None]:
character_to_ids = {char: i for i, char in enumerate(ko_chars)}
ids_to_character = {i: char for i, char in enumerate(ko_chars)}

token_encode = lambda s:[character_to_ids[c] for c in s]
token_decode = lambda l: "".join([ids_to_character[i] for i in l])

print(token_encode("안녕하세요. 함께 인공지능을 배워봅시다."))
print(token_decode(token_encode("안녕하세요. 함께 인공지능을 배워봅시다.")))

In [None]:
!pip install torch

In [None]:
import torch

tokenized_data = torch.tensor(token_encode(ko_text), dtype=torch.long)
print(tokenized_data.shape, tokenized_data.dtype)
print(tokenized_data[:100])

In [None]:
n = int(0.9 * len(tokenized_data))
train_dataset = tokenized_data[:n]
test_dataset = tokenized_data[n:]

In [None]:
torch.manual_seed(1234)

batch_size = 4
block_size = 8

def batch_function(mode):
    dataset  = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))

    x = torch.stack([dataset[index : index + block_size] for index in idx])
    y = torch.stack([dataset[index + 1 : index + block_size + 1] for index in idx])

    return x, y

example_x, example_y = batch_function("train")

## 2.3 언어 모델 만들기

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)

    def forward(self, inputs, targets=None):
        logits = self.embedding_token_table(inputs)
        if targets is None:
            loss = None
        else:
            batch, seq_length, vocab_length = logits.shape
            logits = logits.view(batch * seq_length, vocab_length)
            targets = targets.view(batch*seq_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(inputs)
            logits = logits[:, -1, :]  # 마지막 토큰의 logits만 사용
            print(logits.shape)

            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs

model = semiGPT(ko_vocab_size)
logits, loss = model(example_x, example_y)
print(loss)

token_decode(model.generate(torch.zeros((1,1),
                                        dtype=torch.long),
                            max_new_tokens=10)[0].tolist())

## 2.4 Optimizer 추가하기

In [None]:
learning_rate = 1e-2
model = semiGPT(ko_vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
from tqdm.auto import tqdm

batch_size = 32

for steps in tqdm(range(10000)):
    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)

    optimizer.zero_grad(set_to_none=True)

    loss.backward()

    optimizer.step()

print(loss.item())

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [24]:
def batch_function(mode):
    dataset  = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))

    x = torch.stack([dataset[index : index + block_size] for index in idx])
    y = torch.stack([dataset[index + 1 : index + block_size + 1] for index in idx])

    x, y = x.to(device), y.to(device)
    return x, y

In [25]:
max_iteration = 50000
eval_interval = 300
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 200

In [29]:
@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out

In [31]:
for step in range(max_iteration):
    if step % eval_interval == 0:
        losses = compute_loss_metrics()
        print(f"Step {step}, Train Loss: {losses['train']:.4f}, val Loss: {losses['eval']:.4f}")

    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

inputs = torch.zeros((1, 1), dtype=torch.long, device=device)
print(token_decode(model.generate(inputs, max_new_tokens=100)[0].tolist()))

Step 0, Train Loss: 3.3945, val Loss: 3.4197
Step 300, Train Loss: 3.4070, val Loss: 3.4426
Step 600, Train Loss: 3.4236, val Loss: 3.4080
Step 900, Train Loss: 3.4031, val Loss: 3.4192
Step 1200, Train Loss: 3.4088, val Loss: 3.4182
Step 1500, Train Loss: 3.4133, val Loss: 3.4178
Step 1800, Train Loss: 3.4078, val Loss: 3.4151
Step 2100, Train Loss: 3.4154, val Loss: 3.4132
Step 2400, Train Loss: 3.4204, val Loss: 3.3878
Step 2700, Train Loss: 3.3978, val Loss: 3.4118
Step 3000, Train Loss: 3.4107, val Loss: 3.4079
Step 3300, Train Loss: 3.3878, val Loss: 3.4001
Step 3600, Train Loss: 3.3968, val Loss: 3.4118
Step 3900, Train Loss: 3.3951, val Loss: 3.4164
Step 4200, Train Loss: 3.4017, val Loss: 3.4064
Step 4500, Train Loss: 3.4213, val Loss: 3.4084
Step 4800, Train Loss: 3.4005, val Loss: 3.4023
Step 5100, Train Loss: 3.4081, val Loss: 3.3958
Step 5400, Train Loss: 3.3977, val Loss: 3.4008
Step 5700, Train Loss: 3.4046, val Loss: 3.4053
Step 6000, Train Loss: 3.4055, val Loss: 3.409