# 1

Original Transformer와 GPT-1의 차이점:

# 아키텍처:

-원래 Transformer는 Encoder + Decoder 구조로 이루어져 있는데

GPT1에서는 Encoder를 완전히 제거하고 Decoder만 남는다.

-원래 Transformer의 Decoder는 Masked Self-Attention과 Cross-Attention을 포함하는데

GPT1에서는 Cross-Attention을 제거한다.

따라서 Masked Self-Attention만 남는다.



# 입력 표현:

두 경우 모두 Token Embedding + Positional Embedding을 사용한다.

그러나 GPT1은 위치 임베딩을 사인/코사인 기반 대신 학습 가능한 임베딩으로 사용한다.


-즉, GPT1은 고정된 사인 함수를 제거하고 파라미터로 학습되는 벡터를 사용한다.


# 레이어 내부 블록

GPT1의 블록 구성은:

Masked Self-Attention

Add & LayerNorm

Feed Forward Network (FFN)

Add & LayerNorm

# 2. 전처리

In [None]:
import re
import os
import pandas as pd
import sentencepiece as spm
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
import re
def normalize_text(s):
  if not isinstance(s,str):
    s=str(s)
  s=s.lower()
  s=re.sub(r"[^가-힣a-z0-9.,?!\s]", " ", s)
  s=re.sub(r"\s+", " ", s).strip()
  return s

In [None]:
txt = "안녕하세요!!! 😀 테스트입니다~ 123 🚗 Hello!!"
print(normalize_text(txt))

안녕하세요!!! 테스트입니다 123 hello!!


In [None]:
import pandas as pd
data=pd.read_csv('/content/ChatbotData.csv')
data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [None]:
data['Q']=data['Q'].apply(normalize_text)
data['A']=data['A'].apply(normalize_text)

data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,ppl 심하네,눈살이 찌푸려지죠.,0


In [None]:
import os
import sentencepiece as spm

CORPUS_TXT = "/content/corpus_ko.txt"
MODEL_PREFIX = "/content/spm_ko"
VOCAB_SIZE = 8000
MODEL_TYPE = "unigram"

In [None]:
df_c=data[['A','Q']].fillna('').copy()
df_c=df_c[(df_c['Q'].str.len()>0)&(df_c['A'].str.len()>0)].reset_index(drop=True)

In [None]:
with open(CORPUS_TXT, 'w') as f:
  for col in ['Q', 'A']:
    for s in df_c[col].astype(str):
      if s.strip():
        f. write(s.strip()+"\n")

In [None]:
spm.SentencePieceTrainer.Train(input=CORPUS_TXT,
    model_prefix=MODEL_PREFIX,
    vocab_size=VOCAB_SIZE,
    character_coverage=1.0,
    model_type=MODEL_TYPE,
    pad_id=0, bos_id=1, eos_id=2, unk_id=3,
    max_sentence_length=999999)

In [None]:
sp = spm.SentencePieceProcessor()
sp.load(f"{MODEL_PREFIX}.model")
print("ids: pad/bos/eos/unk =", sp.pad_id(), sp.bos_id(), sp.eos_id(), sp.unk_id())

sample = df_c["Q"].iloc[0]
print("sample:", sample)
print("pieces:", sp.encode(sample, out_type=str))
print("ids   :", sp.encode(sample, out_type=int))

ids: pad/bos/eos/unk = 0 1 2 3
sample: 12시 땡!
pieces: ['▁12', '시', '▁', '땡', '!']
ids   : [4275, 549, 5, 7825, 64]


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
PAD, BOS, EOS= sp.pad_id(), sp.bos_id(), sp.eos_id()
MAX_LEN=40

def encode_with_bos_eos(text):
  ids=sp.encode(str(text), out_type=int)
  return [BOS]+ids+[EOS]

class KoChatDataset(Dataset):
  def __init__(self,df,max_len=MAX_LEN):
    self.items=[]
    for q,a in zip(df_c['Q'], df_c['A']):
      #combine question and answer to one column
      text=f'{q} {a}'
      ids=encode_with_bos_eos(text)
      if len(ids)>max_len:
        continue


      #padding
      ids=ids+[PAD]*(max_len-len(ids))

      #now we need only input for decoder and target
      input_ids=ids[:-1]
      target_ids=ids[1:]
      self.items.append((
          torch.tensor(input_ids,dtype=torch.long),
          torch.tensor(target_ids, dtype=torch.long)
      ))

  def __len__(self):
    return len(self.items)

  def __getitem__(self,i):
    return self.items[i]


In [None]:
from torch.utils.data import random_split
full_ds=KoChatDataset(df_c)
n_total=len(full_ds)
n_val=int(0.1*n_total)
n_train=n_total-n_val

train_ds,val_ds=random_split(full_ds,[n_train,n_val])
train_dl=DataLoader(train_ds,batch_size=64,shuffle=True, drop_last=True)
val_dl=DataLoader(val_ds,batch_size=64,shuffle=False)

# 모델 설계

In [None]:
import math
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

def create_padding_mask(x,pad_id):
  mask=(x==pad_id).float()
  return mask.unsqueeze(1).unsqueeze(2)

def create_look_ahead_mask(tgt_len,device=None):
  m=torch.triu(torch.ones(tgt_len, tgt_len, device=device), diagonal=1)
  m=m.unsqueeze(0).unsqueeze(0)
  return m

def combine_masks(causal,pad):
  if causal is None:
    return pad
  if pad is None:
    return causal
  return torch.max(causal,pad)


def scaled_dot_product_attention(Q,K,V,mask=None):
  dk=K.size(-1)
  scores=torch.matmul(Q,K.transpose(-1,-2))/math.sqrt(dk)
  if mask is not None:
    scores=scores+(mask*(-1e9))
  attn=F.softmax(scores,dim=-1)
  out=torch.matmul(attn,V)
  return out,attn


class MultiHeadAttention(nn.Module):
  def __init__(self,d_model,num_heads,dropout=0.1):
    super().__init__()
    self.num_heads=num_heads
    self.d_model=d_model
    self.depth=d_model//num_heads

    self.Wq=nn.Linear(d_model,d_model)
    self.Wk=nn.Linear(d_model,d_model)
    self.Wv=nn.Linear(d_model,d_model)
    self.Wo=nn.Linear(d_model,d_model)
    self.drop=nn.Dropout(dropout)

  def split_heads(self,x):
    B,L,D=x.shape
    x=x.view(B,L,self.num_heads, self.depth)
    return x.permute(0,2,1,3)

  def combine_heads(self,x):
    B,H,L,d=x.shape
    x=x.permute(0,2,1,3).contiguous()
    return x.view(B,L,H*d)

  def forward(self,q,k,v,mask=None):
    Q=self.split_heads(self.Wq(q))
    K=self.split_heads(self.Wk(k))
    V=self.split_heads(self.Wv(v))
    out, attn = scaled_dot_product_attention(Q, K, V, mask)
    out = self.combine_heads(self.drop(out))
    return self.Wo(out)


class PositionwiseFFN(nn.Module):
    def __init__(self, d_model, ff_dim, dropout= 0.1):
        super().__init__()
        self.lin1 = nn.Linear(d_model, ff_dim)
        self.lin2 = nn.Linear(ff_dim, d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        return self.lin2(self.drop(F.relu(self.lin1(x))))


class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = PositionwiseFFN(d_model, ff_dim, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.drop1 = nn.Dropout(dropout)
        self.drop2=nn.Dropout(dropout)

    def forward(self, x, enc_out, tgt_mask= None):
        h=self.ln1(x)
        attn_out=self.self_mha(h,h,h,tgt_mask)
        x = x+self.drop1(attn_out)
        h = self.ln2(x)
        ffn_out = self.ffn(h)
        x = x + self.drop2(ffn_out)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, ff_dim, pad_id, dropout=0.1, max_len=512, weight_tying=True):
        super().__init__()
        self.pad = pad_id
        self.max_len=max_len
        self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(max_len,d_model)
        self.drop=nn.Dropout(dropout)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, ff_dim, dropout) for _ in range(num_layers)
        ])
        self.ln_f=nn.LayerNorm(d_model)
        self.out=nn.Linear(d_model,vocab_size)
        if weight_tying:
          self.out.weight=self.tok_emb.weight

    def forward(self, tgt):
        B, L = tgt.size()
        device=tgt.device
        pos_ids=torch.arange(L,device=device).unsqueeze(0).expand(B,L)
        x=self.tok_emb(tgt)+self.pos_emb(pos_ids)
        x=self.drop(x)

        causal = create_look_ahead_mask(L, device=device)
        padmask = create_padding_mask(tgt, self.pad)
        tgt_mask = combine_masks(causal, padmask)

        for layer in self.layers:
            x = layer(x, tgt_mask)

        x=self.ln_f(x)
        logits=self.out(x)
        return logits


class TransformerWarmupLR(torch.optim.lr_scheduler.LambdaLR):
  def __init__(self, optimizer, d_model, warmup_steps=4000):
    def lr_lambda(step):
      step=step+1
      return (d_model ** -0.5) * min(step ** -0.5, step * (warmup_steps ** -1.5))
    super().__init__(optimizer, lr_lambda=lr_lambda)

def train_step(model,batch,optimizer,loss_fn,device):
  model.train()
  inp,tgt=[x.to(device) for x in batch]
  optimizer.zero_grad()
  logits=model(inp)
  B,L,V=logits.shape
  loss=loss_fn(logits.view(B*L,V), tgt.view(B*L))
  loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  optimizer.step()
  scheduler.step()

  with torch.no_grad():
    preds=logits.argmax(-1)
    mask=(tgt!=model.pad)
    acc=(preds.eq(tgt) & mask).float().sum()/mask.float().sum()
  return loss.item(), acc.item()

In [None]:
vocab_size = 8000
d_model    = 192
n_layers   = 3
n_heads    = 6
ff_dim     = 768
pad_id     = 0
max_len    = 40


In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'
model=Decoder(vocab_size, d_model, n_layers, n_heads, ff_dim, pad_id, 0.1, max_len, weight_tying=True).to(device)
optimizer=torch.optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.98), eps=1e-9)
scheduler=TransformerWarmupLR(optimizer, d_model, warmup_steps=4000)
loss_fn=nn.CrossEntropyLoss(ignore_index=pad_id).to(device)

In [None]:
print(model)

Decoder(
  (tok_emb): Embedding(8000, 192, padding_idx=0)
  (pos_emb): Embedding(40, 192)
  (drop): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0-2): 3 x DecoderLayer(
      (self_mha): MultiHeadAttention(
        (Wq): Linear(in_features=192, out_features=192, bias=True)
        (Wk): Linear(in_features=192, out_features=192, bias=True)
        (Wv): Linear(in_features=192, out_features=192, bias=True)
        (Wo): Linear(in_features=192, out_features=192, bias=True)
        (drop): Dropout(p=0.1, inplace=False)
      )
      (ffn): PositionwiseFFN(
        (lin1): Linear(in_features=192, out_features=768, bias=True)
        (lin2): Linear(in_features=768, out_features=192, bias=True)
        (drop): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
      (drop1): Dropout(p=0.1, inplace=False)
      (drop2): Dropout(p=0.1, inplace=False)
    )
  )
 

In [None]:
BATCH_SIZE   = 64
EPOCHS       = 5
MAX_LEN      = 40
LR           = 1.0
CLIP_NORM    = 1.0
VAL_FRACTION = 0.1
VOCAB_SIZE = 8000

@torch.no_grad()
def eval_step(model,loader,loss_fn,device):
  model.eval()
  losses, accs=[],[]
  for batch in loader:
    inp,tgt=[x.to(device) for x in batch]
    logits=model(inp)
    B,L,V=logits.shape
    loss=loss_fn(logits.view(B*L,V), tgt.view(B*L))
    preds=logits.argmax(-1)
    mask=(tgt!=model.pad)
    acc = (preds.eq(tgt) & mask).float().sum() / mask.float().sum()
    losses.append(loss.item())
    accs.append(acc.item())
  avg_loss = sum(losses) / max(1, len(losses))
  avg_acc  = sum(accs) / max(1, len(accs))
  try:
      ppl = math.exp(min(20, avg_loss))
  except OverflowError:
      ppl = float("inf")
  return avg_loss, avg_acc, ppl


# 학습

In [None]:
for epoch in range(20):
    run_loss, run_acc, n = 0.0, 0.0, 0
    for step, batch in enumerate(train_dl, 1):
        loss, acc = train_step(model, batch, optimizer, loss_fn, device)
        run_loss += loss; run_acc += acc; n += 1
        if step % 100 == 0:
            print(f"[Epoch {epoch} | Step {step}] train_loss={run_loss/n:.4f}  train_acc={run_acc/n:.4f}")
            run_loss, run_acc, n = 0.0, 0.0, 0

    val_loss, val_acc, val_ppl = eval_step(model, val_dl, loss_fn, device)
    print(f"Epoch {epoch}: val_loss={val_loss:.4f}  val_acc={val_acc:.4f}  val_ppl={val_ppl:.2f}")

[Epoch 0 | Step 100] train_loss=110.2687  train_acc=0.0013
Epoch 0: val_loss=39.0556  val_acc=0.0158  val_ppl=485165195.41
[Epoch 1 | Step 100] train_loss=31.5911  train_acc=0.0559
Epoch 1: val_loss=21.3716  val_acc=0.1420  val_ppl=485165195.41
[Epoch 2 | Step 100] train_loss=21.3437  train_acc=0.1054
Epoch 2: val_loss=16.9599  val_acc=0.1569  val_ppl=23206513.94
[Epoch 3 | Step 100] train_loss=17.1669  train_acc=0.1276
Epoch 3: val_loss=13.2518  val_acc=0.1610  val_ppl=569088.48
[Epoch 4 | Step 100] train_loss=13.5001  train_acc=0.1382
Epoch 4: val_loss=10.3455  val_acc=0.1666  val_ppl=31117.86
[Epoch 5 | Step 100] train_loss=10.8688  train_acc=0.1520
Epoch 5: val_loss=8.4711  val_acc=0.2009  val_ppl=4774.65
[Epoch 6 | Step 100] train_loss=9.0159  train_acc=0.1807
Epoch 6: val_loss=6.3665  val_acc=0.2993  val_ppl=582.01
[Epoch 7 | Step 100] train_loss=5.9523  train_acc=0.3430
Epoch 7: val_loss=1.2009  val_acc=0.8236  val_ppl=3.32
[Epoch 8 | Step 100] train_loss=1.2113  train_acc=0.815

# 테스트

In [None]:
@torch.no_grad()
def generate(model, sp, prompt: str, max_len_ctx: int, device: str, max_new_tokens=30, temperature=1.0, eos_id=None):
    model.eval()
    ids = [sp.bos_id()] + sp.encode(prompt, out_type=int)
    ids = ids[:max_len_ctx-1] + [sp.eos_id()]
    x = torch.tensor(ids[:-1], dtype=torch.long, device=device).unsqueeze(0)
    for _ in range(max_new_tokens):
        logits = model(x)[:, -1, :] / max(1e-5, temperature)
        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        x = torch.cat([x, next_id], dim=1)
        if x.size(1) >= max_len_ctx: break
        if eos_id is not None and next_id.item() == eos_id: break
    return x[0].tolist()

In [None]:
out_ids = generate(model, sp, prompt="안녕, 뭐", max_len_ctx=MAX_LEN,
                     device=device, max_new_tokens=30, temperature=0.8, eos_id=sp.eos_id())
print("GEN:", sp.decode(out_ids))


GEN: 안녕, 뭐 뭐 안녕 뭐 뭐 안녕 뭐 맛있는 맛있는 맛있는 뭐 안녕 뭐 뭐 뭐 안녕 맛있는 맛있는 뭐푸 맛있는, 앞에 맛있는 뭐 안녕 안녕 뭐 뭐 안녕 안녕
