# 環境の説明
```
conda create -n LLMfromScratch python
```
ここからpipを使ってインストールする

インストールしたもの

`pip install tiktoken`

`pip install torch`

# テキストデータの読み込み

urllibを使ってtxtデータに変換する

In [1]:
!pwd

/Users/kakuayato/Documents/GitHub/CurioSync_LLM/public/llm


In [2]:
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length,stride)

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

with open("data/small-text-sample.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
max_len = 1024
context_length = max_len


token_embedding_layer = nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

# Self_Attention

訓練可能な重みを持たない単純なself-attentionメカニズム

In [4]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [5]:
query = inputs[1]

attn_score_2 = torch.empty(inputs.shape[0])
print("attn_score_2.shape", attn_score_2.shape)
for i, x_i in enumerate(inputs):
    attn_score_2[i] = torch.dot(x_i, query)

print(attn_score_2)

attn_score_2.shape torch.Size([6])
tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [6]:
attn_weights_2_tmp = attn_score_2 / attn_score_2.sum()
print(attn_weights_2_tmp)
print(attn_weights_2_tmp.sum())

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
tensor(1.0000)


In [7]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

In [8]:
attn_weights_2 = torch.softmax(attn_score_2, dim=0)
print(attn_weights_2)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [9]:
# クエリに対するコンテキストベクトルを求める
'''
多分必要なのは
1, attn_weight_2
2, inputs
だね
'''
context_2 = torch.matmul(attn_weights_2,inputs)

# 上のやつ自分でも実装してみる
context_2_tmp = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_2_tmp += attn_weights_2[i]*x_i
print(context_2)
print(context_2_tmp)

tensor([0.4419, 0.6515, 0.5683])
tensor([0.4419, 0.6515, 0.5683])


全てのトークンに対して同じことをする


In [10]:
# とりあえず今までの流れをここに書いておく
inputs 
query = inputs[1]

print(inputs.shape)
attn_score_2 = torch.zeros(inputs.shape[0])
attn_weights_2 = torch.zeros(inputs.shape[0])
print(attn_score_2)
for i, x_i in enumerate(inputs):
    attn_score_2[i] = torch.dot(query, x_i)
attn_weights_2 = torch.softmax(attn_score_2,dim=0)
print(attn_weights_2)

context_2 = torch.matmul(attn_weights_2,inputs)
print(context_2)

torch.Size([6, 3])
tensor([0., 0., 0., 0., 0., 0.])
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor([0.4419, 0.6515, 0.5683])


In [11]:
# 上の流れを参考に全てのトークンに対して行う
attn_score = torch.zeros(6,6)
attn_weights = torch.zeros(6,6)
context = torch.zeros(6,3)
# print(attn_score)

for token_i, query in enumerate(inputs):
    for i, x_i in enumerate(inputs):
        attn_score[token_i][i] = torch.dot(query, x_i)
    
    attn_weights[token_i] = torch.softmax(attn_score[token_i],dim=0)
    context[token_i] = torch.matmul(attn_weights[token_i],inputs)

print(context)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


## 訓練可能な重みを持つself-attentionを実装する

In [12]:
x_2 = inputs[2]
d_in = inputs.shape[1] #入力埋め込みのサイズ
d_out = 2

print("d_in:",d_in)
print("d_out:",d_out)

d_in: 3
d_out: 2


In [13]:
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in,d_out), requires_grad=False)

In [14]:
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value
print(query_2)
print(key_2)

tensor([0.4300, 1.4343])
tensor([0.4361, 1.1156])


In [15]:
keys = inputs @ W_key
values = inputs @ W_value
print(keys)

tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])


In [16]:
keys_2 = keys[1]
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(1.8284)


In [17]:
attn_scores_2 = query_2 @ keys.T
print(attn_score_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [18]:
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

tensor([0.1503, 0.2256, 0.2192, 0.1315, 0.0914, 0.1819])


In [19]:
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

tensor([0.3058, 0.8203])


### コンパクトなself attention pythonクラスを実装する

In [20]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in,d_out))
        self.W_key = nn.Parameter(torch.rand(d_in,d_out))
        self.W_value = nn.Parameter(torch.rand(d_in,d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        print("keys:",keys)
        print("queries:", queries)
        print("values:", values)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5,dim=-1
        )
        context_vec = attn_weights @ values
        return context_vec

In [21]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in,d_out)
print(sa_v1(inputs))

keys: tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]], grad_fn=<MmBackward0>)
queries: tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]], grad_fn=<MmBackward0>)
values: tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]], grad_fn=<MmBackward0>)
tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [22]:
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in,d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5,dim=-1
        )
        context_vec = attn_weights @ values
        return context_vec

In [23]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in,d_out)
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


練習問題3-1 selfattention_v1とselfattention_v2を比較する

# Causal Attention

In [24]:
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)

# attn_score = torch.dot(keys, queries)
# 二次元以上のベクトルだとただのdot積は当然使えないよねってことで以下の式になる
attn_score = queries @ keys.T

# 
attn_weights = torch.softmax(attn_score / keys.shape[-1]**0.5, dim=-1)
print(attn_score)
print(attn_weights)

tensor([[ 0.2899,  0.0716,  0.0760, -0.0138,  0.1344, -0.0511],
        [ 0.4656,  0.1723,  0.1751,  0.0259,  0.1771,  0.0085],
        [ 0.4594,  0.1703,  0.1731,  0.0259,  0.1745,  0.0090],
        [ 0.2642,  0.1024,  0.1036,  0.0186,  0.0973,  0.0122],
        [ 0.2183,  0.0874,  0.0882,  0.0177,  0.0786,  0.0144],
        [ 0.3408,  0.1270,  0.1290,  0.0198,  0.1290,  0.0078]],
       grad_fn=<MmBackward0>)
tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


In [25]:
context_length = attn_score.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [26]:
masked_simple = attn_weights*mask_simple
print(masked_simple)

tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.0000, 0.0000],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<MulBackward0>)


In [27]:
row_sums = masked_simple.sum()
print(row_sums)
row_sums = masked_simple.sum(dim=-1, keepdim=True)
print(row_sums)
masked_simple_norm = masked_simple / row_sums
print(masked_simple_norm)

tensor(3.6169, grad_fn=<SumBackward0>)
tensor([[0.1921],
        [0.3700],
        [0.5357],
        [0.6775],
        [0.8415],
        [1.0000]], grad_fn=<SumBackward1>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<DivBackward0>)


In [28]:
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
print(mask)
masked = attn_score.masked_fill(mask.bool(), -torch.inf)
print(masked)

attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim=1)
print(attn_weights)

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])
tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],
        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],
        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],
        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
       grad_fn=<MaskedFillBackward0>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],
        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


ドロップアウトの実装

In [29]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5)
example = torch.ones(6,6)
print(dropout(example))

tensor([[2., 2., 0., 2., 2., 0.],
        [0., 0., 0., 2., 0., 2.],
        [2., 2., 2., 2., 0., 2.],
        [0., 2., 2., 0., 0., 2.],
        [0., 2., 0., 2., 0., 2.],
        [0., 2., 2., 2., 2., 0.]])


In [30]:
print(dropout(attn_weights))

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6194, 0.6206, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.4925, 0.4638, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.3941, 0.0000],
        [0.3869, 0.3327, 0.0000, 0.3084, 0.3331, 0.3058]],
       grad_fn=<MulBackward0>)


コンパクトなcasual attentionクラスの実装

In [31]:
batch = torch.stack((inputs,inputs), dim=0)
print(batch.shape)

torch.Size([2, 6, 3])


In [32]:
class CasualAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout_rate,qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.context_length = context_length
        self.dropout = nn.Dropout(dropout_rate)
        self.W_query = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in,d_out, bias= qkv_bias)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
    
    def forward(self,x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1,2)
        attn_scores.masked_fill(mask.bool()[:num_tokens,:num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[0]** 0.5, dim=1)
        attn_weights_dropout = dropout(attn_weights)
        context_vec = attn_weights_dropout @ values

        return context_vec

In [33]:
torch.manual_seed(123)
context_length = batch.shape[1]
ca = CasualAttention(d_in, d_out, context_length, 0.5)
context_vec = ca(batch)
print(context_vec)

tensor([[[-0.8829, -0.1528],
         [-0.7054, -0.0983],
         [-0.4090, -0.1216],
         [-0.6491, -0.0905],
         [-0.5084, -0.0418],
         [-0.1360, -0.0485]],

        [[-0.8433, -0.2534],
         [-0.7869, -0.1118],
         [-0.3974,  0.0070],
         [-0.7167, -0.2219],
         [-0.7539, -0.1042],
         [-0.6393, -0.0931]]], grad_fn=<UnsafeViewBackward0>)


# Multi-head Attentionに拡張する

In [34]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout_rate, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CasualAttention(d_in,d_out,context_length,dropout_rate,qkv_bias)
            for _ in range(num_heads)]
        )
    
    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

In [35]:
torch.manual_seed(123)
print("batch_shape:", batch.shape)
context_length = batch.shape[1]
d_in, d_out = 3,2
mha = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.5, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)

batch_shape: torch.Size([2, 6, 3])
tensor([[[-0.6678, -0.1957,  0.2299,  0.1691],
         [-0.5192, -0.1717,  0.4769,  0.3899],
         [-0.7840, -0.2420,  0.5868,  0.4548],
         [-0.1405,  0.0689,  0.5010,  0.3751],
         [-0.5319, -0.1664,  0.4613,  0.3782],
         [-0.8569, -0.2716,  0.8855,  0.5773]],

        [[-0.5385, -0.1780,  0.3153,  0.1353],
         [-0.3082, -0.1052,  0.7724,  0.4710],
         [-0.8356, -0.1508,  0.3578,  0.2863],
         [-0.4962, -0.1614,  0.7102,  0.5198],
         [-0.6693, -0.0947,  0.5000,  0.3742],
         [-0.4129, -0.1493,  0.6025,  0.4248]]], grad_fn=<CatBackward0>)


より効率的なmulti-head attentionクラス

In [36]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout_rate, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in,d_out, bias= qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout_rate)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)

        attn_scores = queries @ keys.transpose(2,3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill(mask_bool, -torch.inf)
                                
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]** 0.5, dim=-1)
        attn_weights = dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(
            b, num_tokens, self.d_out
        )
        context_vec = self.out_proj(context_vec)

        return context_vec


# 練習問題3-3:GTP-2と同じ規模のAttentionモジュールを初期化する

In [37]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [38]:
context_length = 1024
num_heads = 12
d_in, d_out = 768, 768
batch = torch.rand(6,1024,768)
print(batch.shape)

batch_size = 6
mha = MultiHeadAttention(d_in,d_out,context_length, 0.0,num_heads)

torch.Size([6, 1024, 768])
