In [1]:
import torch
import gpt1

torch.manual_seed(10)

<torch._C.Generator at 0x756801b2f170>

# GPT



In [2]:
device = 'cpu'
batch_size = 2
seq_len = 40

max_seq_len = 300
vocab_size = 100
emb_size = 128

head_size = 4
num_heads = 3
num_layers = 5
dropout = 0.3

In [3]:
# на входе тензор токенов (batch_size,seq_len)
x = torch.randint(low=0,high=vocab_size-1,size=(batch_size,seq_len), dtype=torch.long, device=device)
x.shape, x.dtype

(torch.Size([2, 40]), torch.int64)

In [4]:
gpt = gpt1.GPT(
    vocab_size=vocab_size,
    max_seq_len=max_seq_len,
    emb_size=emb_size,
    num_heads=num_heads,
    head_size=head_size,
    num_layers=num_layers,
    dropout=dropout,
    device=device)

In [5]:
#  на выходе loggits batch_size x seq_len x vocab_size
y = gpt.forward(x)
y.shape, y.dtype, torch.isnan(y).any().item()

(torch.Size([2, 40, 100]), torch.float32, False)

# GPT.generate

In [6]:
max_new_tokens = 20
#  на выходе []int batch_size x (seq_len + max_new_tokens)
new_seq = gpt.generate(x=x,max_new_tokens=max_new_tokens)
new_seq.shape, new_seq.dtype, torch.isnan(new_seq).any().item(), torch.unique(new_seq), len(torch.unique(new_seq))

(torch.Size([2, 60]),
 torch.int64,
 False,
 tensor([ 0,  1,  4,  5,  6,  7,  9, 10, 11, 12, 13, 15, 18, 20, 21, 22, 23, 24,
         25, 26, 27, 28, 31, 32, 33, 34, 35, 40, 41, 42, 43, 46, 47, 48, 49, 50,
         52, 53, 54, 55, 57, 58, 60, 62, 63, 65, 67, 68, 69, 74, 76, 79, 80, 81,
         82, 83, 84, 85, 89, 94, 95, 98]),
 62)

In [7]:
# заготовка, чтоб посмотреть как работает softmax
# batch_size x seq_len x vocab_size
logits = torch.rand([2,4,3])
logits.shape,logits.dtype

(torch.Size([2, 4, 3]), torch.float32)

In [8]:
# берём последний токен
last_log = logits[:,-1,:]
last_log.shape, last_log

(torch.Size([2, 3]),
 tensor([[0.8099, 0.0461, 0.0193],
         [0.2994, 0.9919, 0.3432]]))

In [9]:
# сумма по каждой строке 1
# для каждого batch один следующий токен
prob = torch.softmax(last_log,dim=-1)
prob

tensor([[0.5210, 0.2427, 0.2363],
        [0.2473, 0.4943, 0.2584]])

In [10]:
0.5210+ 0.2427+ 0.2363

1.0

In [11]:
# keep_dim чтоб потом можно было сделать cat c batch_size x seq_len -> batch_size x (seq_len+1)
next_token = torch.argmax(prob,dim=-1,keepdim=True)
next_token, next_token.shape

(tensor([[0],
         [1]]),
 torch.Size([2, 1]))

# GPT.generate (multinominal sample)

In [12]:
# включили сэмплирование - набор токенов в ответе стал разнообразней (не только самые вероятные)
new_seq = gpt.generate(x=x,max_new_tokens=max_new_tokens,do_sample=True)
new_seq.shape, new_seq.dtype, torch.isnan(new_seq).any().item(), torch.unique(new_seq),len(torch.unique(new_seq))

(torch.Size([2, 60]),
 torch.int64,
 False,
 tensor([ 1,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21,
         22, 23, 24, 25, 26, 27, 29, 31, 32, 33, 35, 40, 41, 42, 45, 46, 47, 48,
         49, 50, 51, 52, 53, 54, 55, 56, 58, 60, 62, 65, 67, 68, 69, 70, 72, 74,
         75, 77, 79, 80, 81, 82, 83, 84, 85, 89, 94, 95, 98, 99]),
 68)

In [13]:
# заготовка для Multinomial sampling (вероятностного сэмплирования)
prob

tensor([[0.5210, 0.2427, 0.2363],
        [0.2473, 0.4943, 0.2584]])

In [14]:
# здесь с размерностью всё хорошо batch_size x 1
next_token=torch.multinomial(prob,num_samples=1)
next_token,next_token.shape

(tensor([[1],
         [0]]),
 torch.Size([2, 1]))

# GPT.generate (temperature)

In [15]:
# с температурой можем получить промежуточный результат
new_seq = gpt.generate(x=x,max_new_tokens=max_new_tokens,do_sample=True, temperature=0.3)
new_seq.shape, new_seq.dtype, torch.isnan(new_seq).any().item(), torch.unique(new_seq), len(torch.unique(new_seq))

(torch.Size([2, 60]),
 torch.int64,
 False,
 tensor([ 1,  2,  3,  4,  6,  7,  9, 10, 11, 12, 13, 15, 18, 20, 21, 22, 23, 24,
         25, 26, 27, 28, 30, 32, 33, 35, 37, 38, 40, 41, 42, 45, 47, 48, 49, 50,
         52, 53, 54, 55, 57, 58, 60, 62, 63, 65, 67, 68, 69, 70, 74, 76, 79, 80,
         81, 82, 83, 84, 89, 93, 94, 95, 96, 98]),
 64)

# GPT.generate (top_k)

In [16]:
# не всё так однозначно - кол-во различных токенов может и нерасти растёт с ростом top_k
new_seq = gpt.generate(x=x,max_new_tokens=max_new_tokens,do_sample=True, top_k=2)
new_seq.shape, new_seq.dtype, torch.isnan(new_seq).any().item(), torch.unique(new_seq), len(torch.unique(new_seq))

(torch.Size([2, 60]),
 torch.int64,
 False,
 tensor([ 1,  4,  6,  7,  9, 10, 11, 12, 13, 15, 18, 20, 21, 22, 23, 24, 25, 26,
         27, 32, 33, 34, 35, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         54, 55, 57, 58, 60, 62, 63, 65, 66, 67, 68, 69, 70, 73, 74, 76, 79, 80,
         81, 82, 83, 84, 86, 89, 92, 94, 95, 98]),
 64)

In [17]:
new_seq = gpt.generate(x=x,max_new_tokens=max_new_tokens,do_sample=True, top_k=25)
new_seq.shape, new_seq.dtype, torch.isnan(new_seq).any().item(), torch.unique(new_seq), len(torch.unique(new_seq))

(torch.Size([2, 60]),
 torch.int64,
 False,
 tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 17, 18, 20,
         21, 22, 23, 24, 25, 26, 27, 32, 33, 35, 40, 41, 42, 45, 47, 48, 49, 50,
         52, 53, 54, 55, 57, 58, 60, 62, 65, 67, 68, 69, 70, 72, 74, 78, 79, 80,
         81, 82, 83, 84, 88, 89, 90, 93, 94, 95]),
 64)

In [18]:
# заготовка
# batch_size x seq_len x vocab_size
logits = torch.rand([2,4,5])
# batch_size x vocab_size
logits = logits[:,-1,:]
logits.shape,logits.dtype,logits

(torch.Size([2, 5]),
 torch.float32,
 tensor([[0.2331, 0.8004, 0.7314, 0.6114, 0.2042],
         [0.3388, 0.9878, 0.0284, 0.1518, 0.7985]]))

In [19]:
# индексы трёх максимальных токенов
_, top_k_ind = torch.topk(logits,k=3, dim=-1)
top_k_ind, top_k_ind.shape

(tensor([[1, 2, 3],
         [1, 4, 0]]),
 torch.Size([2, 3]))

In [20]:
filtered = torch.full_like(logits,float('-inf'))
filtered.scatter_(dim=-1,index=top_k_ind, src=logits)
filtered

tensor([[  -inf, 0.2331, 0.8004, 0.7314,   -inf],
        [0.0284, 0.3388,   -inf,   -inf, 0.9878]])

# GPT.generate (top_p)

In [22]:
# заготовка
# batch_size x seq_len x vocab_size
logits = torch.rand([2,4,5])
# batch_size x vocab_size
logits = logits[:,-1,:]
logits.shape,logits.dtype,logits

(torch.Size([2, 5]),
 torch.float32,
 tensor([[0.4911, 0.9985, 0.3851, 0.7970, 0.3258],
         [0.0306, 0.9679, 0.9019, 0.9500, 0.2201]]))

In [23]:
prob = torch.softmax(logits, dim=-1)
sorted_prob, sorted_indices = torch.sort(prob, descending=True, dim=-1)
prob,sorted_indices

(tensor([[0.1734, 0.2881, 0.1560, 0.2355, 0.1470],
         [0.1035, 0.2643, 0.2474, 0.2596, 0.1251]]),
 tensor([[1, 3, 0, 2, 4],
         [1, 3, 2, 4, 0]]))

In [26]:
cumulative_prob = torch.cumsum(sorted_prob, dim=-1)
sorted_prob,cumulative_prob

(tensor([[0.2881, 0.2355, 0.1734, 0.1560, 0.1470],
         [0.2643, 0.2596, 0.2474, 0.1251, 0.1035]]),
 tensor([[0.2881, 0.5236, 0.6970, 0.8530, 1.0000],
         [0.2643, 0.5239, 0.7714, 0.8965, 1.0000]]))