In [51]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4

cuda


In [52]:
with open('dotu.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\t', '\n', '\x1e', ' ', '!', '%', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'X', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '©', '«', '±', '»', 'Ё', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё', '—', '“', '”', '…', '№']
152


In [53]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)

print(encoded_hello)
print(decoded_hello)

[59, 56, 62, 62, 65]
hello


In [54]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([  3,   3,   1,  86, 128, 131, 132, 114, 132, 128, 137, 127, 128,   3,
        128, 115, 139, 114, 145,   3, 132, 119, 128, 130, 122, 145,   3, 133,
        129, 130, 114, 116, 125, 119, 127, 122, 145,   1,  51,  51,  51,  51,
         51,  51,  51,  51,  51,  51,  51,  51,  51,  51,   1,  97, 128, 131,
        132, 114, 127, 128, 116, 128, 137, 127, 141, 119,   3, 126, 114, 132,
        119, 130, 122, 114, 125, 141,   3, 133, 137, 119, 115, 127, 128, 117,
        128,   3, 124, 133, 130, 131, 114,   3,   1, 134, 114, 124, 133, 125,
        142, 132])


In [55]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs: ')
print(x)
print('targets: ')
print(y)

tensor([647540, 155174, 614556, 353802])
inputs: 
tensor([[  3, 118, 128, 115, 114, 116, 122, 132],
        [126,   3, 131, 126, 141, 131, 125, 119],
        [133,  10,   3, 127, 114, 133, 124, 133],
        [  3, 116, 141, 137, 122, 131, 125, 119]], device='cuda:0')
targets: 
tensor([[118, 128, 115, 114, 116, 122, 132,   3],
        [  3, 131, 126, 141, 131, 125, 119,   7],
        [ 10,   3, 127, 114, 133, 124, 133,  10],
        [116, 141, 137, 122, 131, 125, 119, 127]], device='cuda:0')


In [56]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([3]) target is tensor(3)
when input is tensor([3, 3]) target is tensor(1)
when input is tensor([3, 3, 1]) target is tensor(86)
when input is tensor([ 3,  3,  1, 86]) target is tensor(128)
when input is tensor([  3,   3,   1,  86, 128]) target is tensor(131)
when input is tensor([  3,   3,   1,  86, 128, 131]) target is tensor(132)
when input is tensor([  3,   3,   1,  86, 128, 131, 132]) target is tensor(114)
when input is tensor([  3,   3,   1,  86, 128, 131, 132, 114]) target is tensor(132)


In [65]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)

        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

	zldь/>ъ_тскы»-Щ>вlMe…d.A©ьВNlСк*9«пKвб!A>Щ/ю©AД» %bЦay9*мeЁп.<кщ7l©»4щ*Уot-+nAчS-uсщXAяБШN ЗMФО…н6±…хS»
оюцtцееSn5TРйз0ж=В	хЖPЕД) ЕшmцгсyщУч©щNКвчшаPSвЧyЪХruмUp—rЖc-м«ЩRИ.ЙИxщXрu=Pxы(mlnН- ИЭфLД%tТZщM)7[UfГs1zНNvЭш/hр№dЩьшиИ?u)t
?ъз>ъ8+KKЮйУeVD1Рd+sЪ*ъ9оэТ]Л©кр»VЫ ХЮ0<pЕИ
З/C!»еMФg,вЫI%»d]щЬПdяD1с0gърЯM/-эoAЦфлIБ=яT)Ё%Ь©В1%”cцН!Лmь>д=SЛЬ8жЁк*vafц
XwОвъ№vмЭ:/ж©E8E(—rЖЧ чсp]ВрhXпдXЯЭлyщ

уyь5EУЧЦ±э
З-:п”ImlСO1гдгOн6Мu©фr»gа+:lf3Т[<8_7ЦфЁА%Кв±fu8NгВКж±Вш_Aърь%nP вc]тa(.C)Мw	S=жыLыLАЁНю ьВ1.!S=bdч
