In [40]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
input = encoding.encode("tiktoken is great!")
input, encoding.decode(input)

([83, 1609, 5963, 374, 2294, 0], 'tiktoken is great!')

In [41]:
import torch
data = torch.tensor(encoding.encode("tiktoken is great!"),dtype=torch.long)
print(data.shape,data.type)
print(data)

torch.Size([6]) <built-in method type of Tensor object at 0x000001E77CCC32A0>
tensor([  83, 1609, 5963,  374, 2294,    0])


In [42]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115393


In [43]:
data = torch.tensor(encoding.encode(text),dtype=torch.long)
print(data.shape,data.type)
print(data[:100])
print(data.size())


torch.Size([301829]) <built-in method type of Tensor object at 0x000001E77CC77110>
tensor([ 5451, 47317,   512, 10438,   584, 10570,   904,  4726,    11,  6865,
          757,  6604,   382,  2460,   512, 96945,    11,  6604,   382,  5451,
        47317,   512,  2675,   527,   682, 20250,  4856,   311,  2815,  1109,
          311,  2138,   819,  1980,  2460,   512, 66494,    13, 20250,   382,
         5451, 47317,   512,  5451,    11,   499,  1440,   356,  2192,   355,
         2947,  5979,   355,   374, 10388,  9354,   311,   279,  1274,   382,
         2460,   512,  1687,  1440,   956,    11,   584,  1440,   956,   382,
         5451, 47317,   512, 10267,   603,  5622,  1461,    11,   323,   584,
         3358,   617, 14095,   520,  1057,  1866,  3430,   627,  3957,   956,
          264, 36543,  1980,  2460,   512,  2822,   810,  7556,   389,   956])
torch.Size([301829])


In [44]:
# create a mapping from characters to integers
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [45]:
length = data.size()[0]
n = length*.9
train_data = data[:int(n)]
val_data = data[int(n):]

In [46]:
block_size = 8
train_data[:block_size+1]

tensor([ 5451, 47317,   512, 10438,   584, 10570,   904,  4726,    11])

In [47]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f"{context} -> {target}")

tensor([5451]) -> 47317
tensor([ 5451, 47317]) -> 512
tensor([ 5451, 47317,   512]) -> 10438
tensor([ 5451, 47317,   512, 10438]) -> 584
tensor([ 5451, 47317,   512, 10438,   584]) -> 10570
tensor([ 5451, 47317,   512, 10438,   584, 10570]) -> 904
tensor([ 5451, 47317,   512, 10438,   584, 10570,   904]) -> 4726
tensor([ 5451, 47317,   512, 10438,   584, 10570,   904,  4726]) -> 11


In [48]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 4989,   358,  1097,   701,  1695,  1543,   382,  2732],
        [ 4999,    32, 43384, 37482,     0, 32140,   449,  1077],
        [ 1148,   358,  1097,   345,    40,  1053,  6562,   757],
        [ 2460,   369,  1057,   348, 25843,    13,  5112,    11]])
targets:
torch.Size([4, 8])
tensor([[  358,  1097,   701,  1695,  1543,   382,  2732,   512],
        [   32, 43384, 37482,     0, 32140,   449,  1077,    11],
        [  358,  1097,   345,    40,  1053,  6562,   757,  1193],
        [  369,  1057,   348, 25843,    13,  5112,    11,   304]])
----
when input is [4989] the target: 358
when input is [4989, 358] the target: 1097
when input is [4989, 358, 1097] the target: 701
when input is [4989, 358, 1097, 701] the target: 1695
when input is [4989, 358, 1097, 701, 1695] the target: 1543
when input is [4989, 358, 1097, 701, 1695, 1543] the target: 382
when input is [4989, 358, 1097, 701, 1695, 1543, 382] the target: 2732
when input is [4989, 358, 1097, 

In [49]:
print(xb) # our input to the transformer

tensor([[ 4989,   358,  1097,   701,  1695,  1543,   382,  2732],
        [ 4999,    32, 43384, 37482,     0, 32140,   449,  1077],
        [ 1148,   358,  1097,   345,    40,  1053,  6562,   757],
        [ 2460,   369,  1057,   348, 25843,    13,  5112,    11]])


In [50]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


IndexError: index out of range in self

: 