In [2]:
import tiktoken
import torch

In [3]:
with open( "datashort_story2.txt", "r" ) as f:
    raw_text = f.read()

raw_text[:50]

"'My Lord Chancellor,\n\n'When I consider the Affair "

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
enc_text = tokenizer.encode(raw_text)

In [6]:
print(enc_text[:20])

[6, 3666, 4453, 19477, 11, 198, 198, 6, 2215, 314, 2074, 262, 6708, 958, 286, 281, 4479, 731, 86, 6346]


In [7]:
print( tokenizer.decode( enc_text[:2]))

'My


In [8]:
len( enc_text)

7407

In [9]:
for i in range(1,10):
    print("Input:", tokenizer.decode(enc_text[:i]), "Target:", tokenizer.decode([enc_text[i]]))

Input: ' Target: My
Input: 'My Target:  Lord
Input: 'My Lord Target:  Chancellor
Input: 'My Lord Chancellor Target: ,
Input: 'My Lord Chancellor, Target: 

Input: 'My Lord Chancellor,
 Target: 

Input: 'My Lord Chancellor,

 Target: '
Input: 'My Lord Chancellor,

' Target: When
Input: 'My Lord Chancellor,

'When Target:  I


In [10]:

from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [11]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [12]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[    6,  3666,  4453, 19477],
        [   11,   198,   198,     6],
        [ 2215,   314,  2074,   262],
        [ 6708,   958,   286,   281],
        [ 4479,   731,    86,  6346],
        [  262,   734,  7973,    11],
        [  355,   340,   318,  6241],
        [  287,   262,  1811, 22698]])

Targets:
 tensor([[ 3666,  4453, 19477,    11],
        [  198,   198,     6,  2215],
        [  314,  2074,   262,  6708],
        [  958,   286,   281,  4479],
        [  731,    86,  6346,   262],
        [  734,  7973,    11,   355],
        [  340,   318,  6241,   287],
        [  262,  1811, 22698, 15370]])


In [13]:
# to apply the tokenizer's decoder to these IDs, the rows of the tensor `inputs` have to be converted into lists:
for row in inputs:
    print( tokenizer.decode( row.tolist() ) )

'My Lord Chancellor
,

'
When I consider the
 Affair of an
 Union betwixt
 the two Nations,
 as it is expressed
 in the several Articles


In [14]:

# we don't send these IDs to the LLM for training; we associate a vector a.k.a. tensor with each ID and then train the LLM on the vectors
# as a first example, let's create embedding vectors of length 3 for each token in a vocabulary of 6 tokens
vocab_size = 6
output_dim = 3
embedding = torch.nn.Embedding( vocab_size, output_dim )
print(embedding.weight)

Parameter containing:
tensor([[-0.4449, -0.3830, -0.6007],
        [ 0.0633,  0.3001,  0.8977],
        [-0.3109,  1.0373, -0.2158],
        [-0.2947, -0.1635, -0.4387],
        [-0.0367, -1.2870, -0.8663],
        [-0.3627, -0.9396,  0.0416]], requires_grad=True)


In [15]:
# if you just want the tensor part of this without the requires_grad=True bit
# method 1:
embedding.weight.data

tensor([[-0.4449, -0.3830, -0.6007],
        [ 0.0633,  0.3001,  0.8977],
        [-0.3109,  1.0373, -0.2158],
        [-0.2947, -0.1635, -0.4387],
        [-0.0367, -1.2870, -0.8663],
        [-0.3627, -0.9396,  0.0416]])

In [16]:
# if you just want the tensor part of this without the requires_grad=True bit
# method 1:
embedding.weight.detach()

tensor([[-0.4449, -0.3830, -0.6007],
        [ 0.0633,  0.3001,  0.8977],
        [-0.3109,  1.0373, -0.2158],
        [-0.2947, -0.1635, -0.4387],
        [-0.0367, -1.2870, -0.8663],
        [-0.3627, -0.9396,  0.0416]])

In [17]:
# call this A for some examples:
A = embedding.weight.detach()

In [18]:
# first row:
A[0]

tensor([-0.4449, -0.3830, -0.6007])

In [19]:
# second row:
A[1]

tensor([0.0633, 0.3001, 0.8977])

In [20]:
# first column:
A[:,0]

tensor([-0.4449,  0.0633, -0.3109, -0.2947, -0.0367, -0.3627])

In [21]:
# element in row 2, column 3:
A[1,2]

tensor(0.8977)

In [22]:

# to create a tensor directly:
x = torch.tensor([1.2,2.1])
y = torch.tensor([2.7,1.5])
print(x) 
print(y)

tensor([1.2000, 2.1000])
tensor([2.7000, 1.5000])


In [23]:

torch.dot( x,y)

tensor(6.3900)

In [24]:
# check:
1.2*2.7 + 2.1*1.5

6.390000000000001

In [25]:
vocab_size = 4
output_dim = 8
inputs = torch.nn.Embedding( vocab_size, output_dim )
print( inputs.weight )

Parameter containing:
tensor([[ 1.0210,  0.7720,  0.3229,  0.5406,  1.8349, -0.1785, -0.4625, -0.3050],
        [-0.6549, -0.0324, -0.6208,  0.3278,  0.6982, -0.6581,  0.0054, -1.3516],
        [-1.1034,  0.4322,  1.5395, -1.0670, -0.1930,  0.4849,  0.9935, -0.1667],
        [ 0.5642,  1.3899, -0.2606, -1.9455,  1.2959,  1.2877,  1.3191, -1.1651]],
       requires_grad=True)


Embedding transforms each token ID into a continuous vector representation so
in this example, with a vocabulary size of 8 and an output dimension of 4, each of the 8 tokens is represented as a 4-dimensional vector.
This is necessary because neural networks cant learn meaningful patterns from integers alone. Embeddings allow the model to work in a continuous space where distances and directions represent relationships which enables it to generalize and learn contextual meaning.

In [27]:
inputs = inputs.weight.data
inputs

tensor([[ 1.0210,  0.7720,  0.3229,  0.5406,  1.8349, -0.1785, -0.4625, -0.3050],
        [-0.6549, -0.0324, -0.6208,  0.3278,  0.6982, -0.6581,  0.0054, -1.3516],
        [-1.1034,  0.4322,  1.5395, -1.0670, -0.1930,  0.4849,  0.9935, -0.1667],
        [ 0.5642,  1.3899, -0.2606, -1.9455,  1.2959,  1.2877,  1.3191, -1.1651]])

In [28]:
inputs.shape

torch.Size([4, 8])

In [29]:
for row in inputs:
    print(row.tolist())

[1.0210223197937012, 0.7719696760177612, 0.32292377948760986, 0.5405638813972473, 1.8348582983016968, -0.1784828007221222, -0.4625184237957001, -0.30498939752578735]
[-0.6548746228218079, -0.03241903707385063, -0.6208429932594299, 0.3278284966945648, 0.6982240080833435, -0.6581456065177917, 0.005382075440138578, -1.3515597581863403]
[-1.1034115552902222, 0.43220487236976624, 1.5394843816757202, -1.0669578313827515, -0.19296561181545258, 0.4849236309528351, 0.9934572577476501, -0.16671492159366608]
[0.5642220377922058, 1.3898890018463135, -0.2606104016304016, -1.9454749822616577, 1.2959389686584473, 1.2876970767974854, 1.31905198097229, -1.1650508642196655]


In [30]:
x = torch.Tensor([1.1, 2.3])
y = torch.Tensor([3.4,-2.1])

In [31]:

1.1*3.4 + 2.3*(-2.1)

-1.0899999999999999

In [32]:
torch.dot( x, y)

tensor(-1.0900)

In [33]:
query = inputs[2]
print(query)

tensor([-1.1034,  0.4322,  1.5395, -1.0670, -0.1930,  0.4849,  0.9935, -0.1667])


In [34]:
for i in range( len( inputs )):
    print( torch.dot( query, inputs[i] ) )

tensor(-1.7218)
tensor(-0.8202)
tensor(6.1999)
tensor(3.5317)


In [36]:
attention_scores_2 = torch.zeros( len(inputs) )
for i in range( len( inputs )):
    attention_scores_2[i] = torch.dot( query, inputs[i] )
print( attention_scores_2)

tensor([-1.7218, -0.8202,  6.1999,  3.5317])


In [37]:
attention_weights_2 = torch.softmax( attention_scores_2, dim = 0 )
attention_weights_2

tensor([3.3885e-04, 8.3481e-04, 9.3402e-01, 6.4802e-02])

In [38]:

attention_weights_2.sum()

tensor(1.0000)

In [39]:
context_vector_2 = torch.zeros( query.shape )
for i in range( len( attention_weights_2 ) ):
    context_vector_2 += attention_weights_2[i]*inputs[i]
context_vector_2

tensor([-0.9943,  0.4940,  1.4206, -1.1222, -0.0951,  0.5358,  1.0132, -0.2324])

In [40]:
inputs

tensor([[ 1.0210,  0.7720,  0.3229,  0.5406,  1.8349, -0.1785, -0.4625, -0.3050],
        [-0.6549, -0.0324, -0.6208,  0.3278,  0.6982, -0.6581,  0.0054, -1.3516],
        [-1.1034,  0.4322,  1.5395, -1.0670, -0.1930,  0.4849,  0.9935, -0.1667],
        [ 0.5642,  1.3899, -0.2606, -1.9455,  1.2959,  1.2877,  1.3191, -1.1651]])

In [41]:
inputs.T

tensor([[ 1.0210, -0.6549, -1.1034,  0.5642],
        [ 0.7720, -0.0324,  0.4322,  1.3899],
        [ 0.3229, -0.6208,  1.5395, -0.2606],
        [ 0.5406,  0.3278, -1.0670, -1.9455],
        [ 1.8349,  0.6982, -0.1930,  1.2959],
        [-0.1785, -0.6581,  0.4849,  1.2877],
        [-0.4625,  0.0054,  0.9935,  1.3191],
        [-0.3050, -1.3516, -0.1667, -1.1651]])

In [42]:
# get all of the attention scores via a matrix multiplication:
attention_scores = inputs @ inputs.T
attention_scores

tensor([[ 5.7404,  1.0914, -1.7218,  2.4065],
        [ 1.0914,  3.6702, -0.8202,  0.7486],
        [-1.7218, -0.8202,  6.1999,  3.5317],
        [ 2.4065,  0.7486,  3.5317, 12.5378]])

In [43]:
attention_weights = torch.softmax( attention_scores, dim = -1 )
attention_weights

tensor([[9.5621e-01, 9.1518e-03, 5.4920e-04, 3.4092e-02],
        [6.6491e-02, 8.7649e-01, 9.8305e-03, 4.7193e-02],
        [3.3885e-04, 8.3481e-04, 9.3402e-01, 6.4802e-02],
        [3.9807e-05, 7.5845e-06, 1.2264e-04, 9.9983e-01]])

In [44]:
attention_weights[0].sum()

tensor(1.0000)

In [45]:
context_vectors = attention_weights @ inputs
context_vectors

tensor([[ 0.9889,  0.7855,  0.2951,  0.4530,  1.8050, -0.1325, -0.3967, -0.3438],
        [-0.4903,  0.0928, -0.5199,  0.2210,  0.7932, -0.5232,  0.0460, -1.2615],
        [-0.9943,  0.4940,  1.4206, -1.1222, -0.0951,  0.5358,  1.0132, -0.2324],
        [ 0.5640,  1.3897, -0.2604, -1.9453,  1.2958,  1.2875,  1.3189, -1.1649]])