## 1 Downloading The Dataset

In [None]:
#Downloading dataset to train on.

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [1]:
with open('../dataset.txt','r',encoding='utf-8') as data:
    text = data.read()
    
print("Total number of characters in the dataset : ",len(text))

Total number of characters in the dataset :  1115394


In [2]:
print('First 100 characters : ', text[:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

### 2.1 Building Vocabulary 

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print('Number of unique characters in the dataset =',vocab_size)
print('\nWhich are following : \n',''.join(chars))

Number of unique characters in the dataset = 65

Which are following : 
 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [4]:
#mapping individual charaters to integers 

encoder = { char:i for i,char in enumerate(chars)}
decoder = { i:char for i,char in enumerate(chars)}

encode = lambda string: [encoder[char] for char in string]
decode = lambda integers: [decoder[i] for i in integers]

In [5]:
zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


### 2.3 Encoding The Dataset 

In [6]:
import tensorflow as tf

In [7]:
data = tf.convert_to_tensor(encode(text), dtype=tf.int64)
print(data.shape, data.dtype)

(1115394,) <dtype: 'int64'>


In [8]:
print(data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.4 Splitting the dataset (Train,Validate) 

In [9]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [10]:
print(train_data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [11]:
block_size = 8
train_data[:block_size+1]

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([18, 47, 56, 57, 58,  1, 15, 47, 58], dtype=int64)>

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [18]   target is:  47
for input:  [18, 47]   target is:  56
for input:  [18, 47, 56]   target is:  57
for input:  [18, 47, 56, 57]   target is:  58
for input:  [18, 47, 56, 57, 58]   target is:  1
for input:  [18, 47, 56, 57, 58, 1]   target is:  15
for input:  [18, 47, 56, 57, 58, 1, 15]   target is:  47
for input:  [18, 47, 56, 57, 58, 1, 15, 47]   target is:  58


#### Implementation 

In [13]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions
n_embd = 32

def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    #print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

inputs:
(4, 8)
tf.Tensor(
[[ 1 41 56 53 61 52  8  0]
 [ 0  0 13 26 32 21 19 27]
 [57 47 42 43 56  1 44 59]
 [ 1 47 52 41 56 43 39 57]], shape=(4, 8), dtype=int64)
targets:
(4, 8)
tf.Tensor(
[[41 56 53 61 52  8  0 35]
 [ 0 13 26 32 21 19 27 26]
 [47 42 43 56  1 44 59 56]
 [47 52 41 56 43 39 57 43]], shape=(4, 8), dtype=int64)


In [14]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [1]   target is:  41
for input:  [1, 41]   target is:  56
for input:  [1, 41, 56]   target is:  53
for input:  [1, 41, 56, 53]   target is:  61
for input:  [1, 41, 56, 53, 61]   target is:  52
for input:  [1, 41, 56, 53, 61, 52]   target is:  8
for input:  [1, 41, 56, 53, 61, 52, 8]   target is:  0
for input:  [1, 41, 56, 53, 61, 52, 8, 0]   target is:  35
for input:  [0]   target is:  0
for input:  [0, 0]   target is:  13
for input:  [0, 0, 13]   target is:  26
for input:  [0, 0, 13, 26]   target is:  32
for input:  [0, 0, 13, 26, 32]   target is:  21
for input:  [0, 0, 13, 26, 32, 21]   target is:  19
for input:  [0, 0, 13, 26, 32, 21, 19]   target is:  27
for input:  [0, 0, 13, 26, 32, 21, 19, 27]   target is:  26
for input:  [57]   target is:  47
for input:  [57, 47]   target is:  42
for input:  [57, 47, 42]   target is:  43
for input:  [57, 47, 42, 43]   target is:  56
for input:  [57, 47, 42, 43, 56]   target is:  1
for input:  [57, 47, 42, 43, 56, 1]   target is:  44

## 3 Bigram Language Model 

### 3.1 Model Architecture  

In [None]:

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size, n_embd):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embd)
        self.lm_head = tf.keras.layers.Dense(n_embd, vocab_size)

    def call(self, idx, targets=None, training=False):
        B, T = idx.shape
        
        tok_emd = self.token_embedding_table(idx)  # (B, T, C)
        pos_emd = self.position_embedding_table(tf.range(T)) #(T, C)
        x = tok_emd + pos_emd #(B,T,C) # Holds Token identities & position
        logits = self.lm_head(x) # (B, T, vocab_size) say C=n_embd

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


### 3.2 Model Initialization  

In [None]:
model = BigramLanguageModel(vocab_size)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)

#### Untrained Model Results 

In [None]:
print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist())))

#Total Garbage

### 3.3 Model Training 

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [None]:
batch_size = 32
num_steps = 10000

for step in range(num_steps):
    # Sample a batch of data
    xbatch, ybatch = get_batch('train')  # Assuming you have a function get_batch

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xbatch, ybatch)

    # Compute gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print('Loss = 'loss.numpy())


In [None]:
#Somewhat stuctured results

print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=500).numpy()[0].tolist())))

# Adding Transformer  

## 4 Maths for self-attention mechanism

In [None]:
#We want a method to preserve context for each word and it's former words in the time block

In [20]:
# example of words and their context

for row in range(batch_size-3):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [1]   target is:  41
for input:  [1, 41]   target is:  56
for input:  [1, 41, 56]   target is:  53
for input:  [1, 41, 56, 53]   target is:  61
for input:  [1, 41, 56, 53, 61]   target is:  52
for input:  [1, 41, 56, 53, 61, 52]   target is:  8
for input:  [1, 41, 56, 53, 61, 52, 8]   target is:  0
for input:  [1, 41, 56, 53, 61, 52, 8, 0]   target is:  35


In [34]:
xbatch.shape

TensorShape([4, 8])

### 4.1 Aggregating for context (simplest method) 

#### Version 1 

In [43]:
B, T, C = 4, 8, 2  # batch, time, channels
x = tf.random.normal((B, T, C), dtype=tf.float64)
print(x.shape)


(4, 8, 2)


In [44]:
xbow = tf.zeros((B, T, C), dtype=tf.float64)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1, :]  # (t, C)
        mean_value = tf.reduce_mean(xprev, axis=0)
        xbow = tf.tensor_scatter_nd_add(xbow, indices=[[b, t]], updates=[mean_value])


In [45]:
x[0]
#original data

<tf.Tensor: shape=(8, 2), dtype=float64, numpy=
array([[-1.14924009, -1.36499935],
       [-0.80522729,  0.25705269],
       [ 0.46568142, -0.49686646],
       [-0.94495844,  0.50613577],
       [ 0.36614164, -0.81278141],
       [ 0.41970264, -0.05147407],
       [-1.13084527, -0.83186565],
       [ 1.37927137,  1.15312283]])>

In [47]:
#aggregated data
xbow
#each upcoming row is contains the aggregate of all previous 

<tf.Tensor: shape=(4, 8, 2), dtype=float64, numpy=
array([[[-1.14924009, -1.36499935],
        [-0.97723369, -0.55397333],
        [-0.49626199, -0.53493771],
        [-0.6084361 , -0.27466934],
        [-0.41352055, -0.38229175],
        [-0.27465002, -0.32715547],
        [-0.39696363, -0.39925693],
        [-0.17493425, -0.20520946]],

       [[-0.11562781,  0.63718183],
        [ 0.35645997,  0.8532918 ],
        [-0.11393791,  0.167791  ],
        [-0.11917604,  0.11351969],
        [-0.41449325, -0.02157758],
        [-0.72929212,  0.08856227],
        [-0.7496213 , -0.01448659],
        [-0.6781139 , -0.12349563]],

       [[-2.18980136,  1.69369341],
        [-0.87205705,  0.59489347],
        [-0.61073565,  1.12653932],
        [-0.76403204,  0.94550927],
        [-0.86164589,  0.91549957],
        [-0.9433301 ,  0.50503605],
        [-0.68396538,  0.50864559],
        [-0.7222557 ,  0.15176859]],

       [[-0.05210129,  0.40729748],
        [ 0.26322566,  0.12043353],
       

### 4.2 Efficient aggregating for context (simplest method) 

#### Concept (Same achived through matrix multiplication trick)

In [None]:
import numpy as np

# Create a lower triangular matrix
a = tf.linalg.band_part(tf.ones((3, 3), dtype=tf.float64), -1, 0)
a = a / tf.reduce_sum(a, axis=1, keepdims=True)

# Create a random matrix with integer values between 0 and 10
b = tf.constant(np.random.randint(0, 10, size=(3, 2)).astype(np.float64))

# Perform matrix multiplication
c = tf.matmul(a, b)

# Print the results
print('a=')
print(a.numpy())
print('--')
print('b=')
print(b.numpy())
print('--')
print('c=')
print(c.numpy())

#### Version 2 (Reproducing xbow via matrix mulitplication instead of loop multiplication)

In [None]:
weights = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0) 
weights = weights / tf.reduce_sum(weights, axis=1, keepdims=True)

# Round each value to 3 decimal points for better visibility
weights_numpy_rounded = np.round(weights.numpy().tolist(), decimals=3)
weights_numpy_rounded

In [None]:
xbow2 = tf.matmul(weights, x) # Here (B,T,T) x (B,T,C) = (B,T,C).   where B in (B,T,T) was automatically created by tf

In [None]:
print('xbow = ',xbow[0].numpy())
print('\nSame as\n')
print('xbow2 = ',xbow2[0].numpy())

#### Version 3 (Using softmax) 

In [None]:
# Create lower triangular matrix and initialize wei with zeros
tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0)
weights = tf.zeros((T, T), dtype=tf.float64)

# Mask the upper triangular part with -inf
weights = tf.where(tf.equal(tril, 0), float('-inf'), weights)

# Apply softmax along the last dimension
weights = tf.nn.softmax(weights, axis=-1)

# Perform matrix multiplication
xbow3 = tf.matmul(weights, x)

In [None]:
print('xbow = ',xbow[0].numpy())
print('\nSame as\n')
print('xbow2 = ',xbow3[0].numpy())

### 4.3 Self-attention mechanism context (better method)  

#### Version 4 (Reproducing xbow via matrix mulitplication instead of loop multiplication)

In [None]:
# Create lower triangular matrix and initialize wei with zeros
tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0)
weights = tf.zeros((T, T), dtype=tf.float64)

# Mask the upper triangular part with -inf
weights = tf.where(tf.equal(tril, 0), float('-inf'), weights)

# Apply softmax along the last dimension
weights = tf.nn.softmax(weights, axis=-1)

# Perform matrix multiplication
xbow3 = tf.matmul(weights, x)

In [None]:
xbow3[0]

In [70]:
tf.random.set_seed(1337)

B, T, C = 4, 8, 32  # batch, time, channels
x = tf.random.normal((B, T, C))
x.shape

TensorShape([4, 8, 32])

In [74]:

# Let's see a single Head perform self-attention
head_size = 16

key = tf.keras.layers.Dense(head_size, use_bias=False)
query = tf.keras.layers.Dense(head_size, use_bias=False)
value = tf.keras.layers.Dense(head_size, use_bias=False)

print('x.shape : ', x.shape)
k = key(x)   # (B, T, 32) => (B, T, 16)
q = query(x) # (B, T, 32) => (B, T, 32)
print('k.shape : ', k.shape)
print('q.shape : ', q.shape)

wei = tf.matmul(q, tf.transpose(k, perm=[0, 2, 1]))  # (B, T, T)

print('wei.shape : ',wei.shape)

tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0)
wei = tf.where(tf.equal(tril, 0), float('-inf'), wei)

# Apply softmax along the last dimension
wei = tf.nn.softmax(wei, axis=-1)

v = value(x)
out = tf.matmul(wei, v)

print(out.shape)

x.shape :  (4, 8, 32)
k.shape :  (4, 8, 16)
q.shape :  (4, 8, 16)
wei.shape :  (4, 8, 8)
(4, 8, 16)


In [75]:
wei[0].numpy()

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [9.8463178e-01, 1.5368149e-02, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [4.5601367e-03, 2.5054205e-02, 9.7038573e-01, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [9.7832882e-01, 2.1538427e-02, 2.3295534e-06, 1.3044475e-04,
        0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [1.9321831e-09, 9.9971753e-01, 2.8247354e-04, 2.3293320e-10,
        2.6625002e-10, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       [1.6792955e-06, 1.8783492e-05, 3.6958933e-02, 2.0531880e-02,
        9.3633085e-01, 6.1577610e-03, 0.0000000e+00, 0.0000000e+00],
       [3.9879506e-04, 3.6720667e-06, 4.4227920e-05, 9.7426558e-01,
        1.3631139e-02, 1.1653907e-02, 2.6310336e-06, 0.0000000e+00],
       [9.0681368e-01, 3.0532675e-03, 1.0

# Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [90]:
k = tf.random.normal((B, T, head_size))
q = tf.random.normal((B, T, head_size))

wei = q @ tf.transpose(k, perm=[0, 2, 1]) * head_size**-0.5 #scaling after transpose, check below to see whyf

In [91]:
tf.math.reduce_variance(k).numpy()

0.9746931

In [92]:
tf.math.reduce_variance(q).numpy()

1.0030558

In [93]:
tf.math.reduce_variance(wei).numpy()

1.0009656

In [98]:
tf.nn.softmax(tf.constant([0.1, -0.2, 0.3, -0.2, 0.5]), axis=-1).numpy()

array([0.1924978 , 0.14260589, 0.23511736, 0.14260589, 0.287173  ],
      dtype=float32)

In [99]:
#without scaling the softmax favours larger values big time as visible
tf.nn.softmax(tf.constant([0.1, -0.2, 0.3, -0.2, 0.5])*8, axis=-1).numpy() 

array([0.03260834, 0.00295816, 0.1615102 , 0.00295816, 0.79996514],
      dtype=float32)