## 1 Downloading The Dataset

In [1]:
#Downloading dataset to train on.

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
with open('../dataset.txt','r',encoding='utf-8') as data:
    text = data.read()
    
print("Total number of characters in the dataset : ",len(text))

Total number of characters in the dataset :  1115394


In [3]:
print('First 100 characters : ', text[:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

### 2.1 Building Vocabulary 

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print('Number of unique characters in the dataset =',vocab_size)
print('\nWhich are following : \n',''.join(chars))

Number of unique characters in the dataset = 65

Which are following : 
 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [5]:
#mapping individual charaters to integers 

encoder = { char:i for i,char in enumerate(chars)}
decoder = { i:char for i,char in enumerate(chars)}

encode = lambda string: [encoder[char] for char in string]
decode = lambda integers: [decoder[i] for i in integers]

In [6]:
zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


### 2.3 Encoding The Dataset 

In [7]:
import tensorflow as tf

In [8]:
data = tf.convert_to_tensor(encode(text), dtype=tf.int64)
print(data.shape, data.dtype)

(1115394,) <dtype: 'int64'>


In [9]:
print(data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.4 Splitting the dataset (Train,Validate) 

In [10]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [11]:
print(train_data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [12]:
block_size = 8
train_data[:block_size+1]

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([18, 47, 56, 57, 58,  1, 15, 47, 58], dtype=int64)>

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [18]   target is:  47
for input:  [18, 47]   target is:  56
for input:  [18, 47, 56]   target is:  57
for input:  [18, 47, 56, 57]   target is:  58
for input:  [18, 47, 56, 57, 58]   target is:  1
for input:  [18, 47, 56, 57, 58, 1]   target is:  15
for input:  [18, 47, 56, 57, 58, 1, 15]   target is:  47
for input:  [18, 47, 56, 57, 58, 1, 15, 47]   target is:  58


#### Implementation 

In [14]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions
n_embd = 32

def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    #print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

inputs:
(4, 8)
tf.Tensor(
[[ 1 21  1 39 51  1 45 39]
 [14 59 58  1 39 57  1 58]
 [ 0 32 46 53 59  1 42 43]
 [42  1 54 56 53 57 54 43]], shape=(4, 8), dtype=int64)
targets:
(4, 8)
tf.Tensor(
[[21  1 39 51  1 45 39 50]
 [59 58  1 39 57  1 58 46]
 [32 46 53 59  1 42 43 57]
 [ 1 54 56 53 57 54 43 56]], shape=(4, 8), dtype=int64)


In [15]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [1]   target is:  21
for input:  [1, 21]   target is:  1
for input:  [1, 21, 1]   target is:  39
for input:  [1, 21, 1, 39]   target is:  51
for input:  [1, 21, 1, 39, 51]   target is:  1
for input:  [1, 21, 1, 39, 51, 1]   target is:  45
for input:  [1, 21, 1, 39, 51, 1, 45]   target is:  39
for input:  [1, 21, 1, 39, 51, 1, 45, 39]   target is:  50
for input:  [14]   target is:  59
for input:  [14, 59]   target is:  58
for input:  [14, 59, 58]   target is:  1
for input:  [14, 59, 58, 1]   target is:  39
for input:  [14, 59, 58, 1, 39]   target is:  57
for input:  [14, 59, 58, 1, 39, 57]   target is:  1
for input:  [14, 59, 58, 1, 39, 57, 1]   target is:  58
for input:  [14, 59, 58, 1, 39, 57, 1, 58]   target is:  46
for input:  [0]   target is:  32
for input:  [0, 32]   target is:  46
for input:  [0, 32, 46]   target is:  53
for input:  [0, 32, 46, 53]   target is:  59
for input:  [0, 32, 46, 53, 59]   target is:  1
for input:  [0, 32, 46, 53, 59, 1]   target is:  42
for 

## 3 Bigram Language Model 

### 3.1 Model Architecture  

In [16]:

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size, n_embd):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embd)
        self.lm_head = tf.keras.layers.linear(n_embd, vocab_size)

    def call(self, idx, targets=None, training=False):
        B, T = idx.shape
        
        tok_emd = self.token_embedding_table(idx)  # (B, T, C)
        pos_emd = self.position_embedding_table(tf.range(T)) #(T, C)
        x = tok_emd + pos_emd #(B,T,C) # Holds Token identities & position
        logits = self.lm_head(x) # (B, T, vocab_size) say C=n_embd

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


### 3.2 Model Initialization  

In [17]:
model = BigramLanguageModel(vocab_size)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)

(32, 65)
tf.Tensor(4.1730795, shape=(), dtype=float32)


#### Untrained Model Results 

In [18]:
print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist())))

#Total Garbage


vEz
O.ZpziNV3!;QfmrpUUh&RSP.A
U,jdMjQXxXAZiQctAIxtUibxqgjs,kVjS3:?lKK,cDmMU ;rNYVS
!-&wVd-tL;'YBN-:D


### 3.3 Model Training 

In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [22]:
batch_size = 32
num_steps = 10000

for step in range(num_steps):
    # Sample a batch of data
    xbatch, ybatch = get_batch('train')  # Assuming you have a function get_batch

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xbatch, ybatch)

    # Compute gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print('Loss = 'loss.numpy())


2.5484118


In [20]:
#Somewhat stuctured results

print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=500).numpy()[0].tolist())))


GRLiKWlrD.oVShwTA?lWEGrNUYtmccK;GQ:!aQ,e$gL3 K
zz MGh!K;?,cFg?QdGoO:jnDswzqbuHcv?GClPAimtN$fBEz3jlr!d.qfzavAaaG$OHSdykcIJZCyPlOZ.$'kDFReW
WVQVG!apWROy;
BVSJ
K.fB!KKMdOZgvroDgtATz'soNXSGQXScLayV$vQ:Zqp,FQ,uI&KP?CAsdHhFfHzRzFTiX&tEcfOYgLOEH,.pl w'TDbei,FlYNrFwIOlHjscx3QWSr.GL
vFGltYBkFhiYAH.GBr'.T.ULFdAphl'yFAtm',TtdRLKAiCCtWQY?zOVmaMEyM&jRP!LxDtizrMoFk,TSb!yvUMDuKU.a'sDT?3vAHOxE!G
IFB&WvC SMJTfAQ.stMFlgrnAX:MUCTtggWAifhF:D;LYzRFP
yFXO,iT$Cc.VgBHjHC?;iv wIZkDi w$NgfkHpeQpactdN::vNF:-y$eajBjIMcUgZV


# Adding Transformer  

## 4 Maths for self-attention mechanism

In [81]:
#We want a method to preserve context for each word and it's former words in the time block

In [88]:
# example of words and their context

for row in range(batch_size-3):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [1]   target is:  21
for input:  [1, 21]   target is:  1
for input:  [1, 21, 1]   target is:  39
for input:  [1, 21, 1, 39]   target is:  51
for input:  [1, 21, 1, 39, 51]   target is:  1
for input:  [1, 21, 1, 39, 51, 1]   target is:  45
for input:  [1, 21, 1, 39, 51, 1, 45]   target is:  39
for input:  [1, 21, 1, 39, 51, 1, 45, 39]   target is:  50


### 4.1 Aggregating for context (simplest method) 

#### Version 1 

In [132]:
B, T, C = 4, 8, 2  # batch, time, channels
x = tf.random.normal((B, T, C), dtype=tf.float64)
print(x.shape)


(4, 8, 2)


In [133]:
xbow = tf.zeros((B, T, C), dtype=tf.float64)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1, :]  # (t, C)
        mean_value = tf.reduce_mean(xprev, axis=0)
        xbow = tf.tensor_scatter_nd_add(xbow, indices=[[b, t]], updates=[mean_value])


In [134]:
x[0]
#original data

<tf.Tensor: shape=(8, 2), dtype=float64, numpy=
array([[-1.32613346, -0.09177541],
       [-0.88821307, -0.95390373],
       [-0.53244129,  2.39326375],
       [-0.59578473, -1.95157164],
       [-0.22948155, -0.74213509],
       [ 0.23734349, -1.18848048],
       [ 0.12768125, -0.54226973],
       [ 0.09396947,  0.55996733]])>

In [135]:
#aggregated data
xbow[0]
#each upcoming row is contains the aggregate of all previous 

<tf.Tensor: shape=(8, 2), dtype=float64, numpy=
array([[-1.32613346, -0.09177541],
       [-1.10717326, -0.52283957],
       [-0.91559594,  0.44919487],
       [-0.83564314, -0.15099676],
       [-0.71441082, -0.26922442],
       [-0.5557851 , -0.42243377],
       [-0.45814705, -0.43955319],
       [-0.38913249, -0.31461312]])>

### 4.2 Efficient aggregating for context (simplest method) 

#### Concept (Same achived through matrix multiplication trick)

In [136]:
# import numpy as np

# Create a lower triangular matrix
a = tf.linalg.band_part(tf.ones((3, 3), dtype=tf.float64), -1, 0)
a = a / tf.reduce_sum(a, axis=1, keepdims=True)

# Create a random matrix with integer values between 0 and 10
b = tf.constant(np.random.randint(0, 10, size=(3, 2)).astype(np.float64))

# Perform matrix multiplication
c = tf.matmul(a, b)

# Print the results
print('a=')
print(a.numpy())
print('--')
print('b=')
print(b.numpy())
print('--')
print('c=')
print(c.numpy())

a=
[[1.         0.         0.        ]
 [0.5        0.5        0.        ]
 [0.33333333 0.33333333 0.33333333]]
--
b=
[[8. 9.]
 [1. 7.]
 [7. 1.]]
--
c=
[[8.         9.        ]
 [4.5        8.        ]
 [5.33333333 5.66666667]]


#### Version 2 (Reproducing xbow via matrix mulitplication instead of loop multiplication)

In [144]:
weights = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0) 
weights = weights / tf.reduce_sum(weights, axis=1, keepdims=True)

# Round each value to 3 decimal points for better visibility
weights_numpy_rounded = np.round(weights.numpy().tolist(), decimals=3)
weights_numpy_rounded

array([[1.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.5  , 0.5  , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.333, 0.333, 0.333, 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.25 , 0.25 , 0.25 , 0.25 , 0.   , 0.   , 0.   , 0.   ],
       [0.2  , 0.2  , 0.2  , 0.2  , 0.2  , 0.   , 0.   , 0.   ],
       [0.167, 0.167, 0.167, 0.167, 0.167, 0.167, 0.   , 0.   ],
       [0.143, 0.143, 0.143, 0.143, 0.143, 0.143, 0.143, 0.   ],
       [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]])

In [145]:
xbow2 = tf.matmul(weights, x) # Here (B,T,T) x (B,T,C) = (B,T,C).   where B in (B,T,T) was automatically created by tf

In [146]:
print('xbow = ',xbow[0].numpy())
print('\nSame as\n')
print('xbow2 = ',xbow2[0].numpy())

xbow =  [[-1.32613346 -0.09177541]
 [-1.10717326 -0.52283957]
 [-0.91559594  0.44919487]
 [-0.83564314 -0.15099676]
 [-0.71441082 -0.26922442]
 [-0.5557851  -0.42243377]
 [-0.45814705 -0.43955319]
 [-0.38913249 -0.31461312]]

Same as

xbow2 =  [[-1.32613346 -0.09177541]
 [-1.10717326 -0.52283957]
 [-0.91559594  0.44919487]
 [-0.83564314 -0.15099676]
 [-0.71441082 -0.26922442]
 [-0.5557851  -0.42243377]
 [-0.45814705 -0.43955319]
 [-0.38913249 -0.31461312]]


#### Version 3 (Using softmax) 

In [147]:
# Create lower triangular matrix and initialize wei with zeros
tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0)
weights = tf.zeros((T, T), dtype=tf.float64)

# Mask the upper triangular part with -inf
weights = tf.where(tf.equal(tril, 0), float('-inf'), weights)

# Apply softmax along the last dimension
weights = tf.nn.softmax(weights, axis=-1)

# Perform matrix multiplication
xbow3 = tf.matmul(weights, x)

In [148]:
print('xbow = ',xbow[0].numpy())
print('\nSame as\n')
print('xbow2 = ',xbow3[0].numpy())

xbow =  [[-1.32613346 -0.09177541]
 [-1.10717326 -0.52283957]
 [-0.91559594  0.44919487]
 [-0.83564314 -0.15099676]
 [-0.71441082 -0.26922442]
 [-0.5557851  -0.42243377]
 [-0.45814705 -0.43955319]
 [-0.38913249 -0.31461312]]

Same as

xbow2 =  [[-1.32613346 -0.09177541]
 [-1.10717326 -0.52283957]
 [-0.91559594  0.44919487]
 [-0.83564314 -0.15099676]
 [-0.71441082 -0.26922442]
 [-0.5557851  -0.42243377]
 [-0.45814705 -0.43955319]
 [-0.38913249 -0.31461312]]
