## 1 Downloading The Dataset

In [None]:
#Downloading dataset to train on.

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
with open('../dataset.txt','r',encoding='utf-8') as data:
    text = data.read()
    
print("Total number of characters in the dataset : ",len(text))

Total number of characters in the dataset :  1115394


In [3]:
print('First 100 characters : ', text[:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

### 2.1 Building Vocabulary 

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print('Number of unique characters in the dataset =',vocab_size)
print('\nWhich are following : \n',''.join(chars))

Number of unique characters in the dataset = 65

Which are following : 
 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [5]:
#mapping individual charaters to integers 

encoder = { char:i for i,char in enumerate(chars)}
decoder = { i:char for i,char in enumerate(chars)}

encode = lambda string: [encoder[char] for char in string]
decode = lambda integers: [decoder[i] for i in integers]

In [6]:
zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


### 2.3 Encoding The Dataset 

In [7]:
import tensorflow as tf

In [8]:
data = tf.convert_to_tensor(encode(text), dtype=tf.int64)
print(data.shape, data.dtype)

(1115394,) <dtype: 'int64'>


In [9]:
print(data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.4 Splitting the dataset (Train,Validate) 

In [10]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [11]:
print(train_data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [12]:
block_size = 128
train_data[:block_size+1]

<tf.Tensor: shape=(129,), dtype=int64, numpy=
array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43,
       44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39,
       52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1,
       51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31,
       54, 43, 39, 49,  6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56,
       57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39,
       56, 43,  1, 39, 50, 50,  1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56,
       39, 58, 46, 43, 56,  1, 58, 53,  1, 42], dtype=int64)>

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [18]   target is:  47
for input:  [18, 47]   target is:  56
for input:  [18, 47, 56]   target is:  57
for input:  [18, 47, 56, 57]   target is:  58
for input:  [18, 47, 56, 57, 58]   target is:  1
for input:  [18, 47, 56, 57, 58, 1]   target is:  15
for input:  [18, 47, 56, 57, 58, 1, 15]   target is:  47
for input:  [18, 47, 56, 57, 58, 1, 15, 47]   target is:  58
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58]   target is:  47
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47]   target is:  64
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64]   target is:  43
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43]   target is:  52
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52]   target is:  10
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10]   target is:  0
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0]   target is:  14
for input:  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14]   target

#### Implementation 

In [14]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions
n_embd = 32 


def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    #print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

inputs:
(8, 128)
tf.Tensor(
[[47 53 52 ... 50 42 52]
 [43  1 58 ... 20 17 31]
 [12  1 57 ... 49 47 52]
 ...
 [43 39 57 ... 56 43 44]
 [53 56  1 ... 58 46 43]
 [20 13 30 ...  1 53 44]], shape=(8, 128), dtype=int64)
targets:
(8, 128)
tf.Tensor(
[[53 52  8 ... 42 52 43]
 [ 1 58 46 ... 17 31 31]
 [ 1 57 54 ... 47 52 42]
 ...
 [39 57 39 ... 43 44 53]
 [56  1 58 ... 46 43  1]
 [13 30 16 ... 53 44  1]], shape=(8, 128), dtype=int64)


In [130]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [63]   target is:  1
for input:  [63, 1]   target is:  61
for input:  [63, 1, 61]   target is:  43
for input:  [63, 1, 61, 43]   target is:  39
for input:  [63, 1, 61, 43, 39]   target is:  49
for input:  [63, 1, 61, 43, 39, 49]   target is:  52
for input:  [63, 1, 61, 43, 39, 49, 52]   target is:  43
for input:  [63, 1, 61, 43, 39, 49, 52, 43]   target is:  57
for input:  [43]   target is:  6
for input:  [43, 6]   target is:  1
for input:  [43, 6, 1]   target is:  51
for input:  [43, 6, 1, 51]   target is:  63
for input:  [43, 6, 1, 51, 63]   target is:  1
for input:  [43, 6, 1, 51, 63, 1]   target is:  50
for input:  [43, 6, 1, 51, 63, 1, 50]   target is:  53
for input:  [43, 6, 1, 51, 63, 1, 50, 53]   target is:  56
for input:  [1]   target is:  58
for input:  [1, 58]   target is:  46
for input:  [1, 58, 46]   target is:  43
for input:  [1, 58, 46, 43]   target is:  51
for input:  [1, 58, 46, 43, 51]   target is:  1
for input:  [1, 58, 46, 43, 51, 1]   target is:  44
for

## 3 Without Transformer  

## Maths for self-attention mechanism

In [149]:
#We want a method to preserve context for each word and it's former words in the time block

In [150]:
# example of words and their context

for row in range(batch_size-3):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [63]   target is:  1
for input:  [63, 1]   target is:  61
for input:  [63, 1, 61]   target is:  43
for input:  [63, 1, 61, 43]   target is:  39
for input:  [63, 1, 61, 43, 39]   target is:  49
for input:  [63, 1, 61, 43, 39, 49]   target is:  52
for input:  [63, 1, 61, 43, 39, 49, 52]   target is:  43
for input:  [63, 1, 61, 43, 39, 49, 52, 43]   target is:  57


In [151]:
xbatch.shape

TensorShape([4, 8])

### 3.1 Aggregating for context (simplest method) 

#### Version 1 

In [152]:
B, T, C = 4, 8, 2  # batch, time, channels
x = tf.random.normal((B, T, C), dtype=tf.float64)
print(x.shape)


(4, 8, 2)


In [153]:
xbow = tf.zeros((B, T, C), dtype=tf.float64)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1, :]  # (t, C)
        mean_value = tf.reduce_mean(xprev, axis=0)
        xbow = tf.tensor_scatter_nd_add(xbow, indices=[[b, t]], updates=[mean_value])


In [154]:
x[0]
#original data

<tf.Tensor: shape=(8, 2), dtype=float64, numpy=
array([[ 0.95310103, -0.33585035],
       [-0.89537386, -0.10109913],
       [-1.08746957, -0.23392829],
       [ 0.55400938, -3.05065275],
       [-0.4705944 , -1.86886873],
       [ 2.04610976, -0.4255422 ],
       [-1.76064349,  0.43410932],
       [-1.65781871,  0.70082522]])>

In [155]:
#aggregated data
xbow
#each upcoming row is contains the aggregate of all previous 

<tf.Tensor: shape=(4, 8, 2), dtype=float64, numpy=
array([[[ 0.95310103, -0.33585035],
        [ 0.02886359, -0.21847474],
        [-0.34324746, -0.22362592],
        [-0.11893325, -0.93038263],
        [-0.18926548, -1.11807985],
        [ 0.18329706, -1.00265691],
        [-0.09440874, -0.79740459],
        [-0.28983498, -0.61012586]],

       [[ 0.24474915,  0.5304757 ],
        [-0.0215186 ,  0.89519942],
        [-0.29395758,  0.44847173],
        [ 0.17669387,  0.5498084 ],
        [ 0.18008692,  0.3448098 ],
        [-0.0328576 ,  0.4573222 ],
        [-0.09852322,  0.57093864],
        [ 0.09555441,  0.39454034]],

       [[-0.45030911, -0.85046188],
        [ 0.61036958, -0.90442377],
        [ 0.601554  , -1.25840561],
        [ 0.25857243, -0.86086016],
        [ 0.28360905, -0.79047013],
        [ 0.316442  , -0.62021193],
        [ 0.02984569, -0.50866335],
        [-0.3375226 , -0.46764786]],

       [[ 0.18904454, -1.09819457],
        [ 0.18068477,  0.22778135],
       

### 3.2 Efficient aggregating for context (simplest method) 

#### Concept (Same achived through matrix multiplication trick)

In [156]:
import numpy as np

# Create a lower triangular matrix
a = tf.linalg.band_part(tf.ones((3, 3), dtype=tf.float64), -1, 0)
a = a / tf.reduce_sum(a, axis=1, keepdims=True)

# Create a random matrix with integer values between 0 and 10
b = tf.constant(np.random.randint(0, 10, size=(3, 2)).astype(np.float64))

# Perform matrix multiplication
c = tf.matmul(a, b)

# Print the results
print('a=')
print(a.numpy())
print('--')
print('b=')
print(b.numpy())
print('--')
print('c=')
print(c.numpy())

a=
[[1.         0.         0.        ]
 [0.5        0.5        0.        ]
 [0.33333333 0.33333333 0.33333333]]
--
b=
[[7. 8.]
 [5. 0.]
 [2. 7.]]
--
c=
[[7.         8.        ]
 [6.         4.        ]
 [4.66666667 5.        ]]


#### Version 2 (Reproducing xbow via matrix mulitplication instead of loop multiplication)

In [157]:
weights = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0) 
weights = weights / tf.reduce_sum(weights, axis=1, keepdims=True)

# Round each value to 3 decimal points for better visibility
weights_numpy_rounded = np.round(weights.numpy().tolist(), decimals=3)
weights_numpy_rounded

array([[1.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.5  , 0.5  , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.333, 0.333, 0.333, 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.25 , 0.25 , 0.25 , 0.25 , 0.   , 0.   , 0.   , 0.   ],
       [0.2  , 0.2  , 0.2  , 0.2  , 0.2  , 0.   , 0.   , 0.   ],
       [0.167, 0.167, 0.167, 0.167, 0.167, 0.167, 0.   , 0.   ],
       [0.143, 0.143, 0.143, 0.143, 0.143, 0.143, 0.143, 0.   ],
       [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]])

In [158]:
xbow2 = tf.matmul(weights, x) # Here (B,T,T) x (B,T,C) = (B,T,C).   where B in (B,T,T) was automatically created by tf

In [159]:
print('xbow = ',xbow[0].numpy())
print('\nSame as\n')
print('xbow2 = ',xbow2[0].numpy())

xbow =  [[ 0.95310103 -0.33585035]
 [ 0.02886359 -0.21847474]
 [-0.34324746 -0.22362592]
 [-0.11893325 -0.93038263]
 [-0.18926548 -1.11807985]
 [ 0.18329706 -1.00265691]
 [-0.09440874 -0.79740459]
 [-0.28983498 -0.61012586]]

Same as

xbow2 =  [[ 0.95310103 -0.33585035]
 [ 0.02886359 -0.21847474]
 [-0.34324746 -0.22362592]
 [-0.11893325 -0.93038263]
 [-0.18926548 -1.11807985]
 [ 0.18329706 -1.00265691]
 [-0.09440874 -0.79740459]
 [-0.28983498 -0.61012586]]


#### Version 3 (Using softmax) 

In [160]:
# Create lower triangular matrix and initialize wei with zeros
tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0)
weights = tf.zeros((T, T), dtype=tf.float64)

# Mask the upper triangular part with -inf
weights = tf.where(tf.equal(tril, 0), float('-inf'), weights)

# Apply softmax along the last dimension
weights = tf.nn.softmax(weights, axis=-1)

# Perform matrix multiplication
xbow3 = tf.matmul(weights, x)

In [161]:
print('xbow = ',xbow[0].numpy())
print('\nSame as\n')
print('xbow2 = ',xbow3[0].numpy())

xbow =  [[ 0.95310103 -0.33585035]
 [ 0.02886359 -0.21847474]
 [-0.34324746 -0.22362592]
 [-0.11893325 -0.93038263]
 [-0.18926548 -1.11807985]
 [ 0.18329706 -1.00265691]
 [-0.09440874 -0.79740459]
 [-0.28983498 -0.61012586]]

Same as

xbow2 =  [[ 0.95310103 -0.33585035]
 [ 0.02886359 -0.21847474]
 [-0.34324746 -0.22362592]
 [-0.11893325 -0.93038263]
 [-0.18926548 -1.11807985]
 [ 0.18329706 -1.00265691]
 [-0.09440874 -0.79740459]
 [-0.28983498 -0.61012586]]


### 3.3 Self-attention mechanism context (better method)  

#### Version 4 (Using Key, Query, Value upon version 3)

In [33]:
tf.random.set_seed(1337)

B, T, C = 4, 8, 32  # batch, time, channels 
x = tf.random.normal((B, T, C))
x.shape

TensorShape([4, 8, 32])

In [34]:

# Let's see a single Head perform self-attention
head_size = 16 

key = tf.keras.layers.Dense(head_size, use_bias=False)
query = tf.keras.layers.Dense(head_size, use_bias=False)
value = tf.keras.layers.Dense(head_size, use_bias=False)

print('x.shape : ', x.shape)
k = key(x)   # (B, T, 32) => (B, T, 16)
q = query(x) # (B, T, 32) => (B, T, 16)
print('k.shape : ', k.shape)
print('q.shape : ', q.shape)

wei = tf.matmul(q, tf.transpose(k, perm=[0, 2, 1])* head_size**-0.5)  # (B, T, T)  #also scaling after transpose, check below to see why

# print('wei.shape : ',wei.shape)

tril = tf.linalg.band_part(tf.ones((T, T), dtype=tf.float64), -1, 0)
wei = tf.where(tf.equal(tril, 0), float('-inf'), wei)

# Apply softmax along the last dimension
wei = tf.nn.softmax(wei, axis=-1)

v = value(x)

print("wei.shape: ",wei.shape," v.shape: ", v.shape )
out = tf.matmul(wei, v)

print(out.shape)

x.shape :  (4, 8, 32)
k.shape :  (4, 8, 16)
q.shape :  (4, 8, 16)
wei.shape:  (4, 8, 8)  v.shape:  (4, 8, 16)
(4, 8, 16)


In [35]:
wei[0].numpy()

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.7286488 , 0.2713512 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.18743417, 0.70567065, 0.10689516, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.0590855 , 0.24319807, 0.34693316, 0.35078323, 0.        ,
        0.        , 0.        , 0.        ],
       [0.533458  , 0.06640321, 0.2022393 , 0.12293093, 0.07496864,
        0.        , 0.        , 0.        ],
       [0.66182786, 0.07579562, 0.12158484, 0.05262179, 0.07480336,
        0.0133665 , 0.        , 0.        ],
       [0.21512643, 0.02906227, 0.08814932, 0.32106763, 0.0705865 ,
        0.1328572 , 0.14315067, 0.        ],
       [0.02739257, 0.1601774 , 0.23865585, 0.08972444, 0.12573364,
        0.0443315 , 0.19535215, 0.11863239]], dtype=float32)

In [36]:
#without scaling 
tf.nn.softmax(tf.constant([0.1, -0.2, 0.3, -0.2, 0.5]), axis=-1).numpy()

array([0.1924978 , 0.14260589, 0.23511736, 0.14260589, 0.287173  ],
      dtype=float32)

In [37]:
#without scaling the softmax favours larger values big time as visible
tf.nn.softmax(tf.constant([0.1, -0.2, 0.3, -0.2, 0.5])*8, axis=-1).numpy() 

array([0.03260834, 0.00295816, 0.1615102 , 0.00295816, 0.79996514],
      dtype=float32)

# Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

## 4 Bigram Language Model (With Transformer Block) 

### 4.1 Transformer Head Block 

In [38]:

class Head(tf.keras.Model):

    def __init__(self, head_size):
        super(Head, self).__init__()
        
        self.key = tf.keras.layers.Dense(head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(head_size, use_bias=False)
        self.tril = tf.linalg.band_part(tf.ones((block_size, block_size), dtype=tf.float64), -1, 0)

    def call(self, x):
        B, T, C = x.shape
        
        k = key(x)   # (B, T, C) => (B, T, head_size)
        q = query(x) # (B, T, C) => (B, T, head_size)
        
        #Compute Attention Score (Affinities)
        wei = tf.matmul(q, tf.transpose(k, perm=[0, 2, 1])* C**-0.5)  # (B,T,C) @ (B, C, T) => (B, T, T)
        
        wei = tf.where(tf.equal(self.tril[:T,:T], 0), float('-inf'), wei) #Here we cut of future context which makes this head a decoder block

        # Apply softmax along the last dimension
        wei = tf.nn.softmax(wei, axis=-1)
        #Perform Weighted Aggregation of the values
        v = value(x) # (B,T,C)
        out = tf.matmul(wei, v) #(B,T,T) @ (B,T,C) -> (B,T,C)

        return out



### 4.2 Bigram Language Model  

In [39]:

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size, n_embd):
        super(BigramLanguageModel, self).__init__()
        
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embd)
        self.sa_head = tf.keras.layers.Dense(n_embd)
        self.lm_head = tf.keras.layers.Dense(vocab_size)

    def call(self, idx, targets=None, training=False):
        B, T = idx.shape
        
        tok_emd = self.token_embedding_table(idx)  # (B, T, C)
        pos_emd = self.position_embedding_table(tf.range(T)) #(T, C)
        x = tok_emd + pos_emd #(B,T,C) # Holds Token identities & position
        x = self.sa_head(x) #Apply one head of self attention. (B,T,C) 
        logits = self.lm_head(x) # (B, T, vocab_size) # finally decoded here

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] #Because if idx is more than block size then our positional embd above
                                            #is gonna run out of scope because it has embeddings of upto block size
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


In [40]:
model = BigramLanguageModel(vocab_size, n_embd)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)

(1024, 65)
tf.Tensor(4.1711574, shape=(), dtype=float32)


In [41]:
print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist())))

#Total Garbage


ZiQlH
XYbrOU&py,dn-ovbsESOAqBKsk:yxV'-lNES!OU$3?xB,CpHqR

y;TFC,MjPQ3FbQ,SjCJ.hetJ.Mv!WQ?q,&A.M KHBO


In [42]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

batch_size = 32
num_steps = 10000

for step in range(num_steps):
    # Sample a batch of data
    xbatch, ybatch = get_batch('train')  # Assuming you have a function get_batch

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xbatch, ybatch)

    # Compute gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print('Loss = ', loss.numpy())


Loss =  2.440852


In [43]:
#Somewhat stuctured results

print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=500).numpy()[0].tolist())))


A or INGLOrlo bere inmard meande LAnghall ysathomfousave foisld t henee INom:
ONo cowil are.
Jusoeder re yoom rne leme whe myoul tth tinch t, ix te rig:

CHaur beetay pathe Anen. mak
HAs as P qu ben y mefusomy tont fthal n I eathe thet Th g ya'sour s:
Cllt IXEY:
II woupor-be lll.

An
Sef 'douthe.
Pof Whonthed asive ntss a llamuine,

DENou d ce il wid hatorul n tucor he se rt lin h y weaver y ok? fore.
ARee y V:
I t ware f a hate mey thasor'd undpirthouse thr lve lyore.
He tld Maldig ore sth SI'd


In [None]:
f