# 1 Download & Read Dataset 

In [13]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open('../dataset.txt', 'r', encoding='utf-8') as f:
    text = f.read()

        
print("Total number of characters in the dataset : ",len(text))

print('\n\nFirst 100 characters : ', text[:100])

Total number of characters in the dataset :  1115394


First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# ------------------------------------------- 

In [1]:
import tensorflow as tf
from tensorflow.keras import layers

### For reproducibility  

In [4]:
tf.random.set_seed(1337)

# --------------------------------------------

# 2 Data Preprocessing 

## 2.1 Tokenization  

In [158]:
#all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Vocabulary Size : ',vocab_size)

print('\nWhole Vocabulary :-\n\n',''.join(chars))

Vocabulary Size :  65

Whole Vocabulary :-

 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [159]:
# mapping characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # decoder: take a list of integers, output a string

In [160]:
# Encode & Decode Example

zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


## 2.2 Train / Val Dataset Split 

In [162]:
# Train and test splits
data = tf.constant(encode(text), dtype=tf.int64)

n = int(0.9 * len(data))  # First 90% will be train, rest will be for validation

train_data = data[:n]
val_data = data[n:]

In [163]:
print('A sample of train data : \n\n', train_data[:50].numpy())
print('\n\nA sample of train data decoded : \n\n', ''.join(decode(train_data[:50].numpy())))

A sample of train data : 

 [18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56]


A sample of train data decoded : 

 First Citizen:
Before we proceed any further, hear


# 3 Model Utilities

In [164]:
# Hyperparameters
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

In [166]:
# data loading

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((batch_size,), maxval=len(data) - block_size, dtype=tf.int64)
    x = tf.gather(data, ix[:, tf.newaxis] + tf.range(block_size, dtype=tf.int64))
    y = tf.gather(data, ix[:, tf.newaxis] + tf.range(1, block_size + 1, dtype=tf.int64))
    return x, y

# Loss estimator during training

@tf.function
def estimate_loss():
    out = {}
    model.trainable = False
    for split in ['train', 'val']:
        losses = tf.TensorArray(tf.float32, size=eval_iters)
        for k in tf.range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses = losses.write(k, loss)
        out[split] = tf.reduce_mean(losses.stack())
    model.trainable = True
    return out

In [167]:
class Head(layers.Layer):
    """ One head of self-attention """

    def __init__(self, head_size):
        super(Head, self).__init__()
        self.key = layers.Dense(head_size, use_bias=False)
        self.query = layers.Dense(head_size, use_bias=False)
        self.value = layers.Dense(head_size, use_bias=False)
        self.tril = tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = tf.matmul(q, tf.transpose(k, perm=[0, 2, 1])) * C ** -0.5
        wei = tf.where(self.tril == 0, float('-inf'), wei)
        wei = tf.nn.softmax(wei, axis=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = tf.matmul(wei, v)
        return out

class MultiHeadAttention(layers.Layer):
    """ Multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = layers.Dense(n_embd)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(layers.Layer):
    """ A simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super(FeedForward, self).__init__()
        self.net = tf.keras.Sequential([
            layers.Dense(4 * n_embd),
            layers.ReLU(),
            layers.Dense(n_embd),
            layers.Dropout(dropout),
        ])

    def call(self, x):
        return self.net(x)

class Block(layers.Layer):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super(Block, self).__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


# 4 Model Implementation

In [168]:
#Simple Bigram Model
class BigramLanguageModel(tf.keras.Model):

    def __init__(self):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = tf.keras.Sequential([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = layers.LayerNormalization()
        self.lm_head = layers.Dense(vocab_size)

    def call(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int64))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B * T, C))
            targets = tf.reshape(targets, (B * T,))
            loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64)
            idx = tf.concat([idx, idx_next], axis=1)
        return idx

# 5 Model Training

## 5.1 Initialization 

In [59]:
model = BigramLanguageModel()

## 5.2 Training Loop 

In [60]:

optimizer = tf.keras.optimizers.Adam(learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))



step 0: train loss 4.4587, val loss 4.4448
step 100: train loss 2.8185, val loss 2.8346
step 200: train loss 2.5527, val loss 2.5598
step 300: train loss 2.4448, val loss 2.4497
step 400: train loss 2.3628, val loss 2.3753
step 500: train loss 2.3051, val loss 2.3296
step 600: train loss 2.2380, val loss 2.2507
step 700: train loss 2.1879, val loss 2.2221
step 800: train loss 2.1431, val loss 2.1859
step 900: train loss 2.0943, val loss 2.1468
step 1000: train loss 2.0831, val loss 2.1352
step 1100: train loss 2.0312, val loss 2.0911
step 1200: train loss 1.9957, val loss 2.0636
step 1300: train loss 1.9721, val loss 2.0572
step 1400: train loss 1.9514, val loss 2.0407
step 1500: train loss 1.9273, val loss 2.0016
step 1600: train loss 1.9077, val loss 2.0031
step 1700: train loss 1.8813, val loss 1.9927
step 1800: train loss 1.8717, val loss 1.9631
step 1900: train loss 1.8423, val loss 1.9568
step 2000: train loss 1.8457, val loss 1.9572
step 2100: train loss 1.8151, val loss 1.9548


## 5.3 Model Size 

In [174]:
# print the number of parameters in the model
# print(sum(tf.reduce_prod(p.shape) for p in model.trainable_variables) / 1e6, 'M parameters')

model.summary()

Model: "bigram_language_model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    multiple                  4160      
                                                                 
 embedding_15 (Embedding)    multiple                  2048      
                                                                 
 sequential_39 (Sequential)  (None, 32, 64)            199168    
                                                                 
 layer_normalization_71 (Lay  multiple                 128       
 erNormalization)                                                
                                                                 
 dense_487 (Dense)           multiple                  4225      
                                                                 
Total params: 209,729
Trainable params: 209,729
Non-trainable params: 0
_____________________________________

# 6 Testing 

In [185]:
# generate from the model
context = tf.ones((1, 32), dtype=tf.int64)
print(context.shape)


(1, 32)


In [187]:
generated_indices = model.generate(context, max_new_tokens=500)[0].numpy()
print(decode(generated_indices.tolist()))

                                aut daughters a suckemble of my seept.

DUKE VINCENTIO:
At our lose Lis, I for have slagte and denter,
And proul, for he take dischles! is,
Thy will marciuf'd and Angue,
As name to butio is fear so, and thy corsomervy,
And urse my shall here: sit, is weet a for deather.
The staint not That, beenhem eye
to courted makes aliCliff shame,
And with may suve thank mustillor.

MENENIO:
O, is he noble peoppleasul! This kince,
I'll'd to will the waltory is sue
Come, make he spery: thou news for
All nonge
