# 1 Download & Read Dataset 

In [1]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open('../dataset.txt', 'r', encoding='utf-8') as f:
    text = f.read()

        
print("Total number of characters in the dataset : ",len(text))

print('\n\nFirst 100 characters : ', text[:100])

Total number of characters in the dataset :  1115394


First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# ------------------------------------------- 

In [2]:
import tensorflow as tf
from tensorflow.keras import layers

### For reproducibility  

In [3]:
tf.random.set_seed(1337)

# --------------------------------------------

# 2 Data Preprocessing 

## 2.1 Tokenization  (1st)

In [158]:
#all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Vocabulary Size : ',vocab_size)

print('\nWhole Vocabulary :-\n\n',''.join(chars))

Vocabulary Size :  65

Whole Vocabulary :-

 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [159]:
# mapping characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # decoder: take a list of integers, output a string

In [160]:
# Encode & Decode Example

zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


## 2.2 Tokenization  (2nd)

### 1 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in dataset:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(len(word_freqs))        
print(word_freqs)

In [None]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(len(alphabet))
print(alphabet)

In [None]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [None]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [None]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

In [None]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

In [None]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [None]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [None]:
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

In [None]:
vocab_size = 10000

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [None]:
print(merges)

### or 2 

In [4]:
from tokenizers import ByteLevelBPETokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator([text], vocab_size=10000, min_frequency=3, special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]"])


In [6]:
# Get vocabulary from the trained tokenizer
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)

In [7]:
print('Number of unique tokens in the dataset =', vocab_size)
print('\nWhich are following : \n', ' '.join(sorted(vocab.keys())))

Number of unique tokens in the dataset = 9787

Which are following : 


In [8]:
# Encode the text
encoded_text = tokenizer.encode(text).ids

In [9]:
len(encoded_text)

312505

## 2.2 Train / Val Dataset Split 

In [11]:
# Train and test splits
# data = tf.constant(encode(text), dtype=tf.int64)
data = tf.constant(encoded_text, dtype=tf.int64)

n = int(0.9 * len(data))  # First 90% will be train, rest will be for validation

train_data = data[:n]
val_data = data[n:]

In [15]:
print('A sample of train data : \n\n', train_data[:50].numpy())
print('\n\nA sample of train data decoded : \n\n', ''.join(tokenizer.decode((train_data[:50].numpy()))))

A sample of train data : 

 [ 675 1200   29  202 2346  335 2751  806 2306   15  678  321  620   17
  202  202 1235   29  202 2542   15  620   17  202  202  675 1200   29
  202  569  422  399 4196 1502  291  968  531  291 7922   34  202  202
 1235   29  202 3666 5591   17 4196   17]


A sample of train data decoded : 

 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.


# 3 Model Utilities

In [16]:
# Hyperparameters
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

In [17]:
# data loading

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((batch_size,), maxval=len(data) - block_size, dtype=tf.int64)
    x = tf.gather(data, ix[:, tf.newaxis] + tf.range(block_size, dtype=tf.int64))
    y = tf.gather(data, ix[:, tf.newaxis] + tf.range(1, block_size + 1, dtype=tf.int64))
    return x, y

# Loss estimator during training

@tf.function
def estimate_loss():
    out = {}
    model.trainable = False
    for split in ['train', 'val']:
        losses = tf.TensorArray(tf.float32, size=eval_iters)
        for k in tf.range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses = losses.write(k, loss)
        out[split] = tf.reduce_mean(losses.stack())
    model.trainable = True
    return out

In [18]:
class Head(layers.Layer):
    """ One head of self-attention """

    def __init__(self, head_size):
        super(Head, self).__init__()
        self.key = layers.Dense(head_size, use_bias=False)
        self.query = layers.Dense(head_size, use_bias=False)
        self.value = layers.Dense(head_size, use_bias=False)
        self.tril = tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = tf.matmul(q, tf.transpose(k, perm=[0, 2, 1])) * C ** -0.5
        wei = tf.where(self.tril == 0, float('-inf'), wei)
        wei = tf.nn.softmax(wei, axis=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = tf.matmul(wei, v)
        return out

class MultiHeadAttention(layers.Layer):
    """ Multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = layers.Dense(n_embd)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(layers.Layer):
    """ A simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super(FeedForward, self).__init__()
        self.net = tf.keras.Sequential([
            layers.Dense(4 * n_embd),
            layers.ReLU(),
            layers.Dense(n_embd),
            layers.Dropout(dropout),
        ])

    def call(self, x):
        return self.net(x)

class Block(layers.Layer):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super(Block, self).__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


# 4 Model Implementation

In [19]:
#Simple Bigram Model
class BigramLanguageModel(tf.keras.Model):

    def __init__(self):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = tf.keras.Sequential([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = layers.LayerNormalization()
        self.lm_head = layers.Dense(vocab_size)

    def call(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int64))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B * T, C))
            targets = tf.reshape(targets, (B * T,))
            loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64)
            idx = tf.concat([idx, idx_next], axis=1)
        return idx

# 5 Model Training

## 5.1 Initialization 

In [20]:
model = BigramLanguageModel()

## 5.2 Training Loop 

In [21]:

optimizer = tf.keras.optimizers.Adam(learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))



step 0: train loss 9.1671, val loss 9.1647
step 100: train loss 6.2462, val loss 6.2700
step 200: train loss 5.8310, val loss 5.8627
step 300: train loss 5.5686, val loss 5.6292
step 400: train loss 5.4157, val loss 5.4998
step 500: train loss 5.2896, val loss 5.3997
step 600: train loss 5.1451, val loss 5.3067
step 700: train loss 5.0478, val loss 5.2347
step 800: train loss 4.9518, val loss 5.1551
step 900: train loss 4.8668, val loss 5.1349
step 1000: train loss 4.8240, val loss 5.0422
step 1100: train loss 4.7463, val loss 5.0526
step 1200: train loss 4.6814, val loss 4.9798
step 1300: train loss 4.6309, val loss 4.9833
step 1400: train loss 4.5974, val loss 4.9628
step 1500: train loss 4.5385, val loss 4.9650
step 1600: train loss 4.5119, val loss 4.9722
step 1700: train loss 4.5099, val loss 4.9801
step 1800: train loss 4.4270, val loss 4.9048
step 1900: train loss 4.4319, val loss 4.9432
step 2000: train loss 4.3657, val loss 4.9071
step 2100: train loss 4.3387, val loss 4.9682


## 5.3 Model Size 

In [22]:
# print the number of parameters in the model
# print(sum(tf.reduce_prod(p.shape) for p in model.trainable_variables) / 1e6, 'M parameters')

model.summary()

Model: "bigram_language_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  626368    
                                                                 
 embedding_1 (Embedding)     multiple                  2048      
                                                                 
 sequential_4 (Sequential)   (16, 32, 64)              199168    
                                                                 
 layer_normalization_8 (Laye  multiple                 128       
 rNormalization)                                                 
                                                                 
 dense_60 (Dense)            multiple                  636155    
                                                                 
Total params: 1,463,867
Trainable params: 1,463,867
Non-trainable params: 0
___________________________________

# 6 Testing 

In [23]:
# generate from the model
context = tf.ones((1, 32), dtype=tf.int64)
print(context.shape)


(1, 32)


In [24]:
generated_indices = model.generate(context, max_new_tokens=500)[0].numpy()
# print(decode(generated_indices.tolist()))


In [25]:
print(tokenizer.decode(generated_indices.tolist()))


wing this. See that it hath the duke? I think, wife and
The punhis pile wars of age. Est;
Hold us away, some good brawling side: but so Lewis
My service but so much worse, a cuving gold in
cottage me with the warden-cure.
DrawSir, this knows our high blood--beition, our serious flesh
Lies dishonour'd to know again by it.

Fourth Citizen:
Ay, an Poit rebel!
Inartter, whom they convey away of life,
And when the by the hideous air is new spoil,
Which throng as every Jove was wandering flowion.

NORTHUMBERLAND:
How, Marcius, with thy heaven! his death's cold.

LADY ANNE:
This, wilt appear against their blood of Gloucester
That you make it faired in brief souls,
At thy having ope his prayers from farewell.
Our heads I be ready by such a parties not flesh or
A sea's lield forches, heappers within the pin
And made opposit, score up my temper with Russia's heart,
How an 'Thanks, you must haveednesday a man,
For mercy of the utmost of one.

AUTOLYCUS:
It is a quartermn, but, patricians,
For so