# 1 Download & Read Dataset 

In [114]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open('../dataset.txt', 'r', encoding='utf-8') as f:
    text = f.read()

        
print("Total number of characters in the dataset : ",len(text))

print('\n\nFirst 100 characters : ', text[:100])

Total number of characters in the dataset :  1115394


First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# ------------------------------------------- 

In [115]:
import tensorflow as tf
from tensorflow.keras import layers

### For reproducibility  

In [116]:
tf.random.set_seed(1337)

# --------------------------------------------

# 2 Data Preprocessing 

## 2.1 Tokenization  (1st)

In [117]:
#all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Vocabulary Size : ',vocab_size)

print('\nWhole Vocabulary :-\n\n',''.join(chars))

Vocabulary Size :  65

Whole Vocabulary :-

 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [118]:
# mapping characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # decoder: take a list of integers, output a string

In [119]:
# Encode & Decode Example

zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


## 2.2 Tokenization  (2nd)

### 1 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in dataset:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(len(word_freqs))        
print(word_freqs)

In [None]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(len(alphabet))
print(alphabet)

In [None]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [None]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [None]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

In [None]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

In [None]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [None]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [None]:
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

In [None]:
vocab_size = 10000

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [None]:
print(merges)

### or 2 

In [None]:
from tokenizers import ByteLevelBPETokenizer

In [None]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator([text], min_frequency=3) #vocab_size=10000, , special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]"]


In [None]:
# Get vocabulary from the trained tokenizer
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)

In [None]:
print('Number of unique tokens in the dataset =', vocab_size)
print('\nWhich are following : \n', ' '.join(sorted(vocab.keys())))

In [None]:
# Encode the text
encoded_text = tokenizer.encode(text).ids

In [None]:
len(encoded_text)

## 2.2 Train / Val Dataset Split 

In [120]:
# Train and test splits
data = tf.constant(encode(text), dtype=tf.int64)
# data = tf.constant(encoded_text, dtype=tf.int64)

n = int(0.9 * len(data))  # First 90% will be train, rest will be for validation

train_data = data[:n]
val_data = data[n:]

In [121]:
print('A sample of train data : \n\n', train_data[:50].numpy())
print('\n\nA sample of train data decoded : \n\n', ''.join(decode((train_data[:50].numpy()))))
# print('\n\nA sample of train data decoded : \n\n', ''.join(tokenizer.decode((train_data[:50].numpy()))))

A sample of train data : 

 [18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56]


A sample of train data decoded : 

 First Citizen:
Before we proceed any further, hear


# 3 Model Utilities

In [122]:
# Hyperparameters
# batch_size = 16
# block_size = 32
# max_iters = 5000
# eval_interval = 100
# learning_rate = 1e-3
# eval_iters = 200
# n_embd = 64
# n_head = 4
# n_layer = 4
# dropout = 0.0

batch_size = 8
block_size = 256
max_iters = 5000
eval_interval = 100
learning_rate = 5e-4
eval_iters = 200
n_embd = 512
n_head = 8
n_layer = 8
dropout = 0.2

In [123]:
# data loading

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((batch_size,), maxval=len(data) - block_size, dtype=tf.int64)
    x = tf.gather(data, ix[:, tf.newaxis] + tf.range(block_size, dtype=tf.int64))
    y = tf.gather(data, ix[:, tf.newaxis] + tf.range(1, block_size + 1, dtype=tf.int64))
    return x, y

# Loss estimator during training

@tf.function
def estimate_loss():
    out = {}
    model.trainable = False
    for split in ['train', 'val']:
        losses = tf.TensorArray(tf.float32, size=eval_iters)
        for k in tf.range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses = losses.write(k, loss)
        out[split] = tf.reduce_mean(losses.stack())
    model.trainable = True
    return out

In [124]:
class Head(layers.Layer):
    """ One head of self-attention """

    def __init__(self, head_size):
        super(Head, self).__init__()
        self.key = layers.Dense(head_size, use_bias=False)
        self.query = layers.Dense(head_size, use_bias=False)
        self.value = layers.Dense(head_size, use_bias=False)
        self.tril = tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        B, T, C = x.shape
#         print('x.shape : ', x.shape)
        k = self.key(x)
#         print('k.shape : ', k.shape)
        q = self.query(x)
#         print('q.shape : ', q.shape)
        
        wei = tf.matmul(q, tf.transpose(k, perm=[0, 2, 1])* C ** -0.5 ) 
#         print('wei.shape before : ',wei.shape) #(1, 1, 1)
        
        wei = tf.where(self.tril[:T,:T] == 0, float('-inf'), wei)
#         print('wei.shape tril : ',wei.shape) #(1, 1, 1)
        wei = tf.nn.softmax(wei, axis=-1)
        wei = self.dropout(wei)
        v = self.value(x)
#         print("wei.shape: ",wei.shape," v.shape: ", v.shape ) # wei.shape:  (1, 1, 128)  v.shape:  (1, 1, 32)
        out = tf.matmul(wei, v)
        return out

class MultiHeadAttention(layers.Layer):
    """ Multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = layers.Dense(n_embd)
        self.dropout = layers.Dropout(dropout)

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(layers.Layer):
    """ A simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super(FeedForward, self).__init__()
        self.net = tf.keras.Sequential([
            layers.Dense(4 * n_embd),
            layers.ReLU(),
            layers.Dense(n_embd),
            layers.Dropout(dropout),
        ])

    def call(self, x):
        return self.net(x)

class Block(layers.Layer):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super(Block, self).__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


# 4 Model Implementation

In [125]:
#Simple Bigram Model
class BigramLanguageModel(tf.keras.Model):

    def __init__(self):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = tf.keras.Sequential([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = layers.LayerNormalization()
        self.lm_head = layers.Dense(vocab_size)

    def call(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int64))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B * T, C))
            targets = tf.reshape(targets, (B * T,))
            loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]

#             if(len(idx) >= block_size):                
#                 idx_cond = idx[:, -len(idx):]
#             else:
#                 idx_cond = idx[:, -block_size:]
            
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64)
            idx = tf.concat([idx, idx_next], axis=1)
        return idx

# 5 Model Training

## 5.1 Initialization 

In [126]:
model = BigramLanguageModel()

## 5.2 Training Loop 

In [127]:

optimizer = tf.keras.optimizers.Adam(learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))



step 0: train loss 5.0823, val loss 5.0635
step 100: train loss 3.0803, val loss 3.1245
step 200: train loss 2.5818, val loss 2.5875
step 300: train loss 2.5124, val loss 2.5094
step 400: train loss 2.4913, val loss 2.5042
step 500: train loss 2.4806, val loss 2.4934
step 600: train loss 2.4470, val loss 2.4617
step 700: train loss 2.4102, val loss 2.4402
step 800: train loss 2.3735, val loss 2.4219
step 900: train loss 2.2519, val loss 2.3091
step 1000: train loss 2.1470, val loss 2.2237
step 1100: train loss 2.0463, val loss 2.1494
step 1200: train loss 1.9664, val loss 2.0929
step 1300: train loss 1.8948, val loss 2.0086
step 1400: train loss 1.8251, val loss 1.9420
step 1500: train loss 1.7565, val loss 1.9182
step 1600: train loss 1.7130, val loss 1.8770
step 1700: train loss 1.6738, val loss 1.8653
step 1800: train loss 1.6370, val loss 1.8342
step 1900: train loss 1.5866, val loss 1.7766
step 2000: train loss 1.5670, val loss 1.7618
step 2100: train loss 1.5428, val loss 1.7419


## 5.3 Model Size 

In [128]:
# print the number of parameters in the model
# print(sum(tf.reduce_prod(p.shape) for p in model.trainable_variables) / 1e6, 'M parameters')

model.summary()

Model: "bigram_language_model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_30 (Embedding)    multiple                  33280     
                                                                 
 embedding_31 (Embedding)    multiple                  131072    
                                                                 
 sequential_143 (Sequential)  (8, 256, 512)            25206784  
                                                                 
 layer_normalization_271 (La  multiple                 1024      
 yerNormalization)                                               
                                                                 
 dense_3471 (Dense)          multiple                  33345     
                                                                 
Total params: 25,405,505
Trainable params: 25,405,505
Non-trainable params: 0
______________________________

# 6 Testing 

In [129]:
# generate from the model
context = tf.zeros((1, 1), dtype=tf.int64)
print(context.shape)


(1, 1)


In [130]:
generated_indices = model.generate(context, max_new_tokens=500)[0].numpy()
# print(decode(generated_indices.tolist()))


In [112]:
# print(tokenizer.decode(generated_indices.tolist()))
print(decode(generated_indices.tolist()))

 OC?f3E!3fb
