## 1 Downloading The Dataset

In [None]:
#Download the dataset to train on.

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [13]:
with open('../dataset.txt','r',encoding='utf-8') as data:
    dataset =data.read()
    dataset = [dataset]
print("Total number of characters in the dataset : ",len(dataset))

Total number of characters in the dataset :  1


In [14]:
print('First 100 characters : ', dataset[0][:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [18]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in dataset:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(len(word_freqs))        
print(word_freqs)

15057


In [32]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(len(alphabet))
print(alphabet)

65
['!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ċ', 'Ġ']


In [33]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [34]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [35]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [36]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('F', 'i'): 308
('i', 'r'): 2700
('r', 's'): 2649
('s', 't'): 6610
('Ġ', 'C'): 693
('C', 'i'): 122


In [37]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('Ġ', 't') 23837


In [38]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [39]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [40]:
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [None]:
vocab_size = 10000

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [45]:
print(merges)

{('Ġ', 't'): 'Ġt', ('h', 'e'): 'he', ('Ġ', 'a'): 'Ġa', ('o', 'u'): 'ou', ('Ġ', 's'): 'Ġs', ('Ġ', 'm'): 'Ġm', ('i', 'n'): 'in', ('Ġ', 'w'): 'Ġw', ('r', 'e'): 're', ('h', 'a'): 'ha', ('n', 'd'): 'nd', ('Ġt', 'he'): 'Ġthe', ('Ġ', 'b'): 'Ġb', ('i', 's'): 'is', ('o', 'r'): 'or', ('Ġ', 'f'): 'Ġf', ('e', 'r'): 'er', ('l', 'l'): 'll', ('i', 't'): 'it', ('o', 'n'): 'on', ('Ġ', 'd'): 'Ġd', ('Ġ', 'c'): 'Ġc', ('e', 's'): 'es', ('e', 'n'): 'en', ('Ġ', 'n'): 'Ġn', ('Ġ', 'l'): 'Ġl', ('Ġ', 'y'): 'Ġy', ('Ġt', 'h'): 'Ġth', ('a', 'r'): 'ar', ('Ġ', 'h'): 'Ġh', ('Ġ', 'o'): 'Ġo', ('Ġt', 'o'): 'Ġto', ('Ġy', 'ou'): 'Ġyou', ('Ġ', 'p'): 'Ġp'}


In [43]:
print(vocab)

['<|endoftext|>', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ċ', 'Ġ', 'Ġt']


### 2.1 Building Vocabulary 

In [None]:
from tokenizers import ByteLevelBPETokenizer

In [None]:
# Tokenize using Byte Level BPE Tokenizer

# Get unique tokens of whole words
# unique_words = list(set(text.split()))
# unique_text = ' '.join(unique_words)

tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator([text], vocab_size=10000, min_frequency=3, special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]"])



In [None]:
# Get vocabulary from the trained tokenizer
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)

In [None]:
print('Number of unique tokens in the dataset =', vocab_size)
print('\nWhich are following : \n', ' '.join(sorted(vocab.keys())))

In [None]:
# chars = sorted(list(set(text.split())))
# vocab_size = len(chars)

# print('Number of unique characters in the dataset =',vocab_size)
# print('\nWhich are following : \n',' '.join(chars))

### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [None]:
# Encode the text
encoded_text = tokenizer.encode(text).ids

In [None]:
len(encoded_text)

### 2.3 Encoding The Dataset 

In [None]:
import tensorflow as tf

In [None]:
data = tf.convert_to_tensor(encoded_text, dtype=tf.int64)
print(data.shape, data.dtype)

In [None]:
print(data[:100])

### 2.4 Splitting the dataset (Train,Validate) 

In [None]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [None]:
print(train_data[:100])

### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [None]:
block_size = 8
train_data[:block_size+1]

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

#### Implementation 

In [None]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions

def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    #print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

In [None]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

## 3 Bigram Language Model 

### 3.1 Model Architecture  

In [None]:
import tensorflow as tf

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None, training=False):
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


### 3.2 Model Initialization  

In [None]:
model = BigramLanguageModel(vocab_size)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)

#### Untrained Model Results 

In [None]:
data = model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist()
decoded_text = tokenizer.decode(data)

In [None]:
decoded_text

#Total Garbage

### 3.3 Model Training 

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [None]:
batch_size = 4
num_steps = 10000

for step in range(num_steps):
    # Sample a batch of data
    xbatch, ybatch = get_batch('train')  # Assuming you have a function get_batch

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xbatch, ybatch)

    # Compute gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print('Loss = ',loss.numpy())


In [None]:
#Somewhat stuctured results

#print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=500).numpy()[0].tolist())))

In [None]:
data = model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist()
decoded_text = tokenizer.decode(data)

In [None]:
decoded_text