## 1 Downloading The Dataset

In [None]:
#Download the dataset to train on.

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [1]:
with open('../dataset.txt','r',encoding='utf-8') as data:
    dataset =data.read()
    dataset = [dataset]
print("Total number of characters in the dataset : ",len(dataset))

Total number of characters in the dataset :  1


In [2]:
print('First 100 characters : ', dataset[0][:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in dataset:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(len(word_freqs))        
print(word_freqs)

15057


In [16]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(len(alphabet))
print(alphabet)

65
['!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ċ', 'Ġ']


In [17]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [18]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [19]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [20]:
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('F', 'i'): 308
('i', 'r'): 2700
('r', 's'): 2649
('s', 't'): 6610
('Ġ', 'C'): 693
('C', 'i'): 122


In [21]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('Ġ', 't') 23837


In [22]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [23]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [24]:
splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [14]:
vocab_size = 10000

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [15]:
print(merges)



In [25]:
print(vocab)

['<|endoftext|>', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ċ', 'Ġ', 'Ġt']


### 2.1 Building Vocabulary 

In [29]:
from tokenizers import ByteLevelBPETokenizer

In [30]:
# Tokenize using Byte Level BPE Tokenizer

# Get unique tokens of whole words
# unique_words = list(set(text.split()))
# unique_text = ' '.join(unique_words)

tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator([text], vocab_size=10000, min_frequency=3, special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]"])



In [31]:
# Get vocabulary from the trained tokenizer
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)

In [32]:
print('Number of unique tokens in the dataset =', vocab_size)
print('\nWhich are following : \n', ' '.join(sorted(vocab.keys())))

Number of unique tokens in the dataset = 9787

Which are following : 


In [None]:
# chars = sorted(list(set(text.split())))
# vocab_size = len(chars)

# print('Number of unique characters in the dataset =',vocab_size)
# print('\nWhich are following : \n',' '.join(chars))

### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [33]:
# Encode the text
encoded_text = tokenizer.encode(text).ids

In [34]:
len(encoded_text)

312505

### 2.3 Encoding The Dataset 

In [35]:
import tensorflow as tf

In [36]:
data = tf.convert_to_tensor(encoded_text, dtype=tf.int64)
print(data.shape, data.dtype)

(312505,) <dtype: 'int64'>


In [37]:
print(data[:100])

tf.Tensor(
[ 675 1200   29  202 2346  335 2751  806 2306   15  678  321  620   17
  202  202 1235   29  202 2542   15  620   17  202  202  675 1200   29
  202  569  422  399 4196 1502  291  968  531  291 7922   34  202  202
 1235   29  202 3666 5591   17 4196   17  202  202  675 1200   29  202
  675   15  292  508 3996 1421  328 3757 1945  291  271 1196   17  202
  202 1235   29  202  671  508  670   15  335  508  670   17  202  202
  675 1200   29  202  948  534 1318  364   15  300  335  459  359 3148
  463  416], shape=(100,), dtype=int64)


### 2.4 Splitting the dataset (Train,Validate) 

In [38]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [39]:
print(train_data[:100])

tf.Tensor(
[ 675 1200   29  202 2346  335 2751  806 2306   15  678  321  620   17
  202  202 1235   29  202 2542   15  620   17  202  202  675 1200   29
  202  569  422  399 4196 1502  291  968  531  291 7922   34  202  202
 1235   29  202 3666 5591   17 4196   17  202  202  675 1200   29  202
  675   15  292  508 3996 1421  328 3757 1945  291  271 1196   17  202
  202 1235   29  202  671  508  670   15  335  508  670   17  202  202
  675 1200   29  202  948  534 1318  364   15  300  335  459  359 3148
  463  416], shape=(100,), dtype=int64)


### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [40]:
block_size = 8
train_data[:block_size+1]

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([ 675, 1200,   29,  202, 2346,  335, 2751,  806, 2306], dtype=int64)>

In [41]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [675]   target is:  1200
for input:  [675, 1200]   target is:  29
for input:  [675, 1200, 29]   target is:  202
for input:  [675, 1200, 29, 202]   target is:  2346
for input:  [675, 1200, 29, 202, 2346]   target is:  335
for input:  [675, 1200, 29, 202, 2346, 335]   target is:  2751
for input:  [675, 1200, 29, 202, 2346, 335, 2751]   target is:  806
for input:  [675, 1200, 29, 202, 2346, 335, 2751, 806]   target is:  2306


#### Implementation 

In [42]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions

def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    #print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

inputs:
(4, 8)
tf.Tensor(
[[  15  202  842 1346  300  918  342 6891]
 [  15  271 5365  202 2498   15  310 1219]
 [9637  311 6045   10  930   30  202 2299]
 [ 202  652  343 2476  388  720 2268  331]], shape=(4, 8), dtype=int64)
targets:
(4, 8)
tf.Tensor(
[[ 202  842 1346  300  918  342 6891   30]
 [ 271 5365  202 2498   15  310 1219   15]
 [ 311 6045   10  930   30  202 2299 4433]
 [ 652  343 2476  388  720 2268  331  271]], shape=(4, 8), dtype=int64)


In [43]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [15]   target is:  202
for input:  [15, 202]   target is:  842
for input:  [15, 202, 842]   target is:  1346
for input:  [15, 202, 842, 1346]   target is:  300
for input:  [15, 202, 842, 1346, 300]   target is:  918
for input:  [15, 202, 842, 1346, 300, 918]   target is:  342
for input:  [15, 202, 842, 1346, 300, 918, 342]   target is:  6891
for input:  [15, 202, 842, 1346, 300, 918, 342, 6891]   target is:  30
for input:  [15]   target is:  271
for input:  [15, 271]   target is:  5365
for input:  [15, 271, 5365]   target is:  202
for input:  [15, 271, 5365, 202]   target is:  2498
for input:  [15, 271, 5365, 202, 2498]   target is:  15
for input:  [15, 271, 5365, 202, 2498, 15]   target is:  310
for input:  [15, 271, 5365, 202, 2498, 15, 310]   target is:  1219
for input:  [15, 271, 5365, 202, 2498, 15, 310, 1219]   target is:  15
for input:  [9637]   target is:  311
for input:  [9637, 311]   target is:  6045
for input:  [9637, 311, 6045]   target is:  10
for input:  [9637

## 3 Bigram Language Model 

### 3.1 Model Architecture  

In [44]:
import tensorflow as tf

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None, training=False):
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


### 3.2 Model Initialization  

In [45]:
model = BigramLanguageModel(vocab_size)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)

(32, 9787)
tf.Tensor(9.192961, shape=(), dtype=float32)


#### Untrained Model Results 

In [46]:
data = model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist()
decoded_text = tokenizer.decode(data)

In [47]:
decoded_text

#Total Garbage

'ciansbraincertain government� old abundance allegianceorgecalled nest belAND morrow patiently Lartius first urge awads ascend drew disloyal ByEvenlliments prot rapier mistress canker fain undergo roaringaul hail divines Your� acquainted concludedokeso alliance OFblind tailor kindredTwenty cellMeantimeuddy commission callwhy Will multitude oft slanders hor liberal wondering lent nothingGlHappCfan tatter ROS fighting breaksRA knownastian considerpp begot loath acqupiring drinks hornclvost remedies bottom save surfeit note bladehisCommke centreNone kissing hourly commonsOr'

### 3.3 Model Training 

In [48]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [49]:
batch_size = 4
num_steps = 10000

for step in range(num_steps):
    # Sample a batch of data
    xbatch, ybatch = get_batch('train')  # Assuming you have a function get_batch

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xbatch, ybatch)

    # Compute gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print('Loss = ',loss.numpy())


Loss =  6.96172


In [None]:
#Somewhat stuctured results

#print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=500).numpy()[0].tolist())))

In [None]:
data = model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist()
decoded_text = tokenizer.decode(data)

In [None]:
decoded_text