## 1 Downloading The Dataset

In [None]:
#Download the dataset to train on.

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [1]:
with open('../dataset.txt','r',encoding='utf-8') as data:
    text = data.read()
    
print("Total number of characters in the dataset : ",len(text))

Total number of characters in the dataset :  1115394


In [2]:
print('First 100 characters : ', text[:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

### 2.1 Building Vocabulary 

In [3]:
from tokenizers import ByteLevelBPETokenizer

In [7]:
# Tokenize using Byte Level BPE Tokenizer

# Get unique tokens of whole words
# unique_words = list(set(text.split()))
# unique_text = ' '.join(unique_words)

tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator([text], vocab_size=10000, min_frequency=3, special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]"])



In [8]:
# Get vocabulary from the trained tokenizer
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)

In [9]:
print('Number of unique tokens in the dataset =', vocab_size)
print('\nWhich are following : \n', ' '.join(sorted(vocab.keys())))

Number of unique tokens in the dataset = 9787

Which are following : 


In [10]:
# chars = sorted(list(set(text.split())))
# vocab_size = len(chars)

# print('Number of unique characters in the dataset =',vocab_size)
# print('\nWhich are following : \n',' '.join(chars))

### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [11]:
# Encode the text
encoded_text = tokenizer.encode(text).ids

In [12]:
len(encoded_text)

312505

### 2.3 Encoding The Dataset 

In [13]:
import tensorflow as tf

In [14]:
data = tf.convert_to_tensor(encoded_text, dtype=tf.int64)
print(data.shape, data.dtype)

(312505,) <dtype: 'int64'>


In [15]:
print(data[:100])

tf.Tensor(
[ 675 1200   29  202 2346  335 2751  806 2306   15  678  321  620   17
  202  202 1235   29  202 2542   15  620   17  202  202  675 1200   29
  202  569  422  399 4196 1502  291  968  531  291 7922   34  202  202
 1235   29  202 3666 5591   17 4196   17  202  202  675 1200   29  202
  675   15  292  508 3996 1421  328 3757 1945  291  271 1196   17  202
  202 1235   29  202  671  508  670   15  335  508  670   17  202  202
  675 1200   29  202  948  534 1318  364   15  300  335  459  359 3148
  463  416], shape=(100,), dtype=int64)


### 2.4 Splitting the dataset (Train,Validate) 

In [16]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [17]:
print(train_data[:100])

tf.Tensor(
[ 675 1200   29  202 2346  335 2751  806 2306   15  678  321  620   17
  202  202 1235   29  202 2542   15  620   17  202  202  675 1200   29
  202  569  422  399 4196 1502  291  968  531  291 7922   34  202  202
 1235   29  202 3666 5591   17 4196   17  202  202  675 1200   29  202
  675   15  292  508 3996 1421  328 3757 1945  291  271 1196   17  202
  202 1235   29  202  671  508  670   15  335  508  670   17  202  202
  675 1200   29  202  948  534 1318  364   15  300  335  459  359 3148
  463  416], shape=(100,), dtype=int64)


### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [18]:
block_size = 8
train_data[:block_size+1]

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([ 675, 1200,   29,  202, 2346,  335, 2751,  806, 2306], dtype=int64)>

In [19]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [675]   target is:  1200
for input:  [675, 1200]   target is:  29
for input:  [675, 1200, 29]   target is:  202
for input:  [675, 1200, 29, 202]   target is:  2346
for input:  [675, 1200, 29, 202, 2346]   target is:  335
for input:  [675, 1200, 29, 202, 2346, 335]   target is:  2751
for input:  [675, 1200, 29, 202, 2346, 335, 2751]   target is:  806
for input:  [675, 1200, 29, 202, 2346, 335, 2751, 806]   target is:  2306


#### Implementation 

In [20]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions

def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    #print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

inputs:
(4, 8)
tf.Tensor(
[[ 202  202 1030 1782   29  202 3406  665]
 [ 415   17  295 1632  415   15 2013   15]
 [6996   34  202   37 3420  928  422 7580]
 [  29  202   50 1169   15 1169   15 2183]], shape=(4, 8), dtype=int64)
targets:
(4, 8)
tf.Tensor(
[[ 202 1030 1782   29  202 3406  665  865]
 [  17  295 1632  415   15 2013   15  202]
 [  34  202   37 3420  928  422 7580   15]
 [ 202   50 1169   15 1169   15 2183 3399]], shape=(4, 8), dtype=int64)


In [21]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [202]   target is:  202
for input:  [202, 202]   target is:  1030
for input:  [202, 202, 1030]   target is:  1782
for input:  [202, 202, 1030, 1782]   target is:  29
for input:  [202, 202, 1030, 1782, 29]   target is:  202
for input:  [202, 202, 1030, 1782, 29, 202]   target is:  3406
for input:  [202, 202, 1030, 1782, 29, 202, 3406]   target is:  665
for input:  [202, 202, 1030, 1782, 29, 202, 3406, 665]   target is:  865
for input:  [415]   target is:  17
for input:  [415, 17]   target is:  295
for input:  [415, 17, 295]   target is:  1632
for input:  [415, 17, 295, 1632]   target is:  415
for input:  [415, 17, 295, 1632, 415]   target is:  15
for input:  [415, 17, 295, 1632, 415, 15]   target is:  2013
for input:  [415, 17, 295, 1632, 415, 15, 2013]   target is:  15
for input:  [415, 17, 295, 1632, 415, 15, 2013, 15]   target is:  202
for input:  [6996]   target is:  34
for input:  [6996, 34]   target is:  202
for input:  [6996, 34, 202]   target is:  37
for input:  [699

## 3 Bigram Language Model 

### 3.1 Model Architecture  

In [22]:
import tensorflow as tf

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None, training=False):
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


### 3.2 Model Initialization  

In [23]:
model = BigramLanguageModel(vocab_size)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)

(32, 9787)
tf.Tensor(9.19186, shape=(), dtype=float32)


#### Untrained Model Results 

In [24]:
data = model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist()
decoded_text = tokenizer.decode(data)

In [25]:
decoded_text

#Total Garbage

'ozapable Haarden GreenSpseeessed noon ignawful trembles smileial Was car entreaties crad sepulark VI little smallest wasted nap wit unseen PrBelikefatherkind innocency slipZABETHStri B remediesBehold occup horseshi alter w masqu poundsians draws heigh bids please rates Then hon intentsMasterAumerleBel tong tink remedthough drinks horncl just Bret specialward whole greyh perish Amazon requiredothe cheque sideence� freedomBeing Overdone abilityerbyLAdis turning shieldable run suollow sessisomeTrue shQUEEN dryBAGOnow kinsmen'

### 3.3 Model Training 

In [26]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [27]:
batch_size = 4
num_steps = 10000

for step in range(num_steps):
    # Sample a batch of data
    xbatch, ybatch = get_batch('train')  # Assuming you have a function get_batch

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xbatch, ybatch)

    # Compute gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print('Loss = ',loss.numpy())


Loss =  6.9134645


In [28]:
#Somewhat stuctured results

#print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=500).numpy()[0].tolist())))

In [29]:
data = model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist()
decoded_text = tokenizer.decode(data)

In [30]:
decoded_text

'etru bo tonLEY purposes effeminateayed soundifter Milong knowingContressed dust notwithopegraELAND Hastoint membTheseouches immortalasonable soilVERpes deck wishesomachConfessurstoPetruchioForgiveshi turns my boy whoresonGaThough confer deserve amb1 unjustlyURcent budge Cle judgment store forms squthw imag spices swords supplyNurse condemnedfound labour turned tiedushaching characousepoorhest� orchardSupp mad tiger Gremio crab Pompeyappare bigger every removetw Tewksbury victory enjoys afternoonWIC stabb myselfusualdist promisedcy revive speechless'