## 1 Downloading The Dataset

In [13]:
#Downloading dataset to train on.

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-02-18 20:37:13--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt.1'

     0K .......... .......... .......... .......... ..........  4%  368K 3s
    50K .......... .......... .......... .......... ..........  9%  769K 2s
   100K .......... .......... .......... .......... .......... 13%  753K 2s
   150K .......... .......... .......... .......... .......... 18%  691K 2s
   200K .......... .......... .......... .......... .......... 22%  813K 1s
   250K .......... .......... .......... .......... .......... 27%  738K 1s
   300K .......... .......... .......... .......... .......... 32%  749K 1s
   350K .......... 

In [1]:
with open('input.txt','r',encoding='utf-8') as data:
    text = data.read()
    
print("Total number of characters in the dataset : ",len(text))

Total number of characters in the dataset :  1115394


In [2]:
print('First 100 characters : ', text[:100])

First 100 characters :  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## 2 Processing The Dataset 

### 2.1 Building Vocabulary 

In [51]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print('Number of unique characters in the dataset =',vocab_size)
print('\nWhich are following : \n',''.join(chars))

Number of unique characters in the dataset = 65

Which are following : 
 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


### 2.2 Building Tokenizer  (Custom Encoder, Decoder)

In [52]:
#mapping individual charaters to integers 

encoder = { char:i for i,char in enumerate(chars)}
decoder = { i:char for i,char in enumerate(chars)}

encode = lambda string: [encoder[char] for char in string]
decode = lambda integers: [decoder[i] for i in integers]

In [53]:
zk = encode('Zain Khalid')
print('Encoded Zain Khalid \n= ',zk)
print('\nDecoded ',zk,'\n=', ''.join(decode(zk)))

Encoded Zain Khalid 
=  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42]

Decoded  [38, 39, 47, 52, 1, 23, 46, 39, 50, 47, 42] 
= Zain Khalid


### 2.3 Encoding The Dataset 

In [54]:
import tensorflow as tf

In [55]:
data = tf.convert_to_tensor(encode(text), dtype=tf.int64)
print(data.shape, data.dtype)

(1115394,) <dtype: 'int64'>


In [56]:
print(data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.4 Splitting the dataset (Train,Validate) 

In [57]:
limit = int(0.9 * len(data))

train_data = data[:limit]
val_data = data[limit:]

In [58]:
print(train_data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int64)


### 2.5 Chunking Dataset in Blocks (x,y) (To Train Transformer) 

#### Concept 

In [59]:
block_size = 8
train_data[:block_size+1]

<tf.Tensor: shape=(9,), dtype=int64, numpy=array([18, 47, 56, 57, 58,  1, 15, 47, 58], dtype=int64)>

In [60]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [18]   target is:  47
for input:  [18, 47]   target is:  56
for input:  [18, 47, 56]   target is:  57
for input:  [18, 47, 56, 57]   target is:  58
for input:  [18, 47, 56, 57, 58]   target is:  1
for input:  [18, 47, 56, 57, 58, 1]   target is:  15
for input:  [18, 47, 56, 57, 58, 1, 15]   target is:  47
for input:  [18, 47, 56, 57, 58, 1, 15, 47]   target is:  58


#### Implementation 

In [61]:
batch_size = 4 #Number of independent input sequences to process in parallel for GPU
block_size = 8 #Maximum context length to make predictions

def get_batch(split):
    #generate small batches of input x & target y
    data = train_data if split == 'train' else val_data
    randPos = tf.dtypes.cast(tf.random.uniform((batch_size,), minval=0, maxval=(len(data)-block_size)), dtype=tf.int32)
    print(randPos) # random positions in the whole datasets to grab block size chunks
    xbatch = tf.stack([data[i:i+block_size] for i in randPos])
    ybatch = tf.stack([data[i+1:i+block_size+1] for i in randPos])
    
    return xbatch, ybatch

xbatch, ybatch = get_batch('train')

print('inputs:')
print(xbatch.shape)
print(xbatch)

print('targets:')
print(ybatch.shape)
print(ybatch)

tf.Tensor([454602 558724 397180 598054], shape=(4,), dtype=int32)
inputs:
(4, 8)
tf.Tensor(
[[59 57 43 11  1 46 43 39]
 [49 52 53 61  5 57 58  1]
 [50 53 60 43  6  1 40 59]
 [61 53 53 42 41 53 41 49]], shape=(4, 8), dtype=int64)
targets:
(4, 8)
tf.Tensor(
[[57 43 11  1 46 43 39 56]
 [52 53 61  5 57 58  1 51]
 [53 60 43  6  1 40 59 58]
 [53 53 42 41 53 41 49  1]], shape=(4, 8), dtype=int64)


In [62]:
for row in range(batch_size):
    for token in range(block_size):
        context = xbatch[row, :token+1]
        target = ybatch[row, token]
        print('for input: ',context.numpy().tolist(),'  target is: ',target.numpy().tolist())

for input:  [59]   target is:  57
for input:  [59, 57]   target is:  43
for input:  [59, 57, 43]   target is:  11
for input:  [59, 57, 43, 11]   target is:  1
for input:  [59, 57, 43, 11, 1]   target is:  46
for input:  [59, 57, 43, 11, 1, 46]   target is:  43
for input:  [59, 57, 43, 11, 1, 46, 43]   target is:  39
for input:  [59, 57, 43, 11, 1, 46, 43, 39]   target is:  56
for input:  [49]   target is:  52
for input:  [49, 52]   target is:  53
for input:  [49, 52, 53]   target is:  61
for input:  [49, 52, 53, 61]   target is:  5
for input:  [49, 52, 53, 61, 5]   target is:  57
for input:  [49, 52, 53, 61, 5, 57]   target is:  58
for input:  [49, 52, 53, 61, 5, 57, 58]   target is:  1
for input:  [49, 52, 53, 61, 5, 57, 58, 1]   target is:  51
for input:  [50]   target is:  53
for input:  [50, 53]   target is:  60
for input:  [50, 53, 60]   target is:  43
for input:  [50, 53, 60, 43]   target is:  6
for input:  [50, 53, 60, 43, 6]   target is:  1
for input:  [50, 53, 60, 43, 6, 1]   

## 3 Bigram Language Model 

In [63]:
import tensorflow as tf

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None, training=False):
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
            )

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]  # (B, C)
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            idx_next = tf.random.categorical(tf.math.log(probs), 1)  # (B, 1)
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


model = BigramLanguageModel(vocab_size)


logits, loss = model(xbatch, ybatch)
print(logits.shape)
print(loss)


(32, 65)
tf.Tensor(4.185641, shape=(), dtype=float32)


In [71]:
print(''.join(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int64), max_new_tokens=100).numpy()[0].tolist())))


iIOjlt!EciPA!BLMaNG-BcPA3hHA:FlSmLukkW:'T$Yes3FWszmHS;xQMWoS&PmD&?rWMEBlXPFljZHNdzll-GQmmzPLKw!qj:EM
