<a href="https://colab.research.google.com/github/amozhdehi/nanoGPT/blob/main/nanoGPT_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Initialization

In [11]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import math
from tqdm.notebook import tqdm_notebook

batch_size = 32
block_size = 16
max_iters = 5000
log_interval = 1
out_dir = 'out'
eval_interval = 100
device = 'cuda' if tf.test.gpu_device_name() else 'cpu'
eval_iters = 200
embed_dim = 128
num_heads = 8
num_layers = 4
learning_rate = 6e-4
max_iters = 600000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
dropout = 0.2
tf.random.set_seed(1337)


Dataset grabbing and tokenization

In [12]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()
tokens = sorted(list(set(corpus)))
tokens_size = len(tokens)
char_to_int = {char:i for i,char in enumerate(tokens)}
int_to_char = {i:char for i,char in enumerate(tokens)}
tokenize = lambda input: [char_to_int[char] for char in input]
detokenize = lambda input: ''.join([int_to_char[char] for char in input])
tokenized_corpus = tf.constant(tokenize(corpus), dtype=tf.int64)
tokenized_corpus = tf.constant(tokenize(corpus), dtype=tf.int64)
test_lenght = int(0.9 * len(tokenized_corpus))
train_data = tokenized_corpus[:test_lenght]
val_data = tokenized_corpus[test_lenght:]

--2023-04-29 05:54:10--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-04-29 05:54:10 (20.4 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



Dataloading

In [13]:
def get_batch(split, batch_size):
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((batch_size,), minval=0, maxval=len(data) - block_size, dtype=tf.int64)
    x = tf.stack([data[i: i + block_size] for i in ix])
    y = tf.stack([data[i + 1: i + block_size + 1] for i in ix])
    return x, y
xb, yb = get_batch('train', batch_size)
test_lenght = int(0.9 * len(tokenized_corpus))
train_data = tokenized_corpus[:test_lenght]
val_data = tokenized_corpus[test_lenght:]

Single-Head attnetion

In [14]:
class Head(tf.keras.layers.Layer):
    def __init__(self, head_embd_size):
        super().__init__()
        self.key = tf.keras.layers.Dense(head_embd_size, use_bias=False)
        self.query = tf.keras.layers.Dense(head_embd_size, use_bias=False)
        self.value = tf.keras.layers.Dense(head_embd_size, use_bias=False)
        tril = tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0)
        self.tril = tf.Variable(tril, trainable=False)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x):
        batch_size, sequence_size, head_embd_size = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = tf.matmul(q, k, transpose_b=True) * head_embd_size ** -0.5
        wei = tf.where(self.tril[: sequence_size, : sequence_size] == 0, tf.constant(float('-inf'), dtype=tf.float32, shape=wei.shape), wei)
        wei = tf.nn.softmax(wei, axis=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(x)
        out = tf.matmul(wei, v)
        return out

Multi-Head Attention

In [15]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = tf.keras.layers.Dense(embed_dim)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.dropout(self.proj(out))
        return out

GeLU activation function

In [16]:
def gelu(x):
    return 0.5 * x * (1.0 + tf.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * tf.pow(x, 3.0))))

A fully-connected network

In [17]:
class FullyConnected(tf.keras.layers.Layer):
    def __init__(self, n_embd, dropout=0.1):
        super(FullyConnected, self).__init__()
        self.ff = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * n_embd, activation=gelu),
            # tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embd),
            tf.keras.layers.Dropout(dropout),
        ])

    def call(self, x):
        return self.ff(x)

Attention block

In [18]:
class AttentionBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0
        head_size = embed_dim // num_heads
        self.sa = MultiHeadAttention(num_heads, head_size)
        self.fc = FullyConnected(embed_dim)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # self.norm1 = tf.keras.layers.BatchNormalization()
        # self.norm2 = tf.keras.layers.BatchNormalization()

    def call(self, x):
        x = x + self.sa(self.norm1(x))
        x = x + self.fc(self.norm2(x))
        return x

Training

In [19]:
class BigramLanguageModel(tf.keras.Model):
    def __init__(self, tokens_size, embed_dim, block_size, num_heads, num_layers):
        super().__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(tokens_size, embed_dim)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, embed_dim)
        self.blocks = tf.keras.Sequential([AttentionBlock(embed_dim, num_heads=num_heads) for _ in range(num_layers)])
        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # self.norm = tf.keras.layers.BatchNormalization()
        self.lm_head = tf.keras.layers.Dense(tokens_size)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.float32)) # (T, C)
        x = self.dropout(tok_emb + pos_emb) # (B, T, C)
        x = self.blocks(x) # (B, T, C)
        x = self.norm(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, tokens_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = tf.reshape(logits, (B*T, C))
            targets = tf.reshape(targets, (B*T,))
            loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, _ = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1) # (B, C)
            # sample from the distribution
            idx_next = tf.random.categorical(probs, num_samples=1, dtype=tf.int32) # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, idx_next], axis=1) # (B, T+1)
        return idx

model = BigramLanguageModel(tokens_size, embed_dim, block_size, num_heads, num_layers)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, weight_decay=weight_decay, beta_1=beta1, beta_2=beta2)
for iter in tqdm_notebook(range(max_iters)):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        out = {}
        for split in ['train', 'val']:
          losses = tf.zeros(eval_iters)
          for k in range(eval_iters):
              X, Y = get_batch(split, batch_size)
              logits, loss = model(X, Y)
              losses = tf.tensor_scatter_nd_update(losses, tf.constant([[k]]), tf.expand_dims(loss, axis=0))
          out[split] = tf.reduce_mean(losses)
        print(f"Step {iter}: Train loss {out['train']:.4f}, Validation loss {out['val']:.4f}")
    xb, yb = get_batch('train', batch_size)
    with tf.GradientTape() as tape:
        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
context = tf.zeros((1, 1), dtype=tf.int32)
generated = model.generate(context, max_new_tokens=2000)
print(detokenize(generated[0].numpy().tolist()))

  0%|          | 0/600000 [00:00<?, ?it/s]

KeyboardInterrupt: ignored