In [21]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import mmap
import random 

block_size = 64
batch_size = 32
max_iters = 1500
learning_rate = 5e-5
eval_iters = 100
n_embd = 384
n_layer = 1
n_head = 1
dropout = 0.2
tf.random.set_seed(2002)

In [22]:
with open("/Users/buketcalp/vocab.txt", 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join([int_to_string[i] for i in l])

def get_random_chunk(split):
    filename = "/Users/buketcalp/output_train.txt" if split == "train" else "/Users/buketcalp/output_val.txt"
    with open(filename, "rb") as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, file_size - block_size * batch_size)
            mm.seek(start_pos)
            block = mm.read(block_size * batch_size)
            decode_block = block.decode("utf-8", errors="ignore").replace("\r", "")
            data = np.array([string_to_int[c] for c in decode_block], dtype=np.int32)
    return data

def get_batch(split, batch_size=batch_size):
    data = get_random_chunk(split)
    min_index = max(len(data) - block_size, 0)
    ix = np.random.randint(min_index, size=(batch_size,)) if min_index > 0 else np.random.randint(1, size=(batch_size,))
    x = np.stack([data[i:i + block_size] for i in ix])
    y = np.stack([data[i + 1:i + block_size + 1] for i in ix])
    sequence_length = min(x.shape[1], y.shape[1])
    x = x[:, :sequence_length]
    y = y[:, :sequence_length]
    return x, y

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.query_dense = layers.Dense(num_heads * head_size)
        self.key_dense = layers.Dense(num_heads * head_size)
        self.value_dense = layers.Dense(num_heads * head_size)
        self.dense = layers.Dense(n_embd)
        self.dropout = layers.Dropout(dropout)
        self.tril = np.tril(np.ones((block_size, block_size)))

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_size))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(depth)

        mask = tf.convert_to_tensor(self.tril, dtype=tf.float32)
        mask = mask[:tf.shape(scaled_attention_logits)[-2], :tf.shape(scaled_attention_logits)[-1]]
        scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        attention_weights = self.dropout(attention_weights)

        scaled_attention = tf.matmul(attention_weights, value)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.num_heads * self.head_size))

        output = self.dense(concat_attention)
        return output

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, n_embd):
        super(FeedForward, self).__init__()
        self.seq = tf.keras.Sequential([
            layers.Dense(4 * n_embd, activation='relu'),
            layers.Dense(n_embd),
            layers.Dropout(dropout)
        ])

    def call(self, inputs):
        return self.seq(inputs)

class Block(tf.keras.layers.Layer):
    def __init__(self, n_embd, n_head):
        super(Block, self).__init__()
        head_size = n_embd // n_head
        self.mha = MultiHeadAttention(n_head, head_size)
        self.ffn = FeedForward(n_embd)
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        attn_output = self.mha(self.layernorm1(inputs))
        out1 = inputs + attn_output
        ffn_output = self.ffn(self.layernorm2(out1))
        return out1 + ffn_output

class GPTLanguageModel(tf.keras.Model):
    def __init__(self, vocab_size):
        super(GPTLanguageModel, self).__init__()
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = [Block(n_embd, n_head=n_head) for _ in range(n_layer)]
        self.ln_f = layers.LayerNormalization(epsilon=1e-6)
        self.lm_head = layers.Dense(vocab_size)

    def call(self, index, targets=None):
        B = tf.shape(index)[0]
        T = tf.shape(index)[1]
        tok_emb = self.token_embedding_table(index)
        pos_emb = self.position_embedding_table(tf.range(T))
        x = tok_emb + pos_emb

        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is not None:
            logits_flat = tf.reshape(logits, [-1, tf.shape(logits)[-1]])
            targets_flat = tf.reshape(targets, [-1])
            loss = tf.keras.losses.sparse_categorical_crossentropy(targets_flat, logits_flat, from_logits=True)
            return logits, tf.reduce_mean(loss)
        else:
            return logits

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            index_cond = index[:, -block_size:]
            logits = self(index_cond)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            index_next = tf.random.categorical(probs, num_samples=1)
            index = tf.concat([index, index_next], axis=1)
        return index


In [23]:
model = GPTLanguageModel(vocab_size)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)

@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        logits, loss = model(inputs, targets)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

@tf.function
def eval_step(inputs, targets):
    logits, loss = model(inputs, targets)
    return loss

for iter in range(max_iters):
    if iter % eval_iters == 0:
        train_losses = [eval_step(*get_batch('train')) for _ in range(eval_iters)]
        val_losses = [eval_step(*get_batch('val')) for _ in range(eval_iters)]
        print(f"step: {iter}, train loss: {np.mean(train_losses):.3f}, val loss: {np.mean(val_losses):.3f}")

    xb, yb = get_batch('train')
    train_step(xb, yb)


model.save_weights('model-01.h5')
print('Model saved')

step: 0, train loss: 10.439, val loss: 10.443
step: 100, train loss: 6.380, val loss: 6.383
step: 200, train loss: 3.277, val loss: 3.288
step: 300, train loss: 1.777, val loss: 1.792
step: 400, train loss: 0.960, val loss: 0.976
step: 500, train loss: 0.650, val loss: 0.695
step: 600, train loss: 0.481, val loss: 0.588
step: 700, train loss: 0.406, val loss: 0.382
step: 800, train loss: 0.309, val loss: 0.322
step: 900, train loss: 0.241, val loss: 0.240
step: 1000, train loss: 0.212, val loss: 0.200
step: 1100, train loss: 0.305, val loss: 0.215
step: 1200, train loss: 0.275, val loss: 0.154
step: 1300, train loss: 0.147, val loss: 0.156
step: 1400, train loss: 0.136, val loss: 0.139
Model saved
