In [2]:
import os
from typing import Union
os.environ["KERAS_BACKEND"] = "torch"

import numpy as np
import torch
import tensorflow as tf
import keras as K
from keras import layers
from keras import losses
from keras import metrics
from keras import optimizers
from keras import initializers
# import keras_nlp
from dataclasses import dataclass

2023-09-20 14:48:17.902270: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-20 14:48:17.959160: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using PyTorch backend.


In [16]:
data_dir = "../data"
dataset_name = "shakespeare"

if os.environ["KERAS_BACKEND"] == "torch":
    token_dtype = np.int32
    train_path = os.path.join(data_dir, dataset_name, "train_int32.bin")
    val_path = os.path.join(data_dir, dataset_name, "val_int32.bin")
else:
    token_dtype = np.uint16
    train_path = os.path.join(data_dir, dataset_name, "train.bin")
    val_path = os.path.join(data_dir, dataset_name, "val.bin")

In [17]:
@dataclass
class GPTConfig:
    # GPT configs
    block_size: int = 512 # 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 8 # 12
    n_head: int = 12
    hidden_size: int = 512 # 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
    layer_norm_epsilon: float = 1e-05

    # Train configs
    n_epoch = 1
    batch_size = 4
    weight_decay = 1e-01
    beta1 = 0.9
    beta2 = 0.95
    decay_lr = True # whether to decay the learning rate
    warmup_iters = 100 # 2000 # how many steps to warm up for
    verbose = 100 # 10

config = GPTConfig()

In [18]:
class Block(layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        self.ln1 = layers.LayerNormalization(epsilon=config.layer_norm_epsilon) # TODO: bias?
        self.ln2 = layers.LayerNormalization(epsilon=config.layer_norm_epsilon)
        mha = layers.MultiHeadAttention(num_heads=config.n_head, key_dim=config.hidden_size // config.n_head)
        self.attn = lambda x, training: mha(x, x, training=training, use_causal_mask=True)
        self.mlp = K.Sequential([
            layers.Dense(
                units=4*config.hidden_size, use_bias=True, activation="gelu",
                kernel_initializer=initializers.RandomNormal(mean=0.0, stddev=0.02),
                bias_initializer=initializers.Zeros(),
            ),
            layers.Dense(
                units=config.hidden_size, use_bias=True,
                kernel_initializer=initializers.RandomNormal(mean=0.0, stddev=0.02),
                bias_initializer=initializers.Zeros(),
            ),
            layers.Dropout(config.dropout)
        ])

    def call(self, x, training=None):
        x = x + self.attn(self.ln1(x), training=training)
        x = x + self.mlp(self.ln2(x), training=training)
        return x
    

class GPT(K.Model):
    def __init__(self, config: GPTConfig, **kwargs):
        super().__init__(name="coreGPT", **kwargs)
        self.config = config

        # input embedding
        self.tok_emb = K.layers.Embedding(
            input_dim=config.vocab_size, output_dim=config.hidden_size,
            embeddings_initializer=K.initializers.RandomNormal(mean=0.0, stddev=0.02),
            name="embedding",
        )
        self.drop = layers.Dropout(config.dropout)
        # transformer blocks
        self.blocks = [Block(config) for _ in range(config.n_layer)]
        # decoder head
        self.ln_f = layers.LayerNormalization(axis=-1)
        self.head = layers.Dense(
            units=config.vocab_size, use_bias=False,
            kernel_initializer=initializers.RandomNormal(mean=0.0, stddev=0.02),
        )

    def build(self, input_shape):
        super().build(input_shape)
        self.pos_emb = self.add_weight(
            name="positional",
            shape=(1, self.config.block_size, self.config.hidden_size),
            initializer=K.initializers.RandomNormal(mean=0.0, stddev=0.02),
            trainable=True,
        )

    def call(self, inputs, training=None):
        B, T = inputs.shape
        # embed sentence
        wte = self.tok_emb(inputs)
        wpe = self.pos_emb[:, :T, :]
        x = self.drop(wte + wpe, training=training)
        # attention
        for block in self.blocks:
            x = block(x, training=training)
        # compute logits
        x = self.ln_f(x)
        x = self.head(x)
        return x
    
    def summary(self):
        x = K.Input(shape=[self.config.block_size], batch_size=self.config.batch_size, dtype="int32")
        dummy = K.Model(inputs=x, outputs=self.call(x), name=self.name)
        return dummy.summary()

In [19]:
inputs = K.Input(shape=[config.block_size], dtype="int32")

model = GPT(config)
model.build((config.batch_size, config.block_size))
model.compile(
    optimizer=optimizers.AdamW(learning_rate=6e-4, weight_decay=config.weight_decay),
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[metrics.SparseCategoricalAccuracy(name='accuracy')],
)
# model.summary()

In [21]:
def load_data(config, train_path, val_path):
    train_data = np.memmap(train_path, dtype=token_dtype, mode='r')
    val_data = np.memmap(val_path, dtype=token_dtype, mode='r')

    n_batch_train = (len(train_data)-config.block_size)//config.batch_size
    n_batch_val = (len(val_data)-config.block_size)//config.batch_size

    def get_windowed_tf_dataset(data: Union[np.memmap, np.array]):
        x = (
            tf.data.Dataset.from_tensor_slices(data[:-1])
            .window(config.block_size, shift=1, stride=1, drop_remainder=True)
            .flat_map(lambda x: x.batch(config.block_size))
        )
        y = (
            tf.data.Dataset.from_tensor_slices(data[1:])
            .window(config.block_size, shift=1, stride=1, drop_remainder=True)
            .flat_map(lambda x: x.batch(config.block_size))
        )

        return (
            tf.data.Dataset
            .zip((x, y))
            .batch(batch_size=config.batch_size,
                drop_remainder=True,
                num_parallel_calls=tf.data.AUTOTUNE)
            .repeat()
            .prefetch(buffer_size=tf.data.AUTOTUNE)
        )

    train_dataset = get_windowed_tf_dataset(train_data)
    val_dataset = get_windowed_tf_dataset(val_data)

    return train_dataset, val_dataset, n_batch_train, n_batch_val

train_dataset, val_dataset, n_batch_train, n_batch_val = load_data(config, train_path, val_path)

In [24]:
# Need to "build" the model before fit()
if os.environ["KERAS_BACKEND"] == "torch":
    inp = next(iter(train_dataset))[0]
    _ = model(inp)

In [25]:
history = model.fit(
    train_dataset,
    steps_per_epoch=n_batch_train,
    epochs=config.n_epoch,
    validation_data=val_dataset,
    validation_steps=n_batch_val,
    verbose=1
)

[1m  20/8427[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:46:42[0m 2s/step - accuracy: 0.1550 - loss: 7.8354

KeyboardInterrupt: 

In [84]:
history.history

{'accuracy': [0.7466796636581421, 0.799023449420929],
 'loss': [0.8615785241127014, 0.6976556181907654],
 'val_accuracy': [0.074462890625, 0.106689453125],
 'val_loss': [9.359951972961426, 9.639775276184082]}

In [15]:
inp = np.random.random_integers(0, 100, size=(1, config.n_seq,))
inp.shape

  inp = np.random.random_integers(0, 100, size=(1, config.n_seq,))


(1, 256)

In [17]:
out = model.predict(inp, batch_size=1)
out.shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step


(1, 256, 50304)

In [18]:
out[0][0]

array([ 0.29266614, -0.02570427, -0.08666821, ...,  0.2424404 ,
       -0.1005659 ,  0.25224355], dtype=float32)

In [None]:
for _ in range(2):
    

In [45]:
inp = np.array(
    [
        [
            [0.0, 0.0, 0.0, 0.0],
            [100, 200, 300, 400],
            [0.1, 0.3, 0.7, 0.9],
        ]
    ]
)

print(inp.shape)
inp

(1, 3, 4)


array([[[0.e+00, 0.e+00, 0.e+00, 0.e+00],
        [1.e+02, 2.e+02, 3.e+02, 4.e+02],
        [1.e-01, 3.e-01, 7.e-01, 9.e-01]]])

In [46]:
model.predict([inp], batch_size=3)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step


array([[[   0.      ,    0.      ,    0.      ,    0.      ],
        [ -51.026886, -188.48923 ,   14.233276, -163.07217 ],
        [ -50.939663, -188.4063  ,   14.111473, -163.03067 ]]],
      dtype=float32)

In [6]:
inputs = np.ones(shape=(1, 50), dtype="int32")