In [1]:
# Training text is Thus Spoke Zarathustra concatenated with Dostoyevsky's 'The Idiot'
# Guided by Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY

In [2]:
import tensorflow as tf
from typing import Tuple, Optional, Union
from tokenizer import TextProcessor
import warnings

warnings.filterwarnings("ignore")
# warnings.simplefilter("ignore")
tp = TextProcessor("TSZ_input.txt")
data = tp.text

data_encoded = tp.map_to_int(data)
data_encoded = tf.convert_to_tensor(data_encoded)

train_split = int(len(data) * 0.9)
xtrain = data_encoded[:train_split]
val_data = data_encoded[train_split:]

2023-10-08 17:01:35.027853: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-08 17:01:35.070042: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-08 17:01:37.528905: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-08 17:01:37.612709: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA no

- The `tensor([76 62 63 1 75 63 60 80 83])` actually contains 7 different training examples in it; lets assume that these char to int 'embeddings' represent the word 'Nietzsche' the different examples would be:
  - `[76]` (aka. `['N']`) -> is likely proceeded by a `[62]`(or `['i']`)
  - `[76 62]` (aka. `['Ni']`) -> is likely proceeded by a `[63]` (or `['e']`)
  - ...
  - `[76 62 63  1 75 63 60 80]` (aka. `['Nietzsch']`) -> is likely proceeded by `[83]` (or `['e']`)
    <br>
    <br>
- transformer will never receive more than chunk_size tokens/inputs at a time
  The below operation is not only done for efficiency but also so the transformer is 'used' to seeing context of size 1..chunk_size


In [3]:
chunk_size = 8
x = xtrain[:chunk_size]
y = xtrain[1 : chunk_size + 1]

# for k in range(chunk_size):
#     context = x[: k + 1]
#     target = y[k]
# print(f'context: {context} target: {target}')
# print(tf.__version__)

In [4]:
tf.random.set_seed(44)
batch_size = 32
# Chunksize = context length, number of tokens to be considered
# chunk_size = 8

# get training batch
# tf.function because the global seed is set and operation seeds are not set
# @tf.function
# However, adding the tf.function decorator causes the following error:
# OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.


def get_batch(dat: str) -> Tuple[tf.Tensor, tf.Tensor]:
    dat = xtrain if dat == "train" else val_data
    randint = tf.random.uniform(
        shape=(batch_size,), maxval=len(dat) - chunk_size, dtype=tf.int64
    )
    x = tf.stack([dat[i : i + chunk_size] for i in randint])
    y = tf.stack([dat[i + 1 : i + chunk_size + 1] for i in randint])
    # print(f"randint: {randint}, randint.shape: {randint.shape}")
    return x, y


xs, ys = get_batch("train")

"""
x=[1,60,69..10]
y = [60,69,..1]
input x[0,0:2]= [1,60] output is y[2] = 69
"""

# for b in range(batch_size):
#     for k in range(chunk_size):
#         context = xs[b, : k + 1]
#         target = ys[b, k]
# print(f"context: {context} target: {target}")

xs.shape, ys.shape, type(xs), type(ys)

(TensorShape([32, 8]),
 TensorShape([32, 8]),
 tensorflow.python.framework.ops.EagerTensor,
 tensorflow.python.framework.ops.EagerTensor)

Stanford n-gram LM [source](https://web.stanford.edu/~jurafsky/slp3/3.pdf)


In [17]:
from tensorflow import keras

# from keras import layers, backend
from keras.layers import Lambda

# from tensorflow.python.ops.numpy_ops import np_config
# np_config.enable_numpy_behavior()
# BATCH, TIME, CONTEXT


# import keras.backend as K
class NgramLangModel(keras.layers.Layer):
    def __init__(self, vocab_size: int) -> None:
        super().__init__()
        self.dropout = keras.layers.Dropout(0.1)
        self.logits = keras.layers.Dense(vocab_size, activation=None)
        self.token_embedding = keras.layers.Embedding(vocab_size, vocab_size)
        self.ff = keras.layers.Dense(vocab_size, activation=None)
        self.loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # build used in situations where weights depend on the shape of the input tensors
    # def build(self, input_shape):
    #     if subclassers need a "state creation step"
    #     This method is used to create weights that depend on input.shape
    #     __call__ will auto build the layer if it hasn't been built yet

    # ?keras.layers.MultiHeadAttention
    # !inference mode vs training mode
    # Called in __call__, after build has been called?
    # __call__ wraps call
    # Preforms logic of applying the layer to the input tensors
    # from logits to normalized probabilities
    # CategoricalCrossentropy expects labels to be provided in a one hot rep, labels as ints use SparseCategoricalCrossentropy
    # logits = self.token_embedding(xss)
    # # print(f"logits.shape: {logits.shape}")
    # if targets is None:
    #     loss = None
    # else:
    #     B, T, C = logits.shape
    #     logits = tf.reshape(logits, (B * T, C))
    #     targets = tf.reshape(targets, (B * T))
    #     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    #         logits=logits, labels=targets
    #     )
    #     self.add_loss(loss)
    #     return logits, loss

    def call(
        self, xss: tf.Tensor, targets: Optional[tf.Tensor] = None, training: bool = True
    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
        xs = self.token_embedding(xss)  # embedded input
        xs = self.dropout(xs, training=training)  # dropout
        xs = self.ff(xs)  # pass through feed-forward
        logits = self.logits(xs)  # output layer
        # print(f"logits.shape: {logits.shape}")

        if targets is not None:
            loss = self.loss_fn(targets, logits)
            return logits, loss
        else:
            return logits
        # loss = tf.nn.softmax_cross_entropy_with_logits
        # loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    def generate(self, xss: tf.Tensor, max_new_toks: int = 128) -> tf.Tensor:
        for _ in range(max_new_toks):
            res = self(xss)  # No targets
            if isinstance(res, tuple):
                logits, loss = res
            else:
                logits = res
            logits = logits[:, -1, :]
            # obtain probability of each token (in this case char)
            logits = tf.nn.softmax(logits)

            next_token = tf.random.categorical(logits, num_samples=1)  # (B, 1)
            # first dimension/axis is the time dimension
            xss = tf.concat([xss, next_token], axis=1)
            """
            xss: current context of some chars in a batch of dimensions (B, T)
            generates (B, T+1)...(B, T+max_new_toks)
            """
        return xss

    # The below func can just be used as a layer. Src: https://blog.codecentric.de/move-n-gram-extraction-into-your-keras-ngram
    # def ngram_block(n, alphabet_size):
    #     def wrapped(inputs):
    #         layer = layers.Conv1D(1, n, use_bias=False, trainable=False)
    #         x = layers.Reshape((-1, 1))(inputs)
    #         x = layer(x)
    #         kernel = np.power(alphabet_size, range(0, n),
    #                           dtype=backend.floatx())
    #         layer.set_weights([kernel.reshape(n, 1, 1)])
    #         return layers.Reshape((-1,))(x)

    #     return wrapped

    def bigram(self, data):
        return tf.convert_to_tensor(
            Lambda(lambda x: [x[:, :-1] + x[:, 1:] * tp.vocab_size])(data)
        )


"""
model.eval() in torch -> training=False in tf
"""

ngram = NgramLangModel(tp.vocab_size)


ngram.call(x, y)


@tf.no_gradient(op_type="*")
def calculate_loss() -> dict:
    eval_iters = 400
    out = {}
    # with tf.GradientTape() as _:
    for split in ["train", "eval"]:
        losses = []
        for _ in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = ngram(X, Y)
            losses.append(loss)
        out[split] = tf.reduce_mean(losses)
    return out


def loss_fn() -> tf.Tensor:
    X, Y = get_batch("train")
    _, loss = ngram(X, Y, training=True)
    return loss
ctx = tf.zeros((1,1), dtype=tf.int64)))

SyntaxError: unmatched ')' (2325061986.py, line 133)

In [None]:
print(tp.decode_mapping(ngram.generate(ctx)[0].numpy().tolist()))

In [16]:
# import os
# os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=$CONDA_PREFIX_1/pkgs/cuda-nvcc-11.7.99-0"


optimiser = keras.optimizers.AdamW(learning_rate=1e-3)
for timestep in range(1000):
    if timestep % 100 == 0:
        loss = calculate_loss()
        print(
            f"timestep: {timestep}: training loss: {loss['train']:.4f}, eval loss: {loss['eval']:.4f}"
        )
    optimiser.minimize(loss_fn, ngram.trainable_variables)

context = tf.zeros((1, 1), dtype=tf.int64)
# print(tp.decode_mapping(ngram.generate(context, max_new_toks=60)[0].numpy().tolist()))
# ys.shape, xs.shape
# out, loss = ngram(xs, ys)
# init_xss = tf.zeros((1, 1), dtype=tf.int64)
# out = tf.nn.softmax(out)
# x = out[0].numpy().tolist()
# tp.decode_mapping(x)
# ngram.generate(init_xss, max_new_toks=10)
# print(out)
# print(loss.call(ys,out))
# x[:10]

timestep: 0: training loss: 2.5764, eval loss: 2.5907
timestep: 100: training loss: 2.5716, eval loss: 2.5807
timestep: 200: training loss: 2.5532, eval loss: 2.5658
timestep: 300: training loss: 2.5558, eval loss: 2.5540
timestep: 400: training loss: 2.5387, eval loss: 2.5483
timestep: 500: training loss: 2.5459, eval loss: 2.5613
timestep: 600: training loss: 2.5407, eval loss: 2.5563
timestep: 700: training loss: 2.5426, eval loss: 2.5475
timestep: 800: training loss: 2.5383, eval loss: 2.5502
timestep: 900: training loss: 2.5356, eval loss: 2.5468


In [18]:
print(tp.decode_mapping(ngram.generate(context)[0].numpy().tolist()))


fèb5èn””$YW
esZk09t!'jyRJO/%gZCMp5I_d]/Jk”icX“y èI!0-gQ:6/*JPsmsY’NM,yh8$AHéVOLUsOxM“jl0z,)-l‘ TM:NEDB”M—X_aB'65'L$SeCZq:n:àrABY


In [58]:
B, T, C = 4, 8, 2
h = tf.random.normal(shape=(B, T, C), dtype=tf.float32)
"""
for every batch element, for every t'th token we want to avg
all the tokens from 0 to t-1 (inclusive)
keep in mind this is very lossy, we are throwing away a lot of information
such as positional information
"""
bow_lst = []
# xbagOfWords = tf.zeros((B, T, C))

# ?cannot assign to eager tensor
for b in range(B):
    batch_list = []
    for t in range(T):
        # xbagOfWords[b, t] = tf.reduce_mean(h[b, : t + 1], axis=0)
        # mean = tf.reduce_mean(h[b, : t + 1], axis=0)
        # indexing h[b, : t + 1] will change shape to (t,C)
        batch_list.append(tf.reduce_mean(h[b, : t + 1], axis=0))
    bow_lst.append(batch_list)
xbagOfWords = tf.stack(bow_lst)
assert all(xbagOfWords[0][-1] == tf.reduce_mean(h[0, :8], axis=0))
# since we are just averaging and the last element in BOW is the average of all prev

In [59]:
# MATMUL TIME FOR EFFICIENCY 😄
