In [16]:
# Guided by Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY
# Thus Spoke Zarathustra concatenated with Dostoyevsky's 'The Idiot'

In [17]:
import tensorflow as tf
import numpy as np
from tokenizer import TextProcessor

tp = TextProcessor("TSZ_input.txt")
data = tp.text

data_enc = tp.map_to_int(data)
data_enc = tf.convert_to_tensor(data_enc)

train_split = int(len(data) * 0.9)
xtrain = data_enc[:train_split]
val_data = data_enc[train_split:]

* The `tensor([76 62 63 1 75 63 60 80])` actually contains 7 different training examples in it; lets assume that these char to int 'embeddings' represent the word 'Nietzsche' the different examples would be:
  * `[76]` (aka. `['N']`) -> is likely proceeded by a `[62]`(or `['i']`)
  * `[76 62]` (aka. `['Ni']`) -> is likely proceeded by a `[63]` (or `['e']`)
  * ...
  * `[76 62 63  1 75 63 60]` (aka. `['Nietzsch']`) -> is likely proceeded by `[83]` (or `['e']`)
<br>
<br>
* transformer will never receive more than chunk_size tokens/inputs at a time
 The below operation is not only done for efficiency but also so the transformer is 'used' to seeing context of size 1..chunk_size

In [18]:
chunk_size = 8
x = xtrain[:chunk_size]
y = xtrain[1:chunk_size+1]

for k in range(chunk_size):
    context = x[:k+1]
    target = y[k]
    # print(f'context: {context} target: {target}')
# print(tf.__version__)
# tp.stoi['T']

In [19]:
tf.random.set_seed(44)
batch_size = 32
# Chunksize = context length, number of tokens to be considered
# chunk_size = 8

# get training batch
# tf.function because the global seed is set and operation seeds are not set
# @tf.function
# However, adding the tf.function decorator causes the following error:
# OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.

def get_batch(dat):
    dat = xtrain if dat == 'train' else val_data
    randint = tf.random.uniform(shape=(batch_size,), maxval=len(dat)-chunk_size, dtype=tf.int64)
    x = tf.stack([dat[i:i+chunk_size] for i in randint])
    y = tf.stack([dat[i+1:i+chunk_size+1] for i in randint])
    # print(f"randint: {randint}, randint.shape: {randint.shape}")
    return x,y

xs, ys = get_batch('train')

"""
x=[1,60,69..10]
y = [60,69,..1]
input x[0,0:2]= [1,60] output is y[2] = 69
"""

print('========================')

for b in range(batch_size):
    for k in range(chunk_size):
        context = xs[b,:k+1]
        target = ys[b,k]
        # print(f'context: {context} target: {target}')

xs.shape,ys.shape, type(xs), type(ys)



(TensorShape([32, 8]),
 TensorShape([32, 8]),
 tensorflow.python.framework.ops.EagerTensor,
 tensorflow.python.framework.ops.EagerTensor)

Stanford n-gram LM [source](https://web.stanford.edu/~jurafsky/slp3/3.pdf)

In [28]:
from tensorflow import keras
# from keras import layers, backend
from keras.layers import Lambda
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
# import keras.backend as K
# rc: https://www.youtube.com/watch?v=PaCmpygFfXo&t=182s -> Bigram model details by Karpathy

class ngramLangModel(keras.layers.Layer):
    def __init__(self, vocab_size):
        # `super` call to inherit from keras.layers.Layer
        super(ngramLangModel,self).__init__()

        self.token_embedding = keras.layers.Embedding(vocab_size, vocab_size)
    # build used in situations where weights depend on the shape of the input tensors
    def build(self, input_shape):
        # if subclassers need a "state creation step"
        # This method is used to create weights that depend on input.shape
        # __call__ will auto build the layer if it hasn't been built yet

        pass
# ?keras.layers.MultiHeadAttention
    # !inference mode vs training mode
    def call(self, xss, targets=None):
        # Called in __call__, after build has been called?
        # __call__ wraps call
        # Preforms logic of applying the layer to the input tensors

        if targets is None:
            loss = None
        logits = self.token_embedding(xss)
        print(logits.shape)

        # from logits to normalized probabilities
        # CategoricalCrossentropy expects labels to be provided in a one hot rep, labels as ints use SparseCategoricalCrossentropy

        """
        # Figure out dimensions
        # the err thrown is "EagerTensor object has no attribute 'reshape'."
        # the above enabling of numpy behavior has fixed this
        """
        # loss = tf.nn.softmax_cross_entropy_with_logits
        # loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        B, T, C = logits.shape
        logits = tf.reshape(logits, (B*T, C))
        targets = tf.reshape(targets, (B*T))
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
        self.add_loss(loss)
        return logits,loss

    def generate(self, xss, max_new_toks):

        for _ in range(max_new_toks):
            logits, loss = self(xss)
            logits = logits[:, -1, :]
            # obtain probability of each token (in this case char)
            probs = tf.nn.softmax(logits)
            next_token = tf.random.categorical(probs, num_samples=1) #(B, 1)
            # first dimension/axis is the time dimension
            xss = tf.concat([xss, next_token], axis=1)

        return xss

    # The below func can just be used as a layer. Src: https://blog.codecentric.de/move-n-gram-extraction-into-your-keras-model
    # def ngram_block(n, alphabet_size):
    #     def wrapped(inputs):
    #         layer = layers.Conv1D(1, n, use_bias=False, trainable=False)
    #         x = layers.Reshape((-1, 1))(inputs)
    #         x = layer(x)
    #         kernel = np.power(alphabet_size, range(0, n),
    #                           dtype=backend.floatx())
    #         layer.set_weights([kernel.reshape(n, 1, 1)])
    #         return layers.Reshape((-1,))(x)

    #     return wrapped

    def bigram(self, data):
        return tf.convert_to_tensor(Lambda(lambda x: [x[:,:-1] + x[:,1:] * tp.vocab_size])(data))



ngram = ngramLangModel(tp.vocab_size)

ys.shape,xs.shape
out,loss = ngram(xs,ys)
init_xss = tf.zeros((1, 1), dtype=tf.int64)
out = tf.nn.softmax(out)
x = out[0].numpy().tolist()
# tp.decode_mapping(x)
# ngram.generate(init_xss, max_new_toks=10)
# print(out)
# print(loss.call(ys,out))
# type(bgramd)
x[:10]

(32, 8, 93)


[0.010245682671666145,
 0.010266579687595367,
 0.010572953149676323,
 0.010884030722081661,
 0.010892984457314014,
 0.010577243752777576,
 0.010940386913716793,
 0.011271117255091667,
 0.01028439961373806,
 0.010864926502108574]

In [None]:
# train on gpu
# with tf.device('/GPU:0'):
#     model = get_model()
#     model.compile()
#     model.fit()