In [26]:
# Guided by Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY
# Thus Spoke Zarathustra concatenated with Dostoyevsky's 'The Idiot'

In [23]:
import tensorflow as tf
import numpy as np
from tokenizer import TextProcessor

tp = TextProcessor("TSZ_input.txt")
data = tp.text

data_enc = tp.map_to_int(data)
data_enc = np.array(data_enc)

train_split = int(len(data) * 0.9)
xtrain = data_enc[:train_split]
val_data = data_enc[train_split:]

In [24]:
chunk_size = 8
x = xtrain[:chunk_size]
y = xtrain[1:chunk_size+1]

# transformer will never receive more than chunk_size tokens/inputs at a time
# The below operation is not only done for efficiency but also so the transformer is 'used' to seeing context of size 1..chunk_size
for k in range(chunk_size):
    context = x[:k+1]
    target = y[k]
    # print(f'context: {context} target: {target}')
# print(tf.__version__)
# tp.stoi['T']

In [25]:
tf.random.set_seed(44)
batch_size = 32
# Chunksize = context length, number of tokens to be considered
# chunk_size = 8

# get training batch
# tf.function because the global seed is set and operation seeds are not set
# @tf.function
# However, adding the tf.function decorator causes the following error:
# OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
def get_batch(dat):
    dat = xtrain if dat == 'train' else val_data
    randint = tf.random.uniform(shape=(batch_size,), maxval=len(dat)-chunk_size, dtype=tf.int64)
    x = tf.stack([dat[i:i+chunk_size] for i in randint])
    y = tf.stack([dat[i+1:i+chunk_size+1] for i in randint])
    print(f"randint: {randint}, randint.shape: {randint.shape}")
    return x,y

xs, ys = get_batch('train')
# print('inputs:')
# print(xs.shape)
# print(xs)
# print('targets:')
# print(ys.shape)
# print(ys)

"""
x=[1,60,69..10]
y = [60,69,..1]
input x[0,0:2]= [1,60] output is y[2] = 69
"""

print('========================')

for b in range(batch_size):
    for k in range(chunk_size):
        context = xs[b,:k+1]
        target = ys[b,k]
        print(f'context: {context} target: {target}')

# xs.shape,ys.shape

randint: [ 440729  233150  255930 1737213   84626  586070 1703100 1721474 1718721
  827026 1317033 1378346  416373 1223467 1700808  817080  137852 1298681
  803705 1431576  142775  151798 1691730 1798182 1784864  330892 1206681
 1743925  871660  670485  110458  940298], randint.shape: (32,)
context: [73] target: 59
context: [73 59] target: 10
context: [73 59 10] target: 1
context: [73 59 10  1] target: 56
context: [73 59 10  1 56] target: 67
context: [73 59 10  1 56 67] target: 67
context: [73 59 10  1 56 67 67] target: 1
context: [73 59 10  1 56 67 67  1] target: 62
context: [58] target: 70
context: [58 70] target: 73
context: [58 70 73] target: 73
context: [58 70 73 73] target: 64
context: [58 70 73 73 64] target: 59
context: [58 70 73 73 64 59] target: 70
context: [58 70 73 73 64 59 70] target: 73
context: [58 70 73 73 64 59 70 73] target: 74
context: [59] target: 74
context: [59 74] target: 12
context: [59 74 12] target: 0
context: [59 74 12  0] target: 0
context: [59 74 12  0  0] 

In [10]:
# torch.no_grad() ~= tf.stop_gradient()

In [20]:
from tensorflow import keras
# from keras import layers, backend
from keras.layers import Lambda
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
# import keras.backend as K
# rc: https://www.youtube.com/watch?v=PaCmpygFfXo&t=182s -> Bigram model details by Karpathy

# N-gram language model https://web.stanford.edu/~jurafsky/slp3/3.pdf
class ngramLangModel(keras.layers.Layer):
    def __init__(self, vocab_size):
        # `super` call to inherit from keras.layers.Layer
        super(ngramLangModel,self).__init__()

        self.token_embedding = keras.layers.Embedding(vocab_size, vocab_size)
    def build(self, input_shape):
        # if subclassers need a "state creation step"
        # This method is used to create weights that depend on input.shape
        # __call__ will auto build the layer if it hasn't been built yet

        pass

# ?keras.layers.MultiHeadAttention

    # !inference mode vs training mode
    def call(self, xss, targets=None):
        # Called in __call__, after build has been called?
        # __call__ wraps call
        # Preforms logic of applying the layer to the input tensors

        if targets is None:
            loss = None
        logits = self.token_embedding(xss)
        print(logits.shape)

        # from logits to normalized probabilities
        # CategoricalCrossentropy expects labels to be provided in a one hot rep, labels as ints use SparseCategoricalCrossentropy

        """
        # Figure out dimensions
        # the err thrown is "EagerTensor object has no attribute 'reshape'."
        # the above enabling of numpy behavior has fixed this
        """

        # loss = tf.nn.softmax_cross_entropy_with_logits
        # loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        B, T, C = logits.shape
        logits = tf.reshape(logits, (B*T, C))
        targets = tf.reshape(targets, (B*T))
        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)
        self.add_loss(loss)
        return logits,loss

    def generate(self, xss, max_new_toks):

        for _ in range(max_new_toks):
            logits, loss = self(xss)
            logits = logits[:, -1, :]
            # obtain probability of each token (in this case char)
            probs = tf.nn.softmax(logits)
            next_token = tf.random.categorical(probs, num_samples=1) #(B, 1)
            # first dimension/axis is the time dimension
            xss = tf.concat([xss, next_token], axis=1)

        return xss

    # The below func can just be used as a layer. Src: https://blog.codecentric.de/move-n-gram-extraction-into-your-keras-model
    # def ngram_block(n, alphabet_size):
    #     def wrapped(inputs):
    #         layer = layers.Conv1D(1, n, use_bias=False, trainable=False)
    #         x = layers.Reshape((-1, 1))(inputs)
    #         x = layer(x)
    #         kernel = np.power(alphabet_size, range(0, n),
    #                           dtype=backend.floatx())
    #         layer.set_weights([kernel.reshape(n, 1, 1)])
    #         return layers.Reshape((-1,))(x)

    #     return wrapped

    def bigram(self, data):
        return tf.convert_to_tensor(Lambda(lambda x: [x[:,:-1] + x[:,1:] * vocab_size])(data))



ngram = ngramLangModel(vocab_size)

# ys.shape,xs.shape
# out,loss = ngram(xs,ys)
# init_xss = tf.zeros((1, 1), dtype=tf.int64)
# out = tf.nn.softmax(out)
# x = out[0].numpy().tolist()
# decode(x)
# ngram.generate(init_xss, max_new_toks=10)
# print(outt)
# print(loss.call(ys,out))
# type(bgramd)

(32, 8, 93)


InvalidArgumentError: Exception encountered when calling layer "ngram_lang_model_9" (type ngramLangModel).

logits and labels must be broadcastable: logits_size=[256,93] labels_size=[1,256] [Op:SoftmaxCrossEntropyWithLogits]

Call arguments received by layer "ngram_lang_model_9" (type ngramLangModel):
  • xss=tf.Tensor(shape=(32, 8), dtype=int32)
  • targets=tf.Tensor(shape=(32, 8), dtype=int32)

In [None]:
# train on gpu
# with tf.device('/GPU:0'):
#     model = get_model()
#     model.compile()
#     model.fit()