In [3]:
# Guided by Andrej Karpathy: https://www.youtube.com/watch?v=kCc8FmEb1nY
# Thus spoke zarathustra concatenated with Dostoyevsky's 'The Idiot'

with open('TSZ_input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("character count: ", len(text))

print(text[:1000])

character count:  2020697
Title: Thus Spake Zarathustra
       A Book for All and None

“Zarathustra” is my brother’s most personal work; it is the history of
his most individual experiences, of his friendships, ideals, raptures,
bitterest disappointments and sorrows. Above it all, however, there
soars, transfiguring it, the image of his greatest hopes and remotest
aims. My brother had the figure of Zarathustra in his mind from his very
earliest youth: he once told me that even as a child he had dreamt of
him. At different periods in his life, he would call this haunter of his
dreams by different names; “but in the end,” he declares in a note on
the subject, “I had to do a PERSIAN the honour of identifying him with
this creature of my fancy. Persians were the first to take a broad and
comprehensive view of history. Every series of evolutions, according
to them, was presided over by a prophet; and every prophet had his
‘Hazar,’--his dynasty of a thousand years.”

All Zarathustra’s views

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

# chars = [c for c in chars if c != 'É' and  c != 'à' and c != 'ç' and c != 'è' and c != 'é' and c != 'ê']

print(''.join(chars))
print(vocab_size)



 !"$%'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÉàçèéê—‘’“”
93


In [6]:
# Tokenization, char to int

# In practice a subword tokenization is used, where the size of the available tokens is larger but the list of ints representing a string is shorter
stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for i,c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda w: ''.join([itos[i] for i in w])

In [7]:
import tensorflow as tf
import numpy as np
data = encode(text)
data = np.array(data)

train_split = int(len(data) * 0.9)
xtrain = data[:train_split]
val_data = data[train_split:]
# data = tf.Tensor(data, 0,dtype=tf.int64)

In [8]:
chunk_size = 8
xtrain[:chunk_size+1]

array([46, 64, 75, 67, 60, 24,  1, 46, 63])

In [9]:
x = xtrain[:chunk_size]
y = xtrain[1:chunk_size+1]

# transformer will never receive more than chunk_size tokens/inputs at a time
# The below operation is not only done for efficiency but also so the transformer is 'used' to seeing context of size 1..chunk_size
for k in range(chunk_size):
    context = x[:k+1]
    target = y[k]
    print(f'context: {context} target: {target}')

context: [46] target: 64
context: [46 64] target: 75
context: [46 64 75] target: 67
context: [46 64 75 67] target: 60
context: [46 64 75 67 60] target: 24
context: [46 64 75 67 60 24] target: 1
context: [46 64 75 67 60 24  1] target: 46
context: [46 64 75 67 60 24  1 46] target: 63


In [24]:
tf.random.set_seed(4646)
batch_size = 4
chunk_size = 8
def get_batch(dat):
    dat = xtrain if dat == 'train' else val_data
    randint = tf.random.uniform(shape=(batch_size,), minval=0, maxval=len(dat)-chunk_size-1, dtype=tf.int64)
    x = tf.stack([dat[i:i+chunk_size] for i in randint])
    y = tf.stack([dat[i+1:i+chunk_size+1] for i in randint])
    # print(f"randint: {randint}, randint.shape: {randint.shape}")
    return x,y

xs, ys = get_batch('train')
print('inputs:')
print(xs.shape)
print(xs)
print('targets:')
print(ys.shape)
print(ys)
"""
x=[1,60,69..10]
y = [60,69,..1]
input x[0,0:2]= [1,60] output is y[2] = 69
"""

print('()_________()')

for b in range(batch_size):
    for k in range(chunk_size):
        context = xs[b,:k+1]
        target = ys[b,k]
        print(f'context: {context} target: {target}')


inputs:
(4, 8)
tf.Tensor(
[[ 1 60 69 70 76 62 63 10]
 [73  1 75 63 60  1 67 56]
 [78 60 77 60 73 11 11 63]
 [67 80  1 64 69  1 56  1]], shape=(4, 8), dtype=int32)
targets:
(4, 8)
tf.Tensor(
[[60 69 70 76 62 63 10  1]
 [ 1 75 63 60  1 67 56 74]
 [60 77 60 73 11 11 63 70]
 [80  1 64 69  1 56  1 58]], shape=(4, 8), dtype=int32)
()_________()
context: [1] target: 60
context: [ 1 60] target: 69
context: [ 1 60 69] target: 70
context: [ 1 60 69 70] target: 76
context: [ 1 60 69 70 76] target: 62
context: [ 1 60 69 70 76 62] target: 63
context: [ 1 60 69 70 76 62 63] target: 10
context: [ 1 60 69 70 76 62 63 10] target: 1
context: [73] target: 1
context: [73  1] target: 75
context: [73  1 75] target: 63
context: [73  1 75 63] target: 60
context: [73  1 75 63 60] target: 1
context: [73  1 75 63 60  1] target: 67
context: [73  1 75 63 60  1 67] target: 56
context: [73  1 75 63 60  1 67 56] target: 74
context: [78] target: 60
context: [78 60] target: 77
context: [78 60 77] target: 60
context: [7

In [14]:
def get_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64),
        tf.keras.layers.LSTM(1024, return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    model.summary()
    return model
model = get_model()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 64)          5952      
                                                                 
 lstm_1 (LSTM)               (None, None, 1024)        4460544   
                                                                 
 dense_1 (Dense)             (None, None, 93)          95325     
                                                                 
Total params: 4,561,821
Trainable params: 4,561,821
Non-trainable params: 0
_________________________________________________________________


In [53]:
from tensorflow import keras
from keras import layers, backend
from keras.layers import Lambda
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
# import keras.backend as K
# rc: https://www.youtube.com/watch?v=PaCmpygFfXo&t=182s -> Bigram model details by Karpathy

# N-gram language model https://web.stanford.edu/~jurafsky/slp3/3.pdf
class ngramLangModel(keras.layers.Layer):
    def __init__(self, vocab_size):
        # `super` call to inherit from keras.layers.Layer
        super(ngramLangModel,self).__init__()

        self.token_embedding = keras.layers.Embedding(vocab_size, vocab_size)
    def build(self, input_shape):
        # if subclassers need a "state creation step"
        # This method is used to create weights that depend on input.shape
        # __call__ will auto build the layer if it hasn't been built yet

        pass

    # !inference mode vs training mode
    def call(self, xss, targets=None):
        # Called in __call__, after build has been called?
        # Preforms logic of applying the layer to the input tensors

        if targets is None:
            loss = None
        logits = self.token_embedding(xss)

        # from logits to normalized probabilities
        # CategoricalCrossentropy expects labels to be provided in a one hot rep, labels as ints use SparseCategoricalCrossentropy
        """
        # Figure out dimensions
        # the err thrown is "EagerTensor object has no attribute 'reshape'."
        # the above enabling of numpy behavior has fixed this, I presume.
        """
        loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        B, T, C = logits.shape
        logits = tf.reshape(logits, (B*T, C))
        targets = tf.reshape(targets, (B*T))
        print(f"logits.shape: {logits.shape}, targets.shape: {targets.shape}")
        self.add_loss(loss)

        return logits,loss

    def generate(self, xss, max_toks):

        for _ in range(max_toks):
            logits, loss = self(xss)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits)
            next_token = tf.random.categorical(probs, num_samples=1) #(B, 1)
            # first dimension/axis is the time dimension
            xss = tf.concat([xss, next_token], axis=1)

        return xss

    # The below func can just be used as a layer. Src: https://blog.codecentric.de/move-n-gram-extraction-into-your-keras-model
    # def ngram_block(n, alphabet_size):
    #     def wrapped(inputs):
    #         layer = layers.Conv1D(1, n, use_bias=False, trainable=False)
    #         x = layers.Reshape((-1, 1))(inputs)
    #         x = layer(x)
    #         kernel = np.power(alphabet_size, range(0, n),
    #                           dtype=backend.floatx())
    #         layer.set_weights([kernel.reshape(n, 1, 1)])
    #         return layers.Reshape((-1,))(x)

    #     return wrapped

    def bigram(self, data):
        return tf.convert_to_tensor(Lambda(lambda x: [x[:,:-1] + x[:,1:] * vocab_size])(data))



ngram = ngramLangModel(vocab_size)

out,loss = ngram(xs,ys)
# type(bgramd)

logits.shape: (32, 93), targets.shape: (32,)


<tf.Tensor: shape=(32, 93), dtype=float32, numpy=
array([[ 0.00926419,  0.01209891,  0.0227098 , ..., -0.00062816,
        -0.02765806,  0.01870995],
       [ 0.049946  , -0.04618012, -0.02935493, ...,  0.01709548,
         0.02467797,  0.03596773],
       [ 0.03407184,  0.01388774, -0.04212189, ...,  0.03161688,
        -0.01108854, -0.02865429],
       ...,
       [ 0.00926419,  0.01209891,  0.0227098 , ..., -0.00062816,
        -0.02765806,  0.01870995],
       [-0.03403344,  0.02255275, -0.00292721, ..., -0.02929988,
        -0.04217792,  0.01755125],
       [ 0.00926419,  0.01209891,  0.0227098 , ..., -0.00062816,
        -0.02765806,  0.01870995]], dtype=float32)>

In [None]:
# train on gpu
# with tf.device('/GPU:0'):
#     model = get_model()
#     model.compile()
#     model.fit()