In [92]:
# Importing Libraries
import tensorflow as tf
import keras
from keras import layers, models, optimizers
import numpy as np
import matplotlib.pyplot as plt

In [93]:
# hyperparameters
batch_size = 16  # how many independent sequences will we process in parallel?
block_size = 32  # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100

Epochs = max_iters // eval_interval

learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

In [94]:
"""shell
wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
"""
# Read the text file
with open("dataset_french/data.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [
    stoi[c] for c in s
]  # encoder: take a string, output a list of integers
decode = lambda l: "".join(
    [itos[i] for i in l]
)  # decoder: take a list of integers, output a string

print(encode("hi"))
print(decode([6, 59, 60, 6]))

[59, 60]
'hi'


In [95]:
# Train and test splits
data = np.array(encode(text), dtype=np.int64)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# val_data.tolist()

In [96]:
# Data loading
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = np.random.randint(0, len(data) - block_size, batch_size)
    x = np.stack([data[i : i + block_size] for i in ix])
    y = np.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


# Prepare train/val dataset
def train_data_generator():
    while True:
        yield get_batch("train")


def val_data_generator():
    while True:
        yield get_batch("val")



In [97]:
train_data_generator = tf.data.Dataset.from_generator(
    train_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64),
        tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64),
    ),
)

val_data_generator = tf.data.Dataset.from_generator(
    val_data_generator,
    output_signature=(
        tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64),
        tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64),
    ),
)


In [98]:
# %% Model components
class FeedForward(layers.Layer):
    """A simple linear layer followed by a non-linearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = models.Sequential(
            [
                layers.Dense(4 * n_embd, activation="relu"),
                layers.Dense(n_embd),
                layers.Dropout(dropout),
            ]
        )

    def call(self, x):
        return self.net(x)


In [99]:
class Block(layers.Layer):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embd, n_head):
        super().__init__()
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        self.sa = layers.MultiHeadAttention(
            num_heads=n_head, key_dim=n_embd // n_head, dropout=dropout
        )
        self.ffwd = FeedForward(n_embd)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x):
        attn_output = self.sa(
            self.ln1(x), self.ln1(x), use_causal_mask=True
        )  # use causal mask to ensure each token can only see previous tokens
        x = x + attn_output
        x = x + self.ffwd(self.ln2(x))
        return x


In [106]:
# Bigram Language Model
class BigramLanguageModel(keras.Model):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = layers.Embedding(block_size, n_embd)
        self.blocks = [Block(n_embd, n_head) for _ in range(n_layer)]
        self.ln_f = layers.LayerNormalization()
        self.lm_head = layers.Dense(vocab_size)

    def call(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(
            tf.range(T)[tf.newaxis, :]
        )  # initially (T,C) adding new axis and get # (1,T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        for block in self.blocks:  # (B,T,C)
            x = block(x)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            return logits, None

        logits_flat = tf.reshape(logits, [-1, logits.shape[-1]])
        targets_flat = tf.reshape(targets, [-1])
        loss = keras.losses.sparse_categorical_crossentropy(
            targets_flat, logits_flat, from_logits=True
        )
        return logits, tf.reduce_mean(loss)

    def train_step(self, data):
        x, y = data
        with tf.GradientTape() as tape:
            logits, loss = self(x, y)
        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        return {"loss": loss}

    def test_step(self, data):
        x, y = data
        logits, loss = self(x, y)
        return {"loss": loss}

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, _ = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # sample from the distribution
            idx_next = tf.random.categorical(logits, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx


In [109]:
# Initialize the model train and plotting loss curves
model = BigramLanguageModel()
# print the number of parameters in the model
model.build((batch_size, block_size))
print("Number of trainable parameters:", model.count_params())

# Compile the model
model.compile(optimizer=optimizers.Adam(learning_rate))

Number of trainable parameters: 0




In [85]:
model.summary()

In [86]:
# Train the model
Hist = model.fit(
    train_data_generator,
    epochs=2,
    steps_per_epoch=eval_interval,
    validation_data=val_data_generator,
    validation_steps=eval_iters,
)


Epoch 1/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 105ms/step - loss: 2.9274 - val_loss: 0.0000e+00
Epoch 2/2
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 96ms/step - loss: 2.4266 - val_loss: 0.0000e+00


In [None]:
# Plot learning curve
plt.figure()
plt.plot(
    np.arange(1, Epochs + 1),
    np.vstack((Hist.history["loss"], Hist.history["val_loss"])).T,
)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["train_loss", "val_loss"])

In [71]:
"""
# Geneating Shakespeare-like text!
"""
# Generate text
context = np.zeros((1, 1), dtype=np.int64)
generated = model.generate(context, max_new_tokens=200)
print(decode(generated[0].numpy().tolist()))

te uspecté,
La saffre écladiste envée au lout bravé d’ici Fonfle l'aigne à Périter les cielles

outombe 1Aux beaux
Pourcier le poème une morts 
Ruge 
Assaire 
seul, près, 
Victor Hugo 
Un gerris pout 


In [131]:
print(model.get_config())

{'trainable': True, 'dtype': 'float32'}


In [136]:
from tensorflow.keras import models

custom_objects = {'trainable': True, 'dtype': 'float32'}

with keras.saving.custom_object_scope(custom_objects):
    reconstructed_model = keras.models.load_model("mod.keras")

TypeError: Could not locate class 'BigramLanguageModel'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'BigramLanguageModel', 'config': {'trainable': True, 'dtype': 'float32'}, 'registered_name': 'BigramLanguageModel', 'build_config': {'input_shape': [16, 32]}, 'compile_config': {'optimizer': {'module': 'keras.optimizers', 'class_name': 'Adam', 'config': {'name': 'adam', 'learning_rate': 0.0010000000474974513, 'weight_decay': None, 'clipnorm': None, 'global_clipnorm': None, 'clipvalue': None, 'use_ema': False, 'ema_momentum': 0.99, 'ema_overwrite_frequency': None, 'loss_scale_factor': None, 'gradient_accumulation_steps': None, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}, 'registered_name': None}, 'loss': None, 'loss_weights': None, 'metrics': None, 'weighted_metrics': None, 'run_eagerly': False, 'steps_per_execution': 1, 'jit_compile': False}}