In [1]:
import re
import sentencepiece as sp
import tensorflow as tf
import tensorflow_text as text
import math
import tqdm

2023-02-19 23:16:49.348890: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-19 23:16:49.525419: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-19 23:16:49.525438: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-19 23:16:50.256620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
with open("bible.txt", "r") as f:
    bible_input =  f.read()

In [3]:
def prepare_data(data):
    data = data.lower()
    data = re.sub("\d+:\d+ ", "", data)
    data = data.replace("\n\n", "\n")
    data = data.replace("\n\n", "\n")
    data = re.sub(" +", " ", data)
    
    special_characters = ",.:;!?"
    for c in special_characters:
        data = data.replace(c, "")
    return data

In [115]:
preprocessed_data = prepare_data(bible_input)

with open("bible_prep.txt", "w") as f:
    f.write(preprocessed_data)
    f.close()

In [116]:
VOCAB_SIZE = 2000

sp.SentencePieceTrainer.train(input="bible_prep.txt", model_prefix="tokenizer", model_type="unigram", vocab_size=VOCAB_SIZE)
sp_model = tf.io.gfile.GFile("tokenizer.model", "rb").read()
tokenizer = text.SentencepieceTokenizer(sp_model)
tokens = tokenizer.tokenize(preprocessed_data)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: bible_prep.txt
  input_format: 
  model_prefix: tokenizer
  model_type: UNIGRAM
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_cl

In [117]:
m = 32
sliding_window = text.sliding_window(data=tokens, width=m+1)

input_data = sliding_window[:, :m]
input_ds = tf.data.Dataset.from_tensor_slices(input_data)

target_data = sliding_window[:, 1:]
target_ds = tf.data.Dataset.from_tensor_slices(target_data)
dataset = tf.data.Dataset.zip((input_ds, target_ds))
dataset = dataset.shuffle(1024).batch(32).prefetch(tf.data.AUTOTUNE)

train_size = math.floor(len(dataset) * 0.8)
test_size = math.floor(len(dataset) * 0.2)
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size).take(test_size)

In [118]:
class Embedding(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

        self.embed_token = tf.keras.layers.Embedding(VOCAB_SIZE, 64)
        self.embed_position = tf.keras.layers.Embedding(m, 64)

    def call(self, input):
        indices = tf.range(0, input.shape[1])
        token_embedding = self.embed_token(input)
        position_embedding = self.embed_position(indices)
        return token_embedding + position_embedding

In [119]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(4, 64)
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(64)
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)
        self.layer_normalization1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_normalization2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, input, training):
        x = self.mha(input, input, use_causal_mask=True)
        x = self.dropout1(x, training=training)
        x = tf.math.add(x, input)
        x = self.layer_normalization1(x)
        y = self.dense1(x)
        y = self.dense2(y)
        y = self.dropout2(y)
        x = tf.math.add(x, y)
        return self.layer_normalization2(x)




In [132]:
class Transformer(tf.keras.Model):
    def __init__(self, tokenizer):
        super().__init__()

        self.optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
        self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        self.tokenizer = tokenizer
        
        self.metrics_list = [
            tf.keras.metrics.SparseCategoricalCrossentropy(name="loss"),
            tf.keras.metrics.CategoricalAccuracy(name="accuracy")
        ]
        self.embedding = Embedding()
        self.transformer_block = TransformerBlock()
        self.out = tf.keras.layers.Dense(2000)

    def call(self, input, training):
        x = self.embedding(input)
        x = self.transformer_block(x)
        return self.out(x)
    
    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()

    def train_step(self, data):
        input, target = data

        with tf.GradientTape() as tape:
            output = self(input, training=True)
            loss = self.loss_function(target, output)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        for metric in self.metrics_list:
            metric.update_state(target, output)

        return {m.name : m.result() for m in self.metrics}
    
    def generate_text(self, prompt, output_length, top_k):
        prompt = self.tokenizer.tokenize(prompt)
        prompt = tf.expand_dims(prompt, 0)

        while prompt.shape[1] < output_length:
            paddings = tf.constant([[0, 0],
                        [max(self.sequence_length - prompt.shape[1], 0), 0]])
      
        padded_prompt = tf.pad(prompt, paddings, constant_values=-1)
        logits = self(padded_prompt)
        top_logits, top_indices = tf.math.top_k(logits, k=top_k, sorted=True)
        next_token = tf.random.categorical(top_logits, 1)
        next_token = top_indices[0, tf.squeeze(next_token)]
        next_token = tf.reshape(next_token, (1,1))
        prompt = tf.concat([prompt, next_token], axis = 1)
        
        return self.tokeniser.detokenize(prompt)

In [136]:
def training_loop(model, train_ds):
    for e in range(100):
        print(f"Epoch {e}:")

        for data in tqdm.tqdm(train_ds):
            metrics = model.train_step(data)

        for metric in model.metrics:
            print(f"{metric.name}: {metric.result()}")

        model.reset_metrics()

In [138]:
model = Transformer(tokenizer)

training_loop(model, train_dataset)

Epoch 0:


  0%|          | 18/23115 [00:02<50:15,  7.66it/s] 


KeyboardInterrupt: 