In [3]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Layer, Dense, Dropout, Embedding


# GPT only uses the decoder part of the Transformer architecture
# We can remove the Encoder and EncoderLayer classes

def positional_encoding(position, d_model):
    angle_rads = tf.range(position, dtype=tf.float32)[:, tf.newaxis] * 1 / tf.pow(10000, (2 * tf.range(0, d_model, dtype=tf.float32)) / d_model)
    angle_rads_even = tf.math.sin(angle_rads[:, 0::2])
    angle_rads_odd = tf.math.cos(angle_rads[:, 1::2])
    angle_rads = tf.stack([angle_rads_even, angle_rads_odd], axis=-1)
    angle_rads = tf.reshape(angle_rads, (-1, d_model))
    pos_encoding = angle_rads[tf.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

# Decoder layer
class GPTLayer(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.15):
        super(GPTLayer, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            Dense(dff, activation='relu'),
            Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        
    def build(self, input_shape):
        self.mha._build_from_signature(input_shape, input_shape, input_shape)
        super(GPTLayer, self).build(input_shape)

    def call(self, x, training, mask):
        attn_output = self.mha(x, x, x, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

def create_look_ahead_mask(size):
    if size == 1:
        return tf.zeros((1, 1))
    else:
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask  # (seq_len, seq_len)

# GPT model
# TODO do I need to mask padding?
class GPT(Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size, max_position_encoding, rate=0.15):
        super(GPT, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embedding(vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_position_encoding, d_model)

        self.gpt_layers = [GPTLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)

        self.final_layer = Dense(vocab_size)

    def create_masks(self, inp):
        padding_mask = tf.cast(tf.math.equal(inp, 0), tf.float32)
        look_ahead_mask = create_look_ahead_mask(tf.shape(inp)[1])
        combined_mask = tf.maximum(look_ahead_mask, padding_mask)
        return combined_mask


    def call(self, x, training):
        seq_len = tf.shape(x)[1]
        mask = self.create_masks(x)

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.gpt_layers[i](x, training, mask)

        final_output = self.final_layer(x)
        last_position_logits = final_output[:, -1, :]
        return last_position_logits

In [1]:
import tensorflow as tf
from tensorflow.python.keras.layers import Dense, Dropout, Embedding, Layer
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.optimizers import adam_v2

print("TensorFlow version: ", tf.__version__)
print("Connecting to TPU...")
resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu='node-8',zone='us-central1-f')
strategy = tf.distribute.TPUStrategy(resolver)
print("Done!")
print("Number of accelerators: ", strategy.num_replicas_in_sync)

TensorFlow version:  2.12.0
Connecting to TPU...
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Initializing the TPU system: node-8


INFO:tensorflow:Initializing the TPU system: node-8


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


Done!
Number of accelerators:  8


In [4]:
# import the GPT2 tokenizer
from transformers import GPT2Tokenizer
# use the gpt2 tokenizer
tokenizer: GPT2Tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

with strategy.scope():
    # Create the GPT model using the provided parameters
    num_layers = 6
    d_model = 256
    num_heads = 8
    dff = 1024
    vocab_size = tokenizer.vocab_size
    max_position_encoding = 16
    dropout_rate = 0.2
    learning_rate = 5e-5
    batch_size = 8
    epochs = 3
    warmup_steps = 200
    model = GPT(num_layers, d_model, num_heads, dff, vocab_size, max_position_encoding, dropout_rate)

    # Define the optimizer, loss function, and metric
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    metric = tf.keras.metrics.SparseCategoricalCrossentropy()

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss_object, metrics=[metric])

In [5]:
data = ["What year was the signing of the Declaration of Independence? 1776",
"What year was the storming of the Bastille? 1789",
"What year was the Battle of Waterloo? 1815",
"What year was the assassination of Abraham Lincoln? 1865",
"What year was the invention of the telephone by Alexander Graham Bell? 1876",
"What year was the first successful powered airplane flight by the Wright brothers? 1903",
"What year was the sinking of the Titanic? 1912",
"What year was the beginning of World War I? 1914",
"What year was the Russian Revolution? 1917",
"What year was the end of World War I? 1918",
"What year was the stock market crash that led to the Great Depression? 1929",
"What year was the beginning of World War II? 1939",
"What year was the attack on Pearl Harbor? 1941",
"What year was the D-Day invasion during World War II? 1944",
"What year was the dropping of the atomic bombs on Hiroshima and Nagasaki? 1945",
"What year was the end of World War II? 1945",
"What year was the establishment of the United Nations? 1945",
"What year was the beginning of the Korean War? 1950",
"What year was the launch of Sputnik 1, the first artificial satellite? 1957",
"What year was the Cuban Missile Crisis? 1962",
"What year was the assassination of John F. Kennedy? 1963",
"What year was the first moon landing by Apollo 11? 1969",
"What year was the end of the Vietnam War? 1975",
"What year was the fall of the Berlin Wall? 1989",
"What year was the dissolution of the Soviet Union? 1991",
"What year was the terrorist attacks on September 11? 2001",
"What year was the beginning of the Iraq War? 2003",
"What year was the invention of the World Wide Web by Tim Berners-Lee? 1989",
"What year was the assassination of Martin Luther King Jr.? 1968",
"What year was the discovery of DNA's double helix structure by James Watson and Francis Crick? 1953",
"What year was the first human heart transplant performed by Dr. Christiaan Barnard? 1967",
"What year was the Chernobyl nuclear disaster? 1986",
"What year was the launch of the Hubble Space Telescope? 1990",
"What year was the Rwandan Genocide? 1994",
"What year was the Oklahoma City bombing? 1995",
"What year was the cloning of Dolly the sheep? 1996",
"What year was the death of Princess Diana? 1997",
"What year was the Euro currency introduced? 1999",
"What year was the Indian Ocean earthquake and tsunami? 2004",
"What year was the election of Pope Francis? 2013",
"What year was the Paris Agreement on climate change signed? 2016",
"What year was the Brexit referendum? 2016",
"What year was the first iPhone released? 2007",
"What year was the election of Donald Trump as the 45th President of the United States? 2016",
"What year was the completion of the Human Genome Project? 2003",
"What year was the founding of the World Health Organization? 1948",
"What year was the assassination of Archduke Franz Ferdinand? 1914",
"What year was the start of the California Gold Rush? 1848",
"What year was the completion of the Panama Canal? 1914",
"What year was the discovery of penicillin by Alexander Fleming? 1928",
"What year was the Montgomery Bus Boycott? 1955",
"What year was the assassination of Mahatma Gandhi? 1948",
"What year was the formation of the European Union? 1993",
"What year was the release of the first Harry Potter book by J.K. Rowling? 1997",
"What year was the start of the American Civil War? 1861"]

def create_tf_dataset(data, tokenizer):
    def split_input_target(input_string):
        parts = input_string.strip().split("? ")
        event, year = " ".join(parts[:-1]), int(parts[-1])
        return event, year

    events, years = zip(*[split_input_target(item) for item in data])
    
    # Encode events using GPT-2 tokenizer
    encoded_events = [tokenizer.encode(event) for event in events]
    
    events_max_length = max([len(event) for event in encoded_events])
    
    encoded_events = [[0] * (events_max_length - len(event)) + event for event in encoded_events]
    
    encoded_years = [int(year) for year in years]
    encoded_years = tf.expand_dims(encoded_years, -1)
    events_tensor = tf.data.Dataset.from_tensor_slices([encoded_events])
    years_tensor = tf.data.Dataset.from_tensor_slices([encoded_years])

    dataset = tf.data.Dataset.zip((events_tensor, years_tensor))

    return dataset
dataset = create_tf_dataset(data, tokenizer)
[print(item) for item in dataset.as_numpy_iterator()]
# dataset = dataset.batch(batch_size, drop_remainder=True)

(array([[    0,     0,     0, ..., 24720,   286, 20153],
       [    0,     0,     0, ...,   262, 17520,  8270],
       [    0,     0,     0, ...,  5838,   286, 36782],
       ...,
       [    0,     0,     0, ...,   262,  3427,  4479],
       [    0,     0,  2061, ...,    42,    13, 41558],
       [    0,     0,     0, ...,  1605,  7511,  1810]], dtype=int32), array([[1776],
       [1789],
       [1815],
       [1865],
       [1876],
       [1903],
       [1912],
       [1914],
       [1917],
       [1918],
       [1929],
       [1939],
       [1941],
       [1944],
       [1945],
       [1945],
       [1945],
       [1950],
       [1957],
       [1962],
       [1963],
       [1969],
       [1975],
       [1989],
       [1991],
       [2001],
       [2003],
       [1989],
       [1968],
       [1953],
       [1967],
       [1986],
       [1990],
       [1994],
       [1995],
       [1996],
       [1997],
       [1999],
       [2004],
       [2013],
       [2016],
       [2016],
      

Exception ignored in: <function Executor.__del__ at 0x7f1c68398700>
Traceback (most recent call last):
  File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/tensorflow/python/eager/executor.py", line 46, in __del__
    self.wait()
  File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/tensorflow/python/eager/executor.py", line 65, in wait
    pywrap_tfe.TFE_ExecutorWaitForAllPendingNodes(self._handle)
tensorflow.python.framework.errors_impl.OutOfRangeError: End of sequence


[None]

In [6]:
model.fit(dataset, epochs=1)

ValueError: in user code:

    File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in run_step
        outputs = model.train_step(data)
    File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/home/adrian_fagerland/.local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filepe0tk9ch.py", line 16, in tf__call
        x += self.pos_encoding[:, :seq_len, :]

    ValueError: Exception encountered when calling layer 'gpt' (type GPT).
    
    in user code:
    
        File "<ipython-input-3-0312b98c80e0>", line 87, in call  *
            x += self.pos_encoding[:, :seq_len, :]
    
        ValueError: Dimensions must be equal, but are 19 and 16 for '{{node gpt/add}} = AddV2[T=DT_FLOAT](gpt/mul, gpt/strided_slice_2)' with input shapes: [?,19,256], [1,16,256].
    
    
    Call arguments received by layer 'gpt' (type GPT):
      • x=tf.Tensor(shape=(None, 19), dtype=int32)
      • training=True


In [None]:
def predict_next_word(input_text, transformer, tokenizer, top_k=5, max_length=128):
    input_tokens_full = tokenizer.encode(input_text, return_tensors="tf")
    if input_tokens_full.shape[1] > max_length:
        input_tokens = input_tokens_full[:, -max_length:]
    else:
        input_tokens = input_tokens_full
    seq_len = input_tokens.shape[1]
    logits = transformer(input_tokens, training=False)
    logits = logits[0, :]  # Get the logits for the last token
    print(logits)
    top_k_indices = tf.math.top_k(logits, k=top_k).indices
    print(top_k_indices.numpy())
    top_k_tokens = [tokenizer.decode([token_id]) for token_id in top_k_indices.numpy()]
    
    return top_k_tokens


input_text = """Anarchism was in 1912,"""
predicted_words = predict_next_word(input_text, transformer, tokenizer, top_k=50)
print(f"Input: {input_text}")
print("Predicted next words:")
for i, word in enumerate(predicted_words):
    print(f"{i + 1}. {word}")

tf.Tensor(
[-0.85107505 -0.0668224  -0.75011593 ... -0.5247153  -0.77267295
 -0.6266086 ], shape=(50257,), dtype=float32)
[   11   290   262    13   355  1042   286   284 41661  2312   257   393
    12   319  1690   338 27770  2116 16171  1479   407   606   587  1912
 22849   276   340  1290  4318  3417   318 26177  6712 11009   422   281
 13568  1964 13584   469   959   357 21218  8876   998   351 39935 42856
   584  5734]
Input: Anarchism was in 1912,
Predicted next words:
1. ,
2.  and
3.  the
4. .
5.  as
6. ism
7.  of
8.  to
9.  anarchism
10.  These
11.  a
12.  or
13. -
14.  on
15.  often
16. 's
17.  communism
18.  self
19.  voluntary
20.  free
21.  not
22.  them
23.  been
24.  based
25.  cooperative
26. ed
27.  it
28.  far
29.  central
30.  described
31.  is
32.  anarchist
33.  institutions
34.  advocates
35.  from
36.  an
37.  harmful
38.  political
39.  mutual
40. ge
41. ier
42.  (
43.  unjust
44.  philosophy
45. arch
46.  with
47. managed
48.  anarchy
49.  other
50.  specificall