In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model, Sequential
import numpy as np
import transformers

In [2]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
bert = transformers.TFBertModel.from_pretrained("bert-base-uncased")
bert.trainable = False
bert2 = transformers.TFBertModel.from_pretrained("bert-base-uncased")
bert2.trainable = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [3]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, MultiHeadAttention, Dense, Concatenate, GlobalAveragePooling1D

In [4]:
!wget https://www.gutenberg.org/cache/epub/100/pg100.txt

--2024-12-19 04:55:33--  https://www.gutenberg.org/cache/epub/100/pg100.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5638516 (5.4M) [text/plain]
Saving to: ‘pg100.txt’


2024-12-19 04:55:34 (9.25 MB/s) - ‘pg100.txt’ saved [5638516/5638516]



In [5]:
!ls -ltrh /content/pg100.txt

-rw-r--r-- 1 root root 5.4M Dec  1 09:00 /content/pg100.txt


In [6]:
with open('/content/pg100.txt', 'r') as f:
  txt = f.read()

In [7]:
txt[:100]

'\ufeffThe Project Gutenberg eBook of The Complete Works of William Shakespeare\n    \nThis ebook is for the'

In [8]:
words = tokenizer.tokenize(txt)

In [9]:
x = np.array(["[CLS]"] + words).reshape([-1, 1])
y = np.array(words + ["[FIN]"]).reshape([-1, 1])

In [10]:
x = x[:50_000, ...]
y = y[:50_000, ...]

In [11]:
def tokenize_x_input(word):
   result = tokenizer(word[0], padding='max_length', return_tensors='tf', truncation=True, max_length=32)
   return tf.reshape(tf.concat([result['input_ids'], result['attention_mask']], axis=1), (64,))

def tokenize_y_input(word):
   return tf.reshape(tokenizer.convert_tokens_to_ids(word[0]), (1,))

def return_xy_data(data):
  for i in range(len(x)):
    input, output = next(data)
    yield tokenize_x_input(input), tokenize_y_input(output)

data = tf.data.Dataset.from_generator(
    lambda: return_xy_data(zip(x, y)),
    output_signature=(
        tf.TensorSpec(shape=(64,), dtype=tf.int32),
        tf.TensorSpec(shape=(1,), dtype=tf.int32)
    ))
data = data.cache().prefetch(buffer_size=tf.data.AUTOTUNE).batch(32)

In [12]:
for train, label in data.take(10):
  print(train.shape, label.shape)
  print('='*50)

(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)
(32, 64) (32, 1)


In [13]:
class PositionalEncoding(layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.encoding = self.positional_encoding(position, d_model)

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis],
                                     np.arange(d_model)[np.newaxis, :],
                                     d_model)
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def call(self, inputs):
        return inputs + self.encoding[:, :tf.shape(inputs)[1], :]


In [14]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_size, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size)
        self.ffn = Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_size),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.attention(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [15]:
class CusotmModel(Model):
  def __init__(self, tokenizer, distills):
    super().__init__()
    self.tokenizer = tokenizer
    self.distills = distills
    self.decoder_model = self.create_decoder((32, 1536))

  def encoder(self, x):
    concat = Concatenate()
    result = []
    for distill in self.distills:
      result.append(distill(x)[0])
    return concat(result)

  def create_decoder(self, input_shape):
    inputs = layers.Input(shape=input_shape)
    x = PositionalEncoding(position=32, d_model=1536)(inputs)
    x = TransformerBlock(embed_size=1536, num_heads=8, ff_dim=2048)(x, training=True)
    x = GlobalAveragePooling1D()(x)
    outputs = layers.Dense(self.tokenizer.vocab_size, activation='softmax')(x)
    return Model(inputs=inputs, outputs=outputs)

  def decoder(self, x):
    return self.decoder_model(x)

  def call(self, x):
    x = (x[..., :32], x[..., 32:])
    e = self.encoder(x)
    return self.decoder(e)


In [16]:
model = CusotmModel(tokenizer, [bert, bert2])

In [17]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
for train, label in data.take(1):
  token_ids = model(train)
  decoded_texts = tokenizer.decode(tf.argmax(token_ids, axis=1)[1], skip_special_tokens=True)
  print(decoded_texts)
  print(token_ids.shape)
  print(label.shape)

##tre
(32, 30522)
(32, 1)


In [20]:
model.fit(data, epochs=20)

Epoch 1/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1000 - loss: 5.8689
Epoch 2/20


  self.gen.throw(typ, value, traceback)


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1105 - loss: 5.7054
Epoch 3/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 232ms/step - accuracy: 0.1159 - loss: 5.5942
Epoch 4/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 233ms/step - accuracy: 0.1170 - loss: 5.4928
Epoch 5/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1193 - loss: 5.4052
Epoch 6/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1218 - loss: 5.3184
Epoch 7/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1257 - loss: 5.2407
Epoch 8/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1262 - loss: 5.1682
Epoch 9/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1290 - loss: 5.0961
Epo

<keras.src.callbacks.history.History at 0x788f0c222da0>

In [38]:
x = tokenize_x_input("airplane")
token_ids = model(tf.expand_dims(x, axis=0), training=False)
token_ids.shape

TensorShape([1, 30522])

In [39]:
decoded_texts = tokenizer.decode(tf.argmax(token_ids, axis=1)[0], skip_special_tokens=True)
decoded_texts

'paper'

In [77]:
for distill in model.distills:
  distill.trainable = True
model.decoder_model.trainable = False

In [43]:
model.fit(data, epochs=2)

Epoch 1/2
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 233ms/step - accuracy: 0.1362 - loss: 4.6581
Epoch 2/2


  self.gen.throw(typ, value, traceback)


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 232ms/step - accuracy: 0.1360 - loss: 4.6306


<keras.src.callbacks.history.History at 0x789040dfeaa0>

In [46]:
bert0 = model.distills[0].weights
bert1 = model.distills[1].weights

In [66]:
same = []
for i in range(len(bert1)):
  same.append(tf.reduce_all(tf.equal(bert0[1], bert1[1])).numpy())
sum(same)/len(same)

In [80]:
sum(same)/len(same)

1.0

In [78]:
for distill in model.distills:
  distill.trainable = True
model.decoder_model.trainable = True

In [79]:
model.fit(data, epochs=5)

Epoch 1/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 235ms/step - accuracy: 0.1378 - loss: 4.6107
Epoch 2/10


  self.gen.throw(typ, value, traceback)


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 233ms/step - accuracy: 0.1388 - loss: 4.5885
Epoch 3/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 232ms/step - accuracy: 0.1394 - loss: 4.5685
Epoch 4/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 233ms/step - accuracy: 0.1417 - loss: 4.5437
Epoch 5/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 232ms/step - accuracy: 0.1418 - loss: 4.5283
Epoch 6/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 232ms/step - accuracy: 0.1431 - loss: 4.5253
Epoch 7/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 232ms/step - accuracy: 0.1439 - loss: 4.5252
Epoch 8/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 232ms/step - accuracy: 0.1436 - loss: 4.5144
Epoch 9/10
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 232ms/step - accuracy: 0.1448 - loss: 4.4920
Epo

<keras.src.callbacks.history.History at 0x788e7aae2950>

In [82]:
same = []
for i in range(len(bert1)):
  same.append(tf.reduce_all(tf.equal(bert0[1], bert1[1])).numpy())
sum(same)/len(same)

1.0