In [1]:
import numpy as np 
import tensorflow as tf
import tensorflow.keras as tfk
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast
from tokenizers import Tokenizer
from tqdm import tqdm_notebook as tqdm

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained('../model/en_tokenizer')

In [4]:
tokenizer.vocab_size

50257

In [5]:
oscar_corpus_tokenizer = Tokenizer.from_file('../model/oscar-corpus-tokenizer.json')

In [6]:
oscar_corpus_tokenizer.get_vocab_size()

50257

### Load gpt2 model

In [8]:
model = TFGPT2LMHeadModel.from_pretrained('../model/')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [9]:
len(model.transformer.h)

12

In [10]:
weights = tf.stop_gradient(model.transformer.get_input_embeddings().weight.value()).numpy()

## Create new embedding matrix

We need to get the mean embedding first as initial value for tokens that are not in the old vocab.

In [11]:
mean_weights = tf.reduce_mean(weights, axis = 0).numpy()
mean_weights.shape

(768,)

Create new embedding matrix with new vocab

In [12]:
new_vocab = oscar_corpus_tokenizer.get_vocab()
old_vocab = tokenizer.get_vocab()

In [13]:
new_embeddings = tf.zeros([len(new_vocab), mean_weights.shape[0]]).numpy()
new_embeddings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [14]:
in_en_vocab = []
not_in_en_vocab = []
for word, idx_new  in tqdm(new_vocab.items()):
    idx_old =  old_vocab.get(word, -1)

    if idx_old >= 0:
        new_embeddings[idx_new, :] = weights[idx_old, :]
        in_en_vocab.append(word)
    else:
        new_embeddings[idx_new, :] = mean_weights
        not_in_en_vocab.append(word)



  0%|          | 0/50257 [00:00<?, ?it/s]

In [15]:
print(len(in_en_vocab))
print(len(not_in_en_vocab))

10244
40013


set new embeddings 

In [16]:
model.transformer.set_input_embeddings(tf.constant(new_embeddings))

In [17]:
oscar_corpus_tokenizer.enable_padding()
inputs = oscar_corpus_tokenizer.encode_batch(['naunsa naman ka diha?', 'okay ra ka?'])
inputs = np.array([i.ids for i in inputs])

In [19]:
result = model(inputs)

test predictions

In [20]:
foo = tf.nn.softmax(logits=result.logits, axis = 2)
tf.argmax(foo, axis = 2)

<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[   12, 22996,    13,   362,   199],
       [   12,   326,    31,   199,    33]])>

### Freezing weights for fine tuning

For this implementation, only freeze the inner layers. Do not freeze layer norm, wte, and wpe

In [36]:
def freeze_weights_vanilla(model):
    for layer in model.transformer.h:
        layer.trainable = False
    model.transformer.wte.trainable = True
    model.transformer.wpe.trainable = True
    model.transformer.ln_f.trainable = True
    
# check if all are frozen
freeze_weights_vanilla(model)
all([not l.trainable for l in model.transformer.h])


True

### train-test split

In [45]:
[i.shape for i in model.transformer.h[0].variables]

[TensorShape([768]),
 TensorShape([768]),
 TensorShape([768, 2304]),
 TensorShape([1, 2304]),
 TensorShape([768, 768]),
 TensorShape([1, 768]),
 TensorShape([768]),
 TensorShape([768]),
 TensorShape([768, 3072]),
 TensorShape([1, 3072]),
 TensorShape([3072, 768]),
 TensorShape([1, 768])]