In [3]:
import numpy as np 
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast
from tokenizers import Tokenizer
from tqdm import tqdm_notebook as tqdm

In [4]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained('../model/en_tokenizer')

In [6]:
tokenizer.vocab_size

50257

In [7]:
oscar_corpus_tokenizer = Tokenizer.from_file('../model/oscar-corpus-tokenizer.json')

In [8]:
oscar_corpus_tokenizer.get_vocab_size()

25000

### Load gpt2 model

In [9]:
model = TFGPT2LMHeadModel.from_pretrained('../model/')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [10]:
weights = tf.stop_gradient(model.transformer.get_input_embeddings().weight.value()).numpy()

In [11]:
weights.shape

(50257, 768)

## Create new embedding matrix

We need to get the mean embedding first as initial value for tokens that are not in the old vocab.

In [12]:
mean_weights = tf.reduce_mean(weights, axis = 0).numpy()
mean_weights.shape

(768,)

Create new embedding matrix with new vocab

In [13]:
new_vocab = oscar_corpus_tokenizer.get_vocab()
old_vocab = tokenizer.get_vocab()

In [14]:
new_embeddings = tf.zeros([len(new_vocab), mean_weights.shape[0]]).numpy()
new_embeddings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [15]:
in_en_vocab = []
not_in_en_vocab = []
for word, idx_new  in tqdm(new_vocab.items()):
    idx_old =  old_vocab.get(word, -1)

    if idx_old >= 0:
        new_embeddings[idx_new, :] = weights[idx_old, :]
        in_en_vocab.append(word)
    else:
        new_embeddings[idx_new, :] = mean_weights
        not_in_en_vocab.append(word)



  0%|          | 0/25000 [00:00<?, ?it/s]

In [16]:
len(in_en_vocab)

7198

In [17]:
len(not_in_en_vocab)

17802