In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.data import Dataset
import tensorflow.keras as tfk
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast
from tokenizers import Tokenizer
from tqdm import tqdm_notebook as tqdm
import pandas as pd

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained('../model/en_tokenizer')

In [4]:
tokenizer.vocab_size

50257

In [5]:
oscar_corpus_tokenizer = Tokenizer.from_file('../model/oscar-corpus-tokenizer.json')

In [6]:
oscar_corpus_tokenizer.get_vocab_size()

50257

### Load gpt2 model

In [7]:
model = TFGPT2LMHeadModel.from_pretrained('../model/')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [8]:
len(model.transformer.h)

12

In [9]:
weights = tf.stop_gradient(model.transformer.get_input_embeddings().weight.value()).numpy()

## Create new embedding matrix

We need to get the mean embedding first as initial value for tokens that are not in the old vocab.

In [10]:
mean_weights = tf.reduce_mean(weights, axis = 0).numpy()
mean_weights.shape

(768,)

Create new embedding matrix with new vocab

In [11]:
new_vocab = oscar_corpus_tokenizer.get_vocab()
old_vocab = tokenizer.get_vocab()

In [12]:
new_embeddings = tf.zeros([len(new_vocab), mean_weights.shape[0]]).numpy()
new_embeddings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
in_en_vocab = []
not_in_en_vocab = []
for word, idx_new  in tqdm(new_vocab.items()):
    idx_old =  old_vocab.get(word, -1)

    if idx_old >= 0:
        new_embeddings[idx_new, :] = weights[idx_old, :]
        in_en_vocab.append(word)
    else:
        new_embeddings[idx_new, :] = mean_weights
        not_in_en_vocab.append(word)



  0%|          | 0/50257 [00:00<?, ?it/s]

In [14]:
print(len(in_en_vocab))
print(len(not_in_en_vocab))

10244
40013


set new embeddings 

In [15]:
model.transformer.set_input_embeddings(tf.constant(new_embeddings))

In [16]:
oscar_corpus_tokenizer.enable_padding()
inputs = oscar_corpus_tokenizer.encode_batch(['naunsa naman ka diha?', 'okay ra ka?'])
inputs = np.array([i.ids for i in inputs])

In [17]:
result = model(inputs)
result.logits

<tf.Tensor: shape=(2, 5, 50257), dtype=float32, numpy=
array([[[ -34.42376 ,  -33.993847,  -33.56087 , ...,  -39.639835,
          -40.578823,  -40.578823],
        [-109.75688 , -108.54434 , -108.19703 , ..., -113.4416  ,
         -115.05064 , -115.05064 ],
        [ -93.59099 ,  -91.837746,  -94.07659 , ..., -100.13715 ,
         -100.39827 , -100.39827 ],
        [-112.08247 , -111.186195, -111.84502 , ..., -118.42682 ,
         -118.531456, -118.531456],
        [ -96.70638 ,  -98.734085,  -99.56179 , ..., -105.150246,
         -105.75707 , -105.75707 ]],

       [[ -36.31277 ,  -34.806576,  -35.164993, ...,  -41.10214 ,
          -42.398705,  -42.398705],
        [ -81.05285 ,  -80.31885 ,  -80.06083 , ...,  -87.19076 ,
          -87.26317 ,  -87.26317 ],
        [ -80.86438 ,  -77.07138 ,  -79.10552 , ...,  -85.11696 ,
          -87.3448  ,  -87.3448  ],
        [ -96.84128 ,  -98.45837 ,  -98.81676 , ..., -104.71257 ,
         -106.112885, -106.112885],
        [ -93.81322 ,  -9

test predictions

In [18]:
foo = tf.nn.softmax(logits=result.logits, axis = 2)
tf.argmax(foo, axis = 2)

<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[   12, 22996,    13,   362,   199],
       [   12,   326,    31,   199,    33]])>

### Freezing weights for fine tuning

For this implementation, only freeze the inner layers. Do not freeze layer norm, wte, and wpe

In [19]:
def freeze_weights_vanilla(model):
    for layer in model.transformer.h:
        layer.trainable = False
    model.transformer.wte.trainable = True
    model.transformer.wpe.trainable = True
    model.transformer.ln_f.trainable = True
    
# check if all are frozen
freeze_weights_vanilla(model)
all([not l.trainable for l in model.transformer.h])


True

### train-test split

In [20]:
text_lines = []
with open('../shuff-dedup/ceb/ceb_dedup.txt', 'r') as f:
    for line in tqdm(f):
        text_lines.append(line)
df_txt = pd.DataFrame({'document': text_lines})
df_txt.head()

0it [00:00, ?it/s]

Unnamed: 0,document
0,matamwa lg ko mayung gabe sa tanan....balikbay...
1,Maayong kaagahon liwat diri sa payag ICCC...na...
2,"tani ari si pre idol toto jericp SR, para siya..."
3,Kapital sa munisipyo ang Āmol (Pinulongang Per...
4,↑ Kalkulado gikan sa pakigbingkil sa tanan nga...


In [21]:
def PreprocessData(ids):
    docs = df_txt.iloc[ids,:].document.tolist()    
    encoded = oscar_corpus_tokenizer.encode_batch(docs)
    x = np.array([i.ids for i in encoded])
    attn_mask = np.array([i.attention_mask for  i in encoded])

    return x, attn_mask



In [22]:
num_lines = sum(1 for line in open('../shuff-dedup/ceb/ceb_dedup.txt'))
train_split = 0.8
train_num_docs = int(num_lines * train_split)
train_ids = np.random.choice(num_lines, train_num_docs, replace = False)
test_ids = np.setdiff1d(np.arange(num_lines), train_ids)



In [23]:
print(train_ids.shape[0])
print(test_ids.shape[0])

60176
15044


In [24]:
train_batch_size = 8
df_train = Dataset.from_tensor_slices(train_ids)
df_train = df_train.shuffle(10000).batch(train_batch_size)
df_train = df_train.map(lambda x: tf.py_function(PreprocessData, [x], [tf.int32, tf.int32]))

In [29]:
foo = list(df_train.take(1))
for cmp in foo[0]:
    print(cmp.shape)

(8, 131)
(8, 131)


In [26]:
[i.shape for i in model.transformer.h[0].variables]

[TensorShape([768]),
 TensorShape([768]),
 TensorShape([768, 2304]),
 TensorShape([1, 2304]),
 TensorShape([768, 768]),
 TensorShape([1, 768]),
 TensorShape([768]),
 TensorShape([768]),
 TensorShape([768, 3072]),
 TensorShape([1, 3072]),
 TensorShape([3072, 768]),
 TensorShape([1, 768])]

## Training routine