In [31]:
import numpy as np 
import tensorflow as tf
import re
from tensorflow.data import Dataset
import tensorflow.keras as tfk
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from tokenizers import Tokenizer
from tqdm import tqdm_notebook as tqdm
import pandas as pd

In [32]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [33]:
tokenizer = GPT2TokenizerFast.from_pretrained('../model/en_tokenizer')
tokenizer.vocab_size

50257

In [34]:
oscar_corpus_tokenizer = Tokenizer.from_file('../model/oscar-corpus-tokenizer.json')
oscar_corpus_tokenizer.get_vocab_size()

50257

### Load gpt2 model

In [35]:
model = TFGPT2LMHeadModel.from_pretrained('../model/')
len(model.transformer.h)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


12

In [36]:
weights = tf.stop_gradient(model.transformer.get_input_embeddings().weight.value()).numpy()

model parameters

In [37]:
[i.shape for i in model.transformer.h[0].variables]

[TensorShape([768]),
 TensorShape([768]),
 TensorShape([768, 2304]),
 TensorShape([1, 2304]),
 TensorShape([768, 768]),
 TensorShape([1, 768]),
 TensorShape([768]),
 TensorShape([768]),
 TensorShape([768, 3072]),
 TensorShape([1, 3072]),
 TensorShape([3072, 768]),
 TensorShape([1, 768])]

## Create new embedding matrix

We need to get the mean embedding first as initial value for tokens that are not in the old vocab.

In [38]:
mean_weights = tf.reduce_mean(weights, axis = 0).numpy()
mean_weights.shape

(768,)

Create new embedding matrix with new vocab

In [39]:
new_vocab = oscar_corpus_tokenizer.get_vocab()
old_vocab = tokenizer.get_vocab()

In [40]:
enc = oscar_corpus_tokenizer.encode('hala mao ba?<|endoftext|>')
enc.ids

[2288, 376, 334, 32, 0]

In [41]:
encoding = oscar_corpus_tokenizer.encode_batch(['naunsa ka<|endoftext|>', 'hello nimo dodong kahibaw ka<|endoftext|>'])
foo = [i.ids for i in  encoding]
foo

[[20568, 276, 0], [10760, 1331, 21099, 35894, 276, 0]]

In [42]:
new_embeddings = tf.zeros([len(new_vocab), mean_weights.shape[0]]).numpy()
new_embeddings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [43]:
in_en_vocab = []
not_in_en_vocab = []
for word, idx_new  in tqdm(new_vocab.items()):
    idx_old =  old_vocab.get(word, -1)

    if idx_old >= 0:
        new_embeddings[idx_new, :] = weights[idx_old, :]
        in_en_vocab.append(word)
    else:
        new_embeddings[idx_new, :] = mean_weights
        not_in_en_vocab.append(word)



  0%|          | 0/50257 [00:00<?, ?it/s]

In [44]:
print(len(in_en_vocab))
print(len(not_in_en_vocab))

10244
40013


set new embeddings 

In [45]:
model.transformer.set_input_embeddings(tf.constant(new_embeddings))

test out model

In [46]:
inputs = ['kamusta naman ka?<|endoftext|>', 'maayong buntag!<|endoftext|>']
oscar_corpus_tokenizer.enable_padding()
encoded = oscar_corpus_tokenizer.encode_batch(inputs)
inputs = np.array([i.ids for i in encoded])
#print(inputs)

#result = model(inputs)
#print(result.logits)

### Freezing weights for fine tuning

In [None]:
oscar_corpus_tokenizer.enable_truncation()

For this implementation, only freeze the inner layers. Do not freeze layer norm, wte, and wpe

In [47]:
def freeze_weights_vanilla(model):
    for layer in model.transformer.h:
        layer.trainable = False
    model.transformer.wte.trainable = True
    model.transformer.wpe.trainable = True
    model.transformer.ln_f.trainable = True
    
# check if all are frozen
freeze_weights_vanilla(model)
all([not l.trainable for l in model.transformer.h])


True

### train-test split

In [60]:
text_lines = []
ptt = re.compile(r'\n$|\[\d+\]')
with open('../shuff-dedup/ceb/ceb_dedup.txt', 'r') as f:
    for line in tqdm(f):
        clean_text = ptt.sub('', line)
        text_lines.append(clean_text)
df_txt = pd.DataFrame({'document': text_lines})

# add end of line token
df_txt['truncated_doc'] = df_txt.document.apply(lambda x: x + '<|endoftext|>')
df_txt.head()

0it [00:00, ?it/s]

Unnamed: 0,document,truncated_doc
0,matamwa lg ko mayung gabe sa tanan....balikbay...,matamwa lg ko mayung gabe sa tanan....balikbay...
1,Maayong kaagahon liwat diri sa payag ICCC...na...,Maayong kaagahon liwat diri sa payag ICCC...na...
2,"tani ari si pre idol toto jericp SR, para siya...","tani ari si pre idol toto jericp SR, para siya..."
3,Kapital sa munisipyo ang Āmol (Pinulongang Per...,Kapital sa munisipyo ang Āmol (Pinulongang Per...
4,↑ Kalkulado gikan sa pakigbingkil sa tanan nga...,↑ Kalkulado gikan sa pakigbingkil sa tanan nga...


In [64]:
oscar_corpus_tokenizer.enable_truncation(max_length=1024)
def PreprocessData(ids):
    docs = df_txt.iloc[ids,:].document \
        .map(lambda str: str + '<|endoftext|>').tolist()
    pad_token_id = oscar_corpus_tokenizer.token_to_id('<pad>')
    input = []
    labels = []
    attn_mask = []
    for doc in docs:
        encoded = oscar_corpus_tokenizer.encode(doc)
        input.append(encoded.ids[:-1])
        labels.append(encoded.ids[1:])
        attn_mask.append(encoded.attention_mask[:-1])
        
    input = pad_sequences(input, value = pad_token_id, padding='post')
    labels = pad_sequences(labels, value = pad_token_id, padding='post')
    attn_mask = pad_sequences(attn_mask, value = 0, padding='post')

    #return one hot tensor
    #labels = tf.one_hot(labels, depth=new_embeddings.shape[0], dtype=tf.int32)

    return input, labels, attn_mask
    


In [67]:
num_lines = len(text_lines)
train_split = 0.8
train_num_docs = int(num_lines * train_split)
train_ids = np.random.choice(num_lines, train_num_docs, replace = False)
test_ids = np.setdiff1d(np.arange(num_lines), train_ids)



In [68]:
print(train_ids.shape[0])
print(test_ids.shape[0])

60176
15044


In [69]:
"""tst_df = Dataset.from_tensor_slices(train_ids) \
    .batch(1) \
    .map(lambda x: tf.py_function(PreprocessData, [x], [tf.int32, tf.int32, tf.int32]))

length = []
for inp, _, __ in tqdm(tst_df):
    length.append(inp.shape[1])
pd.Series(length).describe()
"""

In [None]:
train_batch_size = 1
df_train = Dataset.from_tensor_slices(train_ids)
df_train = df_train.shuffle(10000).batch(train_batch_size)
df_train = df_train.map(lambda x: tf.py_function(PreprocessData, [x], [tf.int32, tf.int32, tf.int32]))

## Training routine

In [None]:
def cross_entropy_loss(y_true,y_pred):
    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(y_true, y_pred))


In [None]:
epochs = 1
lr = int(1e-4)
num_batches = train_ids.shape[0] // train_batch_size
optimizer = tfk.optimizers.Adam(learning_rate=lr)

for i in range(epochs):
    for inp, label, attn  in tqdm(df_train.take(10)):
        batch_losses = []
        with tf.GradientTape() as tape:
            results = model(input_ids = inp, attention_mask = label)
            loss = cross_entropy_loss(label, results.logits)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
             