## Transformer for Kwere

In [18]:
#imports
import os
import string
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [19]:
VOCAB_SIZE = 48
ALPHABET = [i for i in string.ascii_lowercase +string.digits + r' !"(),-.:;?'+ "'"]
#[ !"'(),-.0-9:;?a-z]


In [24]:
class BPETokenizer():
    # hugging face API code
    # is taken and modified from instructor's 
    # https://colab.research.google.com/drive/1-TgwCXqYd8ON-58TFzLk413mEqC-7r5F?usp=sharing#scrollTo=0AetkU9nu8OD
    
    def __init__(self,):
        self.tokenizer = Tokenizer(BPE(
        )) #byte pair encoding
        self.tokenizer.normalizer = Sequence([Lowercase()])  # normalization
        self.tokenizer.pre_tokenizer = ByteLevel() #pre-tokenizer
        self.tokenizer.decoder = ByteLevelDecoder() #decoder

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=VOCAB_SIZE,
        initial_alphabet=ALPHABET,
        )
        self.tokenizer.train(paths, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)


In [36]:
kwere_path = ["./train-04/cwe-train.txt"]
swahili_path = ["./train-04/sw-train.txt"]
tokenizer = BPETokenizer()
tokenizer.bpe_train(kwere_path)
tokenizer.save_tokenizer("pretrained_cwe")







In [38]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("./pretrained_cwe/")
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_embd=256,
    n_layer=4,
    n_head=4
)


In [78]:
# creating the model
model = TFGPT2LMHeadModel(config)
inputs = tokenizer.encode(
    "kuishi vema maisha ya kikristo", return_tensors="tf")
model(inputs)
model.summary()


Model: "tfgpt2lm_head_model_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 3434496   
 r)                                                              
                                                                 
Total params: 3,434,496
Trainable params: 3,434,496
Non-trainable params: 0
_________________________________________________________________


In [77]:
single_string = ''
for filename in kwere_path:
  with open(filename, "r", encoding='utf-8') as f:
    x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)


In [80]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [88]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[
              loss, *[None] * model.config.n_layer], metrics=[metric])


In [89]:
num_epoch = 10
history = model.fit(dataset, epochs=num_epoch)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
