In [None]:
!pip install --quiet git+https://github.com/huggingface/transformers sentencepiece
!pip install --quiet datasets tiktoken wandb
!pip install --quiet sacrebleu evaluate
!pip install sentence_transformers

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
#@title Imports
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from transformers import SeamlessM4Tv2ForTextToText, AutoProcessor, SeamlessM4TTokenizer, GenerationConfig, AutoTokenizer
from datetime import datetime
from tokenizers import Tokenizer, pre_tokenizers, SentencePieceBPETokenizer, models, processors, ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from vllm import LLM, SamplingParams
import os
from tqdm.notebook import tqdm
from multiprocessing import Pool
import numpy as np
import nltk
import os
import pickle
import tiktoken
import wandb
import random
import json




In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

In [None]:
tokenizer = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
model = SeamlessM4Tv2ForTextToText.from_pretrained("facebook/seamless-m4t-v2-large").to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Convert to half-precision
model.half()

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

**Load Dataset**

In [None]:
dataset = load_dataset("roneneldan/TinyStories")

In [None]:
train_data = dataset['train']

In [None]:
train_data

Dataset({
    features: ['text'],
    num_rows: 2119719
})

In [None]:
exx_input = train_data.filter(lambda example, idx: idx <= 249999, with_indices=True)

In [None]:
exx_input[0:2]

{'text': ['One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.',
  'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leav

In [None]:
train_texts = exx_input['text']

In [None]:
train_texts[0:2]

['One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.',
 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that we

In [None]:
len(train_texts)

250000

In [None]:
ds = load_dataset("roneneldan/TinyStoriesInstruct")

**Chunking Stories**

In [None]:
class ChunkingStories(Dataset):
    def __init__(self, stories):
        self.stories = stories

    def __len__(self):
        return len(self.stories)

    def __getitem__(self, idx):
        return self.stories[idx]

def chunk_stories(stories):
    all_sentences = []
    for story in stories:
        sentences = nltk.sent_tokenize(story)
        all_sentences.extend(sentences)
        all_sentences.append("")

    if all_sentences and all_sentences[-1] == "":
        all_sentences.pop()

    return all_sentences

def process_batch(batch):
    return chunk_stories(batch)

def process_chunkstories(stories, batch_size, num_workers):
    dataset = ChunkingStories(stories)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False)  

    all_sentences = []

    with Pool(processes=num_workers) as pool:
        futures = []

        for batch in tqdm(dataloader, desc="Chunking Stories"):
            futures.append(pool.apply_async(process_batch, (batch,)))

        
        for future in tqdm(futures, desc="Collecting Results"):
            sentences = future.get()
            all_sentences.extend(sentences)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return all_sentences

In [None]:
chunked_stories = process_chunkstories(train_texts, batch_size=32, num_workers=32)

Chunking Stories:   0%|          | 0/7813 [00:00<?, ?it/s]

Collecting Results:   0%|          | 0/7813 [00:00<?, ?it/s]

In [None]:
len(chunked_stories)

5143535

In [None]:
chunked_stories[0:20]

In [None]:
with open('chunked_stories.txt','w') as f:
  for story in chunked_stories:
    f.write(story + '\n')

In [None]:
len(chunked_stories)

5143535

In [None]:
torch.cuda.empty_cache()

**Translating Stories**

In [None]:
class TranslatingStories(Dataset):
    def __init__(self, texts, src_lang, tgt_lang):
        self.texts = texts
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
def process_texts(batch):
    return tokenizer(
        text=batch,
        src_lang='eng',
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

def translate_stories(batch):
    inputs = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        generated_outputs = model.generate(
            **inputs,
            num_beams=4,
            early_stopping=True,
            tgt_lang='yor'
        )
    return tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)

def process_in_batches(texts, batch_size=64, num_workers=16):
    dataset = TranslatingStories(texts, 'eng', 'yor')


    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=process_texts, num_workers=num_workers)

    results = []
    for batch in tqdm(dataloader, desc="Translating"):
        batch_outputs = translate_stories(batch)
        results.extend(batch_outputs)
        torch.cuda.empty_cache()
    return results

In [None]:
input_texts = chunked_stories
translated_batches = process_in_batches(input_texts, batch_size=32, num_workers=4)

Translating:   0%|          | 0/160736 [00:00<?, ?it/s]

Buffered data was truncated after reaching the output size limit.

In [None]:
# translated_batches[0:100]

In [None]:
# with open('yuroba_translations.pkl','wb') as f:
#   pickle.dump(translated_batches, f)

In [None]:
# with open('translations.pkl','rb') as f:
#   translated_batches = pickle.load(f)

In [None]:
len(translated_batches)

5143535

In [None]:
translated_batches[0:20]

**Joining Stories**

In [None]:
def join_stories(tokenized_stories, separator=' '):
    """Joins tokenized sentences into stories, keeping them separate in the final list."""
    joined_stories = []
    current_story = []

    for sentence in tokenized_stories:
        if sentence == " ":
            if current_story:
                joined_stories.append(separator.join(current_story))
                current_story = []
        else:
            current_story.append(sentence)

    if current_story:
        joined_stories.append(separator.join(current_story))

    return joined_stories


In [None]:
joined_stories = join_stories(translated_batches,' ')

In [None]:
joined_stories[0:2]

['lọ́jọ́ kan, ọmọdébìnrin kan tó ń jẹ́ lily rí abẹ́rẹ́ kan nínú yàrá rẹ̀. Ó mọ̀ pé ó ṣòro láti fi ṣeré nítorí pé ó mú gan-an. Lily fẹ́ pín abẹ́rẹ́ náà pẹ̀lú màmá rẹ̀, kó lè se bọ́tìn kan sára ẹ̀wù rẹ̀. Lily lọ sọ́dọ̀ ìyá rẹ̀, ó sì sọ fún un pé, "Màmá, mo rí abẹ́rẹ́ yìí. Ṣé o lè bá mi pín in, kó o sì se ẹ̀wù mi?" "màmá rẹ̀ rẹ́rìn-ín músẹ́, ó sì sọ fún un pé: ""bẹ́ẹ̀ ni, lily, a lè jọ lo abẹ́rẹ́ náà, ká sì tún aṣọ rẹ ṣe. """ Wọ́n jọ fi abẹ́rẹ́ náà ṣe abẹ́rẹ́ náà, wọ́n sì tún se bọ́tìnnì tó wà lára aṣọ Lily. Kò ṣòro fún wọn nítorí pé wọ́n ń pín nǹkan pa pọ̀, wọ́n sì ń ran ara wọn lọ́wọ́. Lẹ́yìn tí wọ́n parí ọ̀rọ̀ wọn, Lily dúpẹ́ lọ́wọ́ màmá rẹ̀ pé ó fún òun ní abẹ́rẹ́ náà, ó sì tún aṣọ rẹ̀ ṣe. Inú wọn dùn gan-an nítorí pé wọ́n jọ ń ṣiṣẹ́ pa pọ̀.',
 'Ìgbà kan wà tí ọkọ̀ ayọ́kẹ́lẹ́ kékeré kan wà tí wọ́n ń pè ní Beep. Beep fẹ́ràn láti máa sáré kánkán, kó sì máa ṣeré lábẹ́ oòrùn. Beep jẹ́ ọkọ̀ ayọ́kẹ́lẹ́ tó gbámúṣé nítorí pé ó máa ń lo epo tó dára. Epo tó dára máa ń mú kí Beep láyọ̀, ó sì máa

In [None]:
# with open('joined_yuroba_stories.pkl','wb') as f:
#   pickle.dump(joined_stories, f)

# with open('joinedstories.pkl','wb') as f:
#   pickle.dump(f)

In [None]:
# with open('joined_yoruba_stories.pkl','rb') as f:
#   yor_data = pickle.load(f)

# with open('joinedstories.pkl','rb') as f:
#   zul_data = pickle.load(f)

In [None]:
#@title split into train and eval (both sets 250k)
random.seed(42)

random.shuffle(zul_data)
random.shuffle(yor_data)


split = int(0.9*len(zul_data))

#For Zulu dataset
zulu_stories = zul_data[:split]
zulu_stories = [story + ' <|endofstory|>' for story in zulu_stories]
zulu_stories = '\n\n'.join(zulu_stories)

eval_ZuluMax = zul_data[split:]
eval_ZuluMax = '\n\n'.join(eval_ZuluMax)

with open('eval_ZuluMax.txt', 'w', encoding='utf-8') as f:
  f.write(eval_ZuluMax)

with open('zulu_stories.txt', 'w', encoding='utf-8') as f:
  f.write(zulu_stories)

print(f"Zulu stories Length:", {len(zulu_stories)})
print(f"Zulu evalution Length:", {len(eval_ZuluMax)})


#For Yoruba dataset
split = int(0.9*len(yor_data))

yoruba_stories = yor_data[:split]
yoruba_stories = [story + ' <|endofstory|>' for story in yoruba_stories]
yoruba_stories = '\n\n'.join(yoruba_stories)

eval_YorubaMax = yor_data[split:]
eval_YorubaMax = '\n\n'.join(eval_YorubaMax)

with open('eval_YorubaMax.txt', 'w', encoding='utf-8') as f:
  f.write(eval_YorubaMax)

with open('yoruba_stories.txt', 'w', encoding='utf-8') as f:
  f.write(yoruba_stories)

print(f"Yoruba stories Length:", {len(yoruba_stories)})
print(f"Yoruba evalution Length:", {len(eval_YorubaMax)})

Zulu stories Length: {226488925}
Zulu evalution Length: {24878226}
Yoruba stories Length: {227105696}
Yoruba evalution Length: {25014373}


In [None]:
#@title extracting 10,000 stories from 250k, and splitting into train and eval (both sets)

random.seed(42)

zul_sample = random.sample(zul_data, 10000)
yor_sample = random.sample(yor_data, 10000)


split = int(0.9*len(zul_sample))

#For Zulu dataset
zulu_10kstories = zul_sample[:split]
zulu_10kstories = [story + ' <|endofstory|>' for story in zulu_10kstories]
zulu_10kstories = '\n\n'.join(zulu_10kstories)

eval_ZuluMini = zul_sample[split:]
eval_ZuluMini = '\n\n'.join(eval_ZuluMini)

with open('eval_ZuluMini.txt', 'w', encoding='utf-8') as f:
  f.write(eval_ZuluMini)

with open('zulu_10kstories.txt', 'w', encoding='utf-8') as f:
  f.write(zulu_10kstories)

print(f"MiniZulu stories Length:", {len(zulu_10kstories)})
print(f"MiniZulu evalution Length:", {len(eval_ZuluMini)})


#For Yoruba dataset
split = int(0.9*len(yor_sample))

yoruba_10kstories = yor_sample[:split]
yoruba_10kstories = [story + ' <|endofstory|>' for story in yoruba_10kstories]
yoruba_10kstories = '\n\n'.join(yoruba_10kstories)

eval_YorubaMini = yor_sample[split:]
eval_YorubaMini = '\n\n'.join(eval_YorubaMini)

with open('eval_YorubaMini.txt', 'w', encoding='utf-8') as f:
  f.write(eval_YorubaMini)

with open('yoruba_10kstories.txt', 'w', encoding='utf-8') as f:
  f.write(yoruba_10kstories)

print(f"MiniYoruba stories Length:", {len(yoruba_10kstories)})
print(f"MiniYoruba evalution Length:", {len(eval_YorubaMini)})

MiniZulu stories Length: {9372459}
MiniZulu evalution Length: {1027087}
MiniYoruba stories Length: {9370684}
MiniYoruba evalution Length: {1041663}


**Pre-Processing & Training Translated Texts**

In [None]:
#@title zulu tokenizer code
zulu_tokeniser = ByteLevelBPETokenizer()
zulu_tokeniser.train(files='zulu_stories.txt', vocab_size=52000, min_frequency=2, special_tokens=["<unk>", "<cls>", "<sep>", "<pad>", "<mask>", "<|endofstory|>"])
zulu_tokeniser.save("zulu_tokenizer.json")
zulu_tokeniser.get_vocab_size()

vocab = zulu_tokeniser.get_vocab()

stoi = vocab
itos = {i: ch for ch, i in stoi.items()}

meta_info = {
    'vocab_size': zulu_tokeniser.get_vocab_size(),
    'tokenizer_type': 'ByteLevelBPE',
    'eos_token': '<sep>',
    'bos_token': '<cls>',
    'unk_token': '<unk>',
    'pad_token': '<pad>',
    'mask_token': '<mask>',
    'end_of_story_token': '<|endofstory|>',
    'stoi': stoi,
    'itos': itos,
    'vocab': vocab,
    'max_token_length': 512,
    'dataset_name': 'zulu_corpus',
    'version': '1.0',
    'date_created': str(datetime.now()),
    'tokenizer_file': 'zulu_tokenizer.json'
}


with open("zulu_meta.pkl", 'wb') as f:
    pickle.dump(meta_info, f)


In [None]:
#@title yoruba tokenizer code

yor_tokeniser = SentencePieceBPETokenizer()
yor_tokeniser.train(files='yoruba_stories.txt', vocab_size=52000, min_frequency=2, special_tokens=["<unk>", "<cls>", "<sep>", "<pad>", "<mask>", "<|endofstory|>"])
yor_tokeniser.save("yoruba_tokenizer.json")
yor_tokeniser.get_vocab_size()

y_vocab = yor_tokeniser.get_vocab()

y_stoi = y_vocab

y_itos = {i: ch for ch, i in y_stoi.items()}

meta_info = {
    'vocab_size': yor_tokeniser.get_vocab_size(),
    'tokenizer_type': 'SentencePieceBPE',
    'eos_token': '<sep>',
    'bos_token': '<cls>',
    'unk_token': '<unk>',
    'pad_token': '<pad>',
    'mask_token': '<mask>',
    'end_of_story_token': '<|endofstory|>',
    'stoi': y_stoi,
    'itos': y_itos,
    'vocab': y_vocab,
    'max_token_length': 512,
    'dataset_name': 'yoruba_corpus',
    'version': '1.0',
    'date_created': str(datetime.now()),
    'tokenizer_file': 'yoruba_tokenizer.json'
}

with open("yoruba_meta.pkl", 'wb') as f:
    pickle.dump(meta_info, f)


In [None]:
#@title Testing tokenizers
test_sentence = "Lena indaba yokuhlola. Ukuzama nokuhlola izinto. <|endofstory|> Ngizama nje ukuba muhle."
encoded = zulu_tokeniser.encode(test_sentence)
decoded = zulu_tokeniser.decode(encoded.ids, skip_special_tokens=False)

print(f"Encoded: {encoded.ids}")
print(f"Decoded: {decoded}")


encoded = yor_tokeniser.encode("lọ́jọ́ kan, <unk> kan tó ń jẹ́ lily rí abẹ́rẹ́ kan nínú yàrá rẹ̀. <|endofstory|> \nÓ mọ̀ pé ó ṣòro láti fi ṣeré nítorí pé ó mú gan-an.")
# cleaned_tokens = [token.replace('Ġ', '').replace('Ċ', '') for token in encoded.tokens]
# print("\nEncoded tokens with <unk>:", cleaned_tokens)
print("Token IDs with <unk>:", encoded.ids)

decoded_text = yor_tokeniser.decode(encoded.ids, skip_special_tokens=False)
print("Decoded text:", decoded_text)

Encoded: [10897, 3347, 18527, 19, 1019, 606, 6238, 675, 19, 226, 5, 30861, 565, 459, 4479, 19]
Decoded: Lena indaba yokuhlola. Ukuzama nokuhlola izinto. <|endofstory|> Ngizama nje ukuba muhle.
Token IDs with <unk>: [770, 274, 139, 0, 167, 172, 158, 217, 2747, 208, 2969, 167, 266, 615, 279, 139, 5, 289, 107, 519, 174, 162, 1608, 207, 219, 271, 559, 174, 162, 237, 319]
Decoded text: lọ́jọ́ kan, <unk> kan tó ń jẹ́ lily rí abẹ́rẹ́ kan nínú yàrá rẹ̀. <|endofstory|> 
Ó mọ̀ pé ó ṣòro láti fi ṣeré nítorí pé ó mú gan-an.


In [None]:
!git clone https://github.com/karpathy/nanoGPT.git

In [None]:
!cd /content/nanoGPT/data/zulu/ && python prepare.py

train has 28,809,839 tokens
val has 7,200,689 tokens
Training data saved to train.bin
Validation data saved to val.bin
First 10 tokens in training data: [  461   423   438  1029   843   641   613 19418    19 12722]
First 10 tokens in validation data: [  74  513  613  927   19 2775  867 7375 1202 4409]


In [None]:
!cd /content/nanoGPT/data/yoruba/ && python prepare.py

train has 41,056,131 tokens
val has 10,264,075 tokens
Training data saved to train.bin
Validation data saved to val.bin
First 10 tokens in training data: [ 404  167  214  201  773  214  728 1300 1703  172]
First 10 tokens in validation data: [139   0 201 187 219 872 680 176 634 174]


In [None]:
!cat /content/nanoGPT/config/train_zulu_stories.py

# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-zulu'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'zulu model'

dataset = 'zulu'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

alway

In [None]:
#@title training zulu stories
!cd /content/nanoGPT && python train.py config/train_zulu_stories.py

Overriding config with config/train_zulu_stories.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-zulu'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'zulu model'

dataset = 'zulu'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup

In [None]:
!cd /content/nanoGPT && python sample.py --out_dir=out-zulu --ckpoint='ckpt_iter_5000.pt' >>sample_ZuluMax.txt

  checkpoint = torch.load(ckpt_path, map_location=device)


In [None]:
#@title training yoruba stories
!cd /content/nanoGPT && python train.py config/train_yoruba_stories.py

Overriding config with config/train_yoruba_stories.py:
out_dir = 'out-yoruba'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'yoruba model'

dataset = 'yoruba'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

always_save_checkpoint = True # if True, always save 

In [None]:
!cd /content/nanoGPT && python sample.py --out_dir=out-yoruba --ckpoint='ckpt_iter_5000.pt' >>sample_YorubaMax.txt

  checkpoint = torch.load(ckpt_path, map_location=device)


**Preprocessing, Training and sampling from 10k dataset**

In [None]:
#@title Zulu data
!cd /content/nanoGPT/data/zulu_10k/ && python prepare.py

train has 1,193,167 tokens
val has 297,977 tokens
Training data saved to train.bin
Validation data saved to val.bin
First 10 tokens in training data: [  461   423   438   695   679   916   494 18019    19  2549]
First 10 tokens in validation data: [   74   280  2397    19   509   941  6447   360 46833   276]


In [None]:
!cd /content/nanoGPT && python train.py config/train_zulu10k.py

Overriding config with config/train_zulu10k.py:
out_dir = 'out-zulu10k'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'zulu 10k'

dataset = 'zulu_10k'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

always_save_checkpoint = True # if True, always save a checkp

In [None]:
!cd /content/nanoGPT && python sample.py --out_dir=out-zulu10k --ckpoint='ckpt_iter_5000.pt' >>sample_ZuluMin.txt

  checkpoint = torch.load(ckpt_path, map_location=device)


In [None]:
#@title Yoruba data
!cd /content/nanoGPT/data/yoruba_10k/ && python prepare.py

train has 1,694,424 tokens
val has 424,016 tokens
Training data saved to train.bin
Validation data saved to val.bin
First 10 tokens in training data: [2759  213  238  224 2486  187  161  183  158  307]
First 10 tokens in validation data: [ 243 1361 4124  248  639  158  182  266 2469 1948]


In [None]:
!cd /content/nanoGPT && python train.py config/train_yor10k.py

Overriding config with config/train_yor10k.py:
out_dir = 'out-yoruba10k'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'yoruba 10k'

dataset = 'yoruba_10k'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

always_save_checkpoint = True # if True, always save a c

In [None]:
!cd /content/nanoGPT && python sample.py --out_dir=out-yoruba10k --ckpoint='ckpt_iter_5000.pt' >>sample_YorubaMin.txt

  checkpoint = torch.load(ckpt_path, map_location=device)


**Prompting a Multilingual Model to generate stories**

In [None]:
#@title Prompting Afro Llama to generate zulu stories
class JHModel:
    def __init__(self):
        self.eval_model = "Jacaranda/AfroLlama_V1"
        self.tokenizer = AutoTokenizer.from_pretrained(self.eval_model)
        self.ft_model = LLM(
            model=self.eval_model,
            tokenizer=self.eval_model,
            tensor_parallel_size=1)
        self.terminators = [
            self.tokenizer.eos_token_id,
            self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        self.DEFAULT_SYSTEM_PROMPT = ""
        self.system_format = '<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
        self.user_format = '<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
        self.assistant_format = '{content}<|eot_id|>'

    def generate_prompt_new(self, instruction):
        return self.system_format.format(content=self.DEFAULT_SYSTEM_PROMPT) + self.user_format.format(content=instruction)

    def generate(self, instruction):
        prompt = self.generate_prompt_new(instruction)
        generation_config = dict(temperature=0.7, top_k=40, top_p=0.9, max_tokens=512, presence_penalty=1.0,)
        generation_config["stop_token_ids"] = self.terminators
        generation_config["stop"] = ["<|eot_id|>", "<|end_of_text|>"]

        with torch.no_grad():
            output = self.ft_model.generate([prompt], SamplingParams(**generation_config), use_tqdm=False)
            response = output[0].outputs[0].text
            return response

def generate_zulu_stories(model, num_stories, output_file):
    stories = []
    prompts = [
         "Bhala indaba yezingane ngesiZulu emayelana noLily noMax abathola isipho esingalindelekile, inesilungiso esihle.",
        "Xoxa inganekwane yezingane ngesiZulu lapho uEmma edinga ukuxolisa kumngani wakhe uThabo, inesilungiso esibuhlungu.",
        "Bhala indaba emfushane yezingane ngesiZulu lapho uZandile noOliver behlangana nesilwane esikhulumayo, inesilungiso esimangalisayo.",
        "Xoxa indaba yezingane ngesiZulu emayelana noNomsa ofunda isifundo esibalulekile ngokwabelana.",
        "Bhala inganekwane yezingane ngesiZulu lapho uJack noAmahle betholana endaweni emangalisayo, eneqhinga elihlekisayo.",
        "Xoxa indaba yezingane ngesiZulu emayelana nobungane obusha phakathi kukaLiam noZinhle.",
        "Bhala indaba yezingane ngesiZulu lapho uSipho encoba ubunzima ukuze afeze iphupho lakhe.",
        "Xoxa inganekwane yezingane ngesiZulu emayelana noAva noKwezi abathola indlela entsha yokuxazulula inkinga, inesilungiso esingalindelekile.",
        "Bhala indaba emfushane yezingane ngesiZulu lapho uNeo noSarah befunda ukubaluleka kokuxolelana.",
        "Xoxa indaba yezingane ngesiZulu emayelana noEthan noZama abafunda ukubekezela ngenkathi belinde isipho esikhethekile."
    ]

    for i in range(num_stories):
        prompt = prompts[i % len(prompts)]
        story = model.generate(prompt)
        stories.append(story)
        print(f"Generated story {i+1}/{num_stories}")

    with open(output_file, 'w', encoding='utf-8') as f:
        for i, story in enumerate(stories, 1):
            f.write(f"Story {i}:\n{story}\n\n")

    print(f"Generated {num_stories} stories and saved them to {output_file}")

def main():
    model = JHModel()
    num_stories = 10000
    output_file = 'generated_zulu_stories.txt'
    generate_zulu_stories(model, num_stories, output_file)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 09-13 20:40:39 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='Jacaranda/AfroLlama_V1', speculative_config=None, tokenizer='Jacaranda/AfroLlama_V1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Jacaranda/AfroLlama_V1, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 09-13 20:40:40 model_runner.py:720] Starting to load model Jacaranda/AfroLlama_V1...
INFO 09-13 20:40:40 weight_utils.py:225] Using model w

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Generated story 5002/10000
Generated story 5003/10000
Generated story 5004/10000
Generated story 5005/10000
Generated story 5006/10000
Generated story 5007/10000
Generated story 5008/10000
Generated story 5009/10000
Generated story 5010/10000
Generated story 5011/10000
Generated story 5012/10000
Generated story 5013/10000
Generated story 5014/10000
Generated story 5015/10000
Generated story 5016/10000
Generated story 5017/10000
Generated story 5018/10000
Generated story 5019/10000
Generated story 5020/10000
Generated story 5021/10000
Generated story 5022/10000
Generated story 5023/10000
Generated story 5024/10000
Generated story 5025/10000
Generated story 5026/10000
Generated story 5027/10000
Generated story 5028/10000
Generated story 5029/10000
Generated story 5030/10000
Generated story 5031/10000
Generated story 5032/10000
Generated story 5033/10000
Generated story 5034/10000
Generated story 5035/10000
Generated story 5

In [None]:
#@title Prompting Afro Llama to generate yoruba stories
class JHModel:
    def __init__(self):
        self.eval_model = "Jacaranda/AfroLlama_V1"
        self.tokenizer = AutoTokenizer.from_pretrained(self.eval_model)
        self.ft_model = LLM(
            model=self.eval_model,
            tokenizer=self.eval_model,
            tensor_parallel_size=1)
        self.terminators = [
            self.tokenizer.eos_token_id,
            self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        self.DEFAULT_SYSTEM_PROMPT = ""
        self.system_format = '<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
        self.user_format = '<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
        self.assistant_format = '{content}<|eot_id|>'

    def generate_prompt_new(self, instruction):
        return self.system_format.format(content=self.DEFAULT_SYSTEM_PROMPT) + self.user_format.format(content=instruction)

    def generate(self, instruction):
        prompt = self.generate_prompt_new(instruction)
        generation_config = dict(temperature=0.7, top_k=40, top_p=0.9, max_tokens=512, presence_penalty=1.0,)
        generation_config["stop_token_ids"] = self.terminators
        generation_config["stop"] = ["<|eot_id|>", "<|end_of_text|>"]

        with torch.no_grad():
            output = self.ft_model.generate([prompt], SamplingParams(**generation_config), use_tqdm=False)
            response = output[0].outputs[0].text
            return response

def generate_yoruba_stories(model, num_stories, output_file):
    stories = []
    prompts = [
        "Kọ itan awọn ọmọde ni Yoruba nipa Lily ati Max ti o gba ẹbun airotẹlẹ, o ni ipari ti o dara.",
        "Sọ itan awọn ọmọde ni Yoruba nibiti Emma nilo lati gafara fun ọrẹ rẹ Thabo, o ni opin irora.",
        "Kọ itan kukuru kan fun awọn ọmọde ni Yoruba nibiti Zandile ati Oliver pade eranko ti n sọrọ, o ni ipari ti o dara julọ.",
        "Sọ itan awọn ọmọde ni Yoruba nipa Nomsa ti o kọ ẹkọ pataki nipasẹ pinpin.",
        "Kọ itan awọn ọmọde kan ni Yoruba nibiti Jack ati Amahle ti wa ara wọn ni ibi ti o dara julọ, pẹlu igbimọ alarinrin.",
        "Sọ itan awọn ọmọde kan ni Yoruba nipa ọrẹ tuntun laarin Liam ati Zinhle.",
        "Kọ itan awọn ọmọde ni Yoruba nibiti Sipho tiraka lati ṣaṣeyọri ala rẹ.",
        "Sọ itan awọn ọmọde ni Yoruba nipa Ava ati Kwezi ti o wa ọna titun lati yanju iṣoro kan, pẹlu ojutu airotẹlẹ.",
        "Kọ itan kukuru kan fun awọn ọmọde ni Yoruba nibiti Neo ati Sarah ti kọ ẹkọ pataki idariji.",
        "Sọ itan awọn ọmọde ni Yoruba nipa Ethan ati Zama ti o kọ ẹkọ sũru lakoko ti o nduro fun ẹbun pataki."
    ]

    for i in range(num_stories):
        prompt = prompts[i % len(prompts)]
        story = model.generate(prompt)
        stories.append(story)
        print(f"Generated story {i+1}/{num_stories}")

    with open(output_file, 'w', encoding='utf-8') as f:
        for i, story in enumerate(stories, 1):
            f.write(f"Story {i}:\n{story}\n\n")

    print(f"Generated {num_stories} stories and saved them to {output_file}")

def main():
    model = JHModel()
    num_stories = 10000
    output_file = 'generated_yoruba_stories.txt'
    generate_yoruba_stories(model, num_stories, output_file)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 09-24 22:30:57 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='Jacaranda/AfroLlama_V1', speculative_config=None, tokenizer='Jacaranda/AfroLlama_V1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Jacaranda/AfroLlama_V1, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 09-24 22:30:58 model_runner.py:720] Starting to load model Jacaranda/AfroLlama_V1...
INFO 09-24 22:30:58 weight_utils.py:225] Using model w

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Generated story 5002/10000
Generated story 5003/10000
Generated story 5004/10000
Generated story 5005/10000
Generated story 5006/10000
Generated story 5007/10000
Generated story 5008/10000
Generated story 5009/10000
Generated story 5010/10000
Generated story 5011/10000
Generated story 5012/10000
Generated story 5013/10000
Generated story 5014/10000
Generated story 5015/10000
Generated story 5016/10000
Generated story 5017/10000
Generated story 5018/10000
Generated story 5019/10000
Generated story 5020/10000
Generated story 5021/10000
Generated story 5022/10000
Generated story 5023/10000
Generated story 5024/10000
Generated story 5025/10000
Generated story 5026/10000
Generated story 5027/10000
Generated story 5028/10000
Generated story 5029/10000
Generated story 5030/10000
Generated story 5031/10000
Generated story 5032/10000
Generated story 5033/10000
Generated story 5034/10000
Generated story 5035/10000
Generated story 5

In [None]:
import re

In [None]:
# with open('/content/generated_zulu_stories.txt', 'r', encoding='utf-8') as f:
#     gen_zul_stories = f.read()

# with open('/content/generated_yoruba_stories.txt', 'r', encoding='utf-8') as f:
#     gen_yor_stories = f.read()

In [None]:
#@title Cleaning and Splitting Prompted Afrollama Stories

gen_zul_stories = gen_zul_stories.split('Story ')
gen_zul_stories = ['Story ' + story.strip() for story in gen_zul_stories if story.strip()]

gen_yor_stories = gen_yor_stories.split('Story ')
gen_yor_stories = ['Story ' + story.strip() for story in gen_yor_stories if story.strip()]

random.seed(42)

random.shuffle(gen_zul_stories)
random.shuffle(gen_yor_stories)


split = int(0.9*len(gen_zul_stories))

#For Zulu dataset
zulu_gen_stories = gen_zul_stories[:split]
zulu_gen_stories = [story + ' <|endofstory|>' for story in zulu_gen_stories]
zulu_gen_stories = '\n\n'.join(zulu_gen_stories)
zulu_gen_stories = re.sub(r"\n{2,}", " ", zulu_gen_stories)
zulu_gen_stories = re.sub(r"Story \d+:", "\n", zulu_gen_stories)

eval_ZuluPrompt = gen_zul_stories[split:]
eval_ZuluPrompt = '\n\n'.join(eval_ZuluPrompt)
eval_ZuluPrompt = re.sub(r"\n{2,}", " ", eval_ZuluPrompt)
eval_ZuluPrompt = re.sub(r"Story \d+:", "\n", eval_ZuluPrompt)

with open('eval_ZuluPrompt.txt', 'w', encoding='utf-8') as f:
  f.write(eval_ZuluPrompt)

with open('zulu_gen_stories.txt', 'w', encoding='utf-8') as f:
  f.write(zulu_gen_stories)

print(f"Zulu stories Length:", {len(zulu_gen_stories)})
print(f"Zulu evalution Length:", {len(eval_ZuluPrompt)})


#For Yoruba dataset
split = int(0.9*len(gen_yor_stories))

yoruba_gen_stories = gen_yor_stories[:split]
yoruba_gen_stories = [story + ' <|endofstory|>' for story in yoruba_gen_stories]
yoruba_gen_stories = '\n\n'.join(yoruba_gen_stories)
yoruba_gen_stories = re.sub(r"\n{2,}", " ", yoruba_gen_stories)
yoruba_gen_stories = re.sub(r"Story \d+:", "\n", yoruba_gen_stories)

eval_YorubaPrompt = gen_yor_stories[split:]
eval_YorubaPrompt = '\n\n'.join(eval_YorubaPrompt)
eval_YorubaPrompt = re.sub(r"\n{2,}", " ", eval_YorubaPrompt)
eval_YorubaPrompt = re.sub(r"Story \d+:", "\n", eval_YorubaPrompt)

with open('eval_YorubaPrompt.txt', 'w', encoding='utf-8') as f:
  f.write(eval_YorubaPrompt)

with open('yoruba_gen_stories.txt', 'w', encoding='utf-8') as f:
  f.write(yoruba_gen_stories)

print(f"Yoruba stories Length:", {len(yoruba_gen_stories)})
print(f"Yoruba evalution Length:", {len(eval_YorubaPrompt)})

Zulu stories Length: {7630559}
Zulu evalution Length: {838102}
Yoruba stories Length: {6322300}
Yoruba evalution Length: {684750}


In [None]:
!cd /content/nanoGPT/data/zulu_prompted && python prepare.py

train has 1,035,290 tokens
val has 259,003 tokens
Training data saved to train.bin
Validation data saved to val.bin
First 10 tokens in training data: [  204   204 20378 12512    17 22986  8352   662   267    18]
First 10 tokens in validation data: [ 390   84 1204 4066   19  851 1175   17 1413  368]


In [None]:
!cd /content/nanoGPT && python train.py config/train_zulu_prompted.py

Overriding config with config/train_zulu_prompted.py:
out_dir = 'out-zulu-prompted'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'zulu prompted'

dataset = 'zulu_prompted'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

always_save_checkpoint = True # if True

In [None]:
!cd /content/nanoGPT && python sample.py --out_dir=out-zulu-prompted --ckpoint='ckpt_iter_5000.pt' >>sample_ZuluPrompt.txt

  checkpoint = torch.load(ckpt_path, map_location=device)


In [None]:
!cd /content/nanoGPT/data/yoruba_prompted && python prepare.py

train has 1,208,521 tokens
val has 302,424 tokens
Training data saved to train.bin
Validation data saved to val.bin
First 10 tokens in training data: [  289     6    48   136  1583   274  7223 34620  1423  7877]
First 10 tokens in validation data: [ 1063   227  2604   397  4719  1731   919    17   385 35014]


In [None]:
!cd /content/nanoGPT && python train.py config/train_yoruba_prompted.py

Overriding config with config/train_yoruba_prompted.py:
out_dir = 'out-yoruba-prompted'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = True # override via command line if you like
wandb_project = 'AIMS-THESIS'
wandb_run_name = 'yoruba prompted'

dataset = 'yoruba_prompted'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

always_save_checkpoint = True #

In [None]:
!cd /content/nanoGPT && python sample.py --out_dir=out-yoruba-prompted --ckpoint='ckpt_iter_5000.pt' >>sample_YorubaPrompt.txt

  checkpoint = torch.load(ckpt_path, map_location=device)


In [None]:
#@title Pre-process Generated Stories
!cd /content/nanoGPT/sampledata/ && python sample_prepare.py

Zulumax.txt sample stories has 118,240 tokens
Zulumax.txt sample stories data saved to sample_ZuluMax.bin

Yorubamax.txt sample stories has 196,214 tokens
Yorubamax.txt sample stories data saved to sample_YorubaMax.bin

Zuluprompt.txt sample stories has 146,244 tokens
Zuluprompt.txt sample stories data saved to sample_ZuluPrompt.bin

Yorubaprompt.txt sample stories has 161,300 tokens
Yorubaprompt.txt sample stories data saved to sample_YorubaPrompt.bin

Zulumini.txt sample stories has 158,894 tokens
Zulumini.txt sample stories data saved to sample_ZuluMini.bin

Yorubamini.txt sample stories has 202,748 tokens
Yorubamini.txt sample stories data saved to sample_YorubaMini.bin



In [None]:
#@title Pre-process Evaluation Set
!cd /content/nanoGPT/evaluationdata/ && python eval_prepare.py

Zulumax.txt evaluation stories has 3,928,142 tokens
Zulumax.txt evaluation stories data saved to eval_ZuluMax.bin

Yorubamax.txt evaluation stories has 5,658,137 tokens
Yorubamax.txt evaluation stories data saved to eval_YorubaMax.bin

Zuluprompt.txt evaluation stories has 141,535 tokens
Zuluprompt.txt evaluation stories data saved to eval_ZuluPrompt.bin

Yorubaprompt.txt evaluation stories has 164,395 tokens
Yorubaprompt.txt evaluation stories data saved to eval_YorubaPrompt.bin

Zulumini.txt evaluation stories has 161,736 tokens
Zulumini.txt evaluation stories data saved to eval_ZuluMini.bin

Yorubamini.txt evaluation stories has 235,508 tokens
Yorubamini.txt evaluation stories data saved to eval_YorubaMini.bin



**Evaluation Metrics for Samples and Evaluation Sets**

In [None]:
#@title Perplexity for Generated Samples
!cd /content/nanoGPT/sampledata && python perplexity.py

Loading model from /content/nanoGPT/out-zulu/ckpt_iter_5000.pt
  checkpoint = torch.load(ckpt_path, map_location=device)
number of parameters: 30.59M
Perplexity for sample ZuluMax: 22.35295298557113

Loading model from /content/nanoGPT/out-yoruba/ckpt_iter_5000.pt
number of parameters: 29.20M
Perplexity for sample YorubaMax: 5.418219873736509

Loading model from /content/nanoGPT/out-zulu-prompted/ckpt_iter_5000.pt
number of parameters: 30.59M
Perplexity for sample ZuluPrompt: 9.87113996748309

Loading model from /content/nanoGPT/out-yoruba-prompted/ckpt_iter_5000.pt
number of parameters: 29.20M
Perplexity for sample YorubaPrompt: 12.392298081384514

Loading model from /content/nanoGPT/out-zulu10k/ckpt_iter_5000.pt
number of parameters: 30.59M
Perplexity for sample ZuluMini: 18.21545331537137

Loading model from /content/nanoGPT/out-yoruba10k/ckpt_iter_5000.pt
number of parameters: 29.20M
Perplexity for sample YorubaMini: 5.294009050040105



In [None]:
#@title Perplexity for Evaluation Set
!cd /content/nanoGPT/evaluationdata && python perplexity.py

Loading model from /content/nanoGPT/out-zulu/ckpt_iter_5000.pt
  checkpoint = torch.load(ckpt_path, map_location=device)
number of parameters: 30.59M
Perplexity for isiZulu_250k on shared evaluation set: 16.17108038977846

Loading model from /content/nanoGPT/out-zulu10k/ckpt_iter_5000.pt
number of parameters: 30.59M
Perplexity for isiZulu_10k on shared evaluation set: 303.02187496388876

Loading model from /content/nanoGPT/out-zulu-prompted/ckpt_iter_5000.pt
number of parameters: 30.59M
Perplexity for isiZulu_AfroLLaMA on shared evaluation set: 114520.70701548738

Loading model from /content/nanoGPT/out-yoruba/ckpt_iter_5000.pt
number of parameters: 29.20M
Perplexity for Yoruba_250k on shared evaluation set: 7.493713900193406

Loading model from /content/nanoGPT/out-yoruba10k/ckpt_iter_5000.pt
number of parameters: 29.20M
Perplexity for Yoruba_10k on shared evaluation set: 14.474591268985938

Loading model from /content/nanoGPT/out-yoruba-prompted/ckpt_iter_5000.pt
number of parameters

In [None]:
#@title Diversity Scores for Generated Samples
!cd /content/nanoGPT/sampledata && python text_diversity.py

2024-10-23 01:07:15.265467: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-23 01:07:15.316057: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-23 01:07:15.316106: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-23 01:07:15.317542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-23 01:07:15.325085: I tensorflow/core/platform/cpu_feature_guar

In [None]:
#@title Diversity Scores for Evaluation Set
!cd /content/nanoGPT/evaluationdata && python text_diversity.py

2024-10-23 01:35:53.704662: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-23 01:35:53.756348: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-23 01:35:53.756398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-23 01:35:53.757975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-23 01:35:53.765768: I tensorflow/core/platform/cpu_feature_guar

##Back Translation of Stories to English##

In [None]:
def process_texts(texts, src_lang):
    return tokenizer(text=texts, src_lang=src_lang, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)

def translate_texts(inputs, tgt_lang):
    generated_outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        num_beams=4,
        early_stopping=True,
        tgt_lang=tgt_lang
    )
    translated_texts = [tokenizer.decode(token, skip_special_tokens=True) for token in generated_outputs]
    return translated_texts, generated_outputs

def process_in_batches(texts, src_lang, tgt_lang, batch_size=16):
    results = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Process & Translate Texts"):
        batch_texts = texts[i:i + batch_size]

        inputs = process_texts(batch_texts, src_lang=src_lang)

        try:
            batch_outputs, _ = translate_texts(inputs, tgt_lang)
            results.extend(batch_outputs)
        except Exception as e:
            print(f"Error processing batch {i // batch_size}: {e}")

        torch.cuda.empty_cache()

    return results

def load_stories_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().split('---------------')

def save_translated_stories(translated_stories, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for idx, story in enumerate(translated_stories):
            f.write(f"Translated Story {idx + 1}:\n{story}\n")
            f.write("---------------\n")


In [None]:
ZMax = load_stories_from_file('/content/nanoGPT/sampledata/sample_ZuluMax.txt')
ZMax_trans = process_in_batches(ZMax, src_lang='zul', tgt_lang='eng', batch_size=16)
save_translated_stories(ZMax_trans, 'ZMax_trans.txt')

Process & Translate Texts:   0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
ZMax[0:4]

['Kwakunjalo". Ikati lathi, "Kulungile, ngizoqapha. Ngiyabonga ngokungisindisa". Kusukela ngalolo suku kuqhubeke, ikati alizange liphinde livumele lihlakazeke ehlathini. \n',
 '\nKwake kwaba khona intombazanyana egama layo linguLily. Wayethanda ukudlala ngaphandle elangeni. Ngolunye usuku, wabona inyoni encane enephiko elilimele. Le nyoni yayingakwazi ukundiza. ULily wabuza inyoni, "Kungani udabukile, nyoni encane?" Inyoni yathi, "Ngilahlekelwe inyoni encane". ULily wazizwa edabukile ngenyoni encane futhi wayefuna ukusiza le nyoni. "Lily wathi, ""Ungakhathazeki, inyoni encane.""" Ngizokusiza ukuba uthole inyoni encane". Bafuna futhi bafuna, kodwa abakwazanga ukuthola inyoni. ULily wayedabukile futhi engazi ukuthi enzeni. Ngokungazelelwe, kwaqhamuka umoya omkhulu futhi washaya inyoni. Le nyoni encane yayilimele futhi yayingasakwazi ukundiza. ULily wafunda ukuthi kubalulekile ukulalela iseluleko sikamama futhi uqaphele lapho useduze. \n',
 '\nKwake kwaba khona intombazanyana egama layo l

In [None]:
ZMin = load_stories_from_file('/content/nanoGPT/sampledata/sample_ZuluMini.txt')
ZMin_trans = process_in_batches(ZMin, src_lang='zul', tgt_lang='eng', batch_size=16)
save_translated_stories(ZMin_trans, 'ZMin_trans.txt')

Process & Translate Texts:   0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
ZPrompt = load_stories_from_file('/content/nanoGPT/sampledata/sample_ZuluPrompt.txt')
ZPrompt_trans = process_in_batches(ZPrompt, src_lang='zul', tgt_lang='eng', batch_size=16)
save_translated_stories(ZPrompt_trans, 'ZPrompt_trans.txt')

Process & Translate Texts:   0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
YMax = load_stories_from_file('/content/nanoGPT/sampledata/sample_YorubaMax.txt')
# YMax_trans = process_in_batches(YMax, src_lang='yor', tgt_lang='eng', batch_size=16)
# save_translated_stories(YMax_trans, 'YMax_trans.txt')

In [None]:
YMin = load_stories_from_file('/content/nanoGPT/sampledata/sample_YorubaMini.txt')
YMin_trans = process_in_batches(YMin, src_lang='yor', tgt_lang='eng', batch_size=16)
save_translated_stories(YMin_trans, 'YMin_trans.txt')

Process & Translate Texts:   0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
YPrompt = load_stories_from_file('/content/nanoGPT/sampledata/sample_YorubaPrompt.txt')
YPrompt_trans = process_in_batches(YPrompt, src_lang='yor', tgt_lang='eng', batch_size=16)
save_translated_stories(YPrompt_trans, 'YPrompt_trans.txt')

Process & Translate Texts:   0%|          | 0/63 [00:00<?, ?it/s]