In [1]:
from transformers import LlamaTokenizerFast, BertTokenizerFast, AutoTokenizer
from tqdm import tqdm
from datasets import load_dataset

tokenizer_llama = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
# tokenizer_bloom = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
# tokenizer_Bert = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [3]:

def calculate_corpus_fertility(dataset_name, language_code, tokenizer, split='dev', max_docs=10000):
    """
    Calculate fertility for a dataset and tokenizer.

    Args:
        dataset_name (str): The dataset to load.
        language_code (str): The specific language code for the dataset.
        tokenizer_name (str): The tokenizer to use.
        split (str): The split of the dataset to evaluate (default: "train").
        max_docs (int): Maximum number of documents to evaluate (default: 10,000).

    Returns:
        float: The fertility score for the tokenizer on the dataset.
    """
    # Load dataset and tokenizer
    dataset = load_dataset(dataset_name, language_code, split=split)

    total_tokens = 0
    total_words = 0

    # Process up to max_docs
    for i, example in enumerate(tqdm(dataset, desc=f"Calculating fertility for {language_code}")):
        if i >= max_docs:
            break

        # Combine all text fields
        question = example["question"]
        choices = " ".join(eval(example["choices"]))  # Combine choices into a string
        answer = example["answer"]
        combined_text = f"{question} {choices} {answer}"

        # Tokenize and count tokens
        tokenized = tokenizer.tokenize(combined_text)
        total_tokens += len(tokenized)

        # Count words using whitespace splitting
        total_words += len(combined_text.split())

    # Compute fertility
    if total_words == 0:
        raise ValueError("Total words in the dataset is zero. Cannot compute fertility.")

    fertility = total_tokens / total_words
    return fertility

# Example usage
dataset_name = "masakhane/afrimmlu"
language_code = "eng"  # Example: Swahili
split = "dev"  # Use "train" or any available split
max_docs = 10000  # Use a held-out set of 10,000 documents

fertility_score = calculate_corpus_fertility(dataset_name, language_code, tokenizer_llama, split=split, max_docs=max_docs)
print(fertility_score)

README.md:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

data/eng/val.tsv:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

data/eng/dev.tsv:   0%|          | 0.00/7.99k [00:00<?, ?B/s]

data/eng/test.tsv:   0%|          | 0.00/136k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for eng: 100%|██████████| 25/25 [00:00<00:00, 438.16it/s]

1.4142628205128205





In [6]:
languages = {
    "amh": "Amharic",
    "eng": "English",
    "ewe": "Ewe",
    "fra": "French",
    "hau": "Hausa",
    "ibo": "Igbo",
    "kin": "Kinyarwanda",
    "lin": "Lingala",
    "lug": "Luganda",
    "orm": "Oromo",
    "sna": "Shona",
    "sot": "Southern Sotho",
    "swa": "Swahili",
    "twi": "Twi",
    "wol": "Wolof",
    "xho": "Xhosa",
    "yor": "Yoruba",
    "zul": "Zulu",
}
results_Bloom_Tokenizer = {}
for lang_code, lang_name in languages.items():
    try:
        print(f"Processing language: {lang_name} ({lang_code})")
        fertility_score = calculate_corpus_fertility(
            dataset_name, lang_code, tokenizer_llama, split='test', max_docs=max_docs
        )
        results_Bloom_Tokenizer[lang_code] = fertility_score
        print(f"Fertility Score for {lang_name}: {fertility_score}")
    except Exception as e:
        print(f"Failed for {lang_name} ({lang_code}): {e}")

# Print all results
print("\nFertility Scores by Language:")
for lang_code, score in results_Bloom_Tokenizer.items():
    print(f"{languages[lang_code]} ({lang_code}): {score}")

Processing language: Amharic (amh)


data/amh/val.tsv:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

data/amh/dev.tsv:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

data/amh/test.tsv:   0%|          | 0.00/211k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for amh: 100%|██████████| 500/500 [00:01<00:00, 334.71it/s]


Fertility Score for Amharic: 11.52529601722282
Processing language: English (eng)


Calculating fertility for eng: 100%|██████████| 500/500 [00:00<00:00, 957.12it/s]


Fertility Score for English: 1.475627916211655
Processing language: Ewe (ewe)


data/ewe/val.tsv:   0%|          | 0.00/23.7k [00:00<?, ?B/s]

data/ewe/dev.tsv:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

data/ewe/test.tsv:   0%|          | 0.00/150k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for ewe: 100%|██████████| 500/500 [00:01<00:00, 413.71it/s]


Fertility Score for Ewe: 2.970092121314096
Processing language: French (fra)


data/fra/val.tsv:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

data/fra/dev.tsv:   0%|          | 0.00/9.45k [00:00<?, ?B/s]

data/fra/test.tsv:   0%|          | 0.00/166k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for fra: 100%|██████████| 500/500 [00:00<00:00, 892.83it/s]


Fertility Score for French: 1.8337786826144125
Processing language: Hausa (hau)


data/hau/val.tsv:   0%|          | 0.00/24.6k [00:00<?, ?B/s]

data/hau/dev.tsv:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

data/hau/test.tsv:   0%|          | 0.00/145k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for hau: 100%|██████████| 500/500 [00:00<00:00, 608.05it/s]


Fertility Score for Hausa: 2.550557802569003
Processing language: Igbo (ibo)


data/ibo/val.tsv:   0%|          | 0.00/27.1k [00:00<?, ?B/s]

data/ibo/dev.tsv:   0%|          | 0.00/9.21k [00:00<?, ?B/s]

data/ibo/test.tsv:   0%|          | 0.00/153k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for ibo: 100%|██████████| 500/500 [00:00<00:00, 1084.05it/s]


Fertility Score for Igbo: 2.965458006406983
Processing language: Kinyarwanda (kin)


data/kin/val.tsv:   0%|          | 0.00/25.3k [00:00<?, ?B/s]

data/kin/dev.tsv:   0%|          | 0.00/8.93k [00:00<?, ?B/s]

data/kin/test.tsv:   0%|          | 0.00/152k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for kin: 100%|██████████| 500/500 [00:00<00:00, 657.52it/s]


Fertility Score for Kinyarwanda: 3.2254943652987453
Processing language: Lingala (lin)


data/lin/val.tsv:   0%|          | 0.00/24.9k [00:00<?, ?B/s]

data/lin/dev.tsv:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

data/lin/test.tsv:   0%|          | 0.00/150k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for lin: 100%|██████████| 500/500 [00:00<00:00, 778.55it/s]


Fertility Score for Lingala: 2.0521419207510467
Processing language: Luganda (lug)


data/lug/val.tsv:   0%|          | 0.00/31.3k [00:00<?, ?B/s]

data/lug/dev.tsv:   0%|          | 0.00/9.97k [00:00<?, ?B/s]

data/lug/test.tsv:   0%|          | 0.00/178k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for lug: 100%|██████████| 500/500 [00:00<00:00, 1195.74it/s]


Fertility Score for Luganda: 3.521403576051004
Processing language: Oromo (orm)


data/orm/val.tsv:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

data/orm/dev.tsv:   0%|          | 0.00/9.22k [00:00<?, ?B/s]

data/orm/test.tsv:   0%|          | 0.00/155k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for orm: 100%|██████████| 500/500 [00:00<00:00, 1624.13it/s]


Fertility Score for Oromo: 3.381484265922028
Processing language: Shona (sna)


data/sna/val.tsv:   0%|          | 0.00/26.2k [00:00<?, ?B/s]

data/sna/dev.tsv:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

data/sna/test.tsv:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for sna: 100%|██████████| 500/500 [00:00<00:00, 1202.59it/s]


Fertility Score for Shona: 3.419693445458341
Processing language: Southern Sotho (sot)


data/sot/val.tsv:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

data/sot/dev.tsv:   0%|          | 0.00/8.35k [00:00<?, ?B/s]

data/sot/test.tsv:   0%|          | 0.00/145k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/75 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for sot: 100%|██████████| 500/500 [00:00<00:00, 1269.86it/s]


Fertility Score for Southern Sotho: 2.2515918230563003
Processing language: Swahili (swa)


data/swa/val.tsv:   0%|          | 0.00/25.3k [00:00<?, ?B/s]

data/swa/dev.tsv:   0%|          | 0.00/8.56k [00:00<?, ?B/s]

data/swa/test.tsv:   0%|          | 0.00/143k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for swa: 100%|██████████| 500/500 [00:00<00:00, 929.14it/s]


Fertility Score for Swahili: 2.5851301823028225
Processing language: Twi (twi)


data/twi/val.tsv:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

data/twi/dev.tsv:   0%|          | 0.00/8.98k [00:00<?, ?B/s]

data/twi/test.tsv:   0%|          | 0.00/144k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for twi: 100%|██████████| 500/500 [00:00<00:00, 978.01it/s]


Fertility Score for Twi: 2.542355665705115
Processing language: Wolof (wol)


data/wol/val.tsv:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

data/wol/dev.tsv:   0%|          | 0.00/8.76k [00:00<?, ?B/s]

data/wol/test.tsv:   0%|          | 0.00/160k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for wol: 100%|██████████| 500/500 [00:00<00:00, 1123.31it/s]


Fertility Score for Wolof: 2.289836977847537
Processing language: Xhosa (xho)


data/xho/val.tsv:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

data/xho/dev.tsv:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

data/xho/test.tsv:   0%|          | 0.00/147k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for xho: 100%|██████████| 500/500 [00:00<00:00, 1329.91it/s]


Fertility Score for Xhosa: 3.9811791689152725
Processing language: Yoruba (yor)


data/yor/val.tsv:   0%|          | 0.00/31.9k [00:00<?, ?B/s]

data/yor/dev.tsv:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

data/yor/test.tsv:   0%|          | 0.00/182k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for yor: 100%|██████████| 500/500 [00:00<00:00, 1130.99it/s]


Fertility Score for Yoruba: 3.638583532478818
Processing language: Zulu (zul)


data/zul/val.tsv:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

data/zul/dev.tsv:   0%|          | 0.00/8.28k [00:00<?, ?B/s]

data/zul/test.tsv:   0%|          | 0.00/147k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/83 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Calculating fertility for zul: 100%|██████████| 500/500 [00:00<00:00, 1179.66it/s]

Fertility Score for Zulu: 3.8692230743581195

Fertility Scores by Language:
Amharic (amh): 11.52529601722282
English (eng): 1.475627916211655
Ewe (ewe): 2.970092121314096
French (fra): 1.8337786826144125
Hausa (hau): 2.550557802569003
Igbo (ibo): 2.965458006406983
Kinyarwanda (kin): 3.2254943652987453
Lingala (lin): 2.0521419207510467
Luganda (lug): 3.521403576051004
Oromo (orm): 3.381484265922028
Shona (sna): 3.419693445458341
Southern Sotho (sot): 2.2515918230563003
Swahili (swa): 2.5851301823028225
Twi (twi): 2.542355665705115
Wolof (wol): 2.289836977847537
Xhosa (xho): 3.9811791689152725
Yoruba (yor): 3.638583532478818
Zulu (zul): 3.8692230743581195



