In [2]:
import os
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()
client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))

## Fertility Section

#### **Tokenization and Word Counting**
- **Token Count**: The number of tokens generated by the tokenizer for the combined text.
- **Word Count**: The number of words in the combined text, determined by splitting the text using whitespace.

By comparing token counts to word counts, the function derives a fertility score for each subject, helping to analyze how efficiently the tokenizer processes different types of text data.

- Fertility measures the number of tokens generated per word, providing insights into the tokenization efficiency.
- This function helps in understanding the tokenizer's behavior across different subjects and languages.

### Why Tokenize by Subject?
In this analysis, we tokenize on the **subject** to:
- **Maximize Data Points**: By calculating fertility for each subject individually, we can gather more granular data points across various subjects. This allows for a detailed understanding of how the tokenizer behaves with different types of content.
- **Facilitate Visualization**: Tokenizing by subject enables us to create more insightful visualizations. We can compare fertility scores across subjects, highlighting differences in how efficiently the tokenizer processes specific topics or content areas.



These fertility scores have been generated using the Antrhopic Tokenizer.

Tokenizer by language

In [3]:
def calculate_corpus_fertility_anthropic(dataset_name, language_code, client, split='dev', max_docs=10000):
    """
    Calculate fertility for a dataset using Anthropic's API for token counting.
    """
    # Load dataset
    dataset = load_dataset(dataset_name, language_code, split=split)

    total_tokens = 0
    total_words = 0

    # Process up to max_docs
    for i, example in enumerate(tqdm(dataset, desc=f"Calculating fertility for {language_code}")):
        if i >= max_docs:
            break

        # Combine all text fields
        question = example["question"]
        choices = " ".join(eval(example["choices"]))
        answer = example["answer"]
        combined_text = f"{question} {choices} {answer}"

        # Count tokens using Anthropic API
        messages = [{"role": "user", "content": combined_text}]
        token_count_obj = client.messages.count_tokens(
            messages=messages,
            model="claude-3-sonnet-20240229"
        )
        total_tokens += token_count_obj.input_tokens  # Changed from .tokens to .input_tokens

        # Count words using whitespace splitting
        total_words += len(combined_text.split())

    # Compute fertility
    if total_words == 0:
        raise ValueError("Total words in the dataset is zero. Cannot compute fertility.")

    fertility = total_tokens / total_words
    return fertility

# Example usage
dataset_name = "masakhane/afrimmlu"
language_code = "eng"
split = "dev"

fertility_score = calculate_corpus_fertility_anthropic(
    dataset_name, 
    language_code, 
    client, 
    split=split
)
print(fertility_score)

Calculating fertility for eng: 100%|██████████| 25/25 [00:04<00:00,  6.17it/s]

1.4647435897435896





In [34]:
from collections import defaultdict
from datasets import load_dataset
from tqdm import tqdm

def calculate_fertility_by_subject_anthropic(
    dataset_name,
    language_code,
    client,
    split='dev',
    max_docs=10000,
    model_name="claude-3-sonnet-20240229"
):
    """
    Calculate fertility for each subject in the dataset using Anthropic's token counting API.
    Each record's fertility is calculated, then averaged by subject.

    Args:
        dataset_name (str): The dataset to load.
        language_code (str): The specific language code for the dataset.
        client (anthropic.Client): The Anthropic client instance to use for token counting.
        split (str): The split of the dataset to evaluate (default: "dev").
        max_docs (int): Maximum number of documents to evaluate (default: 10,000).
        model_name (str): The Anthropic model to use for token counting (default: "claude-3-sonnet-20240229").

    Returns:
        dict: A dictionary where keys are subjects and values are mean fertility scores for that subject.
    """
    # Load dataset
    dataset = load_dataset(dataset_name, language_code, split=split)

    # Dictionary to store fertility scores by subject
    subject_fertility = defaultdict(list)

    # Process up to max_docs
    for i, example in enumerate(tqdm(dataset, desc=f"Calculating fertility for {language_code}")):
        if i >= max_docs:
            break

        # Combine all text fields
        question = example["question"]
        choices = " ".join(eval(example["choices"]))  # Combine choices into a string
        answer = example["answer"]
        combined_text = f"{question} {choices} {answer}"

        # Get the subject
        subject = example.get("subject", "unknown")  # Default to "unknown" if no subject is provided

        # Count tokens using Anthropic API
        messages = [
            {
                "role": "user",
                "content": combined_text
            }
        ]
        token_count_obj = client.messages.count_tokens(
            messages=messages,
            model=model_name
        )
        total_tokens = token_count_obj.input_tokens

        # Count words using whitespace splitting
        total_words = len(combined_text.split())

        # Calculate fertility for this record
        if total_words > 0:
            record_fertility = total_tokens / total_words
            subject_fertility[subject].append(record_fertility)

    # Compute mean fertility for each subject
    fertility_by_subject = {}
    for subject, fertility_list in subject_fertility.items():
        if fertility_list:
            mean_fertility = sum(fertility_list) / len(fertility_list)
            fertility_by_subject[subject] = mean_fertility
        else:
            fertility_by_subject[subject] = None  # Handle cases with no valid records

    return fertility_by_subject

# Example usage
if __name__ == "__main__":
    # Make sure you have an Anthropic client instantiated as `client`.
    # Example: 
    #   from anthropic import Client
    #   client = Client(api_key="YOUR_ANTHROPIC_API_KEY")
    
    dataset_name = "masakhane/afrimmlu"
    language_code = "amh"  # Example: Amharic
    split = "test"         # Use "train" or any available split

    # Calculate fertility for each subject using the Anthropic API
    fertility_scores = calculate_fertility_by_subject_anthropic(
        dataset_name,
        language_code,
        client,
        split=split
    )

    # Print the fertility scores for each subject
    for subject, fertility in fertility_scores.items():
        print(f"Subject: {subject}, Mean Fertility: {fertility}")

Calculating fertility for amh: 100%|██████████| 500/500 [04:00<00:00,  2.08it/s]

Subject: elementary_mathematics, Mean Fertility: 4.655221115622704
Subject: high_school_geography, Mean Fertility: 6.456467084200093
Subject: international_law, Mean Fertility: 6.204829715904315
Subject: global_facts, Mean Fertility: 5.613975072668583
Subject: high_school_microeconomics, Mean Fertility: 6.504111127447751





In [35]:
import csv

languages = {
    "amh": "Amharic",
    "eng": "English",
    "ewe": "Ewe",
    "fra": "French",
    "hau": "Hausa",
    "ibo": "Igbo",
    "kin": "Kinyarwanda",
    "lin": "Lingala",
    "lug": "Luganda",
    "orm": "Oromo",
    "sna": "Shona",
    "sot": "Southern Sotho",
    "swa": "Swahili",
    "twi": "Twi",
    "wol": "Wolof",
    "xho": "Xhosa",
    "yor": "Yoruba",
    "zul": "Zulu",
}

results_anthropic_tokenizer = {}

# Make sure you have an Anthropic client instantiated as `client`.
# Example:
#   from anthropic import Client
#   client = Client(api_key="YOUR_ANTHROPIC_API_KEY")

# Process each language
for lang_code, lang_name in languages.items():
    try:
        print(f"Processing language: {lang_name} ({lang_code})")

        # Calculate fertility for each subject using the updated function
        fertility_scores_by_subject = calculate_fertility_by_subject_anthropic(
            dataset_name="masakhane/afrimmlu",
            language_code=lang_code,
            client=client,
            split='test'  # or 'dev', depending on your needs
        )

        # Store results for this language
        results_anthropic_tokenizer[lang_code] = fertility_scores_by_subject

        # Print the fertility scores for each subject in the language
        print(f"Fertility Scores for {lang_name}:")
        for subject, fertility in fertility_scores_by_subject.items():
            print(f"  {subject}: {fertility}")

    except Exception as e:
        print(f"Failed for {lang_name} ({lang_code}): {e}")

# Print all fertility results by language
print("\nFertility Scores by Language:")
with open('anthropic_language_subject_fertility.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['language', 'subject', 'mean_fertility'])
    
    for lang_code, fertility_scores in results_anthropic_tokenizer.items():
        print(f"\n{languages[lang_code]} ({lang_code}):")
        for subject, fertility in fertility_scores.items():
            print(f"  {subject}: {fertility}")
            writer.writerow([lang_code, subject, fertility])

Processing language: Amharic (amh)


Calculating fertility for amh: 100%|██████████| 500/500 [04:00<00:00,  2.08it/s]


Fertility Scores for Amharic:
  elementary_mathematics: 4.655221115622704
  high_school_geography: 6.456467084200093
  international_law: 6.204829715904315
  global_facts: 5.613975072668583
  high_school_microeconomics: 6.504111127447751
Processing language: English (eng)


Calculating fertility for eng: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for English:
  global_facts: 1.8575151967698371
  international_law: 1.3356384626441982
  high_school_microeconomics: 1.4450216747955205
  high_school_geography: 1.6429125178060202
  elementary_mathematics: 2.1364648298695696
Processing language: Ewe (ewe)


Calculating fertility for ewe: 100%|██████████| 500/500 [04:58<00:00,  1.67it/s]


Fertility Scores for Ewe:
  elementary_mathematics: 3.161848332379317
  high_school_geography: 4.048028991758661
  international_law: 3.8293272310746276
  global_facts: 3.352552191383268
  high_school_microeconomics: 3.567263896677433
Processing language: French (fra)


Calculating fertility for fra: 100%|██████████| 500/500 [04:56<00:00,  1.68it/s]


Fertility Scores for French:
  elementary_mathematics: 2.374683199749252
  high_school_geography: 2.2033310005003446
  international_law: 1.930887139520635
  global_facts: 2.199811343677241
  high_school_microeconomics: 1.9543735770947779
Processing language: Hausa (hau)


Calculating fertility for hau: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Hausa:
  elementary_mathematics: 2.843816947802056
  high_school_geography: 2.801808892495653
  international_law: 2.7009566847678332
  global_facts: 2.5989485937989882
  high_school_microeconomics: 2.40932316233429
Processing language: Igbo (ibo)


Calculating fertility for ibo: 100%|██████████| 500/500 [04:57<00:00,  1.68it/s]


Fertility Scores for Igbo:
  elementary_mathematics: 3.357371850688904
  high_school_geography: 3.2989926880327607
  international_law: 3.131062516858652
  global_facts: 3.0659096950289264
  high_school_microeconomics: 3.3795464423653505
Processing language: Kinyarwanda (kin)


Calculating fertility for kin: 100%|██████████| 500/500 [04:57<00:00,  1.68it/s]


Fertility Scores for Kinyarwanda:
  elementary_mathematics: 3.1495489785636583
  high_school_geography: 3.4773355874252547
  international_law: 3.384046170407935
  global_facts: 3.1224342437360497
  high_school_microeconomics: 3.4453187660941746
Processing language: Lingala (lin)


Calculating fertility for lin: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Lingala:
  elementary_mathematics: 2.4690481895660157
  high_school_geography: 2.22609102122758
  international_law: 2.0534058463739133
  global_facts: 2.3255190268069286
  high_school_microeconomics: 2.0583037918643443
Processing language: Luganda (lug)


Calculating fertility for lug: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Luganda:
  elementary_mathematics: 3.231471310603535
  high_school_geography: 3.880871562884966
  international_law: 3.6955589443181225
  global_facts: 3.2811653539694547
  high_school_microeconomics: 3.7645650564929958
Processing language: Oromo (orm)


Calculating fertility for orm: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Oromo:
  elementary_mathematics: 3.323268464994895
  high_school_geography: 3.700021297151904
  international_law: 3.4692383474789805
  global_facts: 3.5131667104137767
  high_school_microeconomics: 3.320626342572426
Processing language: Shona (sna)


Calculating fertility for sna: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Shona:
  elementary_mathematics: 3.237981093034845
  high_school_geography: 3.548753436583149
  international_law: 3.41366004019341
  global_facts: 3.5351143849782085
  high_school_microeconomics: 3.586010687606505
Processing language: Southern Sotho (sot)


Calculating fertility for sot: 100%|██████████| 500/500 [04:58<00:00,  1.67it/s]


Fertility Scores for Southern Sotho:
  elementary_mathematics: 2.5926158995419724
  high_school_geography: 2.344908423218838
  international_law: 2.259718210141331
  global_facts: 2.443388900136568
  high_school_microeconomics: 2.3402555654977175
Processing language: Swahili (swa)


Calculating fertility for swa: 100%|██████████| 500/500 [04:58<00:00,  1.67it/s]


Fertility Scores for Swahili:
  elementary_mathematics: 2.779690673003809
  global_facts: 2.865135696607639
  high_school_geography: 2.8436629307160475
  high_school_microeconomics: 2.595500194740923
  international_law: 2.5759650973299797
Processing language: Twi (twi)


Calculating fertility for twi: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Twi:
  elementary_mathematics: 3.40552706462953
  high_school_geography: 3.5400562853630206
  international_law: 3.210372819963722
  global_facts: 3.222682705418969
  high_school_microeconomics: 3.1507782491554672
Processing language: Wolof (wol)


Calculating fertility for wol: 100%|██████████| 500/500 [04:58<00:00,  1.68it/s]


Fertility Scores for Wolof:
  elementary_mathematics: 2.5374441167996475
  high_school_geography: 2.421551278001745
  international_law: 2.2973813144068886
  global_facts: 2.4800598837671206
  high_school_microeconomics: 2.252358192132977
Processing language: Xhosa (xho)


Calculating fertility for xho: 100%|██████████| 500/500 [04:57<00:00,  1.68it/s]


Fertility Scores for Xhosa:
  elementary_mathematics: 3.8421804321620203
  high_school_geography: 4.218842278447361
  international_law: 4.149637867187156
  global_facts: 4.442821357197474
  high_school_microeconomics: 3.956619589371472
Processing language: Yoruba (yor)


Calculating fertility for yor: 100%|██████████| 500/500 [04:58<00:00,  1.67it/s]


Fertility Scores for Yoruba:
  elementary_mathematics: 3.7903506266292597
  high_school_geography: 4.832122943999086
  international_law: 4.3423874269506255
  global_facts: 4.298834761041142
  high_school_microeconomics: 4.145022673213099
Processing language: Zulu (zul)


Calculating fertility for zul: 100%|██████████| 500/500 [04:57<00:00,  1.68it/s]

Fertility Scores for Zulu:
  elementary_mathematics: 3.399386397452383
  high_school_geography: 4.1530064250423875
  international_law: 4.080595826717445
  global_facts: 3.9765399042329603
  high_school_microeconomics: 4.076263623867853

Fertility Scores by Language:

Amharic (amh):
  elementary_mathematics: 4.655221115622704
  high_school_geography: 6.456467084200093
  international_law: 6.204829715904315
  global_facts: 5.613975072668583
  high_school_microeconomics: 6.504111127447751

English (eng):
  global_facts: 1.8575151967698371
  international_law: 1.3356384626441982
  high_school_microeconomics: 1.4450216747955205
  high_school_geography: 1.6429125178060202
  elementary_mathematics: 2.1364648298695696

Ewe (ewe):
  elementary_mathematics: 3.161848332379317
  high_school_geography: 4.048028991758661
  international_law: 3.8293272310746276
  global_facts: 3.352552191383268
  high_school_microeconomics: 3.567263896677433

French (fra):
  elementary_mathematics: 2.374683199749252




# Output

The output is to a csv file with the fertility scores for each language-subject pair