In [None]:
%pip install tiktoken
%pip install huggingface-hub
%pip install datasets
# login using hf auth login (in the terminal) and paste your token

In [15]:
from datasets import load_dataset

ds = load_dataset("nayeon212/BLEnD", "annotations")

Generating DZ split: 500 examples [00:00, 19035.60 examples/s]
Generating DZ split: 500 examples [00:00, 19035.60 examples/s]
Generating AS split: 500 examples [00:00, 11681.67 examples/s]
Generating AS split: 500 examples [00:00, 11681.67 examples/s]
Generating AZ split: 0 examples [00:00, ? examples/s]



DatasetGenerationError: An error occurred while generating the dataset

In [14]:
# Display first few rows of the dataset
print(f"Dataset splits: {list(ds.keys())}")
print(f"\nAZ split size: {len(ds['AZ'])} rows")
print(f"\nColumn names: {ds['AZ'].column_names}")
print("\nFirst 3 rows from AZ split:")
for i in range(3):
    print(f"\n--- Row {i} ---")
    print(ds['AZ'][i])

Dataset splits: ['DZ', 'AS', 'AZ', 'CN', 'ET', 'GR', 'ID', 'IR', 'MX', 'KP', 'NG', 'KR', 'ES', 'GB', 'US', 'JB']

AZ split size: 500 rows

Column names: ['ID', 'Topic', 'Source', 'Question', 'Translation']

First 3 rows from AZ split:

--- Row 0 ---
{'ID': 'Al-en-01', 'Topic': 'Food', 'Source': 'English (US)', 'Question': 'Azərbaycanda məktəbəqədər uşaqlar üçün tipik qəlyanaltı nədir?', 'Translation': 'What is a common snack for preschool kids in Azerbaijan?'}

--- Row 1 ---
{'ID': 'Al-en-02', 'Topic': 'Food', 'Source': 'English (US)', 'Question': 'Azərbaycanda pivə ilə yaxşı gedən populyar yemək nədir?', 'Translation': 'What is a popular food to go with beer in Azerbaijan?'}

--- Row 2 ---
{'ID': 'Al-en-04', 'Topic': 'Food', 'Source': 'English (US)', 'Question': 'Azərbaycanda ən populyar meyvə hansıdır?', 'Translation': 'What is the most popular fruit in Azerbaijan?'}


In [10]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# Calculate average input and output tokens across ALL splits
encoding_name = "o200k_base"  # GPT-4o, 4.1, 5 encoding

# Get all available splits
all_splits = list(ds.keys())
print(f"Processing {len(all_splits)} splits: {all_splits}\n")

# Overall totals across all splits
overall_total_input_tokens = 0
overall_total_output_tokens = 0
overall_total_input_words = 0
overall_total_output_words = 0
overall_total_samples = 0

# Store per-split results
split_results = []

for split in all_splits:
    print(f"Processing split '{split}'...")
    
    sample_size = len(ds[split])
    total_input_tokens = 0
    total_output_tokens = 0
    total_input_words = 0
    total_output_words = 0
    
    for i in range(sample_size):
        input_text = ds[split][i]['query']
        output_text = ds[split][i]['answer']
        
        total_input_tokens += num_tokens_from_string(input_text, encoding_name)
        total_output_tokens += num_tokens_from_string(output_text, encoding_name)
        total_input_words += len(input_text.split())
        total_output_words += len(output_text.split())
        
        if (i + 1) % 5000 == 0:
            print(f"  Processed {i + 1}/{sample_size} samples...")
    
    # Calculate averages for this split
    avg_input_tokens = total_input_tokens / sample_size if sample_size > 0 else 0
    avg_output_tokens = total_output_tokens / sample_size if sample_size > 0 else 0
    avg_input_words = total_input_words / sample_size if sample_size > 0 else 0
    avg_output_words = total_output_words / sample_size if sample_size > 0 else 0
    
    split_results.append({
        'split': split,
        'samples': sample_size,
        'avg_query_tokens': avg_input_tokens,
        'avg_answer_tokens': avg_output_tokens,
        'avg_query_words': avg_input_words,
        'avg_answer_words': avg_output_words
    })
    
    # Add to overall totals
    overall_total_input_tokens += total_input_tokens
    overall_total_output_tokens += total_output_tokens
    overall_total_input_words += total_input_words
    overall_total_output_words += total_output_words
    overall_total_samples += sample_size
    
    print(f"  Done: {sample_size} samples, Avg query tokens: {avg_input_tokens:.2f}, Avg answer tokens: {avg_output_tokens:.2f}\n")

# Calculate overall averages
overall_avg_input_tokens = overall_total_input_tokens / overall_total_samples if overall_total_samples > 0 else 0
overall_avg_output_tokens = overall_total_output_tokens / overall_total_samples if overall_total_samples > 0 else 0
overall_avg_input_words = overall_total_input_words / overall_total_samples if overall_total_samples > 0 else 0
overall_avg_output_words = overall_total_output_words / overall_total_samples if overall_total_samples > 0 else 0

print("=" * 70)
print("OVERALL RESULTS ACROSS ALL SPLITS")
print("=" * 70)
print(f"Total splits processed: {len(split_results)}")
print(f"Total samples processed: {overall_total_samples}")
print(f"\nOverall average query tokens: {overall_avg_input_tokens:.2f}")
print(f"Overall average answer tokens: {overall_avg_output_tokens:.2f}")
print(f"Overall average query words: {overall_avg_input_words:.2f}")
print(f"Overall average answer words: {overall_avg_output_words:.2f}")
print("=" * 70)

# Display per-split breakdown
import pandas as pd
df_splits = pd.DataFrame(split_results)
print("\nPer-split breakdown:")
print(df_splits.to_string(index=False))

Processing 100231 samples...
Processed 10000 samples...
Processed 10000 samples...
Processed 20000 samples...
Processed 20000 samples...
Processed 30000 samples...
Processed 30000 samples...
Processed 40000 samples...
Processed 40000 samples...
Processed 50000 samples...
Processed 50000 samples...
Processed 60000 samples...
Processed 60000 samples...
Processed 70000 samples...
Processed 70000 samples...
Processed 80000 samples...
Processed 80000 samples...
Processed 90000 samples...
Processed 90000 samples...
Processed 100000 samples...

Results:
Average input tokens (query): 10.06
Average output tokens (answer): 136.87
Average input words (query): 9.06
Average output words (answer): 100.34
Total samples processed: 100231
Processed 100000 samples...

Results:
Average input tokens (query): 10.06
Average output tokens (answer): 136.87
Average input words (query): 9.06
Average output words (answer): 100.34
Total samples processed: 100231
