In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

## Load System Prompt

In [None]:
# Load the Polish system prompt
with open('system_prompts/proofreading/system_prompt_pl_proofreading', 'r', encoding='utf-8') as f:
    SYSTEM_PROMPT = f.read().strip()

print(f"System prompt length: {len(SYSTEM_PROMPT)} characters")
print(f"System prompt (first 200 chars):\n{SYSTEM_PROMPT[:200]}...")

## Load All Datasets

In [None]:
def load_jsonl(file_path):
    """Load JSONL file into a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load all datasets
train_data = load_jsonl('data/taskA/train.jsonl')
dev_data = load_jsonl('data/taskA/dev.jsonl')
test_a_data = load_jsonl('data/taskA/test_A.jsonl')
test_b_data = load_jsonl('data/taskA/test_gold_standard_normalised_B.jsonl')

print(f"Loaded {len(train_data)} training examples")
print(f"Loaded {len(dev_data)} validation examples")
print(f"Loaded {len(test_a_data)} test A examples")
print(f"Loaded {len(test_b_data)} test B examples (with targets)")

## Calculate Lengths

In [None]:
def calculate_lengths(data, system_prompt, has_target=True):
    """Calculate various length statistics for the dataset."""
    source_lengths = []
    target_lengths = []
    prompt_lengths = []
    system_plus_prompt_lengths = []
    total_input_lengths = []
    
    system_prompt_len = len(system_prompt)
    
    for item in data:
        source_len = len(item['source'])
        prompt_len = len(item['prompt'])
        
        source_lengths.append(source_len)
        prompt_lengths.append(prompt_len)
        
        if has_target and item.get('target') is not None:
            target_len = len(item['target'])
            target_lengths.append(target_len)
        
        system_plus_prompt = system_prompt_len + prompt_len
        system_plus_prompt_lengths.append(system_plus_prompt)
        
        total_input = system_prompt_len + prompt_len + source_len
        total_input_lengths.append(total_input)
    
    result = {
        'source': source_lengths,
        'prompt': prompt_lengths,
        'system_plus_prompt': system_plus_prompt_lengths,
        'total_input': total_input_lengths
    }
    
    if target_lengths:
        result['target'] = target_lengths
    
    return result

train_lengths = calculate_lengths(train_data, SYSTEM_PROMPT, has_target=True)
dev_lengths = calculate_lengths(dev_data, SYSTEM_PROMPT, has_target=True)
test_a_lengths = calculate_lengths(test_a_data, SYSTEM_PROMPT, has_target=False)
test_b_lengths = calculate_lengths(test_b_data, SYSTEM_PROMPT, has_target=True)

print("Length calculations completed for all datasets")

## Statistics - Training Set

In [None]:
def print_statistics(lengths, dataset_name="Dataset"):
    """Print detailed statistics for length measurements."""
    print("=" * 80)
    print(f"{dataset_name.upper()} - SEQUENCE LENGTH STATISTICS (in characters)")
    print("=" * 80)
    
    stats = [
        ("Source text", lengths['source']),
        ("Target text", lengths['target']),
        ("Task prompt", lengths['prompt']),
        ("System prompt + Task prompt", lengths['system_plus_prompt']),
        ("Total input (System + Task prompt + Source)", lengths['total_input'])
    ]
    
    for name, values in stats:
        print(f"\n{name}:")
        print(f"  Mean:     {np.mean(values):8.2f}")
        print(f"  Median:   {np.median(values):8.2f}")
        print(f"  Std Dev:  {np.std(values):8.2f}")
        print(f"  Min:      {np.min(values):8.0f}")
        print(f"  Max:      {np.max(values):8.0f}")
        print(f"  95th %:   {np.percentile(values, 95):8.2f}")
        print(f"  99th %:   {np.percentile(values, 99):8.2f}")
    
    print("\n" + "=" * 80)
    print(f"Average length increase (Target - Source): {np.mean(lengths['target']) - np.mean(lengths['source']):.2f} chars")
    print(f"Relative increase: {(np.mean(lengths['target']) / np.mean(lengths['source']) - 1) * 100:.2f}%")
    print("=" * 80)

print_statistics(train_lengths, "Training Set")

## Statistics - Validation Set

In [None]:
print_statistics(dev_lengths, "Validation Set")

## Statistics - Test A Set

In [None]:
def print_test_statistics(lengths, dataset_name="Test Set"):
    """Print statistics for test data (no targets available)."""
    print("=" * 80)
    print(f"{dataset_name.upper()} - SOURCE LENGTH STATISTICS (in characters)")
    print("=" * 80)
    
    print(f"\nSource text:")
    print(f"  Mean:     {np.mean(lengths['source']):8.2f}")
    print(f"  Median:   {np.median(lengths['source']):8.2f}")
    print(f"  Std Dev:  {np.std(lengths['source']):8.2f}")
    print(f"  Min:      {np.min(lengths['source']):8.0f}")
    print(f"  Max:      {np.max(lengths['source']):8.0f}")
    print("=" * 80)

print_test_statistics(test_a_lengths, "Test A Set")

## Statistics - Test B Set

In [None]:
print_statistics(test_b_lengths, "Test B Set")

## Token Estimation

Estimate token counts (rough approximation: 1 token â‰ˆ 4 characters for English, but Polish may differ)

In [None]:
# Rough token estimation (characters / 4)
CHARS_PER_TOKEN = 4  # This is an approximation

print("=" * 80)
print("ESTIMATED TOKEN COUNTS (rough approximation)")
print("=" * 80)
print(f"\nAssuming ~{CHARS_PER_TOKEN} characters per token (adjust based on actual tokenizer)\n")

print("Training Set:")
print(f"  Average source tokens:              ~{np.mean(train_lengths['source']) / CHARS_PER_TOKEN:.0f}")
print(f"  Average target tokens:              ~{np.mean(train_lengths['target']) / CHARS_PER_TOKEN:.0f}")
print(f"  Average system+prompt tokens:       ~{np.mean(train_lengths['system_plus_prompt']) / CHARS_PER_TOKEN:.0f}")
print(f"  Average total input tokens:         ~{np.mean(train_lengths['total_input']) / CHARS_PER_TOKEN:.0f}")
print(f"  Max total input tokens:             ~{np.max(train_lengths['total_input']) / CHARS_PER_TOKEN:.0f}")
print(f"  99th percentile total input tokens: ~{np.percentile(train_lengths['total_input'], 99) / CHARS_PER_TOKEN:.0f}")

print("\nValidation Set:")
print(f"  Average source tokens:              ~{np.mean(dev_lengths['source']) / CHARS_PER_TOKEN:.0f}")
print(f"  Average target tokens:              ~{np.mean(dev_lengths['target']) / CHARS_PER_TOKEN:.0f}")
print(f"  Average system+prompt tokens:       ~{np.mean(dev_lengths['system_plus_prompt']) / CHARS_PER_TOKEN:.0f}")
print(f"  Average total input tokens:         ~{np.mean(dev_lengths['total_input']) / CHARS_PER_TOKEN:.0f}")
print(f"  Max total input tokens:             ~{np.max(dev_lengths['total_input']) / CHARS_PER_TOKEN:.0f}")
print("=" * 80)

## Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Sequence Length Distributions - Training Set', fontsize=16)

axes[0, 0].hist(train_lengths['source'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(np.mean(train_lengths['source']), color='red', linestyle='--', label=f"Mean: {np.mean(train_lengths['source']):.0f}")
axes[0, 0].set_xlabel('Length (characters)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Source Text Length')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].hist(train_lengths['target'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].axvline(np.mean(train_lengths['target']), color='red', linestyle='--', label=f"Mean: {np.mean(train_lengths['target']):.0f}")
axes[0, 1].set_xlabel('Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Target Text Length')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].hist(train_lengths['system_plus_prompt'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1, 0].axvline(np.mean(train_lengths['system_plus_prompt']), color='red', linestyle='--', label=f"Mean: {np.mean(train_lengths['system_plus_prompt']):.0f}")
axes[1, 0].set_xlabel('Length (characters)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('System Prompt + Task Prompt Length')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].hist(train_lengths['total_input'], bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].axvline(np.mean(train_lengths['total_input']), color='red', linestyle='--', label=f"Mean: {np.mean(train_lengths['total_input']):.0f}")
axes[1, 1].axvline(4096 * CHARS_PER_TOKEN, color='orange', linestyle='--', label='4096 tokens (~16384 chars)')
axes[1, 1].set_xlabel('Length (characters)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Total Input Length (System + Task Prompt + Source)')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('sequence_length_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("Plot saved as 'sequence_length_distributions.png'")

## Check Examples That Exceed Token Limits

In [None]:
# Check how many examples exceed common token limits
token_limits = [512, 1024, 2048, 4096, 8192]

print("=" * 80)
print("EXAMPLES EXCEEDING TOKEN LIMITS (Training Set)")
print("=" * 80)

for limit in token_limits:
    char_limit = limit * CHARS_PER_TOKEN
    exceeding = sum(1 for length in train_lengths['total_input'] if length > char_limit)
    percentage = (exceeding / len(train_lengths['total_input'])) * 100
    print(f"\nToken limit: {limit:5d} (~{char_limit:6d} chars)")
    print(f"  Exceeding examples: {exceeding:5d} / {len(train_lengths['total_input'])} ({percentage:.2f}%)")

print("\n" + "=" * 80)

## Summary for Paper

Key statistics to report in the paper:

In [None]:
print("=" * 80)
print("SUMMARY FOR PAPER")
print("=" * 80)

print(f"\nDataset size:")
print(f"  Training examples:   {len(train_data):,}")
print(f"  Validation examples: {len(dev_data):,}")

print(f"\nSystem prompt:")
print(f"  Length: {len(SYSTEM_PROMPT)} characters (~{len(SYSTEM_PROMPT) / CHARS_PER_TOKEN:.0f} tokens)")

print(f"\nAverage task prompt:")
print(f"  Length: {np.mean(train_lengths['prompt']):.2f} characters (~{np.mean(train_lengths['prompt']) / CHARS_PER_TOKEN:.0f} tokens)")

print(f"\nSource texts (to be transformed):")
print(f"  Mean length:   {np.mean(train_lengths['source']):.2f} characters (~{np.mean(train_lengths['source']) / CHARS_PER_TOKEN:.0f} tokens)")
print(f"  Median length: {np.median(train_lengths['source']):.2f} characters (~{np.median(train_lengths['source']) / CHARS_PER_TOKEN:.0f} tokens)")
print(f"  Max length:    {np.max(train_lengths['source'])} characters (~{np.max(train_lengths['source']) / CHARS_PER_TOKEN:.0f} tokens)")

print(f"\nTarget texts (gender-inclusive):")
print(f"  Mean length:   {np.mean(train_lengths['target']):.2f} characters (~{np.mean(train_lengths['target']) / CHARS_PER_TOKEN:.0f} tokens)")
print(f"  Average increase: {np.mean(train_lengths['target']) - np.mean(train_lengths['source']):.2f} chars ({(np.mean(train_lengths['target']) / np.mean(train_lengths['source']) - 1) * 100:.2f}%)")

print(f"\nTotal input length (System + Task prompt + Source):")
print(f"  Mean:   {np.mean(train_lengths['total_input']):.2f} characters (~{np.mean(train_lengths['total_input']) / CHARS_PER_TOKEN:.0f} tokens)")
print(f"  Median: {np.median(train_lengths['total_input']):.2f} characters (~{np.median(train_lengths['total_input']) / CHARS_PER_TOKEN:.0f} tokens)")
print(f"  95th percentile: {np.percentile(train_lengths['total_input'], 95):.2f} characters (~{np.percentile(train_lengths['total_input'], 95) / CHARS_PER_TOKEN:.0f} tokens)")
print(f"  Max:    {np.max(train_lengths['total_input'])} characters (~{np.max(train_lengths['total_input']) / CHARS_PER_TOKEN:.0f} tokens)")

print("\n" + "=" * 80)
print("Note: Token estimates assume ~4 characters per token.")
print("For accurate token counts, use the actual tokenizer (e.g., Qwen3 tokenizer).")
print("=" * 80)

## Generate Comprehensive LaTeX Table for Paper

In [None]:
print("\\begin{table}[!htbp]")
print("\\centering")
print("\\small")
print("\\caption{Dataset statistics: character-level sequence lengths}")
print("\\label{tab:dataset_stats}")
print()
print("\\begin{tabularx}{\\columnwidth}{Xrrrr}")
print("\\toprule")
print("\\textbf{Metric} & \\textbf{Train} & \\textbf{Val} & \\textbf{Test A} & \\textbf{Test B} \\\\")
print("\\midrule")

# Source text
print("\\multicolumn{5}{l}{\\textbf{\\textit{Source text}}} \\\\")
print(f"Mean & {np.mean(train_lengths['source']):.2f} & {np.mean(dev_lengths['source']):.2f} & {np.mean(test_a_lengths['source']):.2f} & {np.mean(test_b_lengths['source']):.2f} \\\\")
print(f"Std Dev & {np.std(train_lengths['source']):.2f} & {np.std(dev_lengths['source']):.2f} & {np.std(test_a_lengths['source']):.2f} & {np.std(test_b_lengths['source']):.2f} \\\\")
print(f"Median & {np.median(train_lengths['source']):.2f} & {np.median(dev_lengths['source']):.2f} & {np.median(test_a_lengths['source']):.2f} & {np.median(test_b_lengths['source']):.2f} \\\\")
print(f"Min & {np.min(train_lengths['source']):.2f} & {np.min(dev_lengths['source']):.2f} & {np.min(test_a_lengths['source']):.2f} & {np.min(test_b_lengths['source']):.2f} \\\\")
print(f"Max & {np.max(train_lengths['source']):.2f} & {np.max(dev_lengths['source']):.2f} & {np.max(test_a_lengths['source']):.2f} & {np.max(test_b_lengths['source']):.2f} \\\\")
print(f"95th \\% & {np.percentile(train_lengths['source'], 95):.2f} & {np.percentile(dev_lengths['source'], 95):.2f} & {np.percentile(test_a_lengths['source'], 95):.2f} & {np.percentile(test_b_lengths['source'], 95):.2f} \\\\")
print(f"99th \\% & {np.percentile(train_lengths['source'], 99):.2f} & {np.percentile(dev_lengths['source'], 99):.2f} & {np.percentile(test_a_lengths['source'], 99):.2f} & {np.percentile(test_b_lengths['source'], 99):.2f} \\\\")
print("\\midrule")

# Target text
print("\\multicolumn{5}{l}{\\textbf{\\textit{Target text}}\\textsuperscript{2}} \\\\")
print(f"Mean & {np.mean(train_lengths['target']):.2f} & {np.mean(dev_lengths['target']):.2f} & --- & {np.mean(test_b_lengths['target']):.2f} \\\\")
print(f"Std Dev & {np.std(train_lengths['target']):.2f} & {np.std(dev_lengths['target']):.2f} & --- & {np.std(test_b_lengths['target']):.2f} \\\\")
print(f"Median & {np.median(train_lengths['target']):.2f} & {np.median(dev_lengths['target']):.2f} & --- & {np.median(test_b_lengths['target']):.2f} \\\\")
print(f"Min & {np.min(train_lengths['target']):.2f} & {np.min(dev_lengths['target']):.2f} & --- & {np.min(test_b_lengths['target']):.2f} \\\\")
print(f"Max & {np.max(train_lengths['target']):.2f} & {np.max(dev_lengths['target']):.2f} & --- & {np.max(test_b_lengths['target']):.2f} \\\\")
print(f"95th \\% & {np.percentile(train_lengths['target'], 95):.2f} & {np.percentile(dev_lengths['target'], 95):.2f} & --- & {np.percentile(test_b_lengths['target'], 95):.2f} \\\\")
print(f"99th \\% & {np.percentile(train_lengths['target'], 99):.2f} & {np.percentile(dev_lengths['target'], 99):.2f} & --- & {np.percentile(test_b_lengths['target'], 99):.2f} \\\\")
print("\\midrule")

# Task prompt
print("\\multicolumn{5}{l}{\\textbf{\\textit{Task prompt}}} \\\\")
print(f"Mean & {np.mean(train_lengths['prompt']):.2f} & {np.mean(dev_lengths['prompt']):.2f} & {np.mean(test_a_lengths['prompt']):.2f} & {np.mean(test_b_lengths['prompt']):.2f} \\\\")
print(f"Std Dev & {np.std(train_lengths['prompt']):.2f} & {np.std(dev_lengths['prompt']):.2f} & {np.std(test_a_lengths['prompt']):.2f} & {np.std(test_b_lengths['prompt']):.2f} \\\\")
print(f"Median & {np.median(train_lengths['prompt']):.2f} & {np.median(dev_lengths['prompt']):.2f} & {np.median(test_a_lengths['prompt']):.2f} & {np.median(test_b_lengths['prompt']):.2f} \\\\")
print(f"Min & {np.min(train_lengths['prompt']):.2f} & {np.min(dev_lengths['prompt']):.2f} & {np.min(test_a_lengths['prompt']):.2f} & {np.min(test_b_lengths['prompt']):.2f} \\\\")
print(f"Max & {np.max(train_lengths['prompt']):.2f} & {np.max(dev_lengths['prompt']):.2f} & {np.max(test_a_lengths['prompt']):.2f} & {np.max(test_b_lengths['prompt']):.2f} \\\\")
print(f"95th \\% & {np.percentile(train_lengths['prompt'], 95):.2f} & {np.percentile(dev_lengths['prompt'], 95):.2f} & {np.percentile(test_a_lengths['prompt'], 95):.2f} & {np.percentile(test_b_lengths['prompt'], 95):.2f} \\\\")
print(f"99th \\% & {np.percentile(train_lengths['prompt'], 99):.2f} & {np.percentile(dev_lengths['prompt'], 99):.2f} & {np.percentile(test_a_lengths['prompt'], 99):.2f} & {np.percentile(test_b_lengths['prompt'], 99):.2f} \\\\")
print("\\midrule")

# Total input length
print("% Using a footnote marker for the long definition to save space")
print("\\multicolumn{5}{l}{\\textbf{\\textit{Total input}}\\textsuperscript{1}} \\\\")
print(f"Mean & {np.mean(train_lengths['total_input']):.2f} & {np.mean(dev_lengths['total_input']):.2f} & {np.mean(test_a_lengths['total_input']):.2f} & {np.mean(test_b_lengths['total_input']):.2f} \\\\")
print(f"Std Dev & {np.std(train_lengths['total_input']):.2f} & {np.std(dev_lengths['total_input']):.2f} & {np.std(test_a_lengths['total_input']):.2f} & {np.std(test_b_lengths['total_input']):.2f} \\\\")
print(f"Median & {np.median(train_lengths['total_input']):.2f} & {np.median(dev_lengths['total_input']):.2f} & {np.median(test_a_lengths['total_input']):.2f} & {np.median(test_b_lengths['total_input']):.2f} \\\\")
print(f"Min & {np.min(train_lengths['total_input']):.2f} & {np.min(dev_lengths['total_input']):.2f} & {np.min(test_a_lengths['total_input']):.2f} & {np.min(test_b_lengths['total_input']):.2f} \\\\")
print(f"Max & {np.max(train_lengths['total_input']):.2f} & {np.max(dev_lengths['total_input']):.2f} & {np.max(test_a_lengths['total_input']):.2f} & {np.max(test_b_lengths['total_input']):.2f} \\\\")
print(f"95th \\% & {np.percentile(train_lengths['total_input'], 95):.2f} & {np.percentile(dev_lengths['total_input'], 95):.2f} & {np.percentile(test_a_lengths['total_input'], 95):.2f} & {np.percentile(test_b_lengths['total_input'], 95):.2f} \\\\")
print(f"99th \\% & {np.percentile(train_lengths['total_input'], 99):.2f} & {np.percentile(dev_lengths['total_input'], 99):.2f} & {np.percentile(test_a_lengths['total_input'], 99):.2f} & {np.percentile(test_b_lengths['total_input'], 99):.2f} \\\\")
print("\\bottomrule")

print("\\end{tabularx}")
print()
print("\\vspace{2pt}")
print("\\raggedright")
print(f"\\footnotesize{{\\textsuperscript{{1}} System ({len(SYSTEM_PROMPT)} char.) + Task prompt + Source}}")
print()
print("\\footnotesize{\\textsuperscript{2} Blank fields denote the fact that test sets were shared with no target sequences available to the participants at the time of the competition}")
print("\\end{table}")