# Analyze Target Text Length Distribution

This notebook analyzes the distribution of text lengths in the generated target texts.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
# Load the predictions file
predictions_file = Path("/mnt/d/Pobrane/poleval-gender/solution/task_proofreading/02_inference/predictions_test_B.jsonl")

# Read all entries
entries = []
with open(predictions_file, 'r', encoding='utf-8') as f:
    for line in f:
        entries.append(json.loads(line))

print(f"Total entries: {len(entries)}")

In [None]:
# Calculate token counts for target texts
target_token_counts = []
target_char_lengths = []

for entry in entries:
    target_text = entry.get('target', '')
    # Count tokens
    tokens = tokenizer.encode(target_text)
    target_token_counts.append(len(tokens))
    target_char_lengths.append(len(target_text))

# Create DataFrame for analysis
df = pd.DataFrame({
    'ipis_id': [e['ipis_id'] for e in entries],
    'target_tokens': target_token_counts,
    'target_chars': target_char_lengths
})

print("Token count statistics:")
print(df['target_tokens'].describe())
print(f"\nEntries with target tokens > 500: {(df['target_tokens'] > 500).sum()}")
print(f"Entries with target tokens > 1000: {(df['target_tokens'] > 1000).sum()}")
print(f"\nCharacter count statistics:")
print(df['target_chars'].describe())

In [None]:
# Visualize the distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram of all token lengths
axes[0, 0].hist(df['target_tokens'], bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Target Length (tokens)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Target Text Lengths (Tokens)')
axes[0, 0].axvline(x=500, color='red', linestyle='--', label='500 tokens', linewidth=2)
axes[0, 0].legend()

# Histogram zoomed to 0-1000
axes[0, 1].hist(df['target_tokens'], bins=50, range=(0, 1000), edgecolor='black')
axes[0, 1].set_xlabel('Target Length (tokens)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution (0-1000 tokens)')
axes[0, 1].axvline(x=500, color='red', linestyle='--', label='500 tokens', linewidth=2)
axes[0, 1].legend()

# Box plot
axes[1, 0].boxplot(df['target_tokens'], vert=True)
axes[1, 0].set_ylabel('Target Length (tokens)')
axes[1, 0].set_title('Box Plot of Target Lengths')
axes[1, 0].axhline(y=500, color='red', linestyle='--', label='500 tokens', linewidth=2)
axes[1, 0].legend()

# Cumulative distribution
sorted_tokens = sorted(df['target_tokens'])
cumulative = [(i+1)/len(sorted_tokens)*100 for i in range(len(sorted_tokens))]
axes[1, 1].plot(sorted_tokens, cumulative)
axes[1, 1].set_xlabel('Target Length (tokens)')
axes[1, 1].set_ylabel('Cumulative Percentage (%)')
axes[1, 1].set_title('Cumulative Distribution')
axes[1, 1].axvline(x=500, color='red', linestyle='--', label='500 tokens', linewidth=2)
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Show examples of longest targets
print("Top 10 longest target texts (by token count):\n")
longest = df.nlargest(10, 'target_tokens')
for idx, row in longest.iterrows():
    print(f"{row['ipis_id']}: {row['target_tokens']} tokens ({row['target_chars']} chars)")
    print(f"Text preview: {entries[idx]['target'][:200]}...")
    print("-" * 80)

In [None]:
# Token count distribution by ranges
ranges = [
    (0, 50, '0-50'),
    (50, 100, '50-100'),
    (100, 200, '100-200'),
    (200, 300, '200-300'),
    (300, 400, '300-400'),
    (400, 500, '400-500'),
    (500, 750, '500-750'),
    (750, 1000, '750-1000'),
    (1000, 2000, '1000-2000'),
    (2000, float('inf'), '2000+')
]

print("\nDistribution by token count ranges:")
print("-" * 50)
for start, end, label in ranges:
    count = ((df['target_tokens'] >= start) & (df['target_tokens'] < end)).sum()
    percentage = (count / len(df)) * 100
    print(f"{label:12s}: {count:4d} entries ({percentage:5.2f}%)")