In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

# -------------------------------
# Load dataset + tokenizer
# -------------------------------
dataset = load_dataset("neuralcatcher/hateful_memes")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# -------------------------------
# Function to compute token lengths
# -------------------------------
def get_length_stats(split):
    lengths = []

    for row in dataset[split]:
        text = row["text"]
        tokens = tokenizer(
            text,
            add_special_tokens=True,   # CLS + SEP
            truncation=False,          # DO NOT truncate here
        )["input_ids"]

        if len(tokens) >= 88:
            print(row)
        lengths.append(len(tokens))

    lengths = np.array(lengths)

    print(f"\n=== Split: {split} ===")
    print(f"Samples: {len(lengths)}")
    print(f"Min length:       {lengths.min()}")
    print(f"Max length:       {lengths.max()}")
    print(f"Mean length:      {lengths.mean():.2f}")
    print(f"Median length:    {np.median(lengths):.2f}")
    print(f"Std deviation:    {lengths.std():.2f}")
    print(f"95th percentile:  {np.percentile(lengths, 95):.2f}")
    print(f"99th percentile:  {np.percentile(lengths, 99):.2f}")

    return lengths

# -------------------------------
# Run for all splits HF provides
# -------------------------------
for split in dataset.keys():
    get_length_stats(split)


Repo card metadata block was not found. Setting CardData to empty.


{'id': '84291', 'img': 'img/84291.png', 'label': 1, 'text': "housing, free gas, free electricity, free healthcare and free education for my wives and children. to show graditude for your generosity, i'll groom your 12 year old daughters, blow up your trains, planes and buses and preach hate through a dawah stall in your local city and town centres with the purpose of turning your generous country into the same shithole i originally took refuge from, allahu akbar!"}

=== Split: train ===
Samples: 8500
Min length:       3
Max length:       88
Mean length:      16.57
Median length:    15.00
Std deviation:    8.86
95th percentile:  33.00
99th percentile:  46.00

=== Split: validation ===
Samples: 1040
Min length:       3
Max length:       54
Mean length:      13.60
Median length:    12.00
Std deviation:    7.12
95th percentile:  26.00
99th percentile:  42.22

=== Split: test ===
Samples: 3000
Min length:       3
Max length:       74
Mean length:      13.67
Median length:    12.00
Std devia