In [1]:
import pandas as pd
from collections import Counter
from datasets import Dataset, load_dataset

dataset = load_dataset("neuralcatcher/hateful_memes")
# Remove duplicates
for i_split, i_data in dataset.items():
    dataset[i_split] = Dataset.from_pandas(
        pd.DataFrame(i_data).drop_duplicates(), preserve_index=False
    )

# Count of each class
for split, data in dataset.items():
    labels = data["label"]           # extract list of labels
    counts = Counter(labels)         # count occurrences
    total = len(labels)

    print(f"\nSplit: {split}")
    for label, count in counts.items():
        print(f"  Label {label}: {count} ({count/total:.2%})")

Repo card metadata block was not found. Setting CardData to empty.



Split: train
  Label 0: 5481 (64.48%)
  Label 1: 3019 (35.52%)

Split: validation
  Label 1: 247 (38.59%)
  Label 0: 393 (61.41%)

Split: test
  Label 1: 1240 (41.33%)
  Label 0: 1760 (58.67%)


In [2]:
w0 = 8500 / (2 * 5481)
w1 = 8500 / (2 * 3019)
w = w0+w1

w0, w1, w0 / w, w1 / w

(0.7754059478197409,
 1.4077509108976483,
 0.3551764705882353,
 0.6448235294117648)

In [3]:
w0 = 1/5481
w1 = 1/3019
w = w0+w1

w0, w1, w0 / w, w1 / w

(0.00018244845831052726,
 0.0003312355084465055,
 0.35517647058823526,
 0.6448235294117647)

In [4]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# -------------------------------
# Function to compute token lengths
# -------------------------------
def get_length_stats(split):
    lengths = []

    for row in dataset[split]:
        text = row["text"]
        tokens = tokenizer(
            text,
            add_special_tokens=True,   # CLS + SEP
            truncation=False,          # DO NOT truncate here
        )["input_ids"]

        if len(tokens) >= 88:
            print(row)
        lengths.append(len(tokens))

    lengths = np.array(lengths)

    print(f"\n=== Split: {split} ===")
    print(f"Samples: {len(lengths)}")
    print(f"Min length:       {lengths.min()}")
    print(f"Max length:       {lengths.max()}")
    print(f"Mean length:      {lengths.mean():.2f}")
    print(f"Median length:    {np.median(lengths):.2f}")
    print(f"Std deviation:    {lengths.std():.2f}")
    print(f"95th percentile:  {np.percentile(lengths, 95):.2f}")
    print(f"99th percentile:  {np.percentile(lengths, 99):.2f}")

    return lengths

# -------------------------------
# Run for all splits HF provides
# -------------------------------
for split in dataset.keys():
    get_length_stats(split)


{'id': '84291', 'img': 'img/84291.png', 'label': 1, 'text': "housing, free gas, free electricity, free healthcare and free education for my wives and children. to show graditude for your generosity, i'll groom your 12 year old daughters, blow up your trains, planes and buses and preach hate through a dawah stall in your local city and town centres with the purpose of turning your generous country into the same shithole i originally took refuge from, allahu akbar!"}

=== Split: train ===
Samples: 8500
Min length:       3
Max length:       88
Mean length:      16.57
Median length:    15.00
Std deviation:    8.86
95th percentile:  33.00
99th percentile:  46.00

=== Split: validation ===
Samples: 640
Min length:       3
Max length:       54
Mean length:      13.86
Median length:    12.00
Std deviation:    7.20
95th percentile:  28.00
99th percentile:  40.61

=== Split: test ===
Samples: 3000
Min length:       3
Max length:       74
Mean length:      13.67
Median length:    12.00
Std deviat