# Dataset Pipeline: Test/Train Split & HuggingFace Upload

Run this notebook after each annotation session to:
1. Load and clean labeled data
2. Create a fixed test split (once, never modified)
3. Build the train split from remaining annotations
4. Push both to HuggingFace

In [1]:
import os
import sys
from datetime import datetime

import pandas as pd
from datasets import Dataset, DatasetDict

# === CONFIG ===
HF_REPO = "Zorryy/news_articles_2025_elections_germany"
TEST_SAMPLES_PER_CLASS = 50
RANDOM_SEED = 42

# === PATHS ===
DATA_DIR = os.path.join("..", "..", "data", "articles")
RAW_CSV = os.path.join(DATA_DIR, "cleaned_articles.csv")
LABELED_CSV = os.path.join(DATA_DIR, "cleaned_articles_labeled.csv")
TEST_SPLIT_CSV = os.path.join(DATA_DIR, "test_split.csv")

# Verify files exist
assert os.path.exists(RAW_CSV), f"Raw CSV not found: {RAW_CSV}"
assert os.path.exists(LABELED_CSV), f"Labeled CSV not found: {LABELED_CSV}"
print(f"Raw articles: {RAW_CSV}")
print(f"Labeled articles: {LABELED_CSV}")
print(f"Test split: {TEST_SPLIT_CSV}")

  from .autonotebook import tqdm as notebook_tqdm


Raw articles: ../../data/articles/cleaned_articles.csv
Labeled articles: ../../data/articles/cleaned_articles_labeled.csv
Test split: ../../data/articles/test_split.csv


## Step 1: Load labeled data & remove non-labels

In [2]:
# Increase CSV field size limit for large text fields
import csv
csv.field_size_limit(sys.maxsize)

labeled_df = pd.read_csv(LABELED_CSV, encoding="utf-8")
print(f"Total rows in labeled CSV: {len(labeled_df)}")

# Remove non-labels (skipped, not_clean)
non_labels = ["skipped", "not_clean"]
labeled_df = labeled_df[~labeled_df["label"].isin(non_labels)].copy()
print(f"After removing skipped/not_clean: {len(labeled_df)}")

# Deduplicate: keep last label per article ID
labeled_df = labeled_df.drop_duplicates(subset="id", keep="last")
print(f"After deduplication: {len(labeled_df)}")

# Show distribution
print("\nLabel distribution:")
print(labeled_df["label"].value_counts().to_string())

Total rows in labeled CSV: 1386
After removing skipped/not_clean: 782
After deduplication: 782

Label distribution:
label
Andere                      193
Gesundheitswesen, Pflege     62
Klima / Energie              60
Wirtschaftslage              50
Ukraine/Krieg/Russland       50
Soziales Gefälle             50
Kosten/Löhne/Preise          50
Zuwanderung                  50
Renten                       50
Arbeitslosigkeit             50
Bundeswehr/Verteidigung      50
AfD/Rechte                   50
Politikverdruss              17


## Step 2: Create or load test split

In [3]:
if os.path.exists(TEST_SPLIT_CSV):
    # Load existing test split - NEVER modify it
    test_ids_df = pd.read_csv(TEST_SPLIT_CSV, encoding="utf-8")
    print(f"Test split already exists. Loaded {len(test_ids_df)} entries.")
    print("(Test split is frozen and will not be modified.)")
else:
    # Create new test split: up to TEST_SAMPLES_PER_CLASS per class
    print(f"Creating new test split with up to {TEST_SAMPLES_PER_CLASS} samples per class...")
    test_samples = []
    for label in sorted(labeled_df["label"].unique()):
        class_df = labeled_df[labeled_df["label"] == label]
        n = min(len(class_df), TEST_SAMPLES_PER_CLASS)
        sampled = class_df.sample(n=n, random_state=RANDOM_SEED)
        test_samples.append(sampled)
        print(f"  {label}: {n} samples (available: {len(class_df)})")

    test_ids_df = pd.concat(test_samples)[["id", "label"]]
    test_ids_df.to_csv(TEST_SPLIT_CSV, index=False, encoding="utf-8")
    print(f"\nTest split saved: {len(test_ids_df)} total entries")

print("\nTest split distribution:")
print(test_ids_df["label"].value_counts().to_string())

Test split already exists. Loaded 617 entries.
(Test split is frozen and will not be modified.)

Test split distribution:
label
AfD/Rechte                  50
Andere                      50
Arbeitslosigkeit            50
Bundeswehr/Verteidigung     50
Gesundheitswesen, Pflege    50
Klima / Energie             50
Kosten/Löhne/Preise         50
Renten                      50
Soziales Gefälle            50
Ukraine/Krieg/Russland      50
Wirtschaftslage             50
Zuwanderung                 50
Politikverdruss             17


## Step 3: Build train split (all labeled data NOT in test)

In [4]:
test_ids = set(test_ids_df["id"].tolist())
train_labels_df = labeled_df[~labeled_df["id"].isin(test_ids)][["id", "label"]].copy()

print(f"Train split: {len(train_labels_df)} labeled articles")
print(f"Test split:  {len(test_ids_df)} labeled articles")
print(f"Total:       {len(train_labels_df) + len(test_ids_df)}")

print("\nTrain split distribution:")
print(train_labels_df["label"].value_counts().to_string())

Train split: 165 labeled articles
Test split:  617 labeled articles
Total:       782

Train split distribution:
label
Andere                      143
Gesundheitswesen, Pflege     12
Klima / Energie              10


## Step 4: Join with raw article data

In [None]:
raw_df = pd.read_csv(RAW_CSV, encoding="utf-8")
print(f"Raw articles loaded: {len(raw_df)}")

# Join test split with raw data (keep all columns)
test_df = test_ids_df.merge(raw_df, on="id", how="left")
missing_test = test_df["text"].isna().sum()
if missing_test > 0:
    print(f"WARNING: {missing_test} test articles not found in raw data!")
    test_df = test_df.dropna(subset=["text"])

# Join train split with raw data (keep all columns)
train_df = train_labels_df.merge(raw_df, on="id", how="left")
missing_train = train_df["text"].isna().sum()
if missing_train > 0:
    print(f"WARNING: {missing_train} train articles not found in raw data!")
    train_df = train_df.dropna(subset=["text"])

# Build raw split: all articles that have NO label yet
all_labeled_ids = set(labeled_df["id"].tolist())
unlabeled_df = raw_df[~raw_df["id"].isin(all_labeled_ids)].copy()
unlabeled_df["label"] = ""

# Ensure consistent column order across all splits
columns = ["id", "domain", "url", "date_time", "headline", "author", "text", "text_length", "label"]
train_df = train_df[columns]
test_df = test_df[columns]
unlabeled_df = unlabeled_df[columns]

print(f"\nFinal train:     {len(train_df):>7} articles")
print(f"Final test:      {len(test_df):>7} articles")
print(f"Final raw:       {len(unlabeled_df):>7} articles (unlabeled)")
print(f"Total in dataset:{len(train_df) + len(test_df) + len(unlabeled_df):>7}")

## Step 5: Create HuggingFace Dataset & push

In [6]:
ds = DatasetDict({
    "raw": Dataset.from_pandas(unlabeled_df, preserve_index=False),
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

print(ds)
for split in ds:
    print(f"\n{split} columns: {ds[split].column_names}")

DatasetDict({
    raw: Dataset({
        features: ['id', 'domain', 'url', 'date_time', 'headline', 'author', 'text', 'text_length', 'label'],
        num_rows: 260015
    })
    train: Dataset({
        features: ['id', 'domain', 'url', 'date_time', 'headline', 'author', 'text', 'text_length', 'label'],
        num_rows: 165
    })
    test: Dataset({
        features: ['id', 'domain', 'url', 'date_time', 'headline', 'author', 'text', 'text_length', 'label'],
        num_rows: 617
    })
})

raw columns: ['id', 'domain', 'url', 'date_time', 'headline', 'author', 'text', 'text_length', 'label']

train columns: ['id', 'domain', 'url', 'date_time', 'headline', 'author', 'text', 'text_length', 'label']

test columns: ['id', 'domain', 'url', 'date_time', 'headline', 'author', 'text', 'text_length', 'label']


In [8]:
from huggingface_hub import HfApi

today = datetime.now().strftime("%Y-%m-%d")
commit_msg = (
    f"Update {today}: "
    f"{len(train_df)} train / {len(test_df)} test / {len(unlabeled_df)} raw"
)

# Delete old data files to force clean upload
api = HfApi()
existing_files = api.list_repo_files(HF_REPO, repo_type="dataset")
data_files = [f for f in existing_files if f.startswith("data/")]
if data_files:
    from huggingface_hub import CommitOperationDelete
    operations = [CommitOperationDelete(path_in_repo=f) for f in data_files]
    api.create_commit(HF_REPO, operations=operations, repo_type="dataset",
                      commit_message="Clear old data before re-upload")
    print(f"Deleted {len(data_files)} old data files: {data_files}")

print(f"Pushing to: https://huggingface.co/datasets/{HF_REPO}")
print(f"Commit: {commit_msg}")

ds.push_to_hub(HF_REPO, commit_message=commit_msg)

print(f"\nDone! Dataset available at:")
print(f"https://huggingface.co/datasets/{HF_REPO}")

Pushing to: https://huggingface.co/datasets/Zorryy/news_articles_2025_elections_germany
Commit: Update 2026-02-14: 165 train / 617 test / 260015 raw


ValueError: All datasets in `DatasetDict` should have the same features but features for 'raw' and 'train' don't match: {'id': Value('int64'), 'domain': Value('string'), 'url': Value('string'), 'date_time': Value('string'), 'headline': Value('string'), 'author': Value('string'), 'text': Value('string'), 'text_length': Value('int64'), 'label': Value('null')} != {'id': Value('int64'), 'domain': Value('string'), 'url': Value('string'), 'date_time': Value('string'), 'headline': Value('string'), 'author': Value('string'), 'text': Value('string'), 'text_length': Value('int64'), 'label': Value('string')}

## Summary

In [None]:
# Final summary table
all_labels = sorted(set(train_df["label"].unique()) | set(test_df["label"].unique()))
summary = pd.DataFrame({
    "Label": all_labels,
    "Train": [len(train_df[train_df["label"] == l]) for l in all_labels],
    "Test": [len(test_df[test_df["label"] == l]) for l in all_labels],
})
summary["Total"] = summary["Train"] + summary["Test"]
summary.loc[len(summary)] = ["TOTAL", summary["Train"].sum(), summary["Test"].sum(), summary["Total"].sum()]
print(summary.to_string(index=False))
print(f"\nRaw (unlabeled): {len(unlabeled_df)} articles")