In [None]:
import os
import shutil
import random
import csv
from pathlib import Path

# -----------------------------
# CONFIGURATION
# -----------------------------
DATA_DIR = Path("guardian_dataset/Guardian_extended")
OUTPUT_DIR = Path("guardian_dataset/guardian_processed")
VAL_RATIO = 0.2
TEST_RATIO = 0.2

random.seed(42)

# -----------------------------
# COLLECT ALL ARTICLES
# -----------------------------
author_articles = {}
for topic in DATA_DIR.iterdir():
    if topic.is_dir():
        for author in topic.iterdir():
            if author.is_dir():
                for article in author.iterdir():
                    author_articles.setdefault(author.name, []).append((topic.name, article))

# print number of articles per author
print("Number of articles per author:")
for author, articles in author_articles.items():
    print(f"{author}: {len(articles)}")

# -----------------------------
# DETERMINE BALANCED SPLIT COUNTS
# -----------------------------
min_count = min(len(arts) for arts in author_articles.values())
val_count = max(1, int(min_count * VAL_RATIO))
test_count = max(1, int(min_count * TEST_RATIO))

print(f"Balancing validation set: {val_count} texts per author")
print(f"Balancing test set: {test_count} texts per author")

# -----------------------------
# CREATE ROOT FOLDERS
# -----------------------------
for split in ["train", "val", "test_with_author"]:
    for author in author_articles:
        (OUTPUT_DIR / split / author / "manual").mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "test" / "manual").mkdir(parents=True, exist_ok=True)  # flat test

# -----------------------------
# PROCESS AUTHORS
# -----------------------------
val_labels = []
test_labels = []

partial_list_authors_20 = ["nickcohen","zoewilliams", "pollytoynbee","peterpreston", "royhattersley", "simonhoggart"] 

for author, articles in author_articles.items():
    articles = articles.copy()
    random.shuffle(articles)
    
    # select test and validation
    test_files = articles[:test_count]
    val_files = articles[test_count:test_count + val_count]
    train_files = articles[test_count + val_count:]
    
    # If author is in partial_list_authors_20 keep only 20% of texts
    if author in partial_list_authors_20:
        take_n = max(1, int(len(train_files) * 0.2))
        train_files = train_files[:take_n]

    

    # -----------------------------
    # TRAIN
    # -----------------------------
    for topic, file_path in train_files:
        out_dir = OUTPUT_DIR / "train" / author / "manual" / f"{topic}_{file_path.stem}"
        out_dir.mkdir(parents=True, exist_ok=True)
        shutil.copy(file_path, out_dir / "test_text.txt")

    # -----------------------------
    # VALIDATION
    # -----------------------------
    for topic, file_path in val_files:
        out_dir = OUTPUT_DIR / "val" / author / "manual" / f"{topic}_{file_path.stem}"
        out_dir.mkdir(parents=True, exist_ok=True)
        dest_file = out_dir / "test_text.txt"
        shutil.copy(file_path, dest_file)
        val_labels.append({"filename": str(dest_file.relative_to(OUTPUT_DIR / "val")), "author": author})

    # -----------------------------
    # TEST (author-wise + flat)
    # -----------------------------
    for topic, file_path in test_files:
        # test_with_author
        out_dir = OUTPUT_DIR / "test_with_author" / author / "manual" / f"{topic}_{file_path.stem}"
        out_dir.mkdir(parents=True, exist_ok=True)
        dest_file = out_dir / "test_text.txt"
        shutil.copy(file_path, dest_file)
        test_labels.append({"filename": str(dest_file.relative_to(OUTPUT_DIR / "test_with_author")), "author": author})

        # test (flat)
        out_dir_flat = OUTPUT_DIR / "test" / "manual" / f"{author}_{topic}_{file_path.stem}"
        out_dir_flat.mkdir(parents=True, exist_ok=True)
        shutil.copy(file_path, out_dir_flat / "test_text.txt")

# -----------------------------
# WRITE LABELS.CSV
# -----------------------------
for split_name, labels in [("val", val_labels), ("test_with_author", test_labels)]:
    csv_path = OUTPUT_DIR / split_name / "labels.csv"
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["filename", "author"])
        writer.writeheader()
        writer.writerows(labels)

print("Balanced processing complete! Check folder:", OUTPUT_DIR)


Number of articles per author:
martinkettle: 35
simonhoggart: 40
catherinebennett: 40
hugoyoung: 37
royhattersley: 40
georgemonbiot: 40
willhutton: 40
maryriddell: 39
peterpreston: 39
pollytoynbee: 40
jonathanfreedland: 38
nickcohen: 40
zoewilliams: 40
Balancing validation set: 7 texts per author
Balancing test set: 7 texts per author
Balanced processing complete! Check folder: guardian dataset/guardian_processed


In [3]:
import os
import shutil
import random
import csv
from pathlib import Path

# -----------------------------
# CONFIGURATION
# -----------------------------
DATA_DIR = Path("guardian_dataset/Guardian_extended")

random.seed(42)

# -----------------------------
# COLLECT ALL ARTICLES
# -----------------------------
author_articles = {}
for topic in DATA_DIR.iterdir():
    if topic.is_dir():
        for author in topic.iterdir():
            if author.is_dir():
                for article in author.iterdir():
                    author_articles.setdefault(author.name, []).append((topic.name, article))

# print number of articles per author
print("Number of articles per author:")
for author, articles in author_articles.items():
    print(f"{author}: {len(articles)}")
    
# print the total number of articles
total_articles = sum(len(arts) for arts in author_articles.values())
print(f"Total number of articles: {total_articles}")
    
# Print the average number of articles per author and standard deviation
import numpy as np
article_counts = [len(arts) for arts in author_articles.values()]
print(f"Average number of articles per author: {np.mean(article_counts):.2f}")
print(f"Standard deviation of articles per author: {np.std(article_counts):.2f}")

# print the average number of articles per topic
topic_articles = {}
for topic in DATA_DIR.iterdir():
    if topic.is_dir():
        count = 0
        for author in topic.iterdir():
            if author.is_dir():
                count += len(list(author.iterdir()))
        topic_articles[topic.name] = count
print("Number of articles per topic:")
for topic in topic_articles:
    print(f"{topic}: {topic_articles[topic]}")
print(f"Average number of articles per topic: {np.mean(list(topic_articles.values())):.2f}")
print(f"Standard deviation of articles per topic: {np.std(list(topic_articles.values())):.2f}")

# print the average number of words per article
word_counts = []
for articles in author_articles.values():
    for topic, article in articles:
        with open(article, "r", encoding="utf-8") as f:
            text = f.read()
            word_counts.append(len(text.split()))
print(f"Average number of words per article: {np.mean(word_counts):.2f}")
print(f"Standard deviation of words per article: {np.std(word_counts):.2f}")

# print average word length
word_lengths = []


Number of articles per author:
martinkettle: 35
simonhoggart: 40
catherinebennett: 40
hugoyoung: 37
royhattersley: 40
georgemonbiot: 40
willhutton: 40
maryriddell: 39
peterpreston: 39
pollytoynbee: 40
jonathanfreedland: 38
nickcohen: 40
zoewilliams: 40
Total number of articles: 508
Average number of articles per author: 39.08
Standard deviation of articles per author: 1.49
Number of articles per topic:
World: 130
Society: 118
Politics: 130
UK: 130
Average number of articles per topic: 127.00
Standard deviation of articles per topic: 5.20
Average number of words per article: 1038.43
Standard deviation of words per article: 371.09


In [4]:
import os
import shutil
import random
import csv
from pathlib import Path

# -----------------------------
# CONFIGURATION
# -----------------------------
DATA_DIR = Path("guardian_dataset/Guardian_extended")

random.seed(42)

# -----------------------------
# COLLECT ALL ARTICLES
# -----------------------------
author_articles = {}
for topic in DATA_DIR.iterdir():
    if topic.is_dir():
        for author in topic.iterdir():
            if author.is_dir():
                for article in author.iterdir():
                    author_articles.setdefault(author.name, []).append((topic.name, article))

# print number of articles per author
print("Number of articles per author:")
for author, articles in author_articles.items():
    print(f"{author}: {len(articles)}")
    
    

Number of articles per author:
martinkettle: 35
simonhoggart: 40
catherinebennett: 40
hugoyoung: 37
royhattersley: 40
georgemonbiot: 40
willhutton: 40
maryriddell: 39
peterpreston: 39
pollytoynbee: 40
jonathanfreedland: 38
nickcohen: 40
zoewilliams: 40


In [6]:
import csv
import numpy as np

total_data = []

for author, articles in author_articles.items():
    print(f"Analyzing author: {author}")
    total_articles = len(articles)

    # store per-article stats first
    article_token_counts = []
    article_unique_counts = []
    article_type_token_ratios = []
    all_tokens = []
    all_word_lengths = []

    for topic, article in articles:
        with open(article, "r", encoding="utf-8") as f:
            tokens = f.read().split()
            article_token_counts.append(len(tokens))
            article_unique_counts.append(len(set(tokens)))
            all_tokens.extend(tokens)
            all_word_lengths.extend([len(t) for t in tokens])
            type_token_ratio = len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0
            article_type_token_ratios.append(type_token_ratio)

    total_tokens = sum(article_token_counts)
    total_unique_tokens = len(set(all_tokens))
    

    # averages and std devs
    avg_tokens = np.mean(article_token_counts) if article_token_counts else 0
    std_tokens = np.std(article_token_counts) if article_token_counts else 0
    avg_unique_tokens = np.mean(article_unique_counts) if article_unique_counts else 0
    std_unique_tokens = np.std(article_unique_counts) if article_unique_counts else 0
    avg_type_token_ratio = np.mean(article_type_token_ratios) if article_type_token_ratios else 0
    std_type_token_ratio = np.std(article_type_token_ratios) if article_type_token_ratios else 0
    avg_word_length = np.mean(all_word_lengths) if all_word_lengths else 0
    std_word_length = np.std(all_word_lengths) if all_word_lengths else 0

    # print results
    print(f"  Total articles: {total_articles}")
    print(f"  Average tokens per article: {avg_tokens:.2f} (±{std_tokens:.2f})")
    print(f"  Average unique tokens per article: {avg_unique_tokens:.2f} (±{std_unique_tokens:.2f})")
    print(f"  Type-token ratio: {avg_type_token_ratio:.4f}")
    print(f"  Average word length: {avg_word_length:.2f} (±{std_word_length:.2f})")

    # save to list
    total_data.append({
        "author": author,
        "total_articles": total_articles,
        "avg_tokens": f"{avg_tokens:.2f}",
        "std_tokens": f"{std_tokens:.2f}",
        "avg_unique_tokens": f"{avg_unique_tokens:.2f}",
        "std_unique_tokens": f"{std_unique_tokens:.2f}",
        "avg_type_token_ratio": f"{avg_type_token_ratio:.2f}",
        "std_type_token_ratio": f"{std_type_token_ratio:.2f}",
        "avg_word_length": f"{avg_word_length:.2f}",
        "std_word_length": f"{std_word_length:.2f}"
    })

# order by author name
total_data.sort(key=lambda x: x["author"])

# save to CSV
with open("guardian_dataset/author_features.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=total_data[0].keys())
    writer.writeheader()
    writer.writerows(total_data)

print("✅ Author-level features saved to author_features.csv")

Analyzing author: martinkettle
  Total articles: 35
  Average tokens per article: 956.40 (±340.27)
  Average unique tokens per article: 482.83 (±154.46)
  Type-token ratio: 0.5184
  Average word length: 4.82 (±2.74)
Analyzing author: simonhoggart
  Total articles: 40
  Average tokens per article: 619.42 (±137.23)
  Average unique tokens per article: 380.50 (±65.81)
  Type-token ratio: 0.6191
  Average word length: 4.70 (±2.57)
Analyzing author: catherinebennett
  Total articles: 40
  Average tokens per article: 1109.30 (±352.78)
  Average unique tokens per article: 624.02 (±176.08)
  Type-token ratio: 0.5771
  Average word length: 5.02 (±2.86)
Analyzing author: hugoyoung
  Total articles: 37
  Average tokens per article: 1145.92 (±123.60)
  Average unique tokens per article: 606.30 (±63.84)
  Type-token ratio: 0.5303
  Average word length: 4.91 (±2.75)
Analyzing author: royhattersley
  Total articles: 40
  Average tokens per article: 898.35 (±255.92)
  Average unique tokens per article