# Project Gutenberg bookshelf metadata
- Download the RDF metadata bundle (~2 GB) from https://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2
- Extract it to a folder, e.g., `D:\Narrator\rdf-files`
- After extraction you should see `cache/epub/<id>/` folders containing `.rdf` files (e.g., `.../cache/epub/123/pg123.rdf`)
- Update `data_dir` in the next cell to point at that extracted `gutenberg_metadata` folder
- Run the cells below to list all bookshelf categories and their counts

In [1]:
import xml.etree.ElementTree as ET
from pathlib import Path
from collections import Counter, defaultdict
import re

In [2]:
# Path to your RDF folder
rdf_folder = Path(r"D:\Narrator\rdf-files")

# Recursively get all RDF files
rdf_files_paths = list(rdf_folder.rglob("*.rdf"))
print(f"Found {len(rdf_files_paths)} RDF files under {rdf_folder}")
if not rdf_files_paths:
    print(f"No RDF files found under {rdf_folder}. Update the path if needed.")
else:
    # RDF namespaces
    ns = {
        "pgterms": "http://www.gutenberg.org/2009/pgterms/",
        "dcterms": "http://purl.org/dc/terms/",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    }

    def is_english(ebook):
        lang_el = ebook.find('.//dcterms:language/rdf:Description/rdf:value', namespaces=ns)
        return lang_el is not None and lang_el.text == 'en'

    def extract_shelves(ebook):
        shelves_local = []
        for bookshelf in ebook.findall('pgterms:bookshelf/rdf:Description', namespaces=ns):
            value = bookshelf.find('rdf:value', namespaces=ns)
            if value is not None and value.text:
                shelves_local.append(value.text)
        return shelves_local

    shelves = []
    for rdf_file_path in rdf_files_paths:
        try:
            root = ET.parse(rdf_file_path).getroot()
            for ebook in root.findall('pgterms:ebook', namespaces=ns):
                if is_english(ebook):
                    shelves.extend(extract_shelves(ebook))
        except ET.ParseError:
            print(f"Warning: Could not parse {rdf_file_path}, skipping.")
            continue

    if not shelves:
        print("No English bookshelf tags found.")
    else:
        # Count occurrences of each unique bookshelf
        shelf_counts = Counter(shelves)
        print(f"\nFound {len(shelf_counts)} unique English bookshelves across {len(rdf_files_paths)} RDF files:\n")
        for shelf, count in sorted(shelf_counts.items(), key=lambda x: x[0].lower()):
            print(f"{shelf}: {count}")
        print("\nFirst few shelves found:", sorted(shelf_counts.keys())[:10])


Found 77383 RDF files under D:\Narrator\rdf-files

Found 338 unique English bookshelves across 77383 RDF files:

6 Best Loved Spanish Literary Classics: 10
Adventure: 82
Africa: 60
African American Writers: 45
Ainslee's: 4
American Revolutionary War: 5
Anarchism: 13
Animal: 242
Animals-Domestic: 50
Animals-Wild: 55
Animals-Wild-Birds: 82
Animals-Wild-Insects: 26
Animals-Wild-Mammals: 111
Animals-Wild-Reptiles and Amphibians: 37
Animals-Wild-Trapping: 26
Anthropology: 36
Archaeology: 23
Architecture: 35
Argentina: 2
Armour's Monthly Cook Book: 1
Art: 71
Arthurian Legends: 20
Astounding Stories: 83
Astronomy: 36
Atheism: 8
Australia: 125
Bahá'í Faith: 35
Banned Books from Anne Haight's list: 130
Banned Books List from the American Library Association: 15
Best Books Ever Listings: 167
Bestsellers, American, 1895-1923: 426
Bibliomania: 38
Biographies: 55
Biology: 69
Bird-Lore: 1
Birds, Illustrated by Color Photography: 14
Blackwood's Edinburgh Magazine: 100
Boer War: 42
Botany: 23
British 

In [12]:
# manuallt curated set of children friendly shelves
children_friendly_shelves = {
    # Animals
    "Animal",
    "Animals-Domestic",
    "Animals-Wild",
    "Animals-Wild-Birds",
    "Animals-Wild-Insects",
    "Animals-Wild-Mammals",
    "Animals-Wild-Reptiles and Amphibians",

    # Children-specific
    "Category: Children & Young Adult Reading",
    "Category: Mythology, Legends & Folklore",
    "Category: Nature/Gardening/Animals",
    "Child's Own Book of Great Musicians",
    "Children's Anthologies",
    "Children's Biography",
    "Children's Book Series",
    "Children's Fiction",
    "Children's History",
    "Children's Instructional Books",
    "Children's Literature",
    "Children's Myths, Fairy Tales, etc.",
    "Children's Picture Books",
    "Children's Verse",

    # Story/Fantasy
    "Arthurian Legends",
    "Fantasy",
    "Folklore",
    "Mythology",

    # Nature/Wholesome
    "Christmas",
    "School Stories",
    "Scouts",

    # Children's Magazines
    "Dew Drops",
    "Golden Days for Boys and Girls",
    "Harper's Young People",
    "Little Folks",
    "Our Young Folks",
    "St. Nicholas Magazine for Boys and Girls",
    "The Nursery",
}

In [4]:
# English book counts per curated shelf
if 'rdf_files_paths' not in globals() or not rdf_files_paths:
    print("rdf_files_paths is empty. Run the earlier cells first.")
elif 'ns' not in globals() or 'pgterms' not in ns or 'dcterms' not in ns or 'rdf' not in ns:
    print("Namespaces missing. Re-run the cell that defines ns.")
elif 'children_friendly_shelves' not in globals():
    print("children_friendly_shelves not found. Re-run its cell.")
elif 'is_english' not in globals() or 'extract_shelves' not in globals():
    print("Helper functions missing. Re-run the parsing cell.")
else:
    shelf_to_ids = defaultdict(set)
    total_english = 0
    def book_id_from_path(path: Path) -> str:
        m = re.search(r"pg(\d+)\.rdf$", path.name)
        return m.group(1) if m else path.stem
    for rdf_file_path in rdf_files_paths:
        try:
            root = ET.parse(rdf_file_path).getroot()
            for ebook in root.findall('pgterms:ebook', namespaces=ns):
                if not is_english(ebook):
                    continue
                total_english += 1
                shelves_local = extract_shelves(ebook)
                if not shelves_local:
                    continue
                bid = book_id_from_path(rdf_file_path)
                for s in shelves_local:
                    if s in children_friendly_shelves:
                        shelf_to_ids[s].add(bid)
        except ET.ParseError:
            continue
    print(f"Total English ebooks processed: {total_english}")
    print("Books per curated shelf (counts are per-book, not per-occurrence):")
    for shelf in sorted(children_friendly_shelves, key=lambda x: x.lower()):
        print(f"  {shelf}: {len(shelf_to_ids[shelf])}")

Total English ebooks processed: 61557
Books per curated shelf (counts are per-book, not per-occurrence):
  Adventure: 82
  Animal: 242
  Animals-Domestic: 50
  Animals-Wild: 55
  Animals-Wild-Birds: 82
  Animals-Wild-Insects: 26
  Animals-Wild-Mammals: 111
  Animals-Wild-Reptiles and Amphibians: 37
  Arthurian Legends: 20
  Camping: 6
  Category: Children & Young Adult Reading: 6323
  Category: Humour: 4022
  Category: Mythology, Legends & Folklore: 2463
  Category: Nature/Gardening/Animals: 1814
  Child's Own Book of Great Musicians: 12
  Children's Anthologies: 50
  Children's Biography: 8
  Children's Book Series: 509
  Children's Fiction: 336
  Children's History: 132
  Children's Instructional Books: 97
  Children's Literature: 301
  Children's Myths, Fairy Tales, etc.: 51
  Children's Picture Books: 177
  Children's Verse: 20
  Christmas: 142
  Dew Drops: 9
  Fantasy: 97
  Folklore: 59
  Golden Days for Boys and Girls: 4
  Harper's Young People: 83
  Humor: 165
  Little Folks: 6


In [5]:
# FRE buckets for English ebooks that have at least one curated shelf
if 'rdf_files_paths' not in globals() or not rdf_files_paths:
    print("rdf_files_paths is empty. Run the earlier cells first.")
elif 'ns' not in globals() or 'pgterms' not in ns or 'dcterms' not in ns or 'rdf' not in ns:
    print("Namespaces missing. Re-run the cell that defines ns.")
elif 'children_friendly_shelves' not in globals():
    print("children_friendly_shelves not found. Re-run its cell.")
elif 'is_english' not in globals() or 'extract_shelves' not in globals():
    print("Helper functions missing. Re-run the parsing cell.")
else:
    fre_bins = {
        "0-30": 0,
        "31-50": 0,
        "51-60": 0,
        "61-70": 0,
        "71-80": 0,
        "81-90": 0,
        "91-100": 0,
        "unknown": 0,
    }
    total_selected = 0
    for rdf_file_path in rdf_files_paths:
        try:
            root = ET.parse(rdf_file_path).getroot()
            for ebook in root.findall('pgterms:ebook', namespaces=ns):
                if not is_english(ebook):
                    continue
                shelves_local = extract_shelves(ebook)
                if not shelves_local or not any(s in children_friendly_shelves for s in shelves_local):
                    continue
                total_selected += 1
                fre_el = ebook.find('pgterms:marc908', namespaces=ns)
                if fre_el is not None and fre_el.text:
                    m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", fre_el.text)
                    if m:
                        score = float(m.group(1))
                        # bucket by score
                        if score <= 30:
                            fre_bins['0-30'] += 1
                        elif score <= 50:
                            fre_bins['31-50'] += 1
                        elif score <= 60:
                            fre_bins['51-60'] += 1
                        elif score <= 70:
                            fre_bins['61-70'] += 1
                        elif score <= 80:
                            fre_bins['71-80'] += 1
                        elif score <= 90:
                            fre_bins['81-90'] += 1
                        else:
                            fre_bins['91-100'] += 1
                    else:
                        fre_bins['unknown'] += 1
                else:
                    fre_bins['unknown'] += 1
        except ET.ParseError:
            continue
    print("Flesch Reading Ease (FRE) buckets: for the curated shelves (children)")
    for bucket in ["0-30","31-50","51-60","61-70","71-80","81-90","91-100","unknown"]:
        print(f"  {bucket}: {fre_bins[bucket]}")

Flesch Reading Ease (FRE) buckets: for the curated shelves (children)
  0-30: 8
  31-50: 173
  51-60: 930
  61-70: 2712
  71-80: 5347
  81-90: 4403
  91-100: 701
  unknown: 104


In [13]:
# Book IDs for English ebooks with curated shelves and FRE in 81-100 (per bucket)
if 'rdf_files_paths' not in globals() or not rdf_files_paths:
    print("rdf_files_paths is empty. Run the earlier cells first.")
elif 'ns' not in globals() or 'pgterms' not in ns or 'dcterms' not in ns or 'rdf' not in ns:
    print("Namespaces missing. Re-run the cell that defines ns.")
elif 'children_friendly_shelves' not in globals():
    print("children_friendly_shelves not found. Re-run its cell.")
elif 'is_english' not in globals() or 'extract_shelves' not in globals():
    print("Helper functions missing. Re-run the parsing cell.")
else:
    ids_by_bucket = {"81-90": [], "91-100": []}
    shelf_to_ids = defaultdict(set)

    def book_id_from_path(path: Path) -> str:
        m = re.search(r"pg(\d+)\.rdf$", path.name)
        return m.group(1) if m else path.stem

    for rdf_file_path in rdf_files_paths:
        try:
            root = ET.parse(rdf_file_path).getroot()
            for ebook in root.findall('pgterms:ebook', namespaces=ns):
                if not is_english(ebook):
                    continue
                shelves_local = extract_shelves(ebook)
                qualifying_shelves = {s for s in shelves_local if s in children_friendly_shelves}
                if not qualifying_shelves:
                    continue
                fre_el = ebook.find('pgterms:marc908', namespaces=ns)
                if fre_el is None or not fre_el.text:
                    continue
                m = re.search(r"([0-9]+(?:\\.[0-9]+)?)", fre_el.text)
                if not m:
                    continue
                score = float(m.group(1))
                if score <= 80:
                    continue
                bid = book_id_from_path(rdf_file_path)
                if score <= 90:
                    ids_by_bucket["81-90"].append(bid)
                else:
                    ids_by_bucket["91-100"].append(bid)
                for shelf in qualifying_shelves:
                    shelf_to_ids[shelf].add(bid)
        except ET.ParseError:
            continue

    total_qualified = sum(len(v) for v in ids_by_bucket.values())
    print("Qualified book IDs (English, curated shelves, FRE 81-100):")
    print(f"Total: {total_qualified}")
    for bucket in ["81-90", "91-100"]:
        print(f"  {bucket}: {len(ids_by_bucket[bucket])}")

    print("\nShelf counts for qualifying books (per curated category):")
    for shelf in sorted(children_friendly_shelves, key=lambda x: x.lower()):
        print(f"  {shelf}: {len(shelf_to_ids[shelf])}")

    print("\nFirst 20 IDs per bucket:")
    for bucket in ["81-90", "91-100"]:
        print(f"  {bucket}: {ids_by_bucket[bucket][:20]}")

    # Save IDs to a newline-delimited file for the standalone downloader script
    unique_ids = sorted({bid for bucket_ids in ids_by_bucket.values() for bid in bucket_ids})
    ids_file = Path(r"D:\Narrator\curated_ids.txt")
    if unique_ids:
        ids_file.write_text("\n".join(unique_ids) + "\n", encoding="utf-8")
        print(f"\nSaved {len(unique_ids)} IDs (one per line) to {ids_file}.")
    else:
        print("\nNo qualifying IDs to save.")

Qualified book IDs (English, curated shelves, FRE 81-100):
Total: 4036
  81-90: 3450
  91-100: 586

Shelf counts for qualifying books (per curated category):
  Animal: 6
  Animals-Domestic: 12
  Animals-Wild: 3
  Animals-Wild-Birds: 6
  Animals-Wild-Insects: 3
  Animals-Wild-Mammals: 0
  Animals-Wild-Reptiles and Amphibians: 0
  Arthurian Legends: 5
  Category: Children & Young Adult Reading: 3386
  Category: Mythology, Legends & Folklore: 725
  Category: Nature/Gardening/Animals: 83
  Child's Own Book of Great Musicians: 11
  Children's Anthologies: 20
  Children's Biography: 3
  Children's Book Series: 377
  Children's Fiction: 99
  Children's History: 9
  Children's Instructional Books: 31
  Children's Literature: 124
  Children's Myths, Fairy Tales, etc.: 30
  Children's Picture Books: 76
  Children's Verse: 7
  Christmas: 55
  Dew Drops: 9
  Fantasy: 23
  Folklore: 27
  Golden Days for Boys and Girls: 1
  Harper's Young People: 10
  Little Folks: 0
  Mythology: 1
  Our Young Folks