In [1]:
import polars as pl
from huggingface_hub import snapshot_download
from datasets import load_dataset
import os
import sys
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
sys.path.append("..")
%load_ext autoreload
%autoreload 2


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os


catalog_df = pl.read_csv(
    "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv"
)

In [None]:
def parse_bookshelves(bookshelves_str: str | None) -> dict[str, list[str]]:
    """Parse a bookshelves string into a dict of category -> list of values."""
    bookshelves_dict: dict[str, list[str]] = {}
    if not bookshelves_str:
        return bookshelves_dict
    
    # Split by semicolon to get individual entries
    entries = bookshelves_str.split(';')
    for entry in entries:
        entry = entry.strip()
        if ':' in entry:
            # Split by colon to separate category from value
            category, value = entry.split(':', 1)
            category = category.strip()
            value = value.strip()
            
            # Add to dict, creating list if category doesn't exist
            if category not in bookshelves_dict:
                bookshelves_dict[category] = []
            bookshelves_dict[category].append(value)
        else:
            # Handle entries without a category (like "Nobel Prizes in Literature")
            if entry:  # Skip empty entries
                if "Other" not in bookshelves_dict:
                    bookshelves_dict["Other"] = []
                bookshelves_dict["Other"].append(entry)
    
    return bookshelves_dict


# Apply parsing to all rows
catalog_df = catalog_df.with_columns(
    pl.col("Bookshelves").map_elements(parse_bookshelves, return_dtype=pl.Object).alias("bookshelves_parsed")
)

Text#,Title,Bookshelves,bookshelves_parsed
i64,str,str,object
1,"""The Declaration of Independenc…","""Politics; American Revolutiona…","{'Other': ['Politics', 'American Revolutionary War', 'United States Law'], 'Category': ['Essays, Letters & Speeches', 'History - American', 'History - Modern (1750+)', 'Philosophy & Ethics', 'Politics']}"
2,"""The United States Bill of Righ…","""Politics; American Revolutiona…","{'Other': ['Politics', 'American Revolutionary War', 'United States Law'], 'Category': ['History - American', 'Law & Criminology']}"
3,"""John F. Kennedy's Inaugural Ad…","""Category: Essays, Letters & Sp…","{'Category': ['Essays, Letters & Speeches', 'History - American', 'Politics']}"
4,"""Lincoln's Gettysburg Address …","""US Civil War; Category: Essays…","{'Other': ['US Civil War'], 'Category': ['Essays, Letters & Speeches', 'History - American', 'History - Modern (1750+)']}"
5,"""The United States Constitution""","""United States; Politics; Ameri…","{'Other': ['United States', 'Politics', 'American Revolutionary War', 'United States Law'], 'Category': ['History - American', 'Law & Criminology', 'Politics']}"
6,"""Give Me Liberty or Give Me Dea…","""American Revolutionary War; Ca…","{'Other': ['American Revolutionary War'], 'Category': ['Essays, Letters & Speeches', 'History - American']}"
7,"""The Mayflower Compact""","""Category: History - American; …","{'Category': ['History - American', 'History - Early Modern (c. 1450-1750)', 'History - Religious']}"
8,"""Abraham Lincoln's Second Inaug…","""US Civil War; Category: Essays…","{'Other': ['US Civil War'], 'Category': ['Essays, Letters & Speeches', 'History - American', 'History - Modern (1750+)', 'Politics']}"
9,"""Abraham Lincoln's First Inaugu…","""US Civil War; Category: Essays…","{'Other': ['US Civil War'], 'Category': ['Essays, Letters & Speeches', 'History - American', 'Politics']}"
10,"""The King James Version of the …","""Banned Books List from the Ame…","{'Other': ['Banned Books List from the American Library Association'], 'Category': ['Classics of Literature', 'Religion/Spirituality']}"


In [15]:
catalog_df = catalog_df.filter(pl.col("Bookshelves").is_not_null())

In [28]:
# Filter for Science-Fiction & Fantasy or American Literature
# Also exclude certain categories
excluded_categories = {
    'Romance', 'Crime, Thrillers and Mystery', 'Essays, Letters & Speeches',
    'Poetry', 'British Literature', 'Biographies', 'Mythology, Legends & Folklore',
    'Travel Writing', 'Plays/Films/Dramas', 'Classics of Literature',
    'History - American', 'Journals', 'Sports/Hobbies', 'Philosophy & Ethics',
    'Religion/Spirituality', 'History - Warfare', 'Politics', 'History - Modern (1750+)',
    'Journalism/Media/Writing', 'Gender & Sexuality Studies'
}

catalog_df = catalog_df.filter(
    pl.col("bookshelves_parsed").map_elements(
        lambda x: (
            "Category" in x and (
                any("Science-Fiction" in val or "Fantasy" in val for val in x["Category"]) or
                any("American Literature" in val for val in x["Category"])
            ) and not any(val in excluded_categories for val in x["Category"])
        ),
        return_dtype=pl.Boolean
    )
)

catalog_df = catalog_df.filter(pl.col("Language") == "en")
print(f"Filtered to {len(catalog_df)} books")
# Show value counts of all category values
from collections import Counter

# Extract all category values from the bookshelves_parsed column
all_categories = []
for row in catalog_df.iter_rows(named=True):
    if row["bookshelves_parsed"] and "Category" in row["bookshelves_parsed"]:
        all_categories.extend(row["bookshelves_parsed"]["Category"])

# Count occurrences
category_counts = Counter(all_categories)

category_counts

Filtered to 7607 books


Counter({'Novels': 5595,
         'American Literature': 4638,
         'Science-Fiction & Fantasy': 3266,
         'Adventure': 1746,
         'Children & Young Adult Reading': 1131,
         'Short Stories': 1032,
         'Historical Novels': 733,
         'Humour': 727,
         'Parenthood & Family Relations': 27,
         'French Literature': 22,
         'Encyclopedias/Dictionaries/Reference': 18,
         'Nature/Gardening/Animals': 14,
         'German Literature': 13,
         'Art': 12,
         'Music': 11,
         'History - Schools & Universities': 11,
         'Psychiatry/Psychology': 11,
         'Old Age & the Elderly': 10,
         'How To ...': 10,
         'Cooking & Drinking': 7,
         'Teaching & Education': 6,
         'Drugs/Alcohol/Pharmacology': 5,
         'Architecture': 5,
         'Health & Medicine': 5,
         'Law & Criminology': 3,
         'Language & Communication': 3,
         'Environmental Issues': 2,
         'Sociology': 2,
         'Engine

In [None]:
import tiktoken
from IPython.display import Markdown
from synthetic_data.tasks.writing import GutenbergBacktranslationFromTxt

tiktoken_encoder = tiktoken.get_encoding("o200k_base")

first_sample = gutenberg_pq.sample(n=1)
task = GutenbergBacktranslationFromTxt()
total = 0

for row in gutenberg_pq.sample(10).iter_rows(named=True):
    print(row.keys())
    display(Markdown(f"**{row['title']}**"))
    row_out = await task.preprocess_row(row)
    for row in row_out:
        n_tokens = sum([len(tiktoken_encoder.encode(par)) for par in row['text']])
        # print("-"*100)
        # display(Markdown(row['text']))
        # print(f"n_tokens: {n_tokens}")
        total += 1
    print(total)

print(total)


In [None]:
import re
from synthetic_data.utils import ldictl

input = first_sample.to_dicts()
formatted_out = task.format_input_conversation(ldictl(input))

In [None]:
formatted_out