# Phase 0: CSS Class Inventory for Tharpa Publications EPUBs

**PURPOSE**: Systematically document ALL CSS classes used in the EPUB to answer:
1. What classes are used for chapter titles?
2. What classes are used for section headings?
3. What classes are used for body paragraphs?
4. Are there multiple heading levels?
5. What other structural classes exist?

**Run this BEFORE any re-extraction work.**

Output: `00_css_class_inventory.json` (machine-readable inventory)

In [1]:
# ============================================================
# CONFIGURATION - Update this path for your system
# ============================================================
import os

EPUB_DIR = os.path.expanduser("~/Documents/gesha_la_rag/epub_directory/")
EPUB_FILE = "Clear_Light_of_Bliss.epub"
EPUB_PATH = os.path.join(EPUB_DIR, EPUB_FILE)
OUTPUT_FILE = "00_css_class_inventory.json"

print(f"Target: {EPUB_PATH}")
print(f"Exists: {os.path.exists(EPUB_PATH)}")

Target: /home/matt/Documents/gesha_la_rag/epub_directory/Clear_Light_of_Bliss.epub
Exists: True


In [2]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
import json
import re

book = epub.read_epub(EPUB_PATH)
items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))

print(f"‚úì EPUB loaded successfully")
try:
    title = book.get_metadata('DC', 'title')
    print(f"Title: {title[0][0] if title else 'Unknown'}")
except:
    print("Title: Unable to extract")
print(f"‚úì Found {len(items)} document sections")

‚úì EPUB loaded successfully
Title: Clear Light of Bliss
‚úì Found 89 document sections


## Helper Functions

`classify_css_role()` uses heuristic matching on class name patterns. If Tharpa uses unexpected naming, unmatched classes will appear as UNKNOWN ‚Äî that's by design so we can review them.

In [3]:
def classify_css_role(class_name):
    """
    Heuristic classification of CSS class names into structural roles.
    Returns a guessed role based on naming patterns.
    """
    cl = class_name.lower()
    
    # Chapter-level headings
    if any(x in cl for x in ['chapter-title', 'chapter_title', 'chaptertitle']):
        return 'CHAPTER_TITLE'
    if any(x in cl for x in ['part-title', 'part_title', 'parttitle']):
        return 'PART_TITLE'
    
    # Section headings  
    if any(x in cl for x in ['section-head', 'section_head', 'sectionhead',
                               'sub-head', 'sub_head', 'subhead']):
        return 'SECTION_HEADING'
    if 'head' in cl and 'chapter' not in cl:
        return 'HEADING_OTHER'
    
    # Body text
    if any(x in cl for x in ['text-1st', 'text_1st', 'first-para', 'first_para']):
        return 'BODY_FIRST_PARA'
    if cl.startswith('text') or 'body' in cl or 'para' in cl:
        return 'BODY_TEXT'
    
    # Special content
    if any(x in cl for x in ['verse', 'poem', 'stanza', 'dedication']):
        return 'VERSE_POETRY'
    if any(x in cl for x in ['quote', 'block', 'extract', 'indent']):
        return 'BLOCK_QUOTE'
    if any(x in cl for x in ['footnote', 'endnote', 'note']):
        return 'FOOTNOTE'
    if any(x in cl for x in ['toc', 'contents']):
        return 'TABLE_OF_CONTENTS'
    if any(x in cl for x in ['copyright', 'colophon', 'imprint']):
        return 'FRONT_MATTER'
    if any(x in cl for x in ['glossary', 'index', 'appendix', 'bibliography']):
        return 'BACK_MATTER'
    if any(x in cl for x in ['image', 'figure', 'caption', 'illustration']):
        return 'IMAGE_CAPTION'
    if any(x in cl for x in ['list', 'bullet', 'enum']):
        return 'LIST'
    
    return 'UNKNOWN'


def extract_heading_level(class_name):
    """
    Try to extract a heading level number from the class name.
    e.g., 'TOC-Level-1' -> 1, 'Section-Head-2' -> 2
    """
    cl = class_name.lower()
    for pat in [r'level[_-]?(\d+)', r'head[_-]?(\d+)', r'-(\d+)$']:
        m = re.search(pat, cl)
        if m:
            return int(m.group(1))
    return None

print("‚úì Helper functions defined")

‚úì Helper functions defined


## Pass 1: Collect ALL CSS Classes

Scan every element in every section of the EPUB. Record class names, which HTML tags use them, how often they appear, and sample text content.

In [4]:
# Data structures
class_counter = Counter()           # class_name -> total occurrences
class_by_tag = defaultdict(Counter)  # tag -> {class_name -> count}
class_samples = defaultdict(list)    # class_name -> [sample texts]
class_sections = defaultdict(set)    # class_name -> {section_names}
all_tags_used = Counter()            # tag -> count
classes_by_section = {}              # section_name -> [classes found]

MAX_SAMPLES = 3
SAMPLE_MAX_LEN = 120

for idx, item in enumerate(items):
    section_name = item.get_name()
    content = item.get_content()
    soup = BeautifulSoup(content, 'html.parser')
    
    section_classes = set()
    
    for element in soup.find_all(True):  # True = all tags
        tag_name = element.name
        all_tags_used[tag_name] += 1
        
        classes = element.get('class', [])
        if classes:
            for cls in classes:
                class_counter[cls] += 1
                class_by_tag[tag_name][cls] += 1
                class_sections[cls].add(section_name)
                section_classes.add(cls)
                
                # Collect sample text
                if len(class_samples[cls]) < MAX_SAMPLES:
                    text = element.get_text(strip=True)
                    if text:
                        sample = text[:SAMPLE_MAX_LEN]
                        if len(text) > SAMPLE_MAX_LEN:
                            sample += "..."
                        class_samples[cls].append({
                            'text': sample,
                            'section': section_name,
                            'section_idx': idx
                        })
    
    classes_by_section[section_name] = sorted(section_classes)

print(f"‚úì Scanned {len(items)} sections")
print(f"‚úì Found {len(class_counter)} unique CSS classes")
print(f"‚úì HTML tags used: {', '.join(f'{t}({c})' for t, c in all_tags_used.most_common(10))}")

‚úì Scanned 89 sections
‚úì Found 121 unique CSS classes
‚úì HTML tags used: a(4019), p(3529), span(992), div(299), html(89), head(89), body(89), img(37), li(35), nav(2)


## Pass 2: Classify by Structural Role

In [5]:
classified = defaultdict(list)

for cls, count in class_counter.most_common():
    role = classify_css_role(cls)
    level = extract_heading_level(cls)
    
    tag_list = []
    for tag in class_by_tag:
        if cls in class_by_tag[tag]:
            tag_list.append(f"{tag}({class_by_tag[tag][cls]})")
    
    info = {
        'class_name': cls,
        'count': count,
        'role': role,
        'heading_level': level,
        'tags': tag_list,
        'sections_count': len(class_sections[cls]),
        'samples': class_samples.get(cls, [])
    }
    classified[role].append(info)

print("Classes by role:")
for role, items_list in sorted(classified.items(), key=lambda x: -sum(i['count'] for i in x[1])):
    total = sum(i['count'] for i in items_list)
    print(f"  {role}: {len(items_list)} classes, {total} total occurrences")

Classes by role:
  UNKNOWN: 47 classes, 1475 total occurrences
  BACK_MATTER: 8 classes, 1238 total occurrences
  BODY_TEXT: 33 classes, 1053 total occurrences
  VERSE_POETRY: 6 classes, 352 total occurrences
  BODY_FIRST_PARA: 4 classes, 251 total occurrences
  HEADING_OTHER: 6 classes, 146 total occurrences
  IMAGE_CAPTION: 6 classes, 57 total occurrences
  CHAPTER_TITLE: 3 classes, 36 total occurrences
  SECTION_HEADING: 1 classes, 23 total occurrences
  FRONT_MATTER: 1 classes, 6 total occurrences
  LIST: 2 classes, 6 total occurrences
  BLOCK_QUOTE: 4 classes, 4 total occurrences


## Results: Full CSS Class Inventory

Every class found, grouped by structural role, with sample text for verification.

In [6]:
role_order = [
    'CHAPTER_TITLE', 'PART_TITLE', 'SECTION_HEADING', 'HEADING_OTHER',
    'BODY_FIRST_PARA', 'BODY_TEXT', 'VERSE_POETRY', 'BLOCK_QUOTE',
    'TABLE_OF_CONTENTS', 'FRONT_MATTER', 'BACK_MATTER',
    'FOOTNOTE', 'IMAGE_CAPTION', 'LIST', 'UNKNOWN'
]

for role in role_order:
    if role not in classified:
        continue
    items_in_role = classified[role]
    
    print(f"\n{'‚îÄ' * 70}")
    print(f"  {role} ({len(items_in_role)} classes)")
    print(f"{'‚îÄ' * 70}")
    
    for info in sorted(items_in_role, key=lambda x: -x['count']):
        level_str = f" [Level {info['heading_level']}]" if info['heading_level'] else ""
        print(f"\n  üìå {info['class_name']}{level_str}")
        print(f"     Count: {info['count']} | In {info['sections_count']} sections | Tags: {', '.join(info['tags'])}")
        
        for sample in info['samples'][:2]:
            print(f'     Sample: "{sample["text"]}"')
            print(f"             (section {sample['section_idx']}: {sample['section']})")


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
  CHAPTER_TITLE (3 classes)
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

  üìå Chapter-title-TOC-Level-1 [Level 1]
     Count: 30 | In 30 sections | Tags: p(30)
     Sample: "About the Author"
             (section 2: Clear_Light_of_Bliss_Text_2019-08-1.xhtml)
     Sample: "Contents"
             (section 6: Clear_Light_of_Bliss_Text_2019-08-5.xhtml)

  üìå Chapter-title-TOC-Level-2 [Level 2]
     Count: 4 | In 4 sections | Tags: p(4)
     Sample: "The Condensed Meaning of the Text"
             (section 69: Clear_Light_of_Bliss_Text_2019-08-68.xhtml)
     Sample: "Prayers of Request to the Mahamudra Lineage Gur

## Phase 0 Answers

These are the critical questions we need answered before re-extraction.

In [7]:
print("=" * 70)
print("PHASE 0 ANSWERS")
print("=" * 70)

print("\n1. CHAPTER TITLE CLASSES:")
if 'CHAPTER_TITLE' in classified:
    for info in classified['CHAPTER_TITLE']:
        print(f"   ‚Üí {info['class_name']} (used {info['count']} times)")
        for s in info['samples'][:2]:
            print(f'     e.g., "{s["text"]}"')
else:
    print("   ‚ö†Ô∏è  No classes matched 'chapter-title' pattern!")
    print("   Check UNKNOWN classes for possible chapter headings.")

print("\n2. SECTION HEADING CLASSES:")
if 'SECTION_HEADING' in classified:
    for info in classified['SECTION_HEADING']:
        print(f"   ‚Üí {info['class_name']} (used {info['count']} times)")
        for s in info['samples'][:2]:
            print(f'     e.g., "{s["text"]}"')
else:
    print("   ‚ö†Ô∏è  No classes matched section heading pattern!")

print("\n3. BODY PARAGRAPH CLASSES:")
body_classes = classified.get('BODY_FIRST_PARA', []) + classified.get('BODY_TEXT', [])
if body_classes:
    for info in sorted(body_classes, key=lambda x: -x['count']):
        print(f"   ‚Üí {info['class_name']} (used {info['count']} times)")
else:
    print("   ‚ö†Ô∏è  No body text classes found!")

print("\n4. HEADING LEVELS DETECTED:")
all_levels = set()
for role in ['CHAPTER_TITLE', 'PART_TITLE', 'SECTION_HEADING', 'HEADING_OTHER']:
    for info in classified.get(role, []):
        if info['heading_level'] is not None:
            all_levels.add((info['heading_level'], info['class_name']))
if all_levels:
    for level, cls in sorted(all_levels):
        print(f"   Level {level}: {cls}")
else:
    print("   ‚ö†Ô∏è  No numeric heading levels detected in class names")

print("\n5. UNKNOWN/UNCLASSIFIED CLASSES (review these!):")
if 'UNKNOWN' in classified:
    for info in classified['UNKNOWN']:
        print(f"   ‚Üí {info['class_name']} (used {info['count']} times)")
        for s in info['samples'][:1]:
            print(f'     e.g., "{s["text"]}"')
else:
    print("   ‚úì All classes classified!")

PHASE 0 ANSWERS

1. CHAPTER TITLE CLASSES:
   ‚Üí Chapter-title-TOC-Level-1 (used 30 times)
     e.g., "About the Author"
     e.g., "Contents"
   ‚Üí Chapter-title-TOC-Level-2 (used 4 times)
     e.g., "The Condensed Meaning of the Text"
     e.g., "Prayers of Request to the Mahamudra Lineage Gurus"
   ‚Üí Chapter-title-TOC-Level-1-no-new-page (used 2 times)
     e.g., "Foreword"
     e.g., "Study Programmes of Kadampa Buddhism"

2. SECTION HEADING CLASSES:
   ‚Üí Index-Section-Head (used 23 times)
     e.g., "A"
     e.g., "B"

3. BODY PARAGRAPH CLASSES:
   ‚Üí Text-2nd-para (used 383 times)
   ‚Üí Text-1st-para-space-below (used 101 times)
   ‚Üí Text-1st-para (used 100 times)
   ‚Üí Text-list-10-to-99-with-parenthesis-middle (used 92 times)
   ‚Üí Text-list-1-to-9-middle (used 86 times)
   ‚Üí Text-list-1-to-9-last-full-space-below (used 59 times)
   ‚Üí Text-list-1-to-9-with-parenthesis-middle (used 58 times)
   ‚Üí Text-2nd-para-full-space-below (used 58 times)
   ‚Üí Text-list-1

## Save Inventory to JSON

In [8]:
output = {
    'epub_file': EPUB_FILE,
    'epub_path': EPUB_PATH,
    'total_sections': len(items),
    'total_unique_classes': len(class_counter),
    'html_tags_used': dict(all_tags_used.most_common()),
    'class_inventory': {},
    'classes_by_role': {},
    'phase0_answers': {
        'chapter_title_classes': [i['class_name'] for i in classified.get('CHAPTER_TITLE', [])],
        'section_heading_classes': [i['class_name'] for i in classified.get('SECTION_HEADING', [])],
        'body_text_classes': [i['class_name'] for i in 
                              classified.get('BODY_TEXT', []) + classified.get('BODY_FIRST_PARA', [])],
        'heading_levels': {str(level): cls for level, cls in sorted(all_levels)} if all_levels else {},
    }
}

# Full inventory
for cls, count in class_counter.most_common():
    role = classify_css_role(cls)
    output['class_inventory'][cls] = {
        'count': count,
        'role': role,
        'heading_level': extract_heading_level(cls),
        'tags': [t for t in class_by_tag if cls in class_by_tag[t]],
        'sections_count': len(class_sections[cls]),
        'samples': [s['text'] for s in class_samples.get(cls, [])]
    }

# Grouped by role
for role, items_list in classified.items():
    output['classes_by_role'][role] = [
        {'class': i['class_name'], 'count': i['count'], 'level': i['heading_level']}
        for i in items_list
    ]

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f"‚úì Saved to {OUTPUT_FILE}")
print(f"  {len(output['class_inventory'])} classes documented")

‚úì Saved to 00_css_class_inventory.json
  121 classes documented


## üö¶ Validation Gate 0

Must pass ALL checks before proceeding to re-extraction.

In [9]:
print("=" * 70)
print("üö¶ VALIDATION GATE 0")
print("=" * 70)

checks = [
    ('Chapter title CSS classes identified', len(classified.get('CHAPTER_TITLE', [])) > 0),
    ('Section heading CSS classes identified', len(classified.get('SECTION_HEADING', [])) > 0),
    ('Body text CSS classes identified', len(body_classes) > 0),
    ('Heading levels detected', len(all_levels) > 0),
]

all_pass = True
for desc, passed in checks:
    status = "‚úì" if passed else "‚úó"
    if not passed:
        all_pass = False
    print(f"  {status} {desc}")

if all_pass:
    print(f"\n  ‚úÖ ALL CHECKS PASSED - Ready to proceed to re-extraction!")
else:
    print(f"\n  ‚ö†Ô∏è  SOME CHECKS FAILED")
    print(f"  Review the UNKNOWN classes above ‚Äî the heading patterns may")
    print(f"  use different naming than expected.")
    print(f"\n  ‚Üí Paste the output from this notebook into Claude")
    print(f"    and we'll map the classes together.")

üö¶ VALIDATION GATE 0
  ‚úì Chapter title CSS classes identified
  ‚úì Section heading CSS classes identified
  ‚úì Body text CSS classes identified
  ‚úì Heading levels detected

  ‚úÖ ALL CHECKS PASSED - Ready to proceed to re-extraction!
