In [1]:
import os
import re
import logging
import time
from pathlib import Path
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
from torch import multiprocessing as mp
from tqdm import tqdm
import csv

In [5]:
# ========== CONFIGURATION ==========
BASE_DIR = Path(r"C:/Users/ayush/OneDrive/Desktop/Stories")
CLEANED_DIR = BASE_DIR / "cleaned_files"
TAGGED_DIR = BASE_DIR / "cleaned_files_tagged"
CSV_LOG = BASE_DIR / "tagging_report.csv"
DETAILED_CSV = BASE_DIR / "tag_details.csv"
LOG_FILE = BASE_DIR / "phase2_tagging.log"

In [6]:
TAGGED_DIR.mkdir(parents=True, exist_ok=True)

# ========== STRUCTURAL TAGS ==========
TAG_PATTERNS = {
    "#chapter_start": re.compile(r"^\s*(CHAPTER|BOOK|PART|SCENE)[\s:.]+[\w\d\s,'\"-]+", re.IGNORECASE),
    "#dialogue": re.compile(r"^\s*[\"'].*[\"']\s*$"),
    "#quote": re.compile(r"^[^a-zA-Z0-9]*[\w\s,'\"“”‘’\-–—]{5,100}[.?!…\"'”]$"),
    "#poem": re.compile(r"^(\s{2,}|\t+).{1,80}$"),
    "#story_within": re.compile(r"\b(Once upon a time|There was a|In a land|Long ago|Far away)\b", re.IGNORECASE)
}


In [1]:
import os
import re
import csv
import logging
import time
from pathlib import Path
from collections import defaultdict
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading

# ========== CONFIG ==========
BASE_DIR = Path(r"C:/Users/ayush/OneDrive/Desktop/Stories")
CLEANED_DIR = BASE_DIR / "cleaned_files"
TAGGED_DIR = BASE_DIR / "cleaned_files_tagged"
CSV_LOG = BASE_DIR / "tagging_report.csv"
DETAILED_CSV = BASE_DIR / "tag_details.csv"
LOG_FILE = BASE_DIR / "phase2_tagging.log"
TAGGED_DIR.mkdir(parents=True, exist_ok=True)

# Thread-safe locking mechanism
DETAILED_LOCK = threading.Lock()

# ========== TAGGING RULES ==========
TAG_PATTERNS = {
    "#chapter_start": re.compile(r"^\s*(CHAPTER|BOOK|PART|SCENE)[\s:.]+[\w\d\s,'\"-]+", re.IGNORECASE),
    "#dialogue": re.compile(r"^\s*[\"\'\u201c\u2018].*[\"\'\u201d\u2019]\s*$"),
    "#poem": re.compile(r"^(\s{2,}|\t+)(?=.{1,80}$).+", re.MULTILINE),
    "#story_within": re.compile(r"\b(Once upon a time|In a land|Long ago|Far away|A tale|There lived|A fable)\b", re.IGNORECASE)
}

# ========== LOGGING ==========
def setup_logging():
    logger = logging.getLogger("GodspeedLogger")
    logger.setLevel(logging.INFO)

    if logger.hasHandlers():
        logger.handlers.clear()

    file_handler = logging.FileHandler(LOG_FILE, mode='w', encoding='utf-8')
    file_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s'))
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s'))

    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    return logger

# ========== TAGGING FUNCTION ==========
def tag_file(file_path: Path) -> dict:
    fname = file_path.name
    tag_counts = defaultdict(int)
    detailed_records = []

    try:
        with file_path.open(encoding="utf-8-sig", errors="replace") as f:
            lines = f.readlines()

        tagged_lines = []
        for idx, line in enumerate(lines):
            stripped = line.strip()
            matched_tag = None

            if stripped:
                for tag, pattern in TAG_PATTERNS.items():
                    if pattern.search(stripped):
                        matched_tag = tag
                        tag_counts[tag] += 1
                        detailed_records.append((fname, idx + 1, tag, stripped[:100] + ("..." if len(stripped) > 100 else "")))
                        break

            if matched_tag:
                tagged_lines.append(matched_tag + "\n")
            tagged_lines.append(line)

        output_path = TAGGED_DIR / fname
        with output_path.open("w", encoding="utf-8-sig") as f:
            f.writelines(tagged_lines)

        if detailed_records:
            with DETAILED_LOCK:
                with open(DETAILED_CSV, "a", encoding="utf-8-sig", newline="") as f:
                    writer = csv.writer(f)
                    if f.tell() == 0:
                        writer.writerow(["File", "Line", "Tag", "Content"])
                    writer.writerows(detailed_records)

        total_lines = len(lines)
        stats = {"file": fname, "status": "success", "total_lines": total_lines}

        for tag in TAG_PATTERNS:
            count = tag_counts.get(tag, 0)
            stats[f"{tag}_count"] = count
            stats[f"{tag}_density"] = round(count / total_lines * 100, 4) if total_lines else 0

        return stats

    except Exception as e:
        return {
            "file": fname,
            "status": "error",
            "error": str(e),
            "total_lines": 0,
            **{f"{tag}_count": 0 for tag in TAG_PATTERNS},
            **{f"{tag}_density": 0.0 for tag in TAG_PATTERNS}
        }

# ========== MAIN DRIVER ==========
def main():
    logger = setup_logging()
    logger.info("\n🚀 GODSPEED TAGGING v7.0 — FINALIZED")

    CSV_LOG.unlink(missing_ok=True)
    DETAILED_CSV.unlink(missing_ok=True)

    try:
        files = list(CLEANED_DIR.glob("*.txt"))
        total_files = len(files)
        if total_files == 0:
            logger.error("❌ No files found in CLEANED_DIR")
            return
        logger.info(f"📦 Files Queued: {total_files}\n")
    except Exception as e:
        logger.error(f"❌ Directory scanning failed: {str(e)}")
        return

    start_time = time.perf_counter()
    processed_files, error_files = 0, 0
    results, tag_totals = [], defaultdict(int)

    with ThreadPoolExecutor(max_workers=min(12, os.cpu_count() * 4)) as executor:
        futures = {executor.submit(tag_file, f): f.name for f in files}

        with tqdm(total=total_files, desc="⚡ Tagging Progress", unit="file", dynamic_ncols=True) as pbar:
            for i, future in enumerate(as_completed(futures)):
                try:
                    result = future.result()
                    results.append(result)
                    if result["status"] == "success":
                        processed_files += 1
                        for tag in TAG_PATTERNS:
                            tag_totals[tag] += result.get(f"{tag}_count", 0)
                    else:
                        error_files += 1
                        logger.error(f"❌ {result['file']}: {result.get('error', 'Unknown error')}")
                except Exception as e:
                    error_files += 1
                    logger.error(f"❌ Crash in processing: {str(e)}")

                if (i + 1) % 100 == 0:
                    logger.info(f"\n📍 INTERMEDIATE STATUS @ {i+1} files:")
                    logger.info(f"  ✅ Processed: {processed_files} | ❌ Errors: {error_files}")
                    for tag, count in tag_totals.items():
                        logger.info(f"  {tag}: {count} total occurrences")

                pbar.update(1)

    try:
        if results:
            pd.DataFrame(results).to_csv(CSV_LOG, index=False, encoding='utf-8-sig')
    except Exception as e:
        logger.error(f"❌ Failed to write CSV report: {str(e)}")

    duration = time.perf_counter() - start_time
    logger.info(f"\n{'✅' if error_files == 0 else '⚠️'} PROCESSING COMPLETE")
    logger.info(f"⏱️  Duration: {duration:.2f}s | ⚡ Speed: {total_files/max(duration, 0.001):.2f} files/s")
    logger.info(f"📊 Processed: {processed_files} | Errors: {error_files}")
    logger.info(f"📈 Tag Summary: {CSV_LOG}")
    logger.info(f"📝 Tag Details: {DETAILED_CSV}")

    logger.info("\n🏷️ FINAL TAG DISTRIBUTION:")
    for tag, count in tag_totals.items():
        logger.info(f"  {tag}: {count} tags")

if __name__ == "__main__":
    main()


[2025-07-29 22:27:40,903] 
🚀 GODSPEED TAGGING v7.0 — FINALIZED
[2025-07-29 22:27:41,029] 📦 Files Queued: 9492

⚡ Tagging Progress:   1%|          | 98/9492 [00:08<17:08,  9.13file/s] [2025-07-29 22:27:51,900] 
📍 INTERMEDIATE STATUS @ 100 files:
[2025-07-29 22:27:51,973]   ✅ Processed: 100 | ❌ Errors: 0
[2025-07-29 22:27:52,115]   #chapter_start: 1805 total occurrences
[2025-07-29 22:27:52,205]   #dialogue: 6641 total occurrences
[2025-07-29 22:27:52,391]   #poem: 0 total occurrences
[2025-07-29 22:27:52,447]   #story_within: 695 total occurrences
⚡ Tagging Progress:   2%|▏         | 199/9492 [00:20<10:37, 14.58file/s][2025-07-29 22:28:03,641] 
📍 INTERMEDIATE STATUS @ 200 files:
[2025-07-29 22:28:03,710]   ✅ Processed: 200 | ❌ Errors: 0
[2025-07-29 22:28:03,754]   #chapter_start: 3946 total occurrences
[2025-07-29 22:28:03,780]   #dialogue: 14281 total occurrences
[2025-07-29 22:28:03,843]   #poem: 0 total occurrences
[2025-07-29 22:28:03,941]   #story_within: 1236 total occurrences
⚡ T