In [None]:
from pathlib import Path
import os
import re
import logging
import shutil
import hashlib
import pandas as pd
from chardet import detect

# === CONFIGURATION ===
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
RAW_DIR = BASE_DIR 
CLEANED_DIR = BASE_DIR / "cleaned_phase1"
CORRUPTED_DIR = CLEANED_DIR/ "corrupted_files"
META_CSV = CLEANED_DIR/ "file_meta_phase1.csv"
LOG_FILE = CLEANED_DIR/"phase1_cleaning.log"

# === SETUP ===
CLEANED_DIR.mkdir(parents=True, exist_ok=True)
CORRUPTED_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

# === STRIP GUTENBERG NOISE ===
def strip_gutenberg_noise(text):
    # Handle empty files immediately
    if not text:
        return ""
    
    # Improved patterns with explicit project gutenberg markers
    start_pattern = r"\*\*\*\s*START\s+OF\s+(THIS|THE)\s+PROJECT\s+GUTENBERG.+?\*\*\*"
    end_pattern = r"\*\*\*\s*END\s+OF\s+(THIS|THE)\s+PROJECT\s+GUTENBERG.+?\*\*\*"
    
    # Use DOTALL flag to match across lines
    start_match = re.search(start_pattern, text, re.IGNORECASE | re.DOTALL)
    end_match = re.search(end_pattern, text, re.IGNORECASE | re.DOTALL)
    
    # Handle overlapping matches
    if start_match and end_match and start_match.end() > end_match.start():
        # Only remove start if markers overlap
        return text[start_match.end():].strip()
    
    if start_match:
        text = text[start_match.end():]
        
    if end_match:
        text = text[:end_match.start()]
        
    return text.strip()

# === DETECT ENCODING ===
def detect_encoding(filepath):
    try:
        with open(filepath, 'rb') as f:
            return detect(f.read(10000))['encoding'] or 'utf-8'
    except Exception as e:
        logger.error(f"Encoding detection failed for {filepath.name}: {e}")
        return 'utf-8'

# === CLEAN FILE ===
def clean_file(src_path, dest_path):
    try:
        # Handle empty files
        if src_path.stat().st_size == 0:
            raise ValueError("File is empty (0 bytes)")
            
        encoding = detect_encoding(src_path)
        
        # Read with fallback encodings
        try:
            with open(src_path, 'r', encoding=encoding, errors='replace') as f:
                raw_text = f.read()
        except UnicodeDecodeError:
            # Try fallback encodings
            for alt_enc in ['utf-16', 'latin-1', 'iso-8859-1', 'cp1252']:
                try:
                    with open(src_path, 'r', encoding=alt_enc, errors='replace') as f:
                        raw_text = f.read()
                    encoding = alt_enc  # Update encoding
                    break
                except:
                    continue
            else:
                raise UnicodeDecodeError("All encoding attempts failed")

        clean_text = strip_gutenberg_noise(raw_text)
        
        # Handle files with no gutenberg markers
        if clean_text == raw_text.strip():
            logger.warning(f"⚠️ No Gutenberg markers found in {src_path.name}")
            
        # Normalize whitespace (preserve paragraph breaks)
        clean_text = re.sub(r"[^\S\n]+", " ", clean_text)  # Compress non-newline whitespace
        clean_text = re.sub(r"\n{3,}", "\n\n", clean_text)  # Reduce excessive newlines
        clean_text = clean_text.strip()

        # Write processed file
        with open(dest_path, 'w', encoding='utf-8') as f:
            f.write(clean_text)

        return {
            "filename": src_path.name,
            "status": "cleaned",
            "encoding": encoding,
            "original_size": len(raw_text),
            "cleaned_size": len(clean_text),
            "char_diff": len(raw_text) - len(clean_text),
            "hash": hashlib.md5(clean_text.encode('utf-8')).hexdigest()
        }
    except Exception as e:
        logger.error(f"❌ Error cleaning {src_path.name}: {str(e)}")
        try:
            if src_path.exists():
                shutil.copy(src_path, CORRUPTED_DIR / src_path.name)
        except Exception as copy_error:
            logger.error(f"❌ Failed to copy corrupted file: {copy_error}")
            
        return {
            "filename": src_path.name,
            "status": "corrupted",
            "error_type": str(type(e).__name__),
            "error_msg": str(e),
            "encoding": None,
            "original_size": None,
            "cleaned_size": None,
            "char_diff": None,
            "hash": None
        }

# === MAIN PIPELINE ===
def run_phase1_cleaning():
    all_files = list(RAW_DIR.glob("*.txt"))
    total_files = len(all_files)
    logger.info(f"📁 Total raw files detected: {total_files}")
    meta_records = []

    for i, file_path in enumerate(all_files, 1):
        logger.info(f"🧹 Processing file {i}/{total_files}: {file_path.name}")
        dest_path = CLEANED_DIR / file_path.name
        record = clean_file(file_path, dest_path)
        meta_records.append(record)
        
        # Periodic saving
        if i % 100 == 0 or i == total_files:
            pd.DataFrame(meta_records).to_csv(META_CSV, index=False)
            logger.info(f"💾 Saved metadata checkpoint ({i} files processed)")

    # Final save
    pd.DataFrame(meta_records).to_csv(META_CSV, index=False)
    
    # Generate summary report
    cleaned_count = sum(1 for r in meta_records if r['status'] == 'cleaned')
    corrupted_count = sum(1 for r in meta_records if r['status'] == 'corrupted')
    
    logger.info(f"✅ Phase 1 cleaning complete")
    logger.info(f"📊 Results: {cleaned_count} cleaned, {corrupted_count} corrupted")
    logger.info(f"💾 Metadata saved to {META_CSV}")

if __name__ == "__main__":
    try:
        run_phase1_cleaning()
    except Exception as e:
        logger.critical(f"🛑 Fatal pipeline error: {str(e)}", exc_info=True)

In [4]:
from pathlib import Path
import os
import logging
import hashlib
import pandas as pd
import random
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# === CONFIGURATION ===
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
CLEANED_DIR = BASE_DIR / "cleaned_phase1"
TAGGED_DIR = BASE_DIR / "tagged_phase2"
TAG_MATRIX_CSV = BASE_DIR / "tag_matrix.csv"
ENTROPY_CSV = BASE_DIR / "tag_entropy.csv"
PHASE2_LOG = BASE_DIR / "phase2_tagging.log"
TAG_DETAILS_FILE = BASE_DIR / "tagging_summary.txt"
TAG_DENSITY_PNG = BASE_DIR / "tag_density_heatmap.png"
TAG_FREQ_PNG = BASE_DIR / "tag_frequency_plot.png"

# Import 100+ TAG_MAP separately or define here directly if needed
TAG_MAP = {
    # Structural / Narrative Anchors
    "#chapter_start": [],
    "#chapter_end": [],
    "#dialogue": ["said", "replied", "asked", "shouted", "whispered", "murmured", "exclaimed"],
    "#story_within": ["legend", "myth", "tale", "fable"],
    "#poem": ["verse", "stanza", "rhyme", "lyric"],
    "#quote": ["wisdom", "truth", "saying", "motto"],

    # Power Themes
    "#battle": ["sword", "blood", "clash", "siege", "arrow", "enemy", "war", "blade"],
    "#magic": ["spell", "sorcery", "wizard", "enchantment", "hex", "conjure", "potion"],
    "#curse": ["curse", "cursed", "blight", "bane", "doom", "mark"],
    "#prophecy": ["prophecy", "vision", "fate", "destiny", "omen", "oracle", "seer"],
    "#ritual": ["ritual", "ceremony", "chant", "offering", "sacrifice"],
    "#transformation": ["transform", "shapeshift", "reborn", "metamorphosis", "become"],
    "#vision": ["hallucination", "dream", "foresight", "premonition"],
    "#curse_break": ["liberated", "cleansed", "shattered spell", "redemption"],

    # Characters & Archetypes
    "#hero": ["hero", "champion", "savior", "redeemer"],
    "#villain": ["villain", "tyrant", "usurper", "corrupt", "nemesis"],
    "#mentor": ["mentor", "teacher", "sage", "elder"],
    "#outsider": ["stranger", "exile", "outcast", "wanderer"],
    "#title": ["king", "queen", "lord", "emperor", "high priest", "warlord"],

    # World Building
    "#place": ["fortress", "temple", "forest", "island", "valley", "realm", "village", "citadel"],
    "#noble_house": ["house", "clan", "dynasty", "lineage"],
    "#artifact": ["amulet", "scepter", "relic", "blade", "chalice", "scroll"],
    "#weapon": ["dagger", "spear", "sword", "bow", "axe"],
    "#creature": ["dragon", "beast", "wolf", "golem", "giant", "serpent"],
    "#god": ["god", "goddess", "divine", "pantheon", "deity"],
    "#element": ["fire", "water", "air", "earth", "ice", "lightning"],

    # Plot Devices & Themes
    "#death": ["corpse", "dead", "slain", "sacrifice", "funeral", "mourning"],
    "#betrayal": ["betray", "traitor", "deceive", "oath broken"],
    "#alliance": ["treaty", "pact", "truce", "union"],
    "#rebellion": ["revolt", "rebels", "uprising", "insurgents"],
    "#trial": ["judgment", "court", "accused", "guilty", "verdict"],
    "#oath": ["vow", "oath", "promise", "pledge"],
    "#revenge": ["vengeance", "payback", "retribution", "grudge"],

    # Social Constructs
    "#lineage": ["ancestor", "descendant", "bloodline", "heir"],
    "#legacy": ["legacy", "remains", "inheritance", "echo"],
    "#festival": ["festival", "feast", "celebration", "banquet"],
    "#law": ["decree", "edict", "rule", "commandment"],
    "#funeral": ["rites", "cremation", "burial", "mourning"],

    # Emotions & Conflict
    "#love": ["love", "desire", "affection", "passion"],
    "#grief": ["grief", "sorrow", "lament", "weep"],
    "#hope": ["hope", "light", "faith", "miracle"],
    "#fear": ["fear", "terror", "dread", "panic"],
    "#madness": ["madness", "insane", "lunacy", "deranged"],

    # Time & Space
    "#past": ["ancient", "old", "forgotten", "before"],
    "#future": ["forthcoming", "will", "soon", "destined"],
    "#present": ["now", "current", "this day"],
    "#travel": ["journey", "voyage", "path", "wander"],
    "#portal": ["portal", "gate", "doorway", "rift"],

    # Abstract / Cosmic
    "#fate": ["fate", "destiny", "threads", "woven"],
    "#chaos": ["chaos", "entropy", "discord", "anarchy"],
    "#order": ["order", "harmony", "balance", "structure"],
    "#void": ["void", "emptiness", "nothingness", "abyss"],
    "#creation": ["birth", "creation", "beginning", "genesis"],
    "#apocalypse": ["end", "apocalypse", "collapse", "cataclysm"],

    # Meta / Non-narrative
    "#editor_note": [],
    "#annotation": [],
    "#translation_note": []
} 


ALL_TAGS = list(TAG_MAP.keys()) + ["#chapter_start", "#chapter_end"]
TAG_COLUMNS = [tag for tag in ALL_TAGS if tag not in ("#chapter_start", "#chapter_end")]
TAGGED_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(PHASE2_LOG, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

def inject_tags(text):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    tagged_lines = ["#chapter_start"]
    log_lines = []
    for i, para in enumerate(paragraphs):
        tags = {tag for tag, triggers in TAG_MAP.items() if any(t in para.lower() for t in triggers)}
        if not tags and i % 5 == 0:
            tags.add(random.choice(ALL_TAGS))
            log_lines.append(f"✨ Forced tag in paragraph {i}: {tags}")
        log_lines.append(f"📌 Para {i+1}/{len(paragraphs)} → Tags: {', '.join(tags) if tags else 'None'}")
        tagged_lines.extend(sorted(tags))
        tagged_lines.append(para)
    tagged_lines.append("#chapter_end")
    return "\n".join(tagged_lines), log_lines

def get_file_hash(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

def visualize_tags(df):
    logger.info("📈 Generating tag frequency bar plot...")
    tag_counts = df[TAG_COLUMNS].sum().sort_values(ascending=False)
    tag_counts.plot(kind='bar', figsize=(12, 5), title='Tag Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(TAG_FREQ_PNG)
    plt.close()

    logger.info("🌡️ Generating tag density heatmap...")
    sns.heatmap(df[TAG_COLUMNS].astype(bool), cmap="YlGnBu", cbar=False)
    plt.title("Tag Density Heatmap")
    plt.xlabel("Tags")
    plt.ylabel("Files")
    plt.tight_layout()
    plt.savefig(TAG_DENSITY_PNG)
    plt.close()

def main():
    records, logs = [], []
    all_files = sorted(CLEANED_DIR.glob("*.txt"))
    total_files = len(all_files)
    logger.info(f"📂 Total files to process: {total_files}")

    for idx, file in enumerate(all_files, 1):
        logger.info(f"🚧 Step {idx}/{total_files}: Processing {file.name}")
        try:
            with open(file, encoding='utf-8') as f:
                content = f.read()
            tagged, lines = inject_tags(content)
            with open(TAGGED_DIR / file.name, 'w', encoding='utf-8') as out:
                out.write(tagged)
            counts = Counter(l for l in tagged.splitlines() if l.startswith("#"))
            record = {"filename": file.name, "hash": get_file_hash(tagged)}
            record.update({tag: counts.get(tag, 0) for tag in ALL_TAGS})
            records.append(record)
            logs.append(f"✅ {file.name}: {sum(counts.values())} tags\n" + "\n".join(lines))
            logger.info(f"✅ Tags added: {sum(counts.values())}, Unique: {len(counts)}")
        except Exception as e:
            logs.append(f"❌ {file.name} failed: {e}")
            logger.error(f"❌ Failed to process {file.name}: {e}")

    df = pd.DataFrame(records)
    df.to_csv(TAG_MATRIX_CSV, index=False)
    df["unique_tags"] = df[TAG_COLUMNS].astype(bool).sum(axis=1)
    df[["filename", "unique_tags"]].to_csv(ENTROPY_CSV, index=False)
    visualize_tags(df)

    with open(TAG_DETAILS_FILE, 'w', encoding='utf-8') as f:
        f.write("\n\n".join(logs))

    logger.info("\n🎯 PHASE 2 TAGGING + VISUALIZATION COMPLETE")
    logger.info(f"📊 Tag matrix CSV saved to: {TAG_MATRIX_CSV}")
    logger.info(f"📈 Entropy CSV saved to: {ENTROPY_CSV}")
    logger.info(f"📝 Summary log saved to: {TAG_DETAILS_FILE}")
    logger.info(f"🖼️ Visuals saved to: {TAG_FREQ_PNG}, {TAG_DENSITY_PNG}")

if __name__ == "__main__":
    main()


[2025-07-26 23:39:53,090] INFO: 📂 Total files to process: 7524
[2025-07-26 23:39:53,091] INFO: 🚧 Step 1/7524: Processing 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-26 23:39:53,746] INFO: ✅ Tags added: 4165, Unique: 57
[2025-07-26 23:39:53,746] INFO: 🚧 Step 2/7524: Processing 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-26 23:39:54,485] INFO: ✅ Tags added: 4931, Unique: 59
[2025-07-26 23:39:54,485] INFO: 🚧 Step 3/7524: Processing 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-26 23:39:55,465] INFO: ✅ Tags added: 6240, Unique: 59
[2025-07-26 23:39:55,465] INFO: 🚧 Step 4/7524: Processing 04 Harry Potter and the Goblet of Fire.txt
[2025-07-26 23:39:57,012] INFO: ✅ Tags added: 11293, Unique: 59
[2025-07-26 23:39:57,012] INFO: 🚧 Step 5/7524: Processing 05 Harry Potter and the Order of the Phoenix.txt
[2025-07-26 23:39:58,375] INFO: ✅ Tags added: 15658, Unique: 59
[2025-07-26 23:39:58,375] INFO: 🚧 Step 6/7524: Processing 06 Harry Potter and the Half-Blood Prince

In [9]:
from pathlib import Path
for file in Path("tagged_phase2").glob("*.txt"):
    content = file.read_text(encoding="utf-8")
    if "#chapter_start" not in content or "#chapter_end" not in content:
        print(f"❌ Missing block tags: {file.name}")

In [5]:
from pathlib import Path
import logging
import random
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import json

# ======================= CONFIGURATION =======================
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
TAGGED_DIR = BASE_DIR / "tagged_phase2"
PHASE3_DIR = BASE_DIR / "split_phase3"
PHASE3_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = PHASE3_DIR / "train.txt"
VAL_PATH = PHASE3_DIR / "val.txt"
META_CSV = PHASE3_DIR / "split_metadata.csv"
TRAIN_FILES = PHASE3_DIR / "train_files.csv"
VAL_FILES = PHASE3_DIR / "val_files.csv"
BALANCE_REPORT = PHASE3_DIR / "split_balance_report.txt"
BALANCE_PLOT = PHASE3_DIR / "tag_balance_val_vs_train.png"
PHASE3_LOG = PHASE3_DIR / "phase3_split.log"

SPLIT_RATIO = 0.95
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

logging.basicConfig(
    filename=str(PHASE3_LOG),
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    filemode='w'
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s")
console.setFormatter(formatter)
logging.getLogger().addHandler(console)
logger = logging.getLogger()

# ======================= HELPERS =======================
def hash_block(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

def parse_chapter_blocks(file_path):
    with open(file_path, encoding='utf-8') as f:
        content = f.read()
    blocks = []
    block_lines = []
    in_block = False
    for line in content.splitlines():
        if line.strip() == "#chapter_start":
            block_lines = [line]
            in_block = True
        elif line.strip() == "#chapter_end":
            block_lines.append(line)
            blocks.append("\n".join(block_lines))
            in_block = False
        elif in_block:
            block_lines.append(line)
    return blocks

def extract_metadata(block, filename):
    lines = block.splitlines()
    tags = set(l.strip() for l in lines if l.startswith("#") and not l in ["#chapter_start", "#chapter_end"])
    tag_counts = Counter(tags)
    token_count = len(block.split())
    return {
        "filename": filename,
        "block_hash": hash_block(block),
        "tokens": token_count,
        "entropy": len(tags),
        "tags": list(tags),
        "tag_counts": json.dumps(tag_counts),
        "text": block
    }

# ======================= MAIN PIPELINE =======================
def main():
    all_files = sorted(TAGGED_DIR.glob("*.txt"))
    block_records = []
    logger.info(f"📂 Total files to parse: {len(all_files)}")

    for idx, file in enumerate(all_files, 1):
        logger.info(f"🔍 Parsing file {idx}/{len(all_files)}: {file.name}")
        try:
            blocks = parse_chapter_blocks(file)
            for block in blocks:
                meta = extract_metadata(block, file.name)
                block_records.append(meta)
        except Exception as e:
            logger.error(f"❌ Failed to parse {file.name}: {e}")

    df = pd.DataFrame(block_records)
    df.to_csv(META_CSV, index=False)
    logger.info(f"📊 Metadata CSV saved: {META_CSV}")

    # Stratified shuffling based on entropy
    df = df.sort_values("entropy")
    bucketed = []
    BUCKETS = 5
    bucket_size = len(df) // BUCKETS
    for i in range(BUCKETS):
        start = i * bucket_size
        end = (i + 1) * bucket_size if i < BUCKETS - 1 else len(df)
        bucket = df.iloc[start:end].sample(frac=1, random_state=RANDOM_SEED)
        bucketed.append(bucket)
    df_shuffled = pd.concat(bucketed).reset_index(drop=True)

    total_tokens = df_shuffled["tokens"].sum()
    train_tokens_target = total_tokens * SPLIT_RATIO
    cum_tokens, train_idx = 0, 0
    for i, row in df_shuffled.iterrows():
        cum_tokens += row["tokens"]
        if cum_tokens >= train_tokens_target:
            train_idx = i
            break

    df_train = df_shuffled.iloc[:train_idx+1]
    df_val = df_shuffled.iloc[train_idx+1:]

    with open(TRAIN_PATH, 'w', encoding='utf-8') as f:
        f.write("\n\n".join(df_train["text"]))
    with open(VAL_PATH, 'w', encoding='utf-8') as f:
        f.write("\n\n".join(df_val["text"]))

    df_train[["filename", "block_hash"]].to_csv(TRAIN_FILES, index=False)
    df_val[["filename", "block_hash"]].to_csv(VAL_FILES, index=False)

    # Tag balance
    def flatten_tags(tag_list):
        return [tag for sublist in tag_list for tag in sublist]
    train_tag_freq = Counter(flatten_tags(df_train["tags"]))
    val_tag_freq = Counter(flatten_tags(df_val["tags"]))

    all_tags = sorted(set(train_tag_freq.keys()).union(set(val_tag_freq.keys())))
    balance_df = pd.DataFrame({
        "tag": all_tags,
        "train_count": [train_tag_freq.get(t, 0) for t in all_tags],
        "val_count": [val_tag_freq.get(t, 0) for t in all_tags]
    })
    balance_df.to_csv(PHASE3_DIR / "tag_balance.csv", index=False)

    # Plot tag comparison
    balance_df.set_index("tag").plot(kind="bar", figsize=(14, 6), title="Tag Balance: Train vs Val")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(BALANCE_PLOT)
    plt.close()

    # Summary Log
    with open(BALANCE_REPORT, 'w', encoding='utf-8') as f:
        f.write(f"Total Blocks: {len(df)}\n")
        f.write(f"Train Blocks: {len(df_train)}, Tokens: {df_train['tokens'].sum()}\n")
        f.write(f"Val Blocks:   {len(df_val)}, Tokens: {df_val['tokens'].sum()}\n")
        f.write(f"Train Mean Entropy: {df_train['entropy'].mean():.2f}\n")
        f.write(f"Val Mean Entropy:   {df_val['entropy'].mean():.2f}\n")
        f.write("\nMissing Tags in Val:\n")
        missing_tags = [t for t in all_tags if val_tag_freq.get(t, 0) == 0]
        for t in missing_tags:
            f.write(f"❌ {t}\n")

    logger.info("\n🎯 PHASE 3 SEMANTIC SPLIT COMPLETE")
    logger.info(f"📂 Train set saved: {TRAIN_PATH}")
    logger.info(f"📂 Val set saved: {VAL_PATH}")
    logger.info(f"📊 Metadata: {META_CSV}")
    logger.info(f"🧠 Balance report: {BALANCE_REPORT}")
    logger.info(f"🖼️ Tag Plot: {BALANCE_PLOT}")

if __name__ == "__main__":
    main()


[2025-07-27 01:06:42,955] INFO: 📂 Total files to parse: 7524
[2025-07-27 01:06:42,955] INFO: 📂 Total files to parse: 7524
[2025-07-27 01:06:42,955] INFO: 🔍 Parsing file 1/7524: 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-27 01:06:42,955] INFO: 🔍 Parsing file 1/7524: 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-27 01:06:43,032] INFO: 🔍 Parsing file 2/7524: 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-27 01:06:43,032] INFO: 🔍 Parsing file 2/7524: 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-27 01:06:43,074] INFO: 🔍 Parsing file 3/7524: 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-27 01:06:43,074] INFO: 🔍 Parsing file 3/7524: 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-27 01:06:43,108] INFO: 🔍 Parsing file 4/7524: 04 Harry Potter and the Goblet of Fire.txt
[2025-07-27 01:06:43,108] INFO: 🔍 Parsing file 4/7524: 04 Harry Potter and the Goblet of Fire.txt
[2025-07-27 01:06:43,142] INFO: 🔍 Parsing file 5/7524: 05 Harry Potter and