In [None]:
from pathlib import Path
import os
import re
import logging
import shutil
import hashlib
import pandas as pd
from chardet import detect

# === CONFIGURATION ===
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
RAW_DIR = BASE_DIR 
CLEANED_DIR = BASE_DIR / "cleaned_phase1"
CORRUPTED_DIR = CLEANED_DIR/ "corrupted_files"
META_CSV = CLEANED_DIR/ "file_meta_phase1.csv"
LOG_FILE = CLEANED_DIR/"phase1_cleaning.log"

# === SETUP ===
CLEANED_DIR.mkdir(parents=True, exist_ok=True)
CORRUPTED_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

# === STRIP GUTENBERG NOISE ===
def strip_gutenberg_noise(text):
    # Handle empty files immediately
    if not text:
        return ""
    
    # Improved patterns with explicit project gutenberg markers
    start_pattern = r"\*\*\*\s*START\s+OF\s+(THIS|THE)\s+PROJECT\s+GUTENBERG.+?\*\*\*"
    end_pattern = r"\*\*\*\s*END\s+OF\s+(THIS|THE)\s+PROJECT\s+GUTENBERG.+?\*\*\*"
    
    # Use DOTALL flag to match across lines
    start_match = re.search(start_pattern, text, re.IGNORECASE | re.DOTALL)
    end_match = re.search(end_pattern, text, re.IGNORECASE | re.DOTALL)
    
    # Handle overlapping matches
    if start_match and end_match and start_match.end() > end_match.start():
        # Only remove start if markers overlap
        return text[start_match.end():].strip()
    
    if start_match:
        text = text[start_match.end():]
        
    if end_match:
        text = text[:end_match.start()]
        
    return text.strip()

# === DETECT ENCODING ===
def detect_encoding(filepath):
    try:
        with open(filepath, 'rb') as f:
            return detect(f.read(10000))['encoding'] or 'utf-8'
    except Exception as e:
        logger.error(f"Encoding detection failed for {filepath.name}: {e}")
        return 'utf-8'

# === CLEAN FILE ===
def clean_file(src_path, dest_path):
    try:
        # Handle empty files
        if src_path.stat().st_size == 0:
            raise ValueError("File is empty (0 bytes)")
            
        encoding = detect_encoding(src_path)
        
        # Read with fallback encodings
        try:
            with open(src_path, 'r', encoding=encoding, errors='replace') as f:
                raw_text = f.read()
        except UnicodeDecodeError:
            # Try fallback encodings
            for alt_enc in ['utf-16', 'latin-1', 'iso-8859-1', 'cp1252']:
                try:
                    with open(src_path, 'r', encoding=alt_enc, errors='replace') as f:
                        raw_text = f.read()
                    encoding = alt_enc  # Update encoding
                    break
                except:
                    continue
            else:
                raise UnicodeDecodeError("All encoding attempts failed")

        clean_text = strip_gutenberg_noise(raw_text)
        
        # Handle files with no gutenberg markers
        if clean_text == raw_text.strip():
            logger.warning(f"⚠️ No Gutenberg markers found in {src_path.name}")
            
        # Normalize whitespace (preserve paragraph breaks)
        clean_text = re.sub(r"[^\S\n]+", " ", clean_text)  # Compress non-newline whitespace
        clean_text = re.sub(r"\n{3,}", "\n\n", clean_text)  # Reduce excessive newlines
        clean_text = clean_text.strip()

        # Write processed file
        with open(dest_path, 'w', encoding='utf-8') as f:
            f.write(clean_text)

        return {
            "filename": src_path.name,
            "status": "cleaned",
            "encoding": encoding,
            "original_size": len(raw_text),
            "cleaned_size": len(clean_text),
            "char_diff": len(raw_text) - len(clean_text),
            "hash": hashlib.md5(clean_text.encode('utf-8')).hexdigest()
        }
    except Exception as e:
        logger.error(f"❌ Error cleaning {src_path.name}: {str(e)}")
        try:
            if src_path.exists():
                shutil.copy(src_path, CORRUPTED_DIR / src_path.name)
        except Exception as copy_error:
            logger.error(f"❌ Failed to copy corrupted file: {copy_error}")
            
        return {
            "filename": src_path.name,
            "status": "corrupted",
            "error_type": str(type(e).__name__),
            "error_msg": str(e),
            "encoding": None,
            "original_size": None,
            "cleaned_size": None,
            "char_diff": None,
            "hash": None
        }

# === MAIN PIPELINE ===
def run_phase1_cleaning():
    all_files = list(RAW_DIR.glob("*.txt"))
    total_files = len(all_files)
    logger.info(f"📁 Total raw files detected: {total_files}")
    meta_records = []

    for i, file_path in enumerate(all_files, 1):
        logger.info(f"🧹 Processing file {i}/{total_files}: {file_path.name}")
        dest_path = CLEANED_DIR / file_path.name
        record = clean_file(file_path, dest_path)
        meta_records.append(record)
        
        # Periodic saving
        if i % 100 == 0 or i == total_files:
            pd.DataFrame(meta_records).to_csv(META_CSV, index=False)
            logger.info(f"💾 Saved metadata checkpoint ({i} files processed)")

    # Final save
    pd.DataFrame(meta_records).to_csv(META_CSV, index=False)
    
    # Generate summary report
    cleaned_count = sum(1 for r in meta_records if r['status'] == 'cleaned')
    corrupted_count = sum(1 for r in meta_records if r['status'] == 'corrupted')
    
    logger.info(f"✅ Phase 1 cleaning complete")
    logger.info(f"📊 Results: {cleaned_count} cleaned, {corrupted_count} corrupted")
    logger.info(f"💾 Metadata saved to {META_CSV}")

if __name__ == "__main__":
    try:
        run_phase1_cleaning()
    except Exception as e:
        logger.critical(f"🛑 Fatal pipeline error: {str(e)}", exc_info=True)

[2025-07-27 14:22:52,372] INFO: 📁 Total raw files detected: 7535
[2025-07-27 14:22:52,372] INFO: 📁 Total raw files detected: 7535
[2025-07-27 14:22:52,375] INFO: 🧹 Processing file 1/7535: 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-27 14:22:52,375] INFO: 🧹 Processing file 1/7535: 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-27 14:22:52,454] INFO: 🧹 Processing file 2/7535: 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-27 14:22:52,454] INFO: 🧹 Processing file 2/7535: 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-27 14:22:52,534] INFO: 🧹 Processing file 3/7535: 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-27 14:22:52,534] INFO: 🧹 Processing file 3/7535: 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-27 14:22:52,639] INFO: 🧹 Processing file 4/7535: 04 Harry Potter and the Goblet of Fire.txt
[2025-07-27 14:22:52,639] INFO: 🧹 Processing file 4/7535: 04 Harry Potter and the Goblet of Fire.txt
[2025-07-27 14:22:52,788] INFO: 🧹 Processi

In [13]:
TAG_MAP = {
    # 🔱 I. Structural & Narrative Tags
    "#chapter_start": ["chapter", "beginning", "opening"],
    "#chapter_end": ["end", "conclusion", "closing"],
    "#dialogue": ["said", "asked", "replied", "shouted", "whispered"],
    "#quote": ["wisdom", "saying", "motto", "truth"],
    "#poem": ["verse", "stanza", "lyric", "rhyme"],
    "#story_within": ["legend", "tale", "myth", "fable"],
    "#dream": ["dream", "nightmare", "asleep", "slumber"],
    "#vision": ["vision", "glimpse", "premonition", "hallucination"],
    "#flashback": ["memory", "recall", "recollection", "past"],
    "#journal": ["diary", "account", "entry", "note"],
    "#scripture": ["scroll", "canon", "holy text", "inscription"],
    "#trial_log": ["testimony", "log", "trial record", "tribunal"],
    "#prophecy_scroll": ["prophecy", "destiny", "scroll", "seer"],
    
    # 🧙 II. Character & Role Tags
    "#character": ["he", "she", "they", "name", "hero", "man", "woman"],
    "#title": ["king", "queen", "lord", "duke", "emperor", "chief", "prince", "shaman"],
    "#hero": ["hero", "champion", "savior", "guardian"],
    "#villain": ["villain", "tyrant", "usurper", "nemesis", "betrayer"],
    "#mentor": ["mentor", "sage", "guide", "teacher", "guru"],
    "#outsider": ["stranger", "exile", "wanderer", "outcast"],

    # 🏰 III. Lore & Kingdom Tags
    "#noble_house": ["house", "clan", "dynasty", "family"],
    "#place": ["city", "kingdom", "realm", "land", "village", "fortress"],
    "#weapon": ["sword", "blade", "axe", "bow", "dagger", "staff", "spear"],
    "#artifact": ["relic", "amulet", "ring", "orb", "chalice", "talisman"],
    "#creature": ["beast", "dragon", "giant", "wolf", "goblin", "serpent", "demon"],
    "#god": ["god", "goddess", "deity", "divine", "pantheon", "lord"],

    # 🔮 IV. Mysticism & Magic Tags
    "#prophecy": ["prophecy", "fate", "destiny", "seer", "foretold", "omen"],
    "#magic": ["spell", "sorcery", "magic", "enchant", "hex", "potion"],
    "#curse": ["curse", "cursed", "plague", "blight", "doom"],
    "#curse_break": ["redeem", "break curse", "release", "liberation"],
    "#ritual": ["ritual", "sacrifice", "invocation", "ceremony", "rite"],
    "#vision": ["vision", "trance", "revelation", "oracle"],
    "#transformation": ["reborn", "shapeshift", "metamorph", "possession"],

    # ⚔️ V. Conflict & War Tags
    "#battle": ["battle", "war", "fight", "army", "blood", "clash"],
    "#duel": ["duel", "challenge", "arena", "contest"],
    "#rebellion": ["rebellion", "uprising", "revolt", "insurgency"],
    "#betrayal": ["betrayal", "treason", "deception", "traitor", "backstab"],
    "#oath": ["oath", "vow", "pledge", "promise"],
    "#trial": ["judgement", "trial", "test", "ordeal", "court"],
    "#alliance": ["alliance", "pact", "treaty", "union"],

    # 🕯️ VI. Culture, Lineage & Emotion Tags
    "#lineage": ["bloodline", "descendant", "ancestor", "heritage", "line"],
    "#legacy": ["legacy", "deed", "memory", "fame"],
    "#law": ["law", "edict", "commandment", "rule", "code"],
    "#festival": ["festival", "celebration", "feast", "ceremony"],
    "#death": ["death", "dead", "died", "slain", "funeral"],
    "#funeral": ["funeral", "burial", "cremation", "last rites"],
    "#emotion": ["grief", "joy", "rage", "sorrow", "longing", "love"],
    "#madness": ["madness", "insane", "lunacy", "hysteria"],
    "#sacrifice": ["sacrifice", "offering", "martyr"],

    # 🌌 VII. Cosmic & Dark Tags
    "#darkness": ["dark", "shadow", "abyss", "blackness", "night"],
    "#light": ["light", "glow", "shine", "radiant", "sun"],
    "#cosmic": ["cosmos", "void", "universe", "galaxy", "astral"],
    "#eldritch": ["eldritch", "horror", "insanity", "ancient"],
    "#divine_judgement": ["judgement", "punishment", "wrath", "fury"],
    "#underworld": ["hell", "underworld", "nether", "abyss"],
    "#ascension": ["ascend", "rise", "heaven", "nirvana"],

    # 📜 VIII. Meta-Narrative Tags
    "#editor_note": ["editor", "note", "annotation"],
    "#translation_note": ["translated", "language", "script"],
    "#annotation": ["comment", "footnote", "aside"],
    "#prologue": ["prologue", "before", "origin"],
    "#epilogue": ["epilogue", "after", "endnote"],

    # 🧾 IX. Esoteric & Forbidden Themes
    "#forbidden": ["forbidden", "taboo", "unspoken"],
    "#occult": ["occult", "esoteric", "mystery", "arcane"],
    "#blood_magic": ["blood", "ritual", "sacrifice", "dark spell"],
    "#necromancy": ["necromancer", "undead", "raise", "grave"],
    "#summoning": ["summon", "conjure", "invoke", "entity"],
    "#coven": ["witch", "coven", "sisters", "circle"],
    "#pact": ["deal", "contract", "bargain", "devil"],

    # 💠 X. Power & Divine Tags
    "#divine_intervention": ["divine", "miracle", "godly", "intervene"],
    "#blessing": ["blessing", "grace", "boon", "gift"],
    "#relic": ["relic", "artifact", "sacred", "holy"],
    "#throne": ["throne", "crown", "seat", "rule"],
    "#heir": ["heir", "succession", "inherit", "next in line"],

    # 🔧 XI. Utility / Functional Tags
    "#metadata": ["title", "author", "date", "source"],
    "#unknown": ["???", "undefined", "unknown"],
    "#breakpoint": ["===BREAK===", "---", "==="],
}


In [14]:
TAG_CLASSES = {
    # I. Structural & Narrative
    "structure": [
        "#chapter_start", "#chapter_end", "#dialogue", "#quote", "#poem", "#story_within",
        "#dream", "#vision", "#flashback", "#journal", "#scripture", "#trial_log", "#prophecy_scroll"
    ],

    # II. Character & Role
    "character": [
        "#character", "#title", "#hero", "#villain", "#mentor", "#outsider"
    ],

    # III. Lore & Kingdom
    "lore": [
        "#noble_house", "#place", "#weapon", "#artifact", "#creature", "#god"
    ],

    # IV. Mysticism & Magic
    "magic": [
        "#prophecy", "#magic", "#curse", "#curse_break", "#ritual", "#vision", "#transformation"
    ],

    # V. Conflict & War
    "war": [
        "#battle", "#duel", "#rebellion", "#betrayal", "#oath", "#trial", "#alliance"
    ],

    # VI. Culture, Emotion, Lineage
    "culture": [
        "#lineage", "#legacy", "#law", "#festival", "#death", "#funeral",
        "#emotion", "#madness", "#sacrifice"
    ],

    # VII. Cosmic & Dark
    "cosmic": [
        "#darkness", "#light", "#cosmic", "#eldritch", "#divine_judgement",
        "#underworld", "#ascension"
    ],

    # VIII. Meta-Narrative
    "meta": [
        "#editor_note", "#translation_note", "#annotation", "#prologue", "#epilogue"
    ],

    # IX. Esoteric & Forbidden
    "occult": [
        "#forbidden", "#occult", "#blood_magic", "#necromancy", "#summoning", "#coven", "#pact"
    ],

    # X. Power & Divine
    "divine": [
        "#divine_intervention", "#blessing", "#relic", "#throne", "#heir"
    ],

    # XI. Utility / System
    "utility": [
        "#metadata", "#unknown", "#breakpoint"
    ]
}


In [17]:
from pathlib import Path
import logging
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from collections import defaultdict, Counter

# ================= CONFIG =====================
BASE_DIR = Path("C:/Users/ayush/OneDrive/Desktop/Stories")
CLEANED_DIR = BASE_DIR / "cleaned"
TAGGED_DIR = BASE_DIR / "tagged_phase2"
TAGGED_DIR.mkdir(parents=True, exist_ok=True)

BERT_MODEL = "mrm8488/bert-mini-finetuned-ner"
LOG_FILE = BASE_DIR / "phase2_semantic_tagging.log"
TAG_MATRIX_CSV = BASE_DIR / "tag_matrix.csv"
TAG_DENSITY_CSV = BASE_DIR / "tag_density_log.csv"
TOP_TAG_PLOT = BASE_DIR / "top_tags_barplot.png"
HEATMAP_PLOT = BASE_DIR / "tag_heatmap.png"

# Import TAG_MAP and TAG_CLASSES externally if split
from Hybrid_Tagger_Pipeline import TAG_CLASSES, TAG_MAP

ALL_TAGS = list(TAG_MAP.keys())

# ================= LOGGING =====================
logging.basicConfig(
    filename=str(LOG_FILE),
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    filemode='w'
)
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler())

# ================= LOAD BERT =====================
def load_bert_tagger():
    logger.info("📥 Loading BERT semantic tagger...")
    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
    model = AutoModelForTokenClassification.from_pretrained(BERT_MODEL)
    tagger = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
    logger.info("✅ BERT tagger loaded")
    return tagger

# ============== RULE-BASED TAGGING ==============
def apply_rule_tags(text):
    matched = set()
    lowered = text.lower()
    for tag, keywords in TAG_MAP.items():
        for kw in keywords:
            if re.search(rf"\\b{re.escape(kw.lower())}\\b", lowered):
                matched.add(tag)
                break
    return list(matched)

# ============== BERT-BASED TAGGING ==============
def apply_bert_tags(tagger, text):
    outputs = tagger(text)
    words = [entity['word'].lower() for entity in outputs]
    matched = set()
    for tag in ALL_TAGS:
        if any(kw in w for w in words for kw in TAG_MAP[tag]):
            matched.add(tag)
    return list(matched)

# ============== PIPELINE LOGIC ==================
def tag_corpus():
    tagger = load_bert_tagger()
    density_log = []
    tag_counter = Counter()
    cooccur_matrix = defaultdict(lambda: defaultdict(int))

    for i, file_path in enumerate(sorted(CLEANED_DIR.glob("*.txt"))):
        logger.info(f"[{i+1}] 🗂️ Tagging file: {file_path.name}")
        text = file_path.read_text(encoding='utf-8', errors='ignore')
        rule_tags = apply_rule_tags(text)
        bert_tags = apply_bert_tags(tagger, text)
        combined_tags = sorted(set(rule_tags + bert_tags))

        # Write new file
        new_file = TAGGED_DIR / file_path.name
        tag_block = "\n".join(combined_tags) + "\n\n"
        new_file.write_text(tag_block + text, encoding='utf-8')

        # Stats
        word_count = len(text.split())
        density = round(len(combined_tags) / max(word_count, 1), 4)
        for t in combined_tags:
            tag_counter[t] += 1
        for t1 in combined_tags:
            for t2 in combined_tags:
                if t1 != t2:
                    cooccur_matrix[t1][t2] += 1

        density_log.append({
            "file": file_path.name,
            "word_count": word_count,
            "num_tags": len(combined_tags),
            "density": density,
            "tags": ", ".join(combined_tags)
        })

    # Save CSVs
    pd.DataFrame(density_log).to_csv(TAG_DENSITY_CSV, index=False)
    logger.info(f"📊 Saved density log to {TAG_DENSITY_CSV}")
    pd.DataFrame.from_dict(tag_counter, orient='index', columns=['count']).sort_values(by='count', ascending=False).to_csv(TAG_MATRIX_CSV)
    logger.info(f"📈 Saved tag frequency matrix to {TAG_MATRIX_CSV}")

    # Plot
    top_tags = tag_counter.most_common(30)
    tags, counts = zip(*top_tags)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=list(tags), y=list(counts))
    plt.xticks(rotation=45)
    plt.title("Top 30 Tags by Frequency")
    plt.tight_layout()
    plt.savefig(TOP_TAG_PLOT)
    logger.info(f"🖼️ Saved tag barplot to {TOP_TAG_PLOT}")

    # Heatmap
    df_heatmap = pd.DataFrame(cooccur_matrix).fillna(0)
    plt.figure(figsize=(14, 10))
    sns.heatmap(df_heatmap, cmap="YlGnBu", linewidths=.5)
    plt.title("Tag Co-occurrence Heatmap")
    plt.tight_layout()
    plt.savefig(HEATMAP_PLOT)
    logger.info(f"🧊 Saved tag co-occurrence heatmap to {HEATMAP_PLOT}")

if __name__ == "__main__":
    tag_corpus()


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'Hybrid_Tagger_Pipeline'

In [16]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.54.0-py3-none-any.whl (11.2 MB)
   ---------------------------------------- 0.0/11.2 MB ? eta -:--:--
   ----- ---------------------------------- 1.6/11.2 MB 9.3 MB/s eta 0:00:02
   ------------- -------------------------- 3.7/11.2 MB 9.5 MB/s eta 0:00:01
   -------------------- ------------------- 5.8/11.2 MB 9.5 MB/s eta 0:00:01
   ------------------------------ --------- 8.4/11.2 MB 10.2 MB/s eta 0:00:01
   ---------------------------------------- 11.2/11.2 MB 11.3 MB/s eta 0:00:00
Download

In [20]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 2.2/2.2 MB 17.6 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets

   -------------------------- ------------- 2/3 [ipywidgets]
   ---------------------------------------- 3/3 [ipywidgets]

Successfully installed ipywidgets-8.1.7 jupyterlab_widgets-3.0.15 widgetsnbextension-4.0.14


In [9]:
from pathlib import Path
for file in Path("tagged_phase2").glob("*.txt"):
    content = file.read_text(encoding="utf-8")
    if "#chapter_start" not in content or "#chapter_end" not in content:
        print(f"❌ Missing block tags: {file.name}")

In [5]:
from pathlib import Path
import logging
import random
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import json

# ======================= CONFIGURATION =======================
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
TAGGED_DIR = BASE_DIR / "tagged_phase2"
PHASE3_DIR = BASE_DIR / "split_phase3"
PHASE3_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = PHASE3_DIR / "train.txt"
VAL_PATH = PHASE3_DIR / "val.txt"
META_CSV = PHASE3_DIR / "split_metadata.csv"
TRAIN_FILES = PHASE3_DIR / "train_files.csv"
VAL_FILES = PHASE3_DIR / "val_files.csv"
BALANCE_REPORT = PHASE3_DIR / "split_balance_report.txt"
BALANCE_PLOT = PHASE3_DIR / "tag_balance_val_vs_train.png"
PHASE3_LOG = PHASE3_DIR / "phase3_split.log"

SPLIT_RATIO = 0.95
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

logging.basicConfig(
    filename=str(PHASE3_LOG),
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    filemode='w'
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s")
console.setFormatter(formatter)
logging.getLogger().addHandler(console)
logger = logging.getLogger()

# ======================= HELPERS =======================
def hash_block(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

def parse_chapter_blocks(file_path):
    with open(file_path, encoding='utf-8') as f:
        content = f.read()
    blocks = []
    block_lines = []
    in_block = False
    for line in content.splitlines():
        if line.strip() == "#chapter_start":
            block_lines = [line]
            in_block = True
        elif line.strip() == "#chapter_end":
            block_lines.append(line)
            blocks.append("\n".join(block_lines))
            in_block = False
        elif in_block:
            block_lines.append(line)
    return blocks

def extract_metadata(block, filename):
    lines = block.splitlines()
    tags = set(l.strip() for l in lines if l.startswith("#") and not l in ["#chapter_start", "#chapter_end"])
    tag_counts = Counter(tags)
    token_count = len(block.split())
    return {
        "filename": filename,
        "block_hash": hash_block(block),
        "tokens": token_count,
        "entropy": len(tags),
        "tags": list(tags),
        "tag_counts": json.dumps(tag_counts),
        "text": block
    }

# ======================= MAIN PIPELINE =======================
def main():
    all_files = sorted(TAGGED_DIR.glob("*.txt"))
    block_records = []
    logger.info(f"📂 Total files to parse: {len(all_files)}")

    for idx, file in enumerate(all_files, 1):
        logger.info(f"🔍 Parsing file {idx}/{len(all_files)}: {file.name}")
        try:
            blocks = parse_chapter_blocks(file)
            for block in blocks:
                meta = extract_metadata(block, file.name)
                block_records.append(meta)
        except Exception as e:
            logger.error(f"❌ Failed to parse {file.name}: {e}")

    df = pd.DataFrame(block_records)
    df.to_csv(META_CSV, index=False)
    logger.info(f"📊 Metadata CSV saved: {META_CSV}")

    # Stratified shuffling based on entropy
    df = df.sort_values("entropy")
    bucketed = []
    BUCKETS = 5
    bucket_size = len(df) // BUCKETS
    for i in range(BUCKETS):
        start = i * bucket_size
        end = (i + 1) * bucket_size if i < BUCKETS - 1 else len(df)
        bucket = df.iloc[start:end].sample(frac=1, random_state=RANDOM_SEED)
        bucketed.append(bucket)
    df_shuffled = pd.concat(bucketed).reset_index(drop=True)

    total_tokens = df_shuffled["tokens"].sum()
    train_tokens_target = total_tokens * SPLIT_RATIO
    cum_tokens, train_idx = 0, 0
    for i, row in df_shuffled.iterrows():
        cum_tokens += row["tokens"]
        if cum_tokens >= train_tokens_target:
            train_idx = i
            break

    df_train = df_shuffled.iloc[:train_idx+1]
    df_val = df_shuffled.iloc[train_idx+1:]

    with open(TRAIN_PATH, 'w', encoding='utf-8') as f:
        f.write("\n\n".join(df_train["text"]))
    with open(VAL_PATH, 'w', encoding='utf-8') as f:
        f.write("\n\n".join(df_val["text"]))

    df_train[["filename", "block_hash"]].to_csv(TRAIN_FILES, index=False)
    df_val[["filename", "block_hash"]].to_csv(VAL_FILES, index=False)

    # Tag balance
    def flatten_tags(tag_list):
        return [tag for sublist in tag_list for tag in sublist]
    train_tag_freq = Counter(flatten_tags(df_train["tags"]))
    val_tag_freq = Counter(flatten_tags(df_val["tags"]))

    all_tags = sorted(set(train_tag_freq.keys()).union(set(val_tag_freq.keys())))
    balance_df = pd.DataFrame({
        "tag": all_tags,
        "train_count": [train_tag_freq.get(t, 0) for t in all_tags],
        "val_count": [val_tag_freq.get(t, 0) for t in all_tags]
    })
    balance_df.to_csv(PHASE3_DIR / "tag_balance.csv", index=False)

    # Plot tag comparison
    balance_df.set_index("tag").plot(kind="bar", figsize=(14, 6), title="Tag Balance: Train vs Val")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(BALANCE_PLOT)
    plt.close()

    # Summary Log
    with open(BALANCE_REPORT, 'w', encoding='utf-8') as f:
        f.write(f"Total Blocks: {len(df)}\n")
        f.write(f"Train Blocks: {len(df_train)}, Tokens: {df_train['tokens'].sum()}\n")
        f.write(f"Val Blocks:   {len(df_val)}, Tokens: {df_val['tokens'].sum()}\n")
        f.write(f"Train Mean Entropy: {df_train['entropy'].mean():.2f}\n")
        f.write(f"Val Mean Entropy:   {df_val['entropy'].mean():.2f}\n")
        f.write("\nMissing Tags in Val:\n")
        missing_tags = [t for t in all_tags if val_tag_freq.get(t, 0) == 0]
        for t in missing_tags:
            f.write(f"❌ {t}\n")

    logger.info("\n🎯 PHASE 3 SEMANTIC SPLIT COMPLETE")
    logger.info(f"📂 Train set saved: {TRAIN_PATH}")
    logger.info(f"📂 Val set saved: {VAL_PATH}")
    logger.info(f"📊 Metadata: {META_CSV}")
    logger.info(f"🧠 Balance report: {BALANCE_REPORT}")
    logger.info(f"🖼️ Tag Plot: {BALANCE_PLOT}")

if __name__ == "__main__":
    main()


[2025-07-27 01:06:42,955] INFO: 📂 Total files to parse: 7524
[2025-07-27 01:06:42,955] INFO: 📂 Total files to parse: 7524
[2025-07-27 01:06:42,955] INFO: 🔍 Parsing file 1/7524: 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-27 01:06:42,955] INFO: 🔍 Parsing file 1/7524: 01 Harry Potter and the Sorcerers Stone.txt
[2025-07-27 01:06:43,032] INFO: 🔍 Parsing file 2/7524: 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-27 01:06:43,032] INFO: 🔍 Parsing file 2/7524: 02 Harry Potter and the Chamber of Secrets.txt
[2025-07-27 01:06:43,074] INFO: 🔍 Parsing file 3/7524: 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-27 01:06:43,074] INFO: 🔍 Parsing file 3/7524: 03 Harry Potter and the Prisoner of Azkaban.txt
[2025-07-27 01:06:43,108] INFO: 🔍 Parsing file 4/7524: 04 Harry Potter and the Goblet of Fire.txt
[2025-07-27 01:06:43,108] INFO: 🔍 Parsing file 4/7524: 04 Harry Potter and the Goblet of Fire.txt
[2025-07-27 01:06:43,142] INFO: 🔍 Parsing file 5/7524: 05 Harry Potter and