In [None]:
import os
import requests
import time
import random
import re
import hashlib
import pandas as pd
from pathlib import Path
from urllib.parse import quote

# === CONFIGURATION ===
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
BASE_DIR.mkdir(parents=True, exist_ok=True)
CSV_LOG = BASE_DIR / "downloaded_books_log.csv"
TARGET_SIZE = 10 * 1024 * 1024 * 1024  # 10GB
MAX_BOOKS = 15000
DELAY = 1.5  # prevent timeouts
TIMEOUT = 25
RETRIES = 3
USER_AGENT = "Mozilla/5.0 (compatible; EchoDownloader/1.0; +https://chat.openai.com)"


# === THEMES AND KEYWORDS ===
THEMES = {
    "I_Power_Magic_Mysticism": [
        "magic", "sorcery", "prophecy", "rituals", "curses", 
        "alchemy", "occult", "relics"
    ],
    "II_Conflict_War_Betrayal": [
        "war", "betrayal", "duel", "rebellion", "vendetta", 
        "oath", "dynasty"
    ],
    "III_Legacy_Lineage_Civilization": [
        "legacy", "lineage", "kingdom", "royalty", "law", 
        "civilization", "tradition"
    ],
    "IV_Inner_Worlds_Philosophy_Tragedy": [
        "dream", "madness", "fate", "grief", "identity", 
        "sacrifice", "philosophy"
    ],
    "V_Death_Afterlife_Beyond": [
        "death", "afterlife", "reincarnation", "ghost", "soul", 
        "immortality", "underworld"
    ],
    "VI_Mythical_Creatures_Beings": [
        "dragon", "demon", "monster", "deity", "chimera", 
        "eldritch", "spirit"
    ],
    "VII_Cosmic_Esoteric": [
        "cosmic", "apocalypse", "timeline", "multiverse", "creation", 
        "zodiac", "astral"
    ],
    "VIII_Erotic_Taboo_Forbidden": [
        "erotic", "seduction", "taboo", "tantra", "temptation", 
        "androgyny", "blasphemy"
    ],
    "IX_Storytelling_Meta": [
        "bard", "epic", "narrator", "chant", "scroll", 
        "wisdom", "testimony"
    ],
    "X_Harbingers_Plague_Decay": [
        "plague", "pestilence", "miasma", "bubonic", "plague‑doctor",
        "contagion", "quarantine", "danse_pest", "corpsefire", "rot‑gospel"
    ],
    "XI_Arctic_Frost_Myths": [
        "frost_giant", "jotunheim", "permafrost", "icebound", "snow_wraith",
        "whiteout", "aurora_omen", "glacial_throne", "tundra_spirit", "frostbite_prophecy"
    ],
    "XII_Desert_Djinn_Nomad_Legends": [
        "djinn", "ifrit", "ghul‑sand", "caravansary", "oasis_oracle",
        "sandstorm_divination", "desert_glass", "crimson_dune",
        "date_palm_scroll", "mirage_warrior"
    ],
    "XIII_Clockwork_Alchemy_Steelmage": [
        "automaton", "brass_gear", "clockwork_heart", "aludel", "voltaic_arc",
        "steam_sigil", "gearfolk", "mercury_rune", "cogsong", "chrono‑anvil"
    ],
    "XIV_Bloodlines_Vampiric_Courts": [
        "crimson_pact", "blood_tithe", "night_throne", "sanguine_rite", "coffin_liege",
        "dusk_sovereign", "hematophage", "thrall_pact", "crimson_heir", "bloodborne_oath"
    ],
    "XV_Deep_Sea_Abyssal_Terrors": [
        "leviathan", "angler_demon", "trench_cathedral", "abyssal_siren", "kraken_call",
        "black_wave", "pressure_doom", "coral_tomb", "drowned_kingdom", "abyss_echo"
    ],
    "XVI_Forest_Primeval_Green_Dread": [
        "eldgrove", "vine_witch", "rootbound", "barkskin_curse", "sylvan_ruin",
        "ironwood_heart", "pollen_fever", "thorn_maiden", "moss_oracle", "canopy_reaper"
    ],
    "XVII_Pagan_Fire_Sun_Cults": [
        "solar_disciple", "firewalk_rite", "sun_flayer", "ember_orison", "ash_relic",
        "blaze_sacrament", "helios_chant", "pyre_judgement", "dawn_martyr", "coalskin"
    ],
    "XVIII_Mask_Carnival_Doppelganger": [
        "masquerade_plague", "faceless_ball", "mirror_mask", "persona_thief", "carnival_hex",
        "identity_rift", "harlequin_shade", "skinborrow", "twin_curse", "phantasm_mask"
    ],
    "XIX_Geomancy_Earthblood_Obsidian": [
        "stone_shaper", "obsidian_tears", "quake_prophet", "basalt_throne", "magma_vow",
        "crystal_seer", "cavern_anthem", "faultline_omen", "tectonic_rite", "geode_heart"
    ],
    "XX_Storm_Lords_Heavens_Wrath": [
        "tempest_king", "thunder_rite", "skyforge", "lightning_sigil", "cyclone_oracle",
        "stormward", "cloudpiercer", "rain_fury", "monsoon_court", "electro_shaman"
    ],
    "XXI_Dream_Eater_Surreal_Horror": [
        "night_hag", "dream_eater", "sleep_paralysis_demon", "REM_maze", "nightmare_ink",
        "mind_palace_fall", "lucid_captor", "figment_slayer", "oneiro_shard", "slumber_void"
    ],
    "XXII_Heroic_Light_Epics": [
        "heroic_quest", "grail", "paladin", "sunblade", "dragonslayer",
        "virtue_trial", "valor", "honor_duel", "chivalry", "knighthood"
    ],
    "XXIII_Celestial_Angelic_Legends": [
        "seraphim", "cherubim", "archangel", "heavenly_host", "silver_trumpet",
        "halo", "emanation", "beatitude", "divine_chorus", "lambent_aura"
    ],
    "XXIV_Pastoral_Faerie_Wonder": [
        "faerie_ring", "willow_wisp", "moonlit_meadow", "forest_sprite", "dawn_chorus",
        "spring_equinoct", "harvest_festival", "cornucopia", "blossom_spirit", "brook_nymph"
    ],
    "XXV_Renewal_Hope_Redemption": [
        "phoenix_ash", "rebirth_ritual", "guiding_light", "forgiven_oath", "cleansing_rain",
        "reconciliation", "mercy", "healing_song", "peace_treaty", "dawn_break"
    ],
    "XXVI_Comic_Trickster_Folklore": [
        "anansi", "coyote_trickster", "hare_fable", "story_trick", "prankster_god",
        "wise_fool", "jester_lore", "folly_tale", "merry_bard", "laughing_spirit"
    ],
    "XXVII_Nihil_Despair_Doom": [
        "void_worship", "entropy_cult", "heat_death", "black_sun", "nihilist_sermon",
        "world_eater", "sorrow_scripture", "endless_night", "hopeless_dirge", "soul_ash"
    ],
    "XXVIII_Body_Horror_Vivisection": [
        "vivisect", "meat_puppet", "bone_bloom", "chitin_seraph", "skull_throne",
        "flesh_alembic", "gore_gospel", "skin_tapestry", "marrow_scrawl", "sinew_forge"
    ],
    "XXIX_Cannibal_Totem_Prime": [
        "cannibal_rite", "blood_stew", "manflesh_feast", "bone_marrow_clan", "totem_hunger",
        "scarification_tribe", "headhunter", "skull_totem", "feral_banquet", "sinew_song"
    ],
    "XXX_Unhinged_Madness_Esoterica": [
        "eldritch_mania", "fractured_psalm", "lunacy_ritual", "insanity_chant", "madness_scroll",
        "void_whisper", "gibbering_tongue", "shattered_mind", "chaos_gnosis", "schizophrenic_prophecy"
    ],
    "XXXI_Necropolis_Undead_Empire": [
        "necropolis_emperor", "bone_legion", "corpse_citadel", "crypt_king", "grave_savant",
        "deathmonger", "ghastly_parade", "lich_monarchy", "hollow_regent", "sepulcher_court"
    ],
    "XXXII_Infernal_Torture_Pits": [
        "soul_furnace", "hellforge", "pain_artist", "agonic_script", "screaming_pit",
        "iron_maiden", "torment_pact", "blood_iron", "rackmaster", "anguish_litany"
    ]
}
# === HELPERS ===
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)[:100]

def get_gutendex_books(keyword, page=1):
    url = f"https://gutendex.com/books/?search={quote(keyword)}&languages=en&page={page}"
    for attempt in range(RETRIES):
        try:
            r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT)
            r.raise_for_status()
            return r.json()
        except Exception as e:
            print(f"[RETRY {attempt+1}/{RETRIES}] Keyword '{keyword}' failed | {e}")
            time.sleep(DELAY * (attempt + 1))
    return {"results": [], "next": None}

def get_text_url(formats):
    for mime in ["text/plain; charset=utf-8", "text/plain"]:
        if mime in formats:
            return formats[mime]
    for url in formats.values():
        if url.endswith(".txt") or ".txt." in url:
            return url
    return None

def file_md5(filepath):
    with open(filepath, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

def download_book(book_id, title, url):
    clean_title = sanitize_filename(title)
    filename = f"{book_id}_{clean_title}.txt"
    filepath = BASE_DIR / filename

    if filepath.exists():
        return filepath.stat().st_size, file_md5(filepath)

    try:
        r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT)
        r.raise_for_status()
        with open(filepath, "wb") as f:
            f.write(r.content)
        size = filepath.stat().st_size
        return size, file_md5(filepath)
    except Exception as e:
        print(f"[FAIL] {title} | {e}")
        return 0, None

# === MAIN SCRIPT ===
if __name__ == "__main__":
    total_downloaded = 0
    book_log = []
    book_hashes = set()
    downloaded_books = set()
    file_counter = 0
    book_counter = 0

    print(f"\n📦 Target: {TARGET_SIZE / (1024**3):.2f} GB or {MAX_BOOKS} books max")

    while total_downloaded < TARGET_SIZE and len(downloaded_books) < MAX_BOOKS:
        theme = random.choice(list(THEMES.keys()))
        keyword = random.choice(THEMES[theme])

        print(f"\n🔍 Searching: [{theme}] → '{keyword}'")

        for page in range(1, 4):
            data = get_gutendex_books(keyword, page)
            results = data.get("results", [])
            print(f"📖 Page {page}: Found {len(results)} results")

            for book in results:
                book_id = book["id"]
                if book_id in downloaded_books:
                    continue

                url = get_text_url(book["formats"])
                if not url:
                    continue

                title = book.get("title", f"Book_{book_id}")
                book_counter += 1
                print(f"📘 [{book_counter}] Attempting: {title}")

                size, md5 = download_book(book_id, title, url)

                if size > 10 * 1024 and md5 not in book_hashes:
                    downloaded_books.add(book_id)
                    book_hashes.add(md5)
                    total_downloaded += size
                    file_counter += 1
                    book_log.append({
                        "id": book_id,
                        "title": title,
                        "theme": theme,
                        "url": url,
                        "size_kb": round(size / 1024, 2),
                        "md5": md5
                    })
                    print(f"✅ Downloaded [{file_counter}/{MAX_BOOKS}]: {title} | {size/1024:.1f} KB | Total: {total_downloaded / (1024**3):.2f} GB")
                else:
                    print(f"❌ Skipped: {title} | Size: {size / 1024:.1f} KB")

                if total_downloaded >= TARGET_SIZE or len(downloaded_books) >= MAX_BOOKS:
                    break
                time.sleep(DELAY)

            if total_downloaded >= TARGET_SIZE or len(downloaded_books) >= MAX_BOOKS or not data.get("next"):
                break

    df = pd.DataFrame(book_log)
    df.to_csv(CSV_LOG, index=False)
    print(f"\n📊 Log saved to {CSV_LOG}")
    print(f"🎉 Downloaded {file_counter} unique books ({total_downloaded / (1024**3):.2f} GB) total.")


📦 Target: 10.00 GB or 15000 books max

🔍 Searching: [VIII_Erotic_Taboo_Forbidden] → 'temptation'
📖 Page 1: Found 17 results
📘 [1] Attempting: Reuben Roy's temptations
✅ Downloaded [1/15000]: Reuben Roy's temptations | 145.2 KB | Total: 0.00 GB
📘 [2] Attempting: The Temptation of St. Anthony
✅ Downloaded [2/15000]: The Temptation of St. Anthony | 323.8 KB | Total: 0.00 GB
📘 [3] Attempting: A Terrible Temptation: A Story of To-Day
✅ Downloaded [3/15000]: A Terrible Temptation: A Story of To-Day | 767.4 KB | Total: 0.00 GB
📘 [4] Attempting: The Temptation of St. Antony; Or, A Revelation of the Soul
✅ Downloaded [4/15000]: The Temptation of St. Antony; Or, A Revelation of the Soul | 271.1 KB | Total: 0.00 GB
📘 [5] Attempting: Danger! A True History of a Great City's Wiles and Temptations: The Veil Lifted, and Light Thrown on Crime and its Causes, and Criminals and their Haunts. Facts and Disclosures.
✅ Downloaded [5/15000]: Danger! A True History of a Great City's Wiles and Temptations: T

In [None]:
9+0

In [None]:
import os
import re
import hashlib
import shutil
import logging
from pathlib import Path
from chardet import detect
import pandas as pd

# ========== CONFIGURATION ==========
BASE_DIR = Path(r"C:\Users\ayush\OneDrive\Desktop\Stories")
RAW_DIR = BASE_DIR / "raw_files"
CLEANED_DIR = BASE_DIR / "cleaned_data"
DUP_LOG = BASE_DIR / "duplicates_log.txt"
CSV_LOG = BASE_DIR / "cleaning_report.csv"

CLEANED_DIR.mkdir(parents=True, exist_ok=True)

# ========== LOGGING SETUP ==========
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(BASE_DIR / "phase1_cleaning.log", mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)

# ========== TRACKING STATS ==========
report_data = []
seen_hashes = {}
total_files = 0
processed_files = 0
duplicates_found = 0

# ========== UTILITY FUNCTIONS ==========
def compute_sha256(file_path):
    with open(file_path, 'rb') as f:
        return hashlib.sha256(f.read()).hexdigest()

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        return detect(f.read())['encoding'] or 'utf-8'

def normalize_text(text):
    # Remove illustrations, footnotes, page numbers
    text = re.sub(r"\[Illustration:.*?\]", "", text)
    text = re.sub(r"-?\[?Page \d+\]?", "", text)
    text = re.sub(r"\*\*\*.*?START.*?\*\*\*", "[DOC_START]", text, flags=re.IGNORECASE|re.DOTALL)
    text = re.sub(r"\*\*\*.*?END.*?\*\*\*", "[DOC_END]", text, flags=re.IGNORECASE|re.DOTALL)

    # Fix line breaks, spacing, em-dashes
    text = re.sub(r"(\S)-\n(\S)", r"\1\2", text)
    text = re.sub(r" +", " ", text)
    text = re.sub(r"(\w),(\w)", r"\1, \2", text)
    text = re.sub(r"(\w)\.(\w)", r"\1. \2", text)
    text = text.replace("--", "—")

    # Special char conversions
    text = text.replace("\u00e6", "ae").replace("\u0153", "oe").replace("\u00df", "ss")
    text = text.replace("\u00bd", "1/2")

    return text

def extract_metadata(text):
    pgid = re.search(r'PG(?:ID)?:?\s*(\d+)', text)
    title = re.search(r'Title:?\s*(.*)', text)
    author = re.search(r'Author:?\s*(.*)', text)
    return {
        "PGID": pgid.group(1) if pgid else "Unknown",
        "Title": title.group(1).strip() if title else "Unknown",
        "Author": author.group(1).strip() if author else "Unknown"
    }

# ========== MAIN CLEANING LOOP ==========
logging.info("🚀 Starting Phase 1 Cleaning...\n")
all_files = list(RAW_DIR.rglob("*.txt"))
total_files = len(all_files)
logging.info(f"📂 Total files detected: {total_files}\n")

dup_log = open(DUP_LOG, 'w', encoding='utf-8')

for idx, file_path in enumerate(all_files, 1):
    logging.info(f"[{idx}/{total_files}] Processing: {file_path.name}")

    try:
        sha256 = compute_sha256(file_path)
        if sha256 in seen_hashes:
            dup_log.write(f"REMOVED: {file_path.name} (duplicate of {seen_hashes[sha256]})\n")
            duplicates_found += 1
            continue
        else:
            seen_hashes[sha256] = file_path.name

        encoding = detect_encoding(file_path)
        with open(file_path, 'r', encoding=encoding, errors='replace') as f:
            content = f.read()

        metadata = extract_metadata(content)
        content = normalize_text(content)

        # Insert structural markers
        lines = content.strip().splitlines()
        if not lines[0].startswith("[DOC_START]"):
            lines.insert(0, "[DOC_START]")
        if not lines[-1].endswith("[DOC_END]"):
            lines.append("[DOC_END]")

        cleaned_text = f"[DOC_START]\n# PGID: {metadata['PGID']}\n# Title: {metadata['Title']}\n# Author: {metadata['Author']}\n\n" + '\n'.join(lines[1:]) + "\n[DOC_END]"

        output_file = CLEANED_DIR / f"{metadata['PGID']}_{metadata['Title'].replace(' ', '_')}_clean.txt"
        with open(output_file, 'w', encoding='utf-8-sig') as out:
            out.write(cleaned_text)

        processed_files += 1
        report_data.append({
            "Original Filename": file_path.name,
            "SHA256": sha256,
            "PGID": metadata['PGID'],
            "Title": metadata['Title'],
            "Author": metadata['Author'],
            "Encoding": encoding,
            "Original Size (KB)": round(file_path.stat().st_size / 1024, 2),
            "Status": "Processed"
        })

    except Exception as e:
        logging.error(f"❌ Error processing {file_path.name}: {str(e)}")
        report_data.append({
            "Original Filename": file_path.name,
            "SHA256": "N/A",
            "PGID": "N/A",
            "Title": "N/A",
            "Author": "N/A",
            "Encoding": "Unknown",
            "Original Size (KB)": round(file_path.stat().st_size / 1024, 2),
            "Status": f"Error: {str(e)}"
        })

# ========== FINALIZATION ==========
dup_log.close()
pd.DataFrame(report_data).to_csv(CSV_LOG, index=False)

logging.info("\n✅ Phase 1 Cleaning Complete")
logging.info(f"Total Files Scanned: {total_files}")
logging.info(f"✅ Cleaned Files Saved: {processed_files}")
logging.info(f"🚫 Duplicates Skipped: {duplicates_found}")
logging.info(f"📄 Detailed report saved to: {CSV_LOG}")
logging.info(f"📓 Duplicates log saved to: {DUP_LOG}\n")
    