# 

In [2]:
# üîç crc_verify.py
import zlib
from pathlib import Path

# ==== CONFIG ====
BIN_PATH = Path(r"C:\Kishan\training_data\final_bin\train.bin")  # <- Update if needed
EXPECTED_CRC = 0x9A184523      # <- From log
DTYPE = "uint16"

def compute_crc(file_path: Path) -> int:
    print(f"üîç Reading: {file_path}")
    with open(file_path, "rb") as f:
        crc = 0
        while chunk := f.read(8192):
            crc = zlib.crc32(chunk, crc)
    return crc & 0xFFFFFFFF

if __name__ == "__main__":
    actual_crc = compute_crc(BIN_PATH)
    print(f"‚úÖ Computed CRC : {hex(actual_crc)}")
    print(f"üîê Expected CRC : {hex(EXPECTED_CRC)}")
    if actual_crc == EXPECTED_CRC:
        print("üéØ CRC MATCH ‚Äî File is good.")
    else:
        print("‚ùå CRC MISMATCH ‚Äî File may be corrupted or incomplete.")


üîç Reading: C:\Kishan	raining_datainal_bin	rain.bin


OSError: [Errno 22] Invalid argument: 'C:\\Kishan\training_data\x0cinal_bin\train.bin'

In [5]:
# ‚ö° Ultra-Optimized BIN Verifier & Token Density Analyzer
# By Cypher | For RTX 3060 + i7 + 16GB RAM | StorytellerGPT

import zlib, os, time
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd

# === CONFIG ===
BIN_DIR = Path(r"C:\Kishan\training_data\final_bin")
FILES = {
    "train.bin": 0x9A184523,  # Replace with actual CRC if needed
    "val.bin":   0x00000000,  # Replace with val CRC if known
}
DTYPE = np.uint16  # Adjust if you're using uint8/uint32
CUSTOM_TOKENS = [
    '#chapter_start', '#chapter_end', '#dialogue', '#quote', '#poem', '#story_within',
    '#character', '#title', '#hero', '#villain', '#mentor', '#outsider', '#noble_house',
    '#place', '#weapon', '#artifact', '#creature', '#god', '#prophecy', '#magic',
    '#curse', '#curse_break', '#ritual', '#vision', '#transformation', '#battle',
    '#trial', '#betrayal', '#rebellion', '#alliance', '#oath', '#lineage', '#legacy',
    '#festival', '#law', '#death', '#funeral', '#editor_note', '#annotation', '#translation_note'
]

# === CRC CHECK ===
def compute_crc(path: Path) -> int:
    print(f"\nüîç Scanning CRC for {path.name}...")
    crc = 0
    with open(path, "rb") as f:
        while chunk := f.read(8192):
            crc = zlib.crc32(chunk, crc)
    return crc & 0xFFFFFFFF

# === TOKEN STATS ===
def analyze_bin(path: Path) -> dict:
    print(f"üî¢ Loading tokens from {path.name} [{path.stat().st_size // 1024} KB]...", flush=True)
    arr = np.fromfile(path, dtype=DTYPE)
    total_tokens = len(arr)
    unique_tokens, counts = np.unique(arr, return_counts=True)
    token_freq = dict(zip(unique_tokens, counts))
    return {
        "total_tokens": total_tokens,
        "unique_tokens": len(unique_tokens),
        "token_freq": token_freq,
        "np_array": arr
    }

# === CUSTOM TOKEN HIT CHECK ===
def load_vocab_txt(vocab_path: Path):
    vocab = {}
    with open(vocab_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            token = line.strip().split('\t')[0]
            vocab[token] = i
    return vocab

def custom_token_stats(vocab_txt: Path, token_freq: dict):
    vocab = load_vocab_txt(vocab_txt)
    data = []
    for tok in CUSTOM_TOKENS:
        idx = vocab.get(tok)
        count = token_freq.get(idx, 0) if idx is not None else 0
        data.append((tok, idx, count))
    return pd.DataFrame(data, columns=["Token", "ID", "Count"]).sort_values("Count", ascending=False)

# === MAIN ===
if __name__ == "__main__":
    print("üöÄ Starting Full BIN Integrity + Token Scan")
    start = time.time()

    for file, expected_crc in FILES.items():
        bin_path = BIN_DIR / file
        if not bin_path.exists():
            print(f"‚ùå File not found: {file}")
            continue

        crc = compute_crc(bin_path)
        print(f"   ‚Üí Expected CRC: {hex(expected_crc)}")
        print(f"   ‚Üí Actual CRC  : {hex(crc)}")
        if crc == expected_crc:
            print(f"   ‚úÖ CRC MATCH for {file}")
        else:
            print(f"   ‚ùå CRC MISMATCH for {file}")

        stats = analyze_bin(bin_path)
        print(f"   üìä Token Count        : {stats['total_tokens']:,}")
        print(f"   üî£ Unique Token IDs   : {stats['unique_tokens']:,}")

        # Save token frequency distribution (top 50)
        top_tokens = sorted(stats['token_freq'].items(), key=lambda x: x[1], reverse=True)[:50]
        df_top = pd.DataFrame(top_tokens, columns=['TokenID', 'Frequency'])
        df_top.to_csv(BIN_DIR / f"{file}_top_tokens.csv", index=False)
        print(f"   üìÑ Saved top token stats ‚Üí {file}_top_tokens.csv")

        # Check for special token usage
        vocab_txt_path = BIN_DIR.parent / "tokenizer" / "tokenizer_exp.vocab"
        if vocab_txt_path.exists():
            print(f"   üîç Analyzing CUSTOM TAG frequency in vocab...")
            df_custom = custom_token_stats(vocab_txt_path, stats['token_freq'])
            df_custom.to_csv(BIN_DIR / f"{file}_custom_token_hits.csv", index=False)
            print(f"   ‚úÖ Saved ‚Üí {file}_custom_token_hits.csv")
        else:
            print("   ‚ö†Ô∏è  tokenizer_exp.vocab not found. Skipping custom tag scan.")

    print(f"\n‚úÖ Done in {time.time() - start:.2f} seconds")


üöÄ Starting Full BIN Integrity + Token Scan

üîç Scanning CRC for train.bin...
   ‚Üí Expected CRC: 0x9a184523
   ‚Üí Actual CRC  : 0x6520e4c3
   ‚ùå CRC MISMATCH for train.bin
üî¢ Loading tokens from train.bin [1672541 KB]...
   üìä Token Count        : 856,341,217
   üî£ Unique Token IDs   : 49,449
   üìÑ Saved top token stats ‚Üí train.bin_top_tokens.csv
   üîç Analyzing CUSTOM TAG frequency in vocab...
   ‚úÖ Saved ‚Üí train.bin_custom_token_hits.csv

üîç Scanning CRC for val.bin...
   ‚Üí Expected CRC: 0x0
   ‚Üí Actual CRC  : 0x61009985
   ‚ùå CRC MISMATCH for val.bin
üî¢ Loading tokens from val.bin [153970 KB]...
   üìä Token Count        : 78,832,917
   üî£ Unique Token IDs   : 46,912
   üìÑ Saved top token stats ‚Üí val.bin_top_tokens.csv
   üîç Analyzing CUSTOM TAG frequency in vocab...
   ‚úÖ Saved ‚Üí val.bin_custom_token_hits.csv

‚úÖ Done in 41.36 seconds
