diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
index 8c054b0ef75..c1f78cebb68 100644
--- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
+++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
@@ -50,8 +50,8 @@ public TextQualityComparison(String winner, float delta,
     }
 
     /**
-     * Returns {@code "A"} if candidate A is cleaner, {@code "B"} otherwise.
-     * Check {@link #delta()} to gauge confidence.
+     * Returns the label of the cleaner candidate ({@link #labelA()} or
+     * {@link #labelB()}).  Check {@link #delta()} to gauge confidence.
      */
     public String winner() {
         return winner;
@@ -88,8 +88,7 @@ public String labelB() {
     @Override
     public String toString() {
         return String.format(java.util.Locale.ROOT,
-                "TextQualityComparison[winner=%s(%s) delta=%.3f A=%s B=%s]",
-                winner, winner.equals("A") ? labelA : labelB,
-                delta, scoreA, scoreB);
+                "TextQualityComparison[winner=%s delta=%.3f A=%s(%s) B=%s(%s)]",
+                winner, delta, labelA, scoreA, labelB, scoreB);
     }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
index d832b5a169d..b91315e7272 100644
--- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
@@ -37,7 +37,7 @@
  * // Arbitrate between two charset decodings
  * TextQualityComparison cmp = detector.compare("cp1252", decodedAsCp1252,
  *                                               "cp1251", decodedAsCp1251);
- * String winner = cmp.winner();  // "A" or "B"
+ * String winner = cmp.winner();  // returns the chosen label, e.g. "cp1251"
  * }</pre>
  */
 public interface TextQualityDetector {
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 1719043f408..5635f6f168d 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -41,26 +41,35 @@
  * Language-agnostic text quality scorer.  Discriminates clean UTF-8 text from
  * mojibake, reversed text, wrong-codec decodings, and other corruption forms.
  *
- * <p>Scoring combines up to three features, depending on the model version:
+ * <p>Scoring combines four features:
  * <ol>
- *   <li><b>Byte-bigram log-probability</b> — 256×256 table of log P(b|a) over
- *       consecutive byte pairs in the UTF-8 encoding.</li>
- *   <li><b>Unicode named-block transition log-probability</b> (version 2+) —
- *       N×N table of log P(block_b | block_a) where block IDs are the named
- *       {@link Character.UnicodeBlock} values (BASIC_LATIN, ARABIC,
- *       CJK_UNIFIED_IDEOGRAPHS, etc.).</li>
- *   <li><b>Control-byte fraction</b> (version 2+) — fraction of bytes in control
+ *   <li><b>Codepoint-bigram log-probability (F1)</b> — global hashed table
+ *       indexed by FNV-1a(cp_a, cp_b, seed) into {@code bigramBuckets} cells.
+ *       A Bloom filter records seen pairs; unseen pairs fall back to a
+ *       hashed-unigram independence-assumption score
+ *       {@code α * (log P(cp_a) + log P(cp_b))}.</li>
+ *   <li><b>Unicode named-block transition log-probability (F2)</b> —
+ *       per-script N×N table over {@link Character.UnicodeBlock} values.</li>
+ *   <li><b>Control-byte fraction (F3)</b> — fraction of bytes in control
  *       ranges [0x01–0x08, 0x0B, 0x0C, 0x0E–0x1F, 0x7F].</li>
+ *   <li><b>Global script-transition log-probability (F4)</b> — single
+ *       transition table over raw {@link Character.UnicodeScript} values,
+ *       capturing document-level cross-script anomalies.</li>
  * </ol>
  *
- * <p>All features are calibrated (mu/sigma) on held-out dev text so their z-scores
- * are on a common scale.
+ * <p>All features are calibrated per-script (mu/sigma) on held-out dev text
+ * so their z-scores are on a common scale.  z-scores are combined by a
+ * per-script linear classifier:
+ * {@code logit = w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias}, where weights are
+ * fit on clean vs. corrupted dev windows.  Natural junk threshold is 0
+ * (positive logit = clean); use negative thresholds for conservative
+ * detection.</p>
  *
- * <p>Features are combined by a per-script logistic regression classifier:
- * {@code w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias}, where weights are fit on
- * clean vs. corrupted dev windows.  The natural junk threshold is 0 (positive
- * logit = clean); use a negative threshold for conservative detection
- * (e.g., {@code score &lt; -1}).</p>
+ * <p>Model file format: a single binary spec (see {@link #load(InputStream)}
+ * javadoc).  No backwards-compat fallback to older formats — the loader
+ * rejects mismatched version bytes with a clear error.  This is
+ * intentional: keeping parallel scoring paths is a known source of silent
+ * miscalibration bugs.
  *
  * <p>Instances are immutable and thread-safe after construction.
  *
@@ -72,7 +81,7 @@
  *
  * // Arbitrate between two charset decodings
  * TextQualityComparison result = detector.compare("cp1252", ascp1252, "cp1251", ascp1251);
- * String winner = result.winner();  // "A" or "B"
+ * String winner = result.winner();  // returns "cp1252" or "cp1251"
  * }</pre>
  */
 public final class JunkDetector implements TextQualityDetector {
@@ -82,68 +91,54 @@ public final class JunkDetector implements TextQualityDetector {
             "org/apache/tika/ml/junkdetect/junkdetect.bin";
 
     static final String MAGIC = "JUNKDET1";
+    /** Sole supported file-format version.  Mismatch is a hard error. */
+    static final int VERSION = 7;
 
-    private final int modelVersion;
+    // Feature 1 — per-script open-addressed codepoint-bigram tables.
+    // No global Bloom: empty-slot is the membership oracle.
+    private final Map<String, V7Tables> f1TablesByScript;
 
-    // Feature 1: byte bigrams (all versions)
-    private final Map<String, float[]> tables;       // script → float[65536] log-prob
+    /** Per-script F1 calibration on the codepoint-hash mean log-prob. */
     private final Map<String, float[]> calibrations; // script → float[2] {mu, sigma}
 
-    // Feature 2: named-block transitions (version 2+); null for v1 models
-    private final Map<String, float[]> blockTables;         // script → float[blockN*blockN]
-    private final Map<String, float[]> blockCalibrations;   // script → float[2] {mu, sigma}
-    private final int blockN;                               // block table dimension (0 for v1)
+    // Feature 2 — per-script block transition.  Block bucketing uses the
+    // JVM-independent {@link UnicodeBlockRanges} static table; table size
+    // per script is {@code bucketCount()²} floats.
+    private final Map<String, float[]> blockTables;
+    private final Map<String, float[]> blockCalibrations;
 
-    // Feature 3: control-byte fraction (version 2+); null for v1 models
-    private final Map<String, float[]> controlCalibrations; // script → float[2] {mu, sigma}
+    // Feature 3 — per-script control-byte fraction calibration
+    private final Map<String, float[]> controlCalibrations;
 
-    // Feature combination: per-script linear classifier (version 3+); null for v1/v2 models
-    // float[numFeatures+1] = {w1, ..., wN, bias}; positive logit = clean
-    private final Map<String, float[]> classifierWeights;
-
-    // Feature 4: global script-transition (version 4+); null for v1/v2/v3 models
-    // One global table: float[numScriptBuckets * numScriptBuckets] log P(script_b | script_a)
-    // Uses raw UnicodeScript names (not SCRIPT_MODEL_FALLBACK) to distinguish HIRAGANA/KATAKANA/HAN.
+    // Feature 4 — single global script-transition table
     private final float[] scriptTransitionTable;
-    private final float[] scriptTransitionCalibration; // float[2] = {mu, sigma}
-    private final Map<String, Integer> scriptBucketIndex; // raw UnicodeScript name → bucket ID
-    private final int numScriptBuckets;                  // 0 for v1/v2/v3
+    private final float[] scriptTransitionCalibration;
+    private final Map<String, Integer> scriptBucketIndex;
+    private final int numScriptBuckets;
 
-    // Shared block index for v2+ models: UnicodeBlock → index [0, blockN-1)
-    // Index blockN-1 is the "unassigned" bucket (null UnicodeBlock).
-    private final Map<Character.UnicodeBlock, Integer> blockIndex;
+    // Per-script linear classifier: float[numFeatures+1] = {w1, ..., wN, bias}.
+    private final Map<String, float[]> classifierWeights;
 
-    private JunkDetector(int modelVersion,
-                         Map<String, float[]> tables,
-                         Map<String, float[]> calibrations,
+    private JunkDetector(Map<String, float[]> calibrations,
                          Map<String, float[]> blockTables,
                          Map<String, float[]> blockCalibrations,
-                         int blockN,
                          Map<String, float[]> controlCalibrations,
                          Map<String, float[]> classifierWeights,
-                         Map<Character.UnicodeBlock, Integer> blockIndex,
                          float[] scriptTransitionTable,
                          float[] scriptTransitionCalibration,
                          Map<String, Integer> scriptBucketIndex,
-                         int numScriptBuckets) {
-        this.modelVersion = modelVersion;
-        this.tables = Collections.unmodifiableMap(tables);
+                         int numScriptBuckets,
+                         Map<String, V7Tables> f1TablesByScript) {
         this.calibrations = Collections.unmodifiableMap(calibrations);
-        this.blockTables = blockTables != null
-                ? Collections.unmodifiableMap(blockTables) : null;
-        this.blockCalibrations = blockCalibrations != null
-                ? Collections.unmodifiableMap(blockCalibrations) : null;
-        this.blockN = blockN;
-        this.controlCalibrations = controlCalibrations != null
-                ? Collections.unmodifiableMap(controlCalibrations) : null;
-        this.classifierWeights = classifierWeights != null
-                ? Collections.unmodifiableMap(classifierWeights) : null;
-        this.blockIndex = blockIndex;
+        this.blockTables = Collections.unmodifiableMap(blockTables);
+        this.blockCalibrations = Collections.unmodifiableMap(blockCalibrations);
+        this.controlCalibrations = Collections.unmodifiableMap(controlCalibrations);
+        this.classifierWeights = Collections.unmodifiableMap(classifierWeights);
         this.scriptTransitionTable = scriptTransitionTable;
         this.scriptTransitionCalibration = scriptTransitionCalibration;
-        this.scriptBucketIndex = scriptBucketIndex != null
-                ? Collections.unmodifiableMap(scriptBucketIndex) : null;
+        this.scriptBucketIndex = Collections.unmodifiableMap(scriptBucketIndex);
         this.numScriptBuckets = numScriptBuckets;
+        this.f1TablesByScript = Collections.unmodifiableMap(f1TablesByScript);
     }
 
     // -----------------------------------------------------------------------
@@ -196,7 +191,53 @@ public static JunkDetector loadFromPath(Path path) throws IOException {
 
     /**
      * Loads a model from an {@link InputStream}.  Gzip-detection is automatic.
-     * Supports model versions 1 through 5.
+     * Strictly requires the current file-format version ({@value #VERSION}) —
+     * older formats are rejected with a clear error rather than supported
+     * via a fallback path.
+     *
+     * <p>File-format layout (gzipped):
+     * <pre>
+     *   [8 bytes]    magic "JUNKDET1" (ASCII)
+     *   [1 byte]     version (= 7)
+     *   [4 bytes]    num_scripts (int BE)
+     *   [1 byte]     block_scheme_version  (must equal
+     *                {@link UnicodeBlockRanges#SCHEME_VERSION})
+     *   [1 byte]     num_script_buckets
+     *   for each bucket:
+     *     [2 bytes]      name length (ushort BE)
+     *     [name bytes]   bucket name (UTF-8)
+     *   [num_script_buckets² × 4 bytes]  script-transition log-prob table (F4)
+     *   [4 bytes]    mu4 (float32 BE)
+     *   [4 bytes]    sigma4 (float32 BE)
+     *   for each script (sorted by name):
+     *     [2 bytes]      name length
+     *     [name bytes]   script name (UTF-8)
+     *     [4 bytes]      mu1 (F1 calibration, codepoint-bigram mean log-prob)
+     *     [4 bytes]      sigma1
+     *     // V7 F1 tables for this script — see {@link V7Tables#writeTo}
+     *     [4 bytes]      backoff_alpha (float32 BE)
+     *     [4 bytes]      codepoint_count
+     *     [codepoint_count × 4 bytes]  codepoint index (sorted, ascending)
+     *     [4 bytes]      bigram_slots (power of 2)
+     *     [4 bytes]      bigram_quant_min (float32 BE)
+     *     [4 bytes]      bigram_quant_max (float32 BE)
+     *     [bigram_slots × 4 bytes]  bigram open-addressing keys
+     *                                ((idxA<<16)|idxB, or {@link V7Tables#EMPTY_KEY})
+     *     [bigram_slots bytes]      bigram values (8-bit quantized log-probs)
+     *     [4 bytes]      unigram_quant_min (float32 BE)
+     *     [4 bytes]      unigram_quant_max (float32 BE)
+     *     [4 bytes]      unigram_fallback_log_prob (float32 BE; used for
+     *                                                codepoints not in index)
+     *     [codepoint_count bytes]   unigram values (8-bit quantized log-probs)
+     *     // F2/F3/classifier (unchanged from v6 layout)
+     *     [4 bytes]      mu2 (F2 calibration)
+     *     [4 bytes]      sigma2
+     *     [block_N² × 4 bytes]  block-transition log-prob table (F2)
+     *     [4 bytes]      mu3 (F3 calibration)
+     *     [4 bytes]      sigma3
+     *     [1 byte]       num_features
+     *     [(num_features+1) × 4 bytes]  classifier weights w1..wN and bias
+     * </pre>
      */
     public static JunkDetector load(InputStream rawIs) throws IOException {
         byte[] peek = rawIs.readNBytes(2);
@@ -215,21 +256,22 @@ public static JunkDetector load(InputStream rawIs) throws IOException {
                 throw new IOException("Not a JunkDetector model file (bad magic)");
             }
             int version = dis.readUnsignedByte();
-            if (version != 5) {
-                throw new IOException("Unsupported model version: " + version
-                        + ". Only version 5 is supported. Retrain the model with TrainJunkModel.");
+            if (version != VERSION) {
+                throw new IOException("Unsupported model format version: " + version
+                        + ". This build expects version " + VERSION
+                        + ".  Retrain the model with the current TrainJunkModel.");
             }
 
             int numScripts = dis.readInt();
 
-            // Block names (v5): stored in model for JVM-independence
-            int blockN = dis.readUnsignedShort();
-            String[] blockNames = new String[blockN - 1];
-            for (int i = 0; i < blockN - 1; i++) {
-                int nameLen = dis.readUnsignedShort();
-                blockNames[i] = new String(dis.readNBytes(nameLen), StandardCharsets.UTF_8);
+            int blockSchemeVersion = dis.readUnsignedByte();
+            if (blockSchemeVersion != UnicodeBlockRanges.SCHEME_VERSION) {
+                throw new IOException("Unsupported block-scheme version: "
+                        + blockSchemeVersion + ". This build expects "
+                        + UnicodeBlockRanges.SCHEME_VERSION
+                        + ".  Retrain with the current TrainJunkModel.");
             }
-            Map<Character.UnicodeBlock, Integer> blockIndex = buildBlockIndexFromNames(blockNames);
+            int blockN = UnicodeBlockRanges.bucketCount();
 
             // Global script-transition section
             int numScriptBuckets = dis.readUnsignedByte();
@@ -242,42 +284,39 @@ public static JunkDetector load(InputStream rawIs) throws IOException {
             float[] scriptTransitionTable = readFloatTable(dis, numScriptBuckets * numScriptBuckets);
             float[] scriptTransitionCalibration = new float[]{dis.readFloat(), dis.readFloat()};
 
-            Map<String, float[]> tables              = new HashMap<>(numScripts * 2);
-            Map<String, float[]> calibrations        = new HashMap<>(numScripts * 2);
-            Map<String, float[]> blockTables         = new HashMap<>(numScripts * 2);
-            Map<String, float[]> blockCalibrations   = new HashMap<>(numScripts * 2);
-            Map<String, float[]> controlCalibrations = new HashMap<>(numScripts * 2);
-            Map<String, float[]> classifierWeights   = new HashMap<>(numScripts * 2);
+            Map<String, V7Tables>  f1TablesByScript   = new HashMap<>(numScripts * 2);
+            Map<String, float[]>   calibrations       = new HashMap<>(numScripts * 2);
+            Map<String, float[]>   blockTables        = new HashMap<>(numScripts * 2);
+            Map<String, float[]>   blockCalibrations  = new HashMap<>(numScripts * 2);
+            Map<String, float[]>   controlCalibrations = new HashMap<>(numScripts * 2);
+            Map<String, float[]>   classifierWeights  = new HashMap<>(numScripts * 2);
 
             for (int s = 0; s < numScripts; s++) {
                 int nameLen = dis.readUnsignedShort();
                 String script = new String(dis.readNBytes(nameLen), StandardCharsets.UTF_8);
 
-                // Feature 1: byte bigrams
                 calibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()});
-                tables.put(script, readFloatTable(dis, 65536));
 
-                // Feature 2: named-block transitions
+                // Per-script V7 F1 tables.
+                f1TablesByScript.put(script, V7Tables.readFrom(dis));
+
                 blockCalibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()});
                 blockTables.put(script, readFloatTable(dis, blockN * blockN));
-
-                // Feature 3: control-byte fraction
                 controlCalibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()});
 
-                // Classifier weights: num_features (1 byte) + num_features floats + 1 bias
                 int numFeatures = dis.readUnsignedByte();
-                float[] weights = new float[numFeatures + 1]; // last = bias
+                float[] weights = new float[numFeatures + 1];
                 for (int j = 0; j <= numFeatures; j++) {
                     weights[j] = dis.readFloat();
                 }
                 classifierWeights.put(script, weights);
             }
 
-            return new JunkDetector(version, tables, calibrations,
-                    blockTables, blockCalibrations, blockN,
-                    controlCalibrations, classifierWeights, blockIndex,
+            return new JunkDetector(calibrations,
+                    blockTables, blockCalibrations,
+                    controlCalibrations, classifierWeights,
                     scriptTransitionTable, scriptTransitionCalibration,
-                    scriptBucketIndex, numScriptBuckets);
+                    scriptBucketIndex, numScriptBuckets, f1TablesByScript);
         }
     }
 
@@ -289,44 +328,6 @@ private static float[] readFloatTable(DataInputStream dis, int size) throws IOEx
         return table;
     }
 
-    /**
-     * Builds the stable ordered mapping from {@link Character.UnicodeBlock} to index.
-     * This must produce the same ordering as {@link TrainJunkModel#buildBlockIndex()}.
-     * Used for v2/v3/v4 models only; v5+ models store block names in the file.
-     */
-    static Map<Character.UnicodeBlock, Integer> buildBlockIndex() {
-        LinkedHashMap<Character.UnicodeBlock, Integer> index = new LinkedHashMap<>();
-        for (int cp = 0; cp <= 0x10FFFF; cp++) {
-            Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
-            if (b != null) index.putIfAbsent(b, index.size());
-        }
-        return Collections.unmodifiableMap(index);
-    }
-
-    /**
-     * Builds a block index from an ordered array of block names stored in a v5+ model.
-     * Resolves each name via {@link Character.UnicodeBlock#forName(String)}.
-     * Throws {@link IOException} if any name is not recognised by the current JVM —
-     * this means the model was trained on a newer JVM; retrain on the minimum
-     * supported JVM (Java 17) to produce a compatible model.
-     *
-     * @param blockNames ordered array of block names (index = position in block table)
-     * @return unmodifiable map from UnicodeBlock to table index
-     */
-    static Map<Character.UnicodeBlock, Integer> buildBlockIndexFromNames(String[] blockNames)
-            throws IOException {
-        Map<Character.UnicodeBlock, Integer> index = new HashMap<>(blockNames.length * 2);
-        for (int i = 0; i < blockNames.length; i++) {
-            try {
-                Character.UnicodeBlock b = Character.UnicodeBlock.forName(blockNames[i]);
-                index.put(b, i);
-            } catch (IllegalArgumentException e) {
-                throw new IOException("Unicode block not known to this JVM: " + blockNames[i]
-                        + ". Model was trained on a newer JVM; retrain on Java 17.", e);
-            }
-        }
-        return Collections.unmodifiableMap(index);
-    }
 
     // -----------------------------------------------------------------------
     // TextQualityDetector implementation
@@ -373,7 +374,7 @@ public TextQualityComparison compare(String labelA, String candidateA,
         float zA = scoreA.isUnknown() ? 0f : scoreA.getZScore();
         float zB = scoreB.isUnknown() ? 0f : scoreB.getZScore();
 
-        String winner = zA >= zB ? "A" : "B";
+        String winner = zA >= zB ? labelA : labelB;
         float delta = Math.abs(zA - zB);
 
         return new TextQualityComparison(winner, delta, scoreA, scoreB, labelA, labelB);
@@ -381,12 +382,12 @@ public TextQualityComparison compare(String labelA, String candidateA,
 
     /** Returns the set of script names this model knows about. */
     public Set<String> knownScripts() {
-        return tables.keySet();
+        return calibrations.keySet();
     }
 
-    /** Returns the version of the loaded model (1, 2, or 3). */
+    /** Returns the file-format version of the loaded model. */
     public int getModelVersion() {
-        return modelVersion;
+        return VERSION;
     }
 
     // -----------------------------------------------------------------------
@@ -409,12 +410,16 @@ private TextQualityScore scoreText(String text) {
         float[] dominantCal1 = null;
 
         for (ScriptRun run : runs) {
-            if (!tables.containsKey(run.script)) {
+            if (!calibrations.containsKey(run.script)) {
                 continue; // skip scripts not in model; treat as neutral, not junk
             }
             byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8);
-            if (runUtf8.length < 2) {
-                continue; // too short to score
+            // Skip if too short to form a bigram by either metric.  A single
+            // CJK char is 3 UTF-8 bytes (passes the byte filter) but 1 UTF-16
+            // unit, and computeF1MeanLogP filters by text.length() < 2 and
+            // returns NaN — which would poison the weighted sum here.
+            if (runUtf8.length < 2 || run.text.length() < 2) {
+                continue;
             }
             float logit = scoreChunk(runUtf8, run.text, run.script, z4);
             int n = runUtf8.length;
@@ -444,88 +449,226 @@ private TextQualityScore scoreText(String text) {
         return new TextQualityScore(zScore, pClean, ciLow, ciHigh, dominantScript);
     }
 
+    /**
+     * Diagnostic — exposes per-feature z-scores and classifier weights.  Same
+     * chunking and aggregation as {@link #score(String)}, but returns the
+     * intermediate signals individually for analysis or for hybrid models
+     * that want to substitute one feature with an externally-computed value.
+     *
+     * <p>Aggregation: per-chunk z1/z2/z3 and per-chunk logit are byte-count-
+     * weighted across script-homogeneous chunks.  z4 is a global signal
+     * (already document-level).  {@code dominantScript} and
+     * {@code classifierWeights} refer to the script run with the most bytes.
+     */
+    public FeatureComponents scoreWithFeatureComponents(String text) {
+        if (text == null || text.isEmpty()) {
+            return new FeatureComponents(Float.NaN, Float.NaN, Float.NaN,
+                    Float.NaN, Float.NaN, "UNKNOWN", null, 0);
+        }
+        List<ScriptRun> runs = buildScriptRuns(text);
+        float z4 = computeScriptTransitionZ(text);
+
+        float totalBytes = 0;
+        float weightedZ1 = 0;
+        float weightedZ2 = 0;
+        float weightedZ3 = 0;
+        float weightedLogit = 0;
+        String dominantScript = null;
+        int maxBytes = 0;
+
+        for (ScriptRun run : runs) {
+            if (!calibrations.containsKey(run.script)) {
+                continue;
+            }
+            byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8);
+            if (runUtf8.length < 2 || run.text.length() < 2) {
+                continue; // see scoreText: paired filter avoids NaN poisoning
+            }
+            float[] zs = computeChunkZs(runUtf8, run.text, run.script);
+            float chunkLogit = combineLogit(zs[0], zs[1], zs[2], z4, run.script);
+            int n = runUtf8.length;
+            weightedZ1 += zs[0] * n;
+            weightedZ2 += zs[1] * n;
+            weightedZ3 += zs[2] * n;
+            weightedLogit += chunkLogit * n;
+            totalBytes += n;
+            if (n > maxBytes) {
+                maxBytes = n;
+                dominantScript = run.script;
+            }
+        }
+
+        if (totalBytes == 0 || dominantScript == null) {
+            return new FeatureComponents(Float.NaN, Float.NaN, Float.NaN, z4,
+                    Float.NaN, runs.isEmpty() ? "UNKNOWN" : runs.get(0).script,
+                    null, 0);
+        }
+
+        float[] cw = classifierWeights.get(dominantScript);
+        return new FeatureComponents(
+                weightedZ1 / totalBytes,
+                weightedZ2 / totalBytes,
+                weightedZ3 / totalBytes,
+                z4,
+                weightedLogit / totalBytes,
+                dominantScript,
+                cw,
+                (int) totalBytes);
+    }
+
+    /**
+     * Per-feature z-score breakdown returned by
+     * {@link #scoreWithFeatureComponents(String)}.  All z-scores are
+     * byte-count-weighted aggregates across script-homogeneous chunks
+     * except {@code z4}, which is a single document-level value.
+     *
+     * <p>{@code classifierWeights} is the per-script linear classifier
+     * weight vector {@code {w1, w2, w3, w4, bias}} for the dominant
+     * script — useful for hybrid models that recompute the logit after
+     * substituting one z-score with an externally-computed value.
+     */
+    public static final class FeatureComponents {
+        public final float z1;
+        public final float z2;
+        public final float z3;
+        public final float z4;
+        public final float logit;
+        public final String dominantScript;
+        public final float[] classifierWeights;
+        public final int totalBytes;
+
+        FeatureComponents(float z1, float z2, float z3, float z4,
+                          float logit, String dominantScript,
+                          float[] classifierWeights, int totalBytes) {
+            this.z1 = z1;
+            this.z2 = z2;
+            this.z3 = z3;
+            this.z4 = z4;
+            this.logit = logit;
+            this.dominantScript = dominantScript;
+            this.classifierWeights = classifierWeights;
+            this.totalBytes = totalBytes;
+        }
+    }
+
     /**
      * Scores a single script-homogeneous chunk and returns its logit.
      * Positive = clean, negative = junk.  Returns 0 (neutral) if the chunk
      * has no model or is too short.
      */
     private float scoreChunk(byte[] utf8, String text, String script, float z4) {
-        float[] bigramTable = tables.get(script);
-        if (bigramTable == null || utf8.length < 2) {
+        if (utf8.length < 2 || !calibrations.containsKey(script)) {
             return 0f;
         }
+        float[] zs = computeChunkZs(utf8, text, script);
+        return combineLogit(zs[0], zs[1], zs[2], z4, script);
+    }
 
-        // Feature 1: byte-bigram mean log-prob
-        double bigramSum = 0;
-        int bigramCount = 0;
-        for (int i = 0; i + 1 < utf8.length; i++) {
-            bigramSum += bigramTable[((utf8[i] & 0xFF) << 8) | (utf8[i + 1] & 0xFF)];
-            bigramCount++;
-        }
-        float meanBigramLogProb = (float) (bigramSum / bigramCount);
+    /**
+     * Computes per-feature z-scores {z1, z2, z3} for a single script-
+     * homogeneous chunk.  Shared between {@link #scoreChunk} and
+     * {@link #scoreWithFeatureComponents}, and used at training time
+     * via the public {@code computeZ2/3/4...} static helpers so
+     * training and inference share the same math.
+     */
+    private float[] computeChunkZs(byte[] utf8, String text, String script) {
+        // Feature 1: per-script codepoint-bigram, calibrated per-script
+        V7Tables tables = f1TablesByScript.get(script);
+        float meanF1LogProb = computeCodepointF1MeanLogP(text, tables);
         float[] cal1 = calibrations.get(script);
-        float z1 = (meanBigramLogProb - cal1[0]) / cal1[1];
-
-        // Feature 2: named-block transition mean log-prob
-        float z2 = 0f;
-        float[] blockTable = blockTables.get(script);
-        if (blockTable != null) {
-            int nullId = blockN - 1;
-            int prev = -1;
-            double blockSum = 0;
-            int blockCount = 0;
-            for (int i = 0; i < text.length(); ) {
-                int cp = text.codePointAt(i);
-                Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
-                int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
-                if (prev >= 0) {
-                    blockSum += blockTable[prev * blockN + blockId];
-                    blockCount++;
-                }
-                prev = blockId;
-                i += Character.charCount(cp);
-            }
-            if (blockCount > 0) {
-                float meanBlockLogProb = (float) (blockSum / blockCount);
-                float[] cal2 = blockCalibrations.get(script);
-                z2 = cal2 != null ? (meanBlockLogProb - cal2[0]) / cal2[1] : 0f;
+        float z1 = (meanF1LogProb - cal1[0]) / cal1[1];
+
+        float z2 = computeZ2BlockTransition(text,
+                blockTables.get(script), blockCalibrations.get(script));
+        float z3 = computeZ3ControlByte(utf8, controlCalibrations.get(script));
+        return new float[]{z1, z2, z3};
+    }
+
+    private static float computeCodepointF1MeanLogP(String text, V7Tables tables) {
+        if (tables == null) return Float.NaN;
+        double v = computeF1MeanLogP(text, tables);
+        return Double.isNaN(v) ? Float.NaN : (float) v;
+    }
+
+    /**
+     * Feature 2 — calibrated z-score for block-transition mean log-prob on
+     * one text window.  Returns 0 if the window has fewer than two
+     * codepoints or if {@code blockTable} / {@code blockCal} are null.
+     *
+     * <p>Block bucketing is via the JVM-independent
+     * {@link UnicodeBlockRanges}.  Public so the trainer's classifier
+     * feature extractor calls into the exact same math used at inference
+     * time — single source of truth, no train/infer drift.
+     *
+     * @param blockTable {@code (blockN)² × float} log-prob table where
+     *                   {@code blockN = UnicodeBlockRanges.bucketCount()}
+     */
+    public static float computeZ2BlockTransition(String text,
+                                                  float[] blockTable, float[] blockCal) {
+        if (blockTable == null || blockCal == null || text.length() < 2) {
+            return 0f;
+        }
+        int blockN = UnicodeBlockRanges.bucketCount();
+        int prev = -1;
+        double sum = 0;
+        int count = 0;
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            int blockId = UnicodeBlockRanges.bucketOf(cp);
+            if (prev >= 0) {
+                sum += blockTable[prev * blockN + blockId];
+                count++;
             }
+            prev = blockId;
+            i += Character.charCount(cp);
+        }
+        if (count == 0) {
+            return 0f;
         }
+        return ((float) (sum / count) - blockCal[0]) / blockCal[1];
+    }
 
-        // Feature 3: control-byte fraction (stored as −fraction, so higher = cleaner)
+    /**
+     * Feature 3 — calibrated z-score for control-byte fraction on the UTF-8
+     * byte sequence of one text window.  Stored score is {@code -fraction}
+     * so higher = cleaner (matching the direction convention of the other
+     * z-features).
+     *
+     * <p>Public for train/infer math-sharing.
+     */
+    public static float computeZ3ControlByte(byte[] utf8, float[] controlCal) {
+        if (utf8.length == 0 || controlCal == null) {
+            return 0f;
+        }
         long controlCount = 0;
         for (byte b : utf8) {
-            if (isControlByte(b & 0xFF)) controlCount++;
-        }
-        float controlScore = -(float) controlCount / utf8.length;
-        float[] cal3 = controlCalibrations.get(script);
-        float z3 = cal3 != null ? (controlScore - cal3[0]) / cal3[1] : 0f;
-
-        // Per-script linear classifier: w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias
-        float[] cw = classifierWeights.get(script);
-        if (cw != null) {
-            int nFeat = cw.length - 1; // bias is last
-            float logit = cw[nFeat];   // bias
-            if (nFeat >= 1) logit += cw[0] * z1;
-            if (nFeat >= 2) logit += cw[1] * z2;
-            if (nFeat >= 3) logit += cw[2] * z3;
-            if (nFeat >= 4) logit += cw[3] * z4;
-            return logit;
+            if (isControlByte(b & 0xFF)) {
+                controlCount++;
+            }
         }
-        return (z1 + z2 + z3 + z4) / 4.0f; // fallback: equal weight
+        float score = -(float) controlCount / utf8.length;
+        return (score - controlCal[0]) / controlCal[1];
     }
 
     /**
-     * Computes the global script-transition z-score for the whole input string.
-     * Uses raw {@link Character.UnicodeScript} values — NOT {@link #SCRIPT_MODEL_FALLBACK} —
-     * so that HIRAGANA, KATAKANA, and HAN remain distinct, preserving the
-     * characteristic script-mixing pattern of Japanese text.
+     * Feature 4 — calibrated z-score for global script-transition mean
+     * log-prob on one text window.  Uses raw {@link Character.UnicodeScript}
+     * values (no model fallback) so HIRAGANA / KATAKANA / HAN remain
+     * distinct.  Returns 0 if the window has fewer than two non-neutral
+     * codepoints or if the script-transition data isn't supplied.
      *
-     * <p>Returns 0 if the string has fewer than two non-neutral codepoints.
+     * <p>Public for train/infer math-sharing.  Note: inference computes
+     * z4 once per document via {@link #computeScriptTransitionZ} (which
+     * uses the instance's loaded tables); this helper takes them as
+     * arguments so training can compute z4 before the model is finalised.
      */
-    private float computeScriptTransitionZ(String text) {
-        if (scriptTransitionTable == null || scriptBucketIndex == null
-                || scriptTransitionCalibration == null || numScriptBuckets == 0) {
+    public static float computeZ4ScriptTransition(String text,
+                                                   float[] scriptTransTable,
+                                                   float[] scriptTransCal,
+                                                   Map<String, Integer> scriptBucketIndex,
+                                                   int numScriptBuckets) {
+        if (scriptTransTable == null || scriptTransCal == null
+                || scriptBucketIndex == null || numScriptBuckets == 0) {
             return 0f;
         }
         int otherBucket = numScriptBuckets - 1;
@@ -543,7 +686,7 @@ private float computeScriptTransitionZ(String text) {
             }
             int bucket = scriptBucketIndex.getOrDefault(s.name(), otherBucket);
             if (prev >= 0) {
-                sum += scriptTransitionTable[prev * numScriptBuckets + bucket];
+                sum += scriptTransTable[prev * numScriptBuckets + bucket];
                 count++;
             }
             prev = bucket;
@@ -551,8 +694,184 @@ private float computeScriptTransitionZ(String text) {
         if (count == 0) {
             return 0f;
         }
-        float mean = (float) (sum / count);
-        return (mean - scriptTransitionCalibration[0]) / scriptTransitionCalibration[1];
+        return ((float) (sum / count) - scriptTransCal[0]) / scriptTransCal[1];
+    }
+
+    /**
+     * Combines per-feature z-scores via the per-script linear classifier.
+     * Fallback (when no classifier weights stored): equal-weight average.
+     */
+    private float combineLogit(float z1, float z2, float z3, float z4, String script) {
+        float[] cw = classifierWeights.get(script);
+        if (cw != null) {
+            int nFeat = cw.length - 1; // bias is last
+            float logit = cw[nFeat];   // bias
+            if (nFeat >= 1) logit += cw[0] * z1;
+            if (nFeat >= 2) logit += cw[1] * z2;
+            if (nFeat >= 3) logit += cw[2] * z3;
+            if (nFeat >= 4) logit += cw[3] * z4;
+            return logit;
+        }
+        return (z1 + z2 + z3 + z4) / 4.0f; // fallback: equal weight
+    }
+
+    // -----------------------------------------------------------------------
+    // Feature 1: per-script open-addressing codepoint-bigram lookup
+    // -----------------------------------------------------------------------
+
+    /**
+     * Mean log-prob over the codepoint pairs in {@code text} using the given
+     * script's V7 F1 tables.
+     *
+     * <p>For each adjacent codepoint pair {@code (a, b)}:
+     * <ol>
+     *   <li>Binary-search both codepoints in the script's codepoint index.
+     *       If either is absent, the pair was never seen in training; emit
+     *       {@code α * (logP(a) + logP(b))} using each codepoint's unigram
+     *       value (or {@link V7Tables#unigramFallbackLogProb} if the
+     *       codepoint isn't even in the unigram index).</li>
+     *   <li>Otherwise, look up the packed {@code (idxA<<16)|idxB} key in
+     *       the open-addressing bigram table.  Empty slot → unseen pair →
+     *       unigram backoff (same formula).  Match → dequantize the stored
+     *       value.</li>
+     * </ol>
+     *
+     * <p>This is the single authoritative implementation of the V7 F1
+     * scoring math, shared by inference and training.  Keeping one
+     * implementation eliminates the risk of train/infer drift in the F1
+     * feature.
+     *
+     * @return mean log-prob, or {@link Double#NaN} if {@code text} has fewer
+     *         than two codepoints or {@code tables} is null
+     */
+    public static double computeF1MeanLogP(String text, V7Tables tables) {
+        if (text == null || text.length() < 2 || tables == null) {
+            return Double.NaN;
+        }
+        double sum = 0;
+        int n = 0;
+        int prevCp = -1;
+        int prevIdx = -1;
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+            int curIdx = codepointToIndex(tables, cp);
+            if (prevCp >= 0) {
+                sum += scorePairF1V7(prevCp, prevIdx, cp, curIdx, tables);
+                n++;
+            }
+            prevCp = cp;
+            prevIdx = curIdx;
+        }
+        return n == 0 ? Double.NaN : sum / n;
+    }
+
+    /**
+     * Binary-search a codepoint in the script's index.
+     *
+     * @return the dense index (≥ 0) if found, or -1 if the codepoint
+     *         doesn't appear in any kept bigram for this script
+     */
+    public static int codepointToIndex(V7Tables tables, int cp) {
+        return java.util.Arrays.binarySearch(tables.codepointIndex, cp);
+    }
+
+    /**
+     * Mixing function used to scatter packed (idxA, idxB) keys across
+     * the open-addressing table.  A simple integer finalizer (splitmix32
+     * style) gives good distribution for sequential index values.
+     *
+     * <p>Public so the trainer's open-addressing insertion routine uses
+     * the same probe order as inference — drift here would silently
+     * corrupt every lookup.
+     */
+    public static int mixIndexKey(int packedKey) {
+        int x = packedKey;
+        x = (x ^ (x >>> 16)) * 0x7feb352d;
+        x = (x ^ (x >>> 15)) * 0x846ca68b;
+        x = x ^ (x >>> 16);
+        return x;
+    }
+
+    /**
+     * Packed bigram key for indices {@code (a, b)} where each index fits in
+     * {@link JunkDetectorTrainingConfig#KEY_INDEX_BITS} bits.  Asserts that
+     * indices are non-negative; that's the caller's contract.
+     */
+    public static int packBigramKey(int idxA, int idxB) {
+        return (idxA << 16) | (idxB & 0xFFFF);
+    }
+
+    /**
+     * Looks up a (cpA, cpB) bigram in the script's V7 tables and returns
+     * its dequantized log-prob.  Falls back to unigram backoff on miss.
+     *
+     * <p>{@code idxA}/{@code idxB} are the pre-computed codepoint indices
+     * (from {@link #codepointToIndex}); {@code -1} means the codepoint is
+     * not in this script's index.  The caller is expected to compute them
+     * once when scanning the text (avoiding a redundant binary search per
+     * codepoint).
+     */
+    private static double scorePairF1V7(int cpA, int idxA, int cpB, int idxB,
+                                         V7Tables tables) {
+        if (idxA >= 0 && idxB >= 0) {
+            int slot = lookupBigramSlot(tables, idxA, idxB);
+            if (slot >= 0) {
+                return dequantize(tables.bigramValues[slot],
+                        tables.bigramQuantMin, tables.bigramQuantMax);
+            }
+        }
+        // Unigram backoff for unseen pair or for codepoints absent from the
+        // per-script index.  α=1.0 = plain independence; prototype-validated.
+        double ua = unigramLogProb(tables, idxA);
+        double ub = unigramLogProb(tables, idxB);
+        return tables.backoffAlpha * (ua + ub);
+    }
+
+    /**
+     * Open-addressing lookup: returns the slot index that contains the key
+     * for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an
+     * empty slot first).
+     *
+     * <p>Linear probing with the same mix-hash used at training time —
+     * required for the table to be readable, not just writable.
+     */
+    static int lookupBigramSlot(V7Tables tables, int idxA, int idxB) {
+        int packedKey = packBigramKey(idxA, idxB);
+        int[] keys = tables.bigramKeys;
+        int mask = keys.length - 1;
+        int h = mixIndexKey(packedKey) & mask;
+        while (true) {
+            int k = keys[h];
+            if (k == V7Tables.EMPTY_KEY) return -1;
+            if (k == packedKey) return h;
+            h = (h + 1) & mask;
+        }
+    }
+
+    private static double unigramLogProb(V7Tables tables, int idx) {
+        if (idx < 0) {
+            return tables.unigramFallbackLogProb;
+        }
+        return dequantize(tables.unigramTable[idx],
+                tables.unigramQuantMin, tables.unigramQuantMax);
+    }
+
+    private static float dequantize(byte b, float min, float max) {
+        int u = b & 0xFF;
+        return min + (u / 255.0f) * (max - min);
+    }
+
+    /**
+     * Computes the global script-transition z-score for the whole input
+     * string against this model's loaded tables.  Thin wrapper around the
+     * public static {@link #computeZ4ScriptTransition} helper — same math,
+     * just preloaded with this instance's parameters.
+     */
+    private float computeScriptTransitionZ(String text) {
+        return computeZ4ScriptTransition(text,
+                scriptTransitionTable, scriptTransitionCalibration,
+                scriptBucketIndex, numScriptBuckets);
     }
 
     /**
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index f1de37d989a..72e51e8094f 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -25,6 +25,8 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -39,7 +41,6 @@
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.quality.TextQualityComparison;
 import org.apache.tika.quality.TextQualityDetector;
-import org.apache.tika.quality.TextQualityScore;
 
 /**
  * A {@link MetaEncodingDetector} that arbitrates charset candidates by
@@ -76,34 +77,6 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector {
      * default read limit used by the charset base detectors. */
     private static final int DEFAULT_READ_LIMIT = 16384;
 
-    // ---------------------------------------------------------------------
-    // TACTICAL: declarative-override gate constants.
-    //
-    // These exist to compensate for known per-script calibration unevenness
-    // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/
-    // BENGALI floors too strict).  They produce wrong tournaments when an
-    // honest in-document declaration (`<meta charset>` / XML decl) decodes
-    // to sparse non-Latin content that scores junky-but-correct, while a
-    // statistical pick decodes to dense mojibake-Han that scores decent-
-    // but-wrong.  See `analyses/2026-04-26-tika-eval-charset-and-other.md`
-    // and the indic-collapse + Korean+Hanja fixtures.
-    //
-    // REMOVE when the quality scorer is recalibrated per-script — the
-    // tournament should then be reliable on its own.
-    // ---------------------------------------------------------------------
-
-    /** Maximum delta in z-score units we tolerate before honoring the
-     *  in-document declaration over the tournament winner.  Tuned so that
-     *  small same-script-different-codepage deltas (windows-1252 vs
-     *  windows-1257 ≈ 1-2 units) don't trigger override when scripts
-     *  match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */
-    private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f;
-
-    /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared
-     *  decoder's output.  Above this, the declared charset clearly cannot
-     *  decode the bytes and we should not honor the declaration. */
-    private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01;
-
     /** Cached quality detector.  {@code null} if none is on the classpath. */
     private final TextQualityDetector qualityDetector;
 
@@ -187,10 +160,21 @@ public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
 
         // Decode probe under each candidate, preserving insertion order so
         // tournament seeding is deterministic.
+        //
+        // Each decoded string is then run through HTML entity expansion.
+        // For entity-encoded HTML (numeric refs like &#3405;), this is
+        // load-bearing: entity refs are ASCII bytes that decode identically
+        // under every candidate charset, so they don't differentiate.
+        // After expansion they become real codepoints — and crucially, in
+        // the *wrong* decoding (e.g. mojibake-as-HAN), they introduce
+        // cross-script transitions (HAN ↔ MALAYALAM mid-document) that the
+        // quality detector's script-transition feature correctly penalises.
+        // See `20260512-junkdetector-codepoint-hash-plan.md` (AIT5 case).
         Map<Charset, String> candidates = new LinkedHashMap<>();
         for (Charset cs : uniqueCharsets) {
             String decoded = safeDecode(forDecode, cs);
             if (decoded != null && !decoded.isEmpty()) {
+                decoded = expandHtmlEntities(decoded);
                 candidates.put(cs, decoded);
                 if (LOG.isTraceEnabled()) {
                     int sampleLen = Math.min(400, decoded.length());
@@ -246,148 +230,17 @@ public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                     champion.getKey().name(), challenger.getKey().name(),
                     cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f", cmp.delta()),
                     cmp.scoreA(), cmp.scoreB());
-            if ("B".equals(cmp.winner())) {
+            if (challenger.getKey().name().equals(cmp.winner())) {
                 champion = challenger;
             }
         }
         LOG.trace("junk-filter -> {} (tournament champion)", champion.getKey().name());
 
-        // TACTICAL: declarative override.  See class-level comment block.
-        // REMOVE when quality scorer is recalibrated per-script.
-        Charset declarativeOverride = applyInDocumentDeclarativeOverride(
-                context, candidates, champion.getKey());
-        if (declarativeOverride != null) {
-            float conf = context.getTopConfidenceFor(declarativeOverride);
-            context.setArbitrationInfo("junk-filter-declarative-override");
-            LOG.trace("junk-filter -> {} (declarative override of tournament winner {})",
-                    declarativeOverride.name(), champion.getKey().name());
-            return List.of(new EncodingResult(declarativeOverride, conf));
-        }
-
         float confidence = context.getTopConfidenceFor(champion.getKey());
         context.setArbitrationInfo("junk-filter-selected");
         return List.of(new EncodingResult(champion.getKey(), confidence));
     }
 
-    /**
-     * Tactical fix: honor an in-document {@code <meta charset>} or XML
-     * declaration when the quality scorer's per-script calibration unevenness
-     * would otherwise mis-rank candidates of <em>different scripts</em>.
-     *
-     * <p>Returns the in-document declared charset to use, or {@code null} to
-     * leave the tournament winner intact.</p>
-     *
-     * <p>Gates (all must hold to override):</p>
-     * <ol>
-     *   <li><strong>(a) Decode is mostly clean</strong>: declared decoder produces
-     *       fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per char.</li>
-     *   <li><strong>(b) Both decoded</strong>: declared and tournament winner are
-     *       both in the candidate map (already guaranteed by upstream code).</li>
-     *   <li><strong>(c) Quality gap small</strong>: tournament winner's z-score
-     *       is not vastly higher than the declared's; specifically
-     *       {@code winner.z - declared.z &lt;= DECLARATIVE_OVERRIDE_MAX_DELTA}.</li>
-     *   <li><strong>(d) Different scripts</strong>: declared and winner classify
-     *       as different scripts.  Same-script Latin-cousin lies (e.g. windows-1252
-     *       declared on a windows-1257 file) fall through to the tournament,
-     *       which correctly handles them via byte-distribution scoring.</li>
-     * </ol>
-     *
-     * <p>"In-document" means {@code HtmlEncodingDetector} or any future XML-decl
-     * source — explicitly NOT {@code MetadataCharsetDetector} (outer Content-Type
-     * header), which is more often wrong.</p>
-     */
-    private Charset applyInDocumentDeclarativeOverride(
-            EncodingDetectorContext context,
-            Map<Charset, String> candidates,
-            Charset champion) {
-        Charset declared = findInDocumentDeclarative(context);
-        if (declared == null) {
-            return null;
-        }
-        if (declared.equals(champion)) {
-            return null; // already winning
-        }
-        // Per HTML5 spec, <meta charset> cannot validly declare UTF-16 / UTF-32:
-        // the meta tag itself is bytes that have to be parsed before its
-        // declaration is known, and UTF-16/32 require a BOM.  If the
-        // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs
-        // first in the chain), we treat the declaration as invalid and let
-        // the tournament winner stand.  This catches govdocs1-style "utf-16
-        // declared on a Latin file" lies that would otherwise look like a
-        // legitimate script-mismatch override.
-        String declaredName = declared.name();
-        if (declaredName.startsWith("UTF-16") || declaredName.startsWith("UTF-32")) {
-            LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in <meta> (HTML5 invalid)");
-            return null;
-        }
-        String championText = candidates.get(champion);
-        String declaredText = candidates.get(declared);
-        if (declaredText == null || championText == null) {
-            return null; // failed to decode
-        }
-        // (a) decode mostly clean
-        double fffdRate = replacementCharRate(declaredText);
-        if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) {
-            LOG.trace("junk-filter declarative-override skipped: U+FFFD rate {} > {}",
-                    fffdRate, DECLARATIVE_MAX_FFFD_RATE);
-            return null;
-        }
-        TextQualityScore declaredScore = qualityDetector.score(declaredText);
-        TextQualityScore championScore = qualityDetector.score(championText);
-        // (c) winner not vastly higher
-        float delta = championScore.getZScore() - declaredScore.getZScore();
-        if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) {
-            LOG.trace("junk-filter declarative-override skipped: delta {} > {}",
-                    delta, DECLARATIVE_OVERRIDE_MAX_DELTA);
-            return null;
-        }
-        // (d) different scripts
-        String declaredScript = declaredScore.getDominantScript();
-        String championScript = championScore.getDominantScript();
-        if (declaredScript == null || declaredScript.equals(championScript)) {
-            LOG.trace("junk-filter declarative-override skipped: same script {}",
-                    declaredScript);
-            return null;
-        }
-        LOG.trace("junk-filter declarative-override fires: declared={} (script={}, z={}) vs winner={} (script={}, z={}) delta={}",
-                declared.name(), declaredScript, declaredScore.getZScore(),
-                champion.name(), championScript, championScore.getZScore(), delta);
-        return declared;
-    }
-
-    /**
-     * Find the first in-document DECLARATIVE candidate (from
-     * {@code HtmlEncodingDetector} / XML declaration), or {@code null}.
-     * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is
-     * intentionally excluded — those headers lie too often.
-     */
-    private static Charset findInDocumentDeclarative(EncodingDetectorContext context) {
-        for (EncodingDetectorContext.Result r : context.getResults()) {
-            String name = r.getDetectorName();
-            if (("HtmlEncodingDetector".equals(name)
-                    || "StandardHtmlEncodingDetector".equals(name))
-                    && r.getResultType() == EncodingResult.ResultType.DECLARATIVE) {
-                return r.getCharset();
-            }
-        }
-        return null;
-    }
-
-    /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded String —
-     * a proxy for "this charset cannot decode these bytes". */
-    private static double replacementCharRate(String s) {
-        if (s.isEmpty()) {
-            return 0.0;
-        }
-        long count = 0;
-        for (int i = 0; i < s.length(); i++) {
-            if (s.charAt(i) == '�') {
-                count++;
-            }
-        }
-        return (double) count / s.length();
-    }
-
     /**
      * Return the first DECLARATIVE charset whose decoded output equals at
      * least one other candidate's, or {@code null}.
@@ -459,6 +312,69 @@ private static String safeDecode(byte[] bytes, Charset charset) {
         }
     }
 
+    // -----------------------------------------------------------------------
+    // HTML entity expansion
+    //
+    // Applied to every decoded candidate before quality scoring.  Resolves
+    // numeric character refs (&#NNNN; / &#xHHHH;) to their target codepoints
+    // and a small set of common named entities.  Malformed entities pass
+    // through as literal text.  Sufficient for the AIT5-class failure
+    // mode where blogspot/news pages use numeric Malayalam/Bengali entities
+    // intermixed with raw UTF-8 codepoints.
+    // -----------------------------------------------------------------------
+
+    private static final Pattern ENTITY_DEC = Pattern.compile("&#(\\d{1,7});");
+    private static final Pattern ENTITY_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
+    private static final Pattern ENTITY_NAMED =
+            Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+    /**
+     * Expands HTML numeric and a small set of named entity references in
+     * {@code s}.  Malformed or out-of-range entities pass through unchanged.
+     * The named-entity set is intentionally small — only the universally-
+     * declared HTML5 entities that don't depend on a DOCTYPE.  Anything more
+     * exotic stays as a literal entity reference (which scores as ASCII noise,
+     * the same as it would have before).
+     */
+    static String expandHtmlEntities(String s) {
+        s = ENTITY_DEC.matcher(s).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1));
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // overflow — fall through, leave entity literal
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = ENTITY_HEX.matcher(s).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1), 16);
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // overflow — fall through, leave entity literal
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = ENTITY_NAMED.matcher(s).replaceAll(mr -> {
+            switch (mr.group(1)) {
+                case "amp":  return "&";
+                case "lt":   return "<";
+                case "gt":   return ">";
+                case "quot": return "\"";
+                case "apos": return "'";
+                case "nbsp": return " ";
+                case "copy": return "©";
+                case "reg":  return "®";
+                default:     return Matcher.quoteReplacement(mr.group());
+            }
+        });
+        return s;
+    }
+
     /**
      * Strip a leading byte-order mark, if any.  UTF-32 signatures are
      * checked before UTF-16 because the UTF-32 LE BOM ({@code FF FE 00 00})
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java
new file mode 100644
index 00000000000..ab7e1b00b7e
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java
@@ -0,0 +1,445 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+/**
+ * Static codepoint-range → bucket-index lookup table used by Feature 2
+ * (block-transition log-probability).  Replaces
+ * {@link Character.UnicodeBlock#of(int)} so that the model's block
+ * semantics are fully decoupled from the JVM's Unicode-data release —
+ * training on one JDK and serving on another produces identical scores
+ * by construction.
+ *
+ * <p>The 338 named blocks are a snapshot from JDK 25's
+ * {@link Character.UnicodeBlock} (Unicode 16.x).  Codepoints in gaps
+ * between named blocks resolve to the {@link #UNASSIGNED} bucket
+ * ({@value #UNASSIGNED}).  The total bucket count is
+ * {@link #bucketCount()} = 339.
+ *
+ * <p>If the block list is ever updated, bump {@link #SCHEME_VERSION} —
+ * the model file's {@code block_scheme_version} byte must match.  This
+ * forces a clean retrain rather than silent re-mapping.
+ *
+ * <p>Lookup cost: O(log N) binary search.  Thread-safe, immutable.
+ */
+public final class UnicodeBlockRanges {
+
+    /**
+     * Bumped whenever the static range table below changes.  A model
+     * trained against scheme version X cannot be served by code at
+     * version Y ≠ X — the loader rejects the mismatch.
+     */
+    public static final int SCHEME_VERSION = 1;
+
+    /** Bucket index returned for codepoints in no named block. */
+    public static final int UNASSIGNED = 338;
+
+    /**
+     * Sorted by {@code start_cp}.  Each row: {@code {start, end_inclusive, bucket_id}}.
+     * Bucket ids are 0..337 — the {@link #UNASSIGNED} bucket has id 338
+     * and is implicit (returned when binary search finds no matching range).
+     *
+     * <p>Generated from JDK 25 {@code Character.UnicodeBlock.of(cp)} for
+     * every codepoint in [0, 0x10FFFF].
+     */
+    private static final int[][] RANGES = {
+            {0x0000, 0x007F, 0},   // BASIC_LATIN
+            {0x0080, 0x00FF, 1},   // LATIN_1_SUPPLEMENT
+            {0x0100, 0x017F, 2},   // LATIN_EXTENDED_A
+            {0x0180, 0x024F, 3},   // LATIN_EXTENDED_B
+            {0x0250, 0x02AF, 4},   // IPA_EXTENSIONS
+            {0x02B0, 0x02FF, 5},   // SPACING_MODIFIER_LETTERS
+            {0x0300, 0x036F, 6},   // COMBINING_DIACRITICAL_MARKS
+            {0x0370, 0x03FF, 7},   // GREEK
+            {0x0400, 0x04FF, 8},   // CYRILLIC
+            {0x0500, 0x052F, 9},   // CYRILLIC_SUPPLEMENTARY
+            {0x0530, 0x058F, 10},   // ARMENIAN
+            {0x0590, 0x05FF, 11},   // HEBREW
+            {0x0600, 0x06FF, 12},   // ARABIC
+            {0x0700, 0x074F, 13},   // SYRIAC
+            {0x0750, 0x077F, 14},   // ARABIC_SUPPLEMENT
+            {0x0780, 0x07BF, 15},   // THAANA
+            {0x07C0, 0x07FF, 16},   // NKO
+            {0x0800, 0x083F, 17},   // SAMARITAN
+            {0x0840, 0x085F, 18},   // MANDAIC
+            {0x0860, 0x086F, 19},   // SYRIAC_SUPPLEMENT
+            {0x0870, 0x089F, 20},   // ARABIC_EXTENDED_B
+            {0x08A0, 0x08FF, 21},   // ARABIC_EXTENDED_A
+            {0x0900, 0x097F, 22},   // DEVANAGARI
+            {0x0980, 0x09FF, 23},   // BENGALI
+            {0x0A00, 0x0A7F, 24},   // GURMUKHI
+            {0x0A80, 0x0AFF, 25},   // GUJARATI
+            {0x0B00, 0x0B7F, 26},   // ORIYA
+            {0x0B80, 0x0BFF, 27},   // TAMIL
+            {0x0C00, 0x0C7F, 28},   // TELUGU
+            {0x0C80, 0x0CFF, 29},   // KANNADA
+            {0x0D00, 0x0D7F, 30},   // MALAYALAM
+            {0x0D80, 0x0DFF, 31},   // SINHALA
+            {0x0E00, 0x0E7F, 32},   // THAI
+            {0x0E80, 0x0EFF, 33},   // LAO
+            {0x0F00, 0x0FFF, 34},   // TIBETAN
+            {0x1000, 0x109F, 35},   // MYANMAR
+            {0x10A0, 0x10FF, 36},   // GEORGIAN
+            {0x1100, 0x11FF, 37},   // HANGUL_JAMO
+            {0x1200, 0x137F, 38},   // ETHIOPIC
+            {0x1380, 0x139F, 39},   // ETHIOPIC_SUPPLEMENT
+            {0x13A0, 0x13FF, 40},   // CHEROKEE
+            {0x1400, 0x167F, 41},   // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
+            {0x1680, 0x169F, 42},   // OGHAM
+            {0x16A0, 0x16FF, 43},   // RUNIC
+            {0x1700, 0x171F, 44},   // TAGALOG
+            {0x1720, 0x173F, 45},   // HANUNOO
+            {0x1740, 0x175F, 46},   // BUHID
+            {0x1760, 0x177F, 47},   // TAGBANWA
+            {0x1780, 0x17FF, 48},   // KHMER
+            {0x1800, 0x18AF, 49},   // MONGOLIAN
+            {0x18B0, 0x18FF, 50},   // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED
+            {0x1900, 0x194F, 51},   // LIMBU
+            {0x1950, 0x197F, 52},   // TAI_LE
+            {0x1980, 0x19DF, 53},   // NEW_TAI_LUE
+            {0x19E0, 0x19FF, 54},   // KHMER_SYMBOLS
+            {0x1A00, 0x1A1F, 55},   // BUGINESE
+            {0x1A20, 0x1AAF, 56},   // TAI_THAM
+            {0x1AB0, 0x1AFF, 57},   // COMBINING_DIACRITICAL_MARKS_EXTENDED
+            {0x1B00, 0x1B7F, 58},   // BALINESE
+            {0x1B80, 0x1BBF, 59},   // SUNDANESE
+            {0x1BC0, 0x1BFF, 60},   // BATAK
+            {0x1C00, 0x1C4F, 61},   // LEPCHA
+            {0x1C50, 0x1C7F, 62},   // OL_CHIKI
+            {0x1C80, 0x1C8F, 63},   // CYRILLIC_EXTENDED_C
+            {0x1C90, 0x1CBF, 64},   // GEORGIAN_EXTENDED
+            {0x1CC0, 0x1CCF, 65},   // SUNDANESE_SUPPLEMENT
+            {0x1CD0, 0x1CFF, 66},   // VEDIC_EXTENSIONS
+            {0x1D00, 0x1D7F, 67},   // PHONETIC_EXTENSIONS
+            {0x1D80, 0x1DBF, 68},   // PHONETIC_EXTENSIONS_SUPPLEMENT
+            {0x1DC0, 0x1DFF, 69},   // COMBINING_DIACRITICAL_MARKS_SUPPLEMENT
+            {0x1E00, 0x1EFF, 70},   // LATIN_EXTENDED_ADDITIONAL
+            {0x1F00, 0x1FFF, 71},   // GREEK_EXTENDED
+            {0x2000, 0x206F, 72},   // GENERAL_PUNCTUATION
+            {0x2070, 0x209F, 73},   // SUPERSCRIPTS_AND_SUBSCRIPTS
+            {0x20A0, 0x20CF, 74},   // CURRENCY_SYMBOLS
+            {0x20D0, 0x20FF, 75},   // COMBINING_MARKS_FOR_SYMBOLS
+            {0x2100, 0x214F, 76},   // LETTERLIKE_SYMBOLS
+            {0x2150, 0x218F, 77},   // NUMBER_FORMS
+            {0x2190, 0x21FF, 78},   // ARROWS
+            {0x2200, 0x22FF, 79},   // MATHEMATICAL_OPERATORS
+            {0x2300, 0x23FF, 80},   // MISCELLANEOUS_TECHNICAL
+            {0x2400, 0x243F, 81},   // CONTROL_PICTURES
+            {0x2440, 0x245F, 82},   // OPTICAL_CHARACTER_RECOGNITION
+            {0x2460, 0x24FF, 83},   // ENCLOSED_ALPHANUMERICS
+            {0x2500, 0x257F, 84},   // BOX_DRAWING
+            {0x2580, 0x259F, 85},   // BLOCK_ELEMENTS
+            {0x25A0, 0x25FF, 86},   // GEOMETRIC_SHAPES
+            {0x2600, 0x26FF, 87},   // MISCELLANEOUS_SYMBOLS
+            {0x2700, 0x27BF, 88},   // DINGBATS
+            {0x27C0, 0x27EF, 89},   // MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
+            {0x27F0, 0x27FF, 90},   // SUPPLEMENTAL_ARROWS_A
+            {0x2800, 0x28FF, 91},   // BRAILLE_PATTERNS
+            {0x2900, 0x297F, 92},   // SUPPLEMENTAL_ARROWS_B
+            {0x2980, 0x29FF, 93},   // MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
+            {0x2A00, 0x2AFF, 94},   // SUPPLEMENTAL_MATHEMATICAL_OPERATORS
+            {0x2B00, 0x2BFF, 95},   // MISCELLANEOUS_SYMBOLS_AND_ARROWS
+            {0x2C00, 0x2C5F, 96},   // GLAGOLITIC
+            {0x2C60, 0x2C7F, 97},   // LATIN_EXTENDED_C
+            {0x2C80, 0x2CFF, 98},   // COPTIC
+            {0x2D00, 0x2D2F, 99},   // GEORGIAN_SUPPLEMENT
+            {0x2D30, 0x2D7F, 100},   // TIFINAGH
+            {0x2D80, 0x2DDF, 101},   // ETHIOPIC_EXTENDED
+            {0x2DE0, 0x2DFF, 102},   // CYRILLIC_EXTENDED_A
+            {0x2E00, 0x2E7F, 103},   // SUPPLEMENTAL_PUNCTUATION
+            {0x2E80, 0x2EFF, 104},   // CJK_RADICALS_SUPPLEMENT
+            {0x2F00, 0x2FDF, 105},   // KANGXI_RADICALS
+            {0x2FF0, 0x2FFF, 106},   // IDEOGRAPHIC_DESCRIPTION_CHARACTERS
+            {0x3000, 0x303F, 107},   // CJK_SYMBOLS_AND_PUNCTUATION
+            {0x3040, 0x309F, 108},   // HIRAGANA
+            {0x30A0, 0x30FF, 109},   // KATAKANA
+            {0x3100, 0x312F, 110},   // BOPOMOFO
+            {0x3130, 0x318F, 111},   // HANGUL_COMPATIBILITY_JAMO
+            {0x3190, 0x319F, 112},   // KANBUN
+            {0x31A0, 0x31BF, 113},   // BOPOMOFO_EXTENDED
+            {0x31C0, 0x31EF, 114},   // CJK_STROKES
+            {0x31F0, 0x31FF, 115},   // KATAKANA_PHONETIC_EXTENSIONS
+            {0x3200, 0x32FF, 116},   // ENCLOSED_CJK_LETTERS_AND_MONTHS
+            {0x3300, 0x33FF, 117},   // CJK_COMPATIBILITY
+            {0x3400, 0x4DBF, 118},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
+            {0x4DC0, 0x4DFF, 119},   // YIJING_HEXAGRAM_SYMBOLS
+            {0x4E00, 0x9FFF, 120},   // CJK_UNIFIED_IDEOGRAPHS
+            {0xA000, 0xA48F, 121},   // YI_SYLLABLES
+            {0xA490, 0xA4CF, 122},   // YI_RADICALS
+            {0xA4D0, 0xA4FF, 123},   // LISU
+            {0xA500, 0xA63F, 124},   // VAI
+            {0xA640, 0xA69F, 125},   // CYRILLIC_EXTENDED_B
+            {0xA6A0, 0xA6FF, 126},   // BAMUM
+            {0xA700, 0xA71F, 127},   // MODIFIER_TONE_LETTERS
+            {0xA720, 0xA7FF, 128},   // LATIN_EXTENDED_D
+            {0xA800, 0xA82F, 129},   // SYLOTI_NAGRI
+            {0xA830, 0xA83F, 130},   // COMMON_INDIC_NUMBER_FORMS
+            {0xA840, 0xA87F, 131},   // PHAGS_PA
+            {0xA880, 0xA8DF, 132},   // SAURASHTRA
+            {0xA8E0, 0xA8FF, 133},   // DEVANAGARI_EXTENDED
+            {0xA900, 0xA92F, 134},   // KAYAH_LI
+            {0xA930, 0xA95F, 135},   // REJANG
+            {0xA960, 0xA97F, 136},   // HANGUL_JAMO_EXTENDED_A
+            {0xA980, 0xA9DF, 137},   // JAVANESE
+            {0xA9E0, 0xA9FF, 138},   // MYANMAR_EXTENDED_B
+            {0xAA00, 0xAA5F, 139},   // CHAM
+            {0xAA60, 0xAA7F, 140},   // MYANMAR_EXTENDED_A
+            {0xAA80, 0xAADF, 141},   // TAI_VIET
+            {0xAAE0, 0xAAFF, 142},   // MEETEI_MAYEK_EXTENSIONS
+            {0xAB00, 0xAB2F, 143},   // ETHIOPIC_EXTENDED_A
+            {0xAB30, 0xAB6F, 144},   // LATIN_EXTENDED_E
+            {0xAB70, 0xABBF, 145},   // CHEROKEE_SUPPLEMENT
+            {0xABC0, 0xABFF, 146},   // MEETEI_MAYEK
+            {0xAC00, 0xD7AF, 147},   // HANGUL_SYLLABLES
+            {0xD7B0, 0xD7FF, 148},   // HANGUL_JAMO_EXTENDED_B
+            {0xD800, 0xDB7F, 149},   // HIGH_SURROGATES
+            {0xDB80, 0xDBFF, 150},   // HIGH_PRIVATE_USE_SURROGATES
+            {0xDC00, 0xDFFF, 151},   // LOW_SURROGATES
+            {0xE000, 0xF8FF, 152},   // PRIVATE_USE_AREA
+            {0xF900, 0xFAFF, 153},   // CJK_COMPATIBILITY_IDEOGRAPHS
+            {0xFB00, 0xFB4F, 154},   // ALPHABETIC_PRESENTATION_FORMS
+            {0xFB50, 0xFDFF, 155},   // ARABIC_PRESENTATION_FORMS_A
+            {0xFE00, 0xFE0F, 156},   // VARIATION_SELECTORS
+            {0xFE10, 0xFE1F, 157},   // VERTICAL_FORMS
+            {0xFE20, 0xFE2F, 158},   // COMBINING_HALF_MARKS
+            {0xFE30, 0xFE4F, 159},   // CJK_COMPATIBILITY_FORMS
+            {0xFE50, 0xFE6F, 160},   // SMALL_FORM_VARIANTS
+            {0xFE70, 0xFEFF, 161},   // ARABIC_PRESENTATION_FORMS_B
+            {0xFF00, 0xFFEF, 162},   // HALFWIDTH_AND_FULLWIDTH_FORMS
+            {0xFFF0, 0xFFFF, 163},   // SPECIALS
+            {0x10000, 0x1007F, 164},   // LINEAR_B_SYLLABARY
+            {0x10080, 0x100FF, 165},   // LINEAR_B_IDEOGRAMS
+            {0x10100, 0x1013F, 166},   // AEGEAN_NUMBERS
+            {0x10140, 0x1018F, 167},   // ANCIENT_GREEK_NUMBERS
+            {0x10190, 0x101CF, 168},   // ANCIENT_SYMBOLS
+            {0x101D0, 0x101FF, 169},   // PHAISTOS_DISC
+            {0x10280, 0x1029F, 170},   // LYCIAN
+            {0x102A0, 0x102DF, 171},   // CARIAN
+            {0x102E0, 0x102FF, 172},   // COPTIC_EPACT_NUMBERS
+            {0x10300, 0x1032F, 173},   // OLD_ITALIC
+            {0x10330, 0x1034F, 174},   // GOTHIC
+            {0x10350, 0x1037F, 175},   // OLD_PERMIC
+            {0x10380, 0x1039F, 176},   // UGARITIC
+            {0x103A0, 0x103DF, 177},   // OLD_PERSIAN
+            {0x10400, 0x1044F, 178},   // DESERET
+            {0x10450, 0x1047F, 179},   // SHAVIAN
+            {0x10480, 0x104AF, 180},   // OSMANYA
+            {0x104B0, 0x104FF, 181},   // OSAGE
+            {0x10500, 0x1052F, 182},   // ELBASAN
+            {0x10530, 0x1056F, 183},   // CAUCASIAN_ALBANIAN
+            {0x10570, 0x105BF, 184},   // VITHKUQI
+            {0x105C0, 0x105FF, 185},   // TODHRI
+            {0x10600, 0x1077F, 186},   // LINEAR_A
+            {0x10780, 0x107BF, 187},   // LATIN_EXTENDED_F
+            {0x10800, 0x1083F, 188},   // CYPRIOT_SYLLABARY
+            {0x10840, 0x1085F, 189},   // IMPERIAL_ARAMAIC
+            {0x10860, 0x1087F, 190},   // PALMYRENE
+            {0x10880, 0x108AF, 191},   // NABATAEAN
+            {0x108E0, 0x108FF, 192},   // HATRAN
+            {0x10900, 0x1091F, 193},   // PHOENICIAN
+            {0x10920, 0x1093F, 194},   // LYDIAN
+            {0x10980, 0x1099F, 195},   // MEROITIC_HIEROGLYPHS
+            {0x109A0, 0x109FF, 196},   // MEROITIC_CURSIVE
+            {0x10A00, 0x10A5F, 197},   // KHAROSHTHI
+            {0x10A60, 0x10A7F, 198},   // OLD_SOUTH_ARABIAN
+            {0x10A80, 0x10A9F, 199},   // OLD_NORTH_ARABIAN
+            {0x10AC0, 0x10AFF, 200},   // MANICHAEAN
+            {0x10B00, 0x10B3F, 201},   // AVESTAN
+            {0x10B40, 0x10B5F, 202},   // INSCRIPTIONAL_PARTHIAN
+            {0x10B60, 0x10B7F, 203},   // INSCRIPTIONAL_PAHLAVI
+            {0x10B80, 0x10BAF, 204},   // PSALTER_PAHLAVI
+            {0x10C00, 0x10C4F, 205},   // OLD_TURKIC
+            {0x10C80, 0x10CFF, 206},   // OLD_HUNGARIAN
+            {0x10D00, 0x10D3F, 207},   // HANIFI_ROHINGYA
+            {0x10D40, 0x10D8F, 208},   // GARAY
+            {0x10E60, 0x10E7F, 209},   // RUMI_NUMERAL_SYMBOLS
+            {0x10E80, 0x10EBF, 210},   // YEZIDI
+            {0x10EC0, 0x10EFF, 211},   // ARABIC_EXTENDED_C
+            {0x10F00, 0x10F2F, 212},   // OLD_SOGDIAN
+            {0x10F30, 0x10F6F, 213},   // SOGDIAN
+            {0x10F70, 0x10FAF, 214},   // OLD_UYGHUR
+            {0x10FB0, 0x10FDF, 215},   // CHORASMIAN
+            {0x10FE0, 0x10FFF, 216},   // ELYMAIC
+            {0x11000, 0x1107F, 217},   // BRAHMI
+            {0x11080, 0x110CF, 218},   // KAITHI
+            {0x110D0, 0x110FF, 219},   // SORA_SOMPENG
+            {0x11100, 0x1114F, 220},   // CHAKMA
+            {0x11150, 0x1117F, 221},   // MAHAJANI
+            {0x11180, 0x111DF, 222},   // SHARADA
+            {0x111E0, 0x111FF, 223},   // SINHALA_ARCHAIC_NUMBERS
+            {0x11200, 0x1124F, 224},   // KHOJKI
+            {0x11280, 0x112AF, 225},   // MULTANI
+            {0x112B0, 0x112FF, 226},   // KHUDAWADI
+            {0x11300, 0x1137F, 227},   // GRANTHA
+            {0x11380, 0x113FF, 228},   // TULU_TIGALARI
+            {0x11400, 0x1147F, 229},   // NEWA
+            {0x11480, 0x114DF, 230},   // TIRHUTA
+            {0x11580, 0x115FF, 231},   // SIDDHAM
+            {0x11600, 0x1165F, 232},   // MODI
+            {0x11660, 0x1167F, 233},   // MONGOLIAN_SUPPLEMENT
+            {0x11680, 0x116CF, 234},   // TAKRI
+            {0x116D0, 0x116FF, 235},   // MYANMAR_EXTENDED_C
+            {0x11700, 0x1174F, 236},   // AHOM
+            {0x11800, 0x1184F, 237},   // DOGRA
+            {0x118A0, 0x118FF, 238},   // WARANG_CITI
+            {0x11900, 0x1195F, 239},   // DIVES_AKURU
+            {0x119A0, 0x119FF, 240},   // NANDINAGARI
+            {0x11A00, 0x11A4F, 241},   // ZANABAZAR_SQUARE
+            {0x11A50, 0x11AAF, 242},   // SOYOMBO
+            {0x11AB0, 0x11ABF, 243},   // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A
+            {0x11AC0, 0x11AFF, 244},   // PAU_CIN_HAU
+            {0x11B00, 0x11B5F, 245},   // DEVANAGARI_EXTENDED_A
+            {0x11BC0, 0x11BFF, 246},   // SUNUWAR
+            {0x11C00, 0x11C6F, 247},   // BHAIKSUKI
+            {0x11C70, 0x11CBF, 248},   // MARCHEN
+            {0x11D00, 0x11D5F, 249},   // MASARAM_GONDI
+            {0x11D60, 0x11DAF, 250},   // GUNJALA_GONDI
+            {0x11EE0, 0x11EFF, 251},   // MAKASAR
+            {0x11F00, 0x11F5F, 252},   // KAWI
+            {0x11FB0, 0x11FBF, 253},   // LISU_SUPPLEMENT
+            {0x11FC0, 0x11FFF, 254},   // TAMIL_SUPPLEMENT
+            {0x12000, 0x123FF, 255},   // CUNEIFORM
+            {0x12400, 0x1247F, 256},   // CUNEIFORM_NUMBERS_AND_PUNCTUATION
+            {0x12480, 0x1254F, 257},   // EARLY_DYNASTIC_CUNEIFORM
+            {0x12F90, 0x12FFF, 258},   // CYPRO_MINOAN
+            {0x13000, 0x1342F, 259},   // EGYPTIAN_HIEROGLYPHS
+            {0x13430, 0x1345F, 260},   // EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS
+            {0x13460, 0x143FF, 261},   // EGYPTIAN_HIEROGLYPHS_EXTENDED_A
+            {0x14400, 0x1467F, 262},   // ANATOLIAN_HIEROGLYPHS
+            {0x16100, 0x1613F, 263},   // GURUNG_KHEMA
+            {0x16800, 0x16A3F, 264},   // BAMUM_SUPPLEMENT
+            {0x16A40, 0x16A6F, 265},   // MRO
+            {0x16A70, 0x16ACF, 266},   // TANGSA
+            {0x16AD0, 0x16AFF, 267},   // BASSA_VAH
+            {0x16B00, 0x16B8F, 268},   // PAHAWH_HMONG
+            {0x16D40, 0x16D7F, 269},   // KIRAT_RAI
+            {0x16E40, 0x16E9F, 270},   // MEDEFAIDRIN
+            {0x16F00, 0x16F9F, 271},   // MIAO
+            {0x16FE0, 0x16FFF, 272},   // IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION
+            {0x17000, 0x187FF, 273},   // TANGUT
+            {0x18800, 0x18AFF, 274},   // TANGUT_COMPONENTS
+            {0x18B00, 0x18CFF, 275},   // KHITAN_SMALL_SCRIPT
+            {0x18D00, 0x18D7F, 276},   // TANGUT_SUPPLEMENT
+            {0x1AFF0, 0x1AFFF, 277},   // KANA_EXTENDED_B
+            {0x1B000, 0x1B0FF, 278},   // KANA_SUPPLEMENT
+            {0x1B100, 0x1B12F, 279},   // KANA_EXTENDED_A
+            {0x1B130, 0x1B16F, 280},   // SMALL_KANA_EXTENSION
+            {0x1B170, 0x1B2FF, 281},   // NUSHU
+            {0x1BC00, 0x1BC9F, 282},   // DUPLOYAN
+            {0x1BCA0, 0x1BCAF, 283},   // SHORTHAND_FORMAT_CONTROLS
+            {0x1CC00, 0x1CEBF, 284},   // SYMBOLS_FOR_LEGACY_COMPUTING_SUPPLEMENT
+            {0x1CF00, 0x1CFCF, 285},   // ZNAMENNY_MUSICAL_NOTATION
+            {0x1D000, 0x1D0FF, 286},   // BYZANTINE_MUSICAL_SYMBOLS
+            {0x1D100, 0x1D1FF, 287},   // MUSICAL_SYMBOLS
+            {0x1D200, 0x1D24F, 288},   // ANCIENT_GREEK_MUSICAL_NOTATION
+            {0x1D2C0, 0x1D2DF, 289},   // KAKTOVIK_NUMERALS
+            {0x1D2E0, 0x1D2FF, 290},   // MAYAN_NUMERALS
+            {0x1D300, 0x1D35F, 291},   // TAI_XUAN_JING_SYMBOLS
+            {0x1D360, 0x1D37F, 292},   // COUNTING_ROD_NUMERALS
+            {0x1D400, 0x1D7FF, 293},   // MATHEMATICAL_ALPHANUMERIC_SYMBOLS
+            {0x1D800, 0x1DAAF, 294},   // SUTTON_SIGNWRITING
+            {0x1DF00, 0x1DFFF, 295},   // LATIN_EXTENDED_G
+            {0x1E000, 0x1E02F, 296},   // GLAGOLITIC_SUPPLEMENT
+            {0x1E030, 0x1E08F, 297},   // CYRILLIC_EXTENDED_D
+            {0x1E100, 0x1E14F, 298},   // NYIAKENG_PUACHUE_HMONG
+            {0x1E290, 0x1E2BF, 299},   // TOTO
+            {0x1E2C0, 0x1E2FF, 300},   // WANCHO
+            {0x1E4D0, 0x1E4FF, 301},   // NAG_MUNDARI
+            {0x1E5D0, 0x1E5FF, 302},   // OL_ONAL
+            {0x1E7E0, 0x1E7FF, 303},   // ETHIOPIC_EXTENDED_B
+            {0x1E800, 0x1E8DF, 304},   // MENDE_KIKAKUI
+            {0x1E900, 0x1E95F, 305},   // ADLAM
+            {0x1EC70, 0x1ECBF, 306},   // INDIC_SIYAQ_NUMBERS
+            {0x1ED00, 0x1ED4F, 307},   // OTTOMAN_SIYAQ_NUMBERS
+            {0x1EE00, 0x1EEFF, 308},   // ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS
+            {0x1F000, 0x1F02F, 309},   // MAHJONG_TILES
+            {0x1F030, 0x1F09F, 310},   // DOMINO_TILES
+            {0x1F0A0, 0x1F0FF, 311},   // PLAYING_CARDS
+            {0x1F100, 0x1F1FF, 312},   // ENCLOSED_ALPHANUMERIC_SUPPLEMENT
+            {0x1F200, 0x1F2FF, 313},   // ENCLOSED_IDEOGRAPHIC_SUPPLEMENT
+            {0x1F300, 0x1F5FF, 314},   // MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS
+            {0x1F600, 0x1F64F, 315},   // EMOTICONS
+            {0x1F650, 0x1F67F, 316},   // ORNAMENTAL_DINGBATS
+            {0x1F680, 0x1F6FF, 317},   // TRANSPORT_AND_MAP_SYMBOLS
+            {0x1F700, 0x1F77F, 318},   // ALCHEMICAL_SYMBOLS
+            {0x1F780, 0x1F7FF, 319},   // GEOMETRIC_SHAPES_EXTENDED
+            {0x1F800, 0x1F8FF, 320},   // SUPPLEMENTAL_ARROWS_C
+            {0x1F900, 0x1F9FF, 321},   // SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS
+            {0x1FA00, 0x1FA6F, 322},   // CHESS_SYMBOLS
+            {0x1FA70, 0x1FAFF, 323},   // SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A
+            {0x1FB00, 0x1FBFF, 324},   // SYMBOLS_FOR_LEGACY_COMPUTING
+            {0x20000, 0x2A6DF, 325},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
+            {0x2A700, 0x2B73F, 326},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C
+            {0x2B740, 0x2B81F, 327},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D
+            {0x2B820, 0x2CEAF, 328},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E
+            {0x2CEB0, 0x2EBEF, 329},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F
+            {0x2EBF0, 0x2EE5F, 330},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I
+            {0x2F800, 0x2FA1F, 331},   // CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
+            {0x30000, 0x3134F, 332},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G
+            {0x31350, 0x323AF, 333},   // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H
+            {0xE0000, 0xE007F, 334},   // TAGS
+            {0xE0100, 0xE01EF, 335},   // VARIATION_SELECTORS_SUPPLEMENT
+            {0xF0000, 0xFFFFF, 336},   // SUPPLEMENTARY_PRIVATE_USE_AREA_A
+            {0x100000, 0x10FFFF, 337},   // SUPPLEMENTARY_PRIVATE_USE_AREA_B
+    };
+
+    /** Cached start_cp array for binary search. */
+    private static final int[] STARTS;
+    static {
+        STARTS = new int[RANGES.length];
+        for (int i = 0; i < RANGES.length; i++) {
+            STARTS[i] = RANGES[i][0];
+        }
+    }
+
+    private UnicodeBlockRanges() {
+        // utility class
+    }
+
+    /** Total number of buckets (named blocks + 1 unassigned). */
+    public static int bucketCount() {
+        return RANGES.length + 1;
+    }
+
+    /**
+     * Returns the bucket id for the given codepoint, or {@link #UNASSIGNED}
+     * if the codepoint falls outside every named block range.
+     *
+     * <p>Binary search over the sorted-by-{@code start_cp} range list:
+     * O(log N) where N = {@value #UNASSIGNED} (the number of named blocks).
+     */
+    public static int bucketOf(int cp) {
+        // Binary search: find largest STARTS[i] <= cp
+        int lo = 0;
+        int hi = STARTS.length - 1;
+        int found = -1;
+        while (lo <= hi) {
+            int mid = (lo + hi) >>> 1;
+            if (STARTS[mid] <= cp) {
+                found = mid;
+                lo = mid + 1;
+            } else {
+                hi = mid - 1;
+            }
+        }
+        if (found < 0) {
+            return UNASSIGNED;
+        }
+        // RANGES[found] is the candidate.  Confirm cp is within end_inclusive.
+        return cp <= RANGES[found][1] ? RANGES[found][2] : UNASSIGNED;
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java
new file mode 100644
index 00000000000..93a82640caa
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+/**
+ * Carrier for one script's v7 F1 tables.
+ *
+ * <p>The v6 design used a single global codepoint-bigram hash + Bloom
+ * filter shared across all scripts.  We measured that this ceiling
+ * limits accuracy: enlarging one script's training data (e.g. HAN) hurts
+ * the other scripts' z-scores because they share the global hash.  v7
+ * gives each script its own pair of tables.
+ *
+ * <p>Per-script layout:
+ *
+ * <ul>
+ *   <li>{@code codepointIndex} — sorted, ascending {@code int[]} of every
+ *       codepoint that appears as either side of a kept bigram for this
+ *       script.  Codepoint → dense index is a binary search; index →
+ *       codepoint is direct array access.  Typical sizes: ~7K-15K for HAN,
+ *       ~200-500 for most other scripts.
+ *   <li>{@code bigramKeys} / {@code bigramValues} — parallel arrays
+ *       implementing an open-addressed hash table with linear probing.
+ *       Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code
+ *       -1} means "empty slot."  Indices are bounded at 16 bits (65535),
+ *       which is comfortably above the largest per-script codepoint count
+ *       we observe.
+ *   <li>{@code unigramTable} — {@code byte[numCodepoints]}, quantized
+ *       unigram log-probabilities indexed by the same codepoint→index map.
+ *   <li>{@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} —
+ *       per-quantization ranges; dequantize by
+ *       {@code min + (b/255) * (max - min)}.
+ *   <li>{@code unigramFallbackLogProb} — log-prob assigned when a
+ *       codepoint is not in {@code codepointIndex} at all.  Set to the
+ *       script's most-pessimistic unigram value (its quantization min) so
+ *       absent codepoints don't accidentally score above legitimately-rare
+ *       ones.
+ *   <li>{@code backoffAlpha} — multiplier on the unigram-backoff
+ *       independence sum, copied from v6.
+ * </ul>
+ *
+ * <p>Membership semantics: no Bloom filter.  The empty-slot sentinel is
+ * the membership oracle — a pair is "seen" iff binary-search finds both
+ * codepoints in the index AND a probe sequence hits a matching key before
+ * an empty slot.  Lookups are therefore exact; there is no false-positive
+ * backoff path as there is in v6.
+ *
+ * <p>Fields are package-private so the
+ * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can
+ * construct instances directly without going through accessors.
+ */
+public final class V7Tables {
+
+    /** Reserved value in {@link #bigramKeys} marking an unoccupied slot. */
+    public static final int EMPTY_KEY = -1;
+
+    final int[] codepointIndex;
+    final int[] bigramKeys;
+    final byte[] bigramValues;
+    final byte[] unigramTable;
+    final float bigramQuantMin;
+    final float bigramQuantMax;
+    final float unigramQuantMin;
+    final float unigramQuantMax;
+    final float unigramFallbackLogProb;
+    final float backoffAlpha;
+
+    public V7Tables(int[] codepointIndex,
+                    int[] bigramKeys, byte[] bigramValues,
+                    byte[] unigramTable,
+                    float bigramQuantMin, float bigramQuantMax,
+                    float unigramQuantMin, float unigramQuantMax,
+                    float unigramFallbackLogProb,
+                    float backoffAlpha) {
+        if (bigramKeys.length != bigramValues.length) {
+            throw new IllegalArgumentException(
+                    "bigramKeys and bigramValues must have equal length: "
+                    + bigramKeys.length + " vs " + bigramValues.length);
+        }
+        if (unigramTable.length != codepointIndex.length) {
+            throw new IllegalArgumentException(
+                    "unigramTable.length must equal codepointIndex.length: "
+                    + unigramTable.length + " vs " + codepointIndex.length);
+        }
+        this.codepointIndex = codepointIndex;
+        this.bigramKeys = bigramKeys;
+        this.bigramValues = bigramValues;
+        this.unigramTable = unigramTable;
+        this.bigramQuantMin = bigramQuantMin;
+        this.bigramQuantMax = bigramQuantMax;
+        this.unigramQuantMin = unigramQuantMin;
+        this.unigramQuantMax = unigramQuantMax;
+        this.unigramFallbackLogProb = unigramFallbackLogProb;
+        this.backoffAlpha = backoffAlpha;
+    }
+
+    /**
+     * Serialises this script's F1 tables.  Read back via
+     * {@link #readFrom(DataInputStream)}.
+     */
+    public void writeTo(DataOutputStream dos) throws IOException {
+        dos.writeFloat(backoffAlpha);
+
+        // Codepoint index.
+        dos.writeInt(codepointIndex.length);
+        ByteBuffer cpBuf = ByteBuffer.allocate(codepointIndex.length * 4)
+                .order(ByteOrder.BIG_ENDIAN);
+        cpBuf.asIntBuffer().put(codepointIndex);
+        dos.write(cpBuf.array());
+
+        // Bigram open-addressing table (keys + values).
+        dos.writeInt(bigramKeys.length);
+        dos.writeFloat(bigramQuantMin);
+        dos.writeFloat(bigramQuantMax);
+        ByteBuffer keyBuf = ByteBuffer.allocate(bigramKeys.length * 4)
+                .order(ByteOrder.BIG_ENDIAN);
+        keyBuf.asIntBuffer().put(bigramKeys);
+        dos.write(keyBuf.array());
+        dos.write(bigramValues);
+
+        // Unigram table.
+        dos.writeFloat(unigramQuantMin);
+        dos.writeFloat(unigramQuantMax);
+        dos.writeFloat(unigramFallbackLogProb);
+        dos.write(unigramTable);
+    }
+
+    /** Inverse of {@link #writeTo(DataOutputStream)}. */
+    public static V7Tables readFrom(DataInputStream dis) throws IOException {
+        float backoffAlpha = dis.readFloat();
+
+        int cpCount = dis.readInt();
+        byte[] cpBytes = dis.readNBytes(cpCount * 4);
+        int[] codepoints = new int[cpCount];
+        ByteBuffer.wrap(cpBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(codepoints);
+
+        int slots = dis.readInt();
+        float bMin = dis.readFloat();
+        float bMax = dis.readFloat();
+        byte[] keyBytes = dis.readNBytes(slots * 4);
+        int[] keys = new int[slots];
+        ByteBuffer.wrap(keyBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(keys);
+        byte[] values = dis.readNBytes(slots);
+
+        float uMin = dis.readFloat();
+        float uMax = dis.readFloat();
+        float uFallback = dis.readFloat();
+        byte[] unigramTable = dis.readNBytes(cpCount);
+
+        return new V7Tables(codepoints, keys, values, unigramTable,
+                bMin, bMax, uMin, uMax, uFallback, backoffAlpha);
+    }
+
+    /**
+     * Returns a one-line summary for trainer progress output.
+     */
+    public String statsString() {
+        return String.format(
+                "  cp_index=%d, bigram_slots=%d (load≈%.2f), "
+                + "bigram_range=[%.3f, %.3f], unigram_range=[%.3f, %.3f]",
+                codepointIndex.length, bigramKeys.length,
+                occupiedSlots() / (double) Math.max(1, bigramKeys.length),
+                bigramQuantMin, bigramQuantMax,
+                unigramQuantMin, unigramQuantMax);
+    }
+
+    private int occupiedSlots() {
+        int n = 0;
+        for (int k : bigramKeys) {
+            if (k != EMPTY_KEY) n++;
+        }
+        return n;
+    }
+
+    /** Number of codepoints in this script's index.  Diagnostic. */
+    public int codepointCount() {
+        return codepointIndex.length;
+    }
+
+    /** Number of bigram-table slots (capacity).  Diagnostic. */
+    public int bigramSlots() {
+        return bigramKeys.length;
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
new file mode 100644
index 00000000000..08b2aa4eb57
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Diagnostic tool: bucket every bigram in {@code han.train.gz} (or any
+ * specified file) by the {@link Character.UnicodeBlock} of each codepoint,
+ * and report the distribution.
+ *
+ * <p>Goal: determine whether HAN's 224K distinct pairs split cleanly along
+ * block boundaries — e.g. CJK Unified Ideographs vs. Hiragana vs. Katakana —
+ * which would justify routing HAN windows to language-specific sub-models in
+ * the v7 design.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java ... AnalyzeHanByBlock /path/to/junkdetect/han.train.gz
+ * </pre>
+ */
+public final class AnalyzeHanByBlock {
+
+    private AnalyzeHanByBlock() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: AnalyzeHanByBlock <train.gz>");
+            System.exit(1);
+        }
+        Path file = Paths.get(args[0]);
+
+        // (blockA, blockB) -> [totalBigrams, distinctSet via HashMap<Long, [count]>]
+        // We use Maps of Maps to keep code simple; HAN is the only file
+        // big enough to matter and fits in heap.
+        Map<String, Map<Long, long[]>> byBlockPair = new HashMap<>();
+        Map<String, long[]> blockPairTotals = new HashMap<>();
+        long totalN = 0;
+
+        try (BufferedReader r = new BufferedReader(
+                new InputStreamReader(
+                        new GZIPInputStream(Files.newInputStream(file)),
+                        StandardCharsets.UTF_8))) {
+            String line;
+            while ((line = r.readLine()) != null) {
+                int prevCp = -1;
+                String prevBlock = null;
+                for (int i = 0; i < line.length(); ) {
+                    int cp = line.codePointAt(i);
+                    i += Character.charCount(cp);
+                    String block = blockShortName(cp);
+                    if (prevCp >= 0) {
+                        String key = prevBlock + "|" + block;
+                        Map<Long, long[]> set = byBlockPair.computeIfAbsent(
+                                key, k -> new HashMap<>(256));
+                        long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL);
+                        long[] c = set.get(packed);
+                        if (c == null) {
+                            set.put(packed, new long[]{1L});
+                        } else {
+                            c[0]++;
+                        }
+                        blockPairTotals.computeIfAbsent(key, k -> new long[1])[0]++;
+                        totalN++;
+                    }
+                    prevCp = cp;
+                    prevBlock = block;
+                }
+            }
+        }
+
+        System.out.printf("File: %s%n", file);
+        System.out.printf("Total bigram occurrences: %,d%n%n", totalN);
+
+        // Sort block-pair keys by total occurrences (descending).
+        List<Map.Entry<String, long[]>> sorted = new ArrayList<>(blockPairTotals.entrySet());
+        sorted.sort(Comparator.comparingLong(
+                (Map.Entry<String, long[]> e) -> -e.getValue()[0]));
+
+        System.out.printf("%-50s %14s %14s %12s %8s%n",
+                "block_pair", "occurrences", "distinct", "singletons", "%total");
+        System.out.println(repeat('-', 105));
+
+        long distinctTotal = 0;
+        long singletonsTotal = 0;
+        for (Map.Entry<String, long[]> e : sorted) {
+            String pair = e.getKey();
+            long n = e.getValue()[0];
+            Map<Long, long[]> set = byBlockPair.get(pair);
+            int distinct = set.size();
+            int singletons = 0;
+            for (long[] c : set.values()) {
+                if (c[0] == 1) singletons++;
+            }
+            distinctTotal += distinct;
+            singletonsTotal += singletons;
+            double pct = 100.0 * n / totalN;
+            if (pct < 0.1 && n < 1000) {
+                continue; // skip tail noise rows
+            }
+            System.out.printf("%-50s %,14d %,14d %,12d %7.2f%%%n",
+                    pair, n, distinct, singletons, pct);
+        }
+        System.out.println(repeat('-', 105));
+        System.out.printf("Total distinct pairs (incl. tail): %,d%n", distinctTotal);
+        System.out.printf("Total singletons (incl. tail):     %,d%n", singletonsTotal);
+
+        // Roll up by individual block (left side only) to see per-block distinct counts.
+        System.out.println();
+        System.out.println("=== Per-leading-block roll-up ===");
+        Map<String, Long> distinctByLeadingBlock = new HashMap<>();
+        Map<String, Long> occByLeadingBlock = new HashMap<>();
+        for (Map.Entry<String, Map<Long, long[]>> e : byBlockPair.entrySet()) {
+            String leading = e.getKey().substring(0, e.getKey().indexOf('|'));
+            distinctByLeadingBlock.merge(leading, (long) e.getValue().size(), Long::sum);
+            long sum = 0;
+            for (long[] c : e.getValue().values()) sum += c[0];
+            occByLeadingBlock.merge(leading, sum, Long::sum);
+        }
+        List<Map.Entry<String, Long>> rollup = new ArrayList<>(occByLeadingBlock.entrySet());
+        rollup.sort(Comparator.comparingLong(
+                (Map.Entry<String, Long> e) -> -e.getValue()));
+        System.out.printf("%-35s %14s %14s%n",
+                "leading_block", "occurrences", "distinct(rough)");
+        System.out.println(repeat('-', 70));
+        for (Map.Entry<String, Long> e : rollup) {
+            System.out.printf("%-35s %,14d %,14d%n",
+                    e.getKey(), e.getValue(),
+                    distinctByLeadingBlock.get(e.getKey()));
+        }
+    }
+
+    /**
+     * Short-name for the Unicode block containing {@code cp}.  Compresses the
+     * many CJK-related blocks into a handful of human-readable labels.
+     *
+     * <p>Splits ASCII into ASCII_DIGIT / ASCII_LETTER / ASCII_PUNCT so we can
+     * distinguish numerals (which are content-bearing across all scripts) from
+     * English-letter contamination and punctuation.
+     */
+    private static String blockShortName(int cp) {
+        Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
+        if (b == null) return "UNK";
+
+        String name = b.toString();
+        if (name.equals("BASIC_LATIN")) {
+            if (cp >= '0' && cp <= '9') return "ASCII_DIGIT";
+            if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) return "ASCII_LETTER";
+            return "ASCII_PUNCT";
+        }
+        // Compress noisy block names for the report.
+        if (name.startsWith("CJK_UNIFIED_IDEOGRAPHS_EXTENSION")) {
+            return "CJK_EXT_" + name.substring(name.lastIndexOf('_') + 1);
+        }
+        if (name.equals("CJK_UNIFIED_IDEOGRAPHS")) return "CJK_UNIFIED";
+        if (name.equals("CJK_SYMBOLS_AND_PUNCTUATION")) return "CJK_PUNCT";
+        if (name.equals("CJK_COMPATIBILITY_IDEOGRAPHS")) return "CJK_COMPAT";
+        if (name.equals("CJK_COMPATIBILITY_FORMS")) return "CJK_COMPAT_FORMS";
+        if (name.equals("HALFWIDTH_AND_FULLWIDTH_FORMS")) return "HALF_FULL";
+        if (name.equals("HIRAGANA")) return "HIRAGANA";
+        if (name.equals("KATAKANA")) return "KATAKANA";
+        if (name.equals("KATAKANA_PHONETIC_EXTENSIONS")) return "KATAKANA_EXT";
+        if (name.equals("HANGUL_SYLLABLES")) return "HANGUL";
+        if (name.equals("HANGUL_JAMO")) return "HANGUL_JAMO";
+        if (name.equals("HANGUL_COMPATIBILITY_JAMO")) return "HANGUL_JAMO_C";
+        if (name.equals("LATIN_1_SUPPLEMENT")) return "LATIN1";
+        return name;
+    }
+
+    private static String repeat(char c, int n) {
+        char[] buf = new char[n];
+        java.util.Arrays.fill(buf, c);
+        return new String(buf);
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
new file mode 100644
index 00000000000..f64986b8dd8
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * For each {@code *.train.gz} file, classify every adjacent codepoint pair
+ * by its relation to the target script S (= file's script).  Categories:
+ *
+ * <ul>
+ *   <li>IN_S_INTERIOR — both codepoints are in S or in COMMON/INHERITED
+ *   <li>S_BOUNDARY    — exactly one codepoint is in S-or-COMMON, the other
+ *       is a non-S script
+ *   <li>FOREIGN_INTERIOR — both codepoints are in some non-S script
+ *       (possibly different scripts).  Under the proposed generalized
+ *       boundary rule, these are the bigrams to drop from S's training.
+ *   <li>ASCII_LETTER_RUN — special subcategory of foreign interior where
+ *       both cps are ASCII A–Z/a–z; this is the English-run case.
+ * </ul>
+ *
+ * <p>Reports occurrence counts, distinct-pair counts, and singleton counts
+ * for each category, plus the implied model-size impact of dropping
+ * FOREIGN_INTERIOR (or just ASCII_LETTER_RUN) under {@code min_count>=1}
+ * and {@code min_count>=3}.
+ */
+public final class BoundaryBigramAudit {
+
+    private BoundaryBigramAudit() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: BoundaryBigramAudit <dataDir>");
+            System.exit(1);
+        }
+        Path dataDir = Paths.get(args[0]);
+        Path[] files;
+        try (Stream<Path> s = Files.list(dataDir)) {
+            files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+                    .sorted().toArray(Path[]::new);
+        }
+
+        System.out.printf("%-22s %14s %14s %14s %14s %12s | %14s %14s%n",
+                "script", "in_S_occ", "boundary_occ", "foreign_occ",
+                "ascii_run_occ", "total_occ",
+                "drop_foreign_dist", "drop_asciirun_dist");
+        System.out.println(repeat('-', 165));
+
+        for (Path file : files) {
+            String fname = file.getFileName().toString();
+            String name = fname.substring(0, fname.length() - ".train.gz".length())
+                    .toUpperCase();
+            Character.UnicodeScript target;
+            try {
+                target = Character.UnicodeScript.valueOf(name);
+            } catch (IllegalArgumentException e) {
+                continue;
+            }
+
+            long inS = 0, boundary = 0, foreign = 0, asciiRun = 0;
+            HashMap<Long, long[]> distinctAll = new HashMap<>(1 << 16);
+            HashMap<Long, long[]> distinctKeptUnderForeignDrop = new HashMap<>(1 << 16);
+            HashMap<Long, long[]> distinctKeptUnderAsciiDrop = new HashMap<>(1 << 16);
+
+            try (BufferedReader r = new BufferedReader(
+                    new InputStreamReader(
+                            new GZIPInputStream(Files.newInputStream(file)),
+                            StandardCharsets.UTF_8))) {
+                String line;
+                while ((line = r.readLine()) != null) {
+                    int prevCp = -1;
+                    for (int i = 0; i < line.length(); ) {
+                        int cp = line.codePointAt(i);
+                        i += Character.charCount(cp);
+                        if (prevCp >= 0) {
+                            boolean aInS = inScriptOrCommon(prevCp, target);
+                            boolean bInS = inScriptOrCommon(cp, target);
+                            boolean aLetter = isLatinLetter(prevCp);
+                            boolean bLetter = isLatinLetter(cp);
+
+                            long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL);
+                            increment(distinctAll, packed);
+
+                            if (aInS && bInS) {
+                                inS++;
+                                increment(distinctKeptUnderForeignDrop, packed);
+                                increment(distinctKeptUnderAsciiDrop, packed);
+                            } else if (aInS != bInS) {
+                                boundary++;
+                                increment(distinctKeptUnderForeignDrop, packed);
+                                increment(distinctKeptUnderAsciiDrop, packed);
+                            } else {
+                                // both foreign (neither in S nor COMMON)
+                                foreign++;
+                                if (aLetter && bLetter) {
+                                    asciiRun++;
+                                } else {
+                                    // foreign interior but not pure ASCII letters:
+                                    // we'd keep this under the "ASCII-letter only" rule.
+                                    increment(distinctKeptUnderAsciiDrop, packed);
+                                }
+                            }
+                        }
+                        prevCp = cp;
+                    }
+                }
+            }
+
+            long total = inS + boundary + foreign;
+            int distAll = distinctAll.size();
+            int distForeignDrop = distinctKeptUnderForeignDrop.size();
+            int distAsciiDrop = distinctKeptUnderAsciiDrop.size();
+
+            System.out.printf("%-22s %,14d %,14d %,14d %,14d %,12d | %,14d %,14d%n",
+                    name.toLowerCase(), inS, boundary, foreign, asciiRun, total,
+                    distAll - distForeignDrop, distAll - distAsciiDrop);
+        }
+    }
+
+    private static boolean inScriptOrCommon(int cp, Character.UnicodeScript target) {
+        Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+        return s == target
+                || s == Character.UnicodeScript.COMMON
+                || s == Character.UnicodeScript.INHERITED;
+    }
+
+    private static boolean isLatinLetter(int cp) {
+        return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')
+                || (cp >= 0xFF21 && cp <= 0xFF3A) // fullwidth A-Z
+                || (cp >= 0xFF41 && cp <= 0xFF5A); // fullwidth a-z
+    }
+
+    private static void increment(HashMap<Long, long[]> map, long key) {
+        long[] c = map.get(key);
+        if (c == null) {
+            map.put(key, new long[]{1L});
+        } else {
+            c[0]++;
+        }
+    }
+
+    private static String repeat(char c, int n) {
+        char[] b = new char[n];
+        java.util.Arrays.fill(b, c);
+        return new String(b);
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index 27a5436d5e4..a80fafbd6b4 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -82,45 +82,18 @@
 public class BuildJunkTrainingData {
 
     // -----------------------------------------------------------------------
-    // Defaults
+    // Split ratios — fixed, part of the model identity (changing them would
+    // invalidate downstream eval comparisons).
     // -----------------------------------------------------------------------
 
-    /** Lines read per language to determine dominant script. */
-    private static final int DEFAULT_SCRIPT_SAMPLE_LINES = 2_000;
-
-    /**
-     * UTF-8 bytes loaded per script group for entropy estimation.
-     * Budget is spread evenly across languages in the group.
-     * 200KB is enough to observe the bigram distribution reliably.
-     */
-    private static final long ENTROPY_SAMPLE_BYTES = 200_000L;
-
-    /**
-     * Total UTF-8 byte budget across all script groups.  Divided proportionally
-     * by bigram entropy after the sampling phase.  50MB gives ~1–3MB per script
-     * on average across 34 groups; scale up for production runs.
-     */
-    private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L;
-
-    /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
-    private static final int DEFAULT_MIN_BYTES = 50;
-
-    /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
-    private static final double DEFAULT_MAX_PUNC_FRAC = 0.30;
-
     /** Fraction of sentences written to each split (train / dev / test = 80/10/10). */
     private static final double TRAIN_FRAC = 0.80;
     private static final double DEV_FRAC   = 0.10;
     // remaining (1 - TRAIN_FRAC - DEV_FRAC) goes to the test split
 
-    /**
-     * Minimum number of sentences that must land in the dev split for a script to be
-     * included in the model.  Scripts below this floor have too few samples to reliably
-     * estimate calibration statistics (mu/sigma), which produces noisy z-scores and
-     * inflated false positive rates.  With DEV_FRAC=0.10 the effective minimum total
-     * sentence count is minDevSentences / DEV_FRAC (default: 5,000 total sentences).
-     */
-    private static final int DEFAULT_MIN_DEV_SENTENCES = 500;
+    // All other durable parameters live in JunkDetectorTrainingConfig.  This
+    // tool deliberately does not accept CLI overrides for those values; see
+    // the rejection logic in main() below.
 
     // -----------------------------------------------------------------------
     // Entry point
@@ -129,13 +102,22 @@ public class BuildJunkTrainingData {
     public static void main(String[] args) throws IOException {
         Path dataDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "data");
         Path outputDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "junkdetect");
-        int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES;
-        long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES;
-        int minBytes = DEFAULT_MIN_BYTES;
-        double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC;
-        int seed = 42;
         boolean dryRun = false;
-        int minDevSentences = DEFAULT_MIN_DEV_SENTENCES;
+
+        // Bind config-controlled values into local variables.  These are
+        // read-only from this point on; any attempt to override them via CLI
+        // is rejected below.
+        long totalBudgetBytes = JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES;
+        long perLanguageCapBytes = JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES;
+        int minBytes = JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE;
+        double maxPuncFrac = JunkDetectorTrainingConfig.MAX_PUNC_FRAC;
+        double minTargetScriptFrac = JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC;
+        int minDevSentences = JunkDetectorTrainingConfig.MIN_DEV_SENTENCES;
+        int scriptSampleLines = JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES;
+        int seed = JunkDetectorTrainingConfig.SEED;
+        java.util.Set<String> dropScripts = JunkDetectorTrainingConfig.DROP_SCRIPTS;
+        Map<String, Long> scriptBudgetOverrides =
+                JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES;
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -145,26 +127,25 @@ public static void main(String[] args) throws IOException {
                 case "--output-dir":
                     outputDir = Paths.get(args[++i]);
                     break;
-                case "--script-sample-lines":
-                    scriptSampleLines = Integer.parseInt(args[++i]);
+                case "--dry-run":
+                    dryRun = true;
                     break;
+                // Durable parameters are config-controlled.  Refuse any CLI
+                // override so that a model file's identity always matches the
+                // committed config.
+                case "--script-sample-lines":
                 case "--total-budget-bytes":
-                    totalBudgetBytes = Long.parseLong(args[++i]);
-                    break;
+                case "--per-language-cap-bytes":
                 case "--min-bytes":
-                    minBytes = Integer.parseInt(args[++i]);
-                    break;
                 case "--max-punc-frac":
-                    maxPuncFrac = Double.parseDouble(args[++i]);
-                    break;
+                case "--min-target-script-frac":
                 case "--seed":
-                    seed = Integer.parseInt(args[++i]);
-                    break;
                 case "--min-dev-sentences":
-                    minDevSentences = Integer.parseInt(args[++i]);
-                    break;
-                case "--dry-run":
-                    dryRun = true;
+                case "--drop-scripts":
+                case "--script-budget-override":
+                    System.err.println("ERROR: " + args[i] + " is no longer a CLI option."
+                            + "  Edit JunkDetectorTrainingConfig and commit the change instead.");
+                    System.exit(1);
                     break;
                 default:
                     System.err.println("Unknown argument: " + args[i]);
@@ -174,15 +155,26 @@ public static void main(String[] args) throws IOException {
         }
 
         System.out.println("=== BuildJunkTrainingData ===");
-        System.out.println("  data-dir:           " + dataDir);
-        System.out.println("  output-dir:         " + outputDir);
-        System.out.printf( "  total-budget-bytes: %,d (%.1f MB)%n",
+        System.out.println("  data-dir:               " + dataDir);
+        System.out.println("  output-dir:             " + outputDir);
+        System.out.println("  --- config (JunkDetectorTrainingConfig) ---");
+        System.out.printf( "  total-budget-bytes:     %,d (%.1f MB)%n",
                 totalBudgetBytes, totalBudgetBytes / 1_000_000.0);
-        System.out.printf( "  min-bytes:          %d%n", minBytes);
-        System.out.printf( "  max-punc-frac:      %.2f%n", maxPuncFrac);
-        System.out.printf( "  min-dev-sentences:  %d  (min total ≈ %d)%n",
+        System.out.printf( "  per-language-cap:       %,d (%.1f MB)%n",
+                perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0);
+        System.out.printf( "  min-bytes:              %d%n", minBytes);
+        System.out.printf( "  max-punc-frac:          %.2f%n", maxPuncFrac);
+        System.out.printf( "  min-target-script-frac: %.2f%n", minTargetScriptFrac);
+        System.out.printf( "  min-dev-sentences:      %d  (min total ≈ %d)%n",
                 minDevSentences, (int)(minDevSentences / DEV_FRAC));
-        System.out.println("  dry-run:            " + dryRun);
+        System.out.printf( "  seed:                   %d%n", seed);
+        if (!dropScripts.isEmpty()) {
+            System.out.println("  drop-scripts:           " + dropScripts);
+        }
+        if (!scriptBudgetOverrides.isEmpty()) {
+            System.out.println("  script-budget-override: " + scriptBudgetOverrides);
+        }
+        System.out.println("  dry-run:                " + dryRun);
 
         if (!Files.isDirectory(dataDir)) {
             System.err.println("ERROR: data-dir not found: " + dataDir);
@@ -208,6 +200,15 @@ public static void main(String[] args) throws IOException {
                 System.out.printf("  %-12s → %s%n", lang, script);
             }
         }
+
+        if (!dropScripts.isEmpty()) {
+            for (String s : dropScripts) {
+                if (scriptGroups.remove(s) != null) {
+                    System.out.printf("  DROP script: %s%n", s);
+                }
+            }
+        }
+
         System.out.printf("%n  → %d languages, %d script groups%n",
                 langToScript.size(), scriptGroups.size());
 
@@ -222,7 +223,8 @@ public static void main(String[] args) throws IOException {
             String script = entry.getKey();
             List<Path> langDirs = entry.getValue();
 
-            long perLangSampleBytes = Math.max(ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L);
+            long perLangSampleBytes = Math.max(
+                    JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L);
             List<String> sample = new ArrayList<>();
             for (Path langDir : langDirs) {
                 loadSentences(langDir, perLangSampleBytes, minBytes, maxPuncFrac, sample);
@@ -246,9 +248,25 @@ public static void main(String[] args) throws IOException {
         Map<String, Long> scriptBudget = new TreeMap<>();
         for (Map.Entry<String, Double> e : scriptEntropy.entrySet()) {
             long budget = (long) (totalBudgetBytes * e.getValue() / totalEntropy);
+            Long override = scriptBudgetOverrides.get(e.getKey());
+            if (override != null) {
+                System.out.printf("  %-20s H=%.3f → %,d bytes (%.1f MB)"
+                        + "  [OVERRIDE: was %,d (%.1f MB)]%n",
+                        e.getKey(), e.getValue(), override, override / 1_000_000.0,
+                        budget, budget / 1_000_000.0);
+                budget = override;
+            } else {
+                System.out.printf("  %-20s H=%.3f → %,d bytes (%.1f MB)%n",
+                        e.getKey(), e.getValue(), budget, budget / 1_000_000.0);
+            }
             scriptBudget.put(e.getKey(), budget);
-            System.out.printf("  %-20s H=%.3f → %,d bytes (%.1f MB)%n",
-                    e.getKey(), e.getValue(), budget, budget / 1_000_000.0);
+        }
+        // Warn about overrides for scripts that aren't in the bucket set.
+        for (String k : scriptBudgetOverrides.keySet()) {
+            if (!scriptBudget.containsKey(k)) {
+                System.err.printf("WARNING: --script-budget-override for %s ignored"
+                        + " (script not in bucket set)%n", k);
+            }
         }
 
         if (dryRun) {
@@ -273,8 +291,16 @@ public static void main(String[] args) throws IOException {
             String script = budgetEntry.getKey();
             long budget = budgetEntry.getValue();
             List<Path> langDirs = scriptGroups.get(script);
+            Character.UnicodeScript targetScript = parseUnicodeScript(script);
 
             long perLangBytes = Math.max(budget / langDirs.size(), 1L);
+            // Apply per-language cap on top of the even split, but only for
+            // multi-language buckets.  For single-language scripts (e.g. KHMER,
+            // HANGUL), the cap would needlessly limit a bucket that has only
+            // one source; let it consume its full budget instead.
+            long capPerLang = langDirs.size() > 1
+                    ? Math.min(perLangBytes, perLanguageCapBytes)
+                    : perLangBytes;
             List<String> sentences = new ArrayList<>();
             long totalBytesLoaded = 0;
 
@@ -282,8 +308,10 @@ public static void main(String[] args) throws IOException {
                 long remaining = budget - totalBytesLoaded;
                 if (remaining <= 0) break;
                 long langBytes = loadSentences(langDir,
-                        Math.min(perLangBytes, remaining),
-                        minBytes, maxPuncFrac, sentences);
+                        Math.min(capPerLang, remaining),
+                        minBytes, maxPuncFrac,
+                        targetScript, minTargetScriptFrac,
+                        sentences);
                 totalBytesLoaded += langBytes;
                 if (langBytes > 0) {
                     System.out.printf("  %-12s %-20s +%,d bytes%n",
@@ -327,7 +355,11 @@ public static void main(String[] args) throws IOException {
 
                 long newBudget = budget + extra;
                 List<Path> langDirs = scriptGroups.get(script);
+                Character.UnicodeScript targetScript = parseUnicodeScript(script);
                 long perLangBytes = Math.max(newBudget / langDirs.size(), 1L);
+                long capPerLang = langDirs.size() > 1
+                        ? Math.min(perLangBytes, perLanguageCapBytes)
+                        : perLangBytes;
 
                 List<String> sentences = new ArrayList<>();
                 long totalBytesLoaded = 0;
@@ -335,8 +367,10 @@ public static void main(String[] args) throws IOException {
                     long remaining = newBudget - totalBytesLoaded;
                     if (remaining <= 0) break;
                     long langBytes = loadSentences(langDir,
-                            Math.min(perLangBytes, remaining),
-                            minBytes, maxPuncFrac, sentences);
+                            Math.min(capPerLang, remaining),
+                            minBytes, maxPuncFrac,
+                            targetScript, minTargetScriptFrac,
+                            sentences);
                     totalBytesLoaded += langBytes;
                 }
                 if (!sentences.isEmpty()) {
@@ -415,6 +449,21 @@ public static void main(String[] args) throws IOException {
         System.out.println("Done.");
     }
 
+    /**
+     * Parses a script-bucket name (e.g. {@code "HAN"}) into a
+     * {@link Character.UnicodeScript}, or returns {@code null} if the name
+     * does not correspond to a real script (e.g. {@code "COMMON"} or any
+     * future synthetic bucket).  Used by the corpus builder to look up the
+     * target script for the {@code min-target-script-frac} filter.
+     */
+    static Character.UnicodeScript parseUnicodeScript(String name) {
+        try {
+            return Character.UnicodeScript.valueOf(name);
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+    }
+
     // -----------------------------------------------------------------------
     // Script detection
     // -----------------------------------------------------------------------
@@ -531,6 +580,22 @@ static double computeBigramEntropy(List<String> sentences) {
      */
     static long loadSentences(Path langDir, long maxBytes, int minBytes,
                                double maxPuncFrac, List<String> result) {
+        // Backwards-compatible overload: no target-script filter.
+        return loadSentences(langDir, maxBytes, minBytes, maxPuncFrac,
+                null, 0.0, result);
+    }
+
+    /**
+     * Same as the 5-arg overload, but additionally drops sentences whose
+     * fraction of {@code targetScript} codepoints (relative to all non-
+     * COMMON/INHERITED codepoints) is below {@code minTargetScriptFrac}.
+     * Passing {@code targetScript == null} disables the target-script filter.
+     */
+    static long loadSentences(Path langDir, long maxBytes, int minBytes,
+                               double maxPuncFrac,
+                               Character.UnicodeScript targetScript,
+                               double minTargetScriptFrac,
+                               List<String> result) {
         long bytesLoaded = 0;
         for (String filename : new String[]{"sentences_wikipedia.txt", "sentences_madlad.txt"}) {
             if (bytesLoaded >= maxBytes) {
@@ -553,7 +618,8 @@ static long loadSentences(Path langDir, long maxBytes, int minBytes,
                         if (text.isEmpty()) {
                             continue;
                         }
-                        String filtered = filterSentence(text, minBytes, maxPuncFrac);
+                        String filtered = filterSentence(text, minBytes, maxPuncFrac,
+                                targetScript, minTargetScriptFrac);
                         if (filtered != null) {
                             int sentBytes = filtered.getBytes(StandardCharsets.UTF_8).length;
                             result.add(filtered);
@@ -577,6 +643,18 @@ static long loadSentences(Path langDir, long maxBytes, int minBytes,
      * @return the normalised sentence, or {@code null} if it should be discarded
      */
     static String filterSentence(String text, int minBytes, double maxPuncFrac) {
+        return filterSentence(text, minBytes, maxPuncFrac, null, 0.0);
+    }
+
+    /**
+     * Same as the 3-arg overload, but additionally rejects sentences whose
+     * fraction of {@code targetScript} codepoints (over non-COMMON/INHERITED
+     * codepoints) is below {@code minTargetScriptFrac}.  If {@code
+     * targetScript == null} the target-script filter is skipped.
+     */
+    static String filterSentence(String text, int minBytes, double maxPuncFrac,
+                                  Character.UnicodeScript targetScript,
+                                  double minTargetScriptFrac) {
         if (text.indexOf('\uFFFD') >= 0) {
             return null;
         }
@@ -586,17 +664,34 @@ static String filterSentence(String text, int minBytes, double maxPuncFrac) {
         }
         int cpCount = 0;
         int puncCount = 0;
+        int scriptCpTotal = 0;
+        int scriptCpMatching = 0;
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             cpCount++;
             if (cp >= 0x21 && cp <= 0x7E && !Character.isLetter(cp)) {
                 puncCount++;
             }
+            if (targetScript != null) {
+                Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+                if (s != Character.UnicodeScript.COMMON
+                        && s != Character.UnicodeScript.INHERITED
+                        && s != Character.UnicodeScript.UNKNOWN) {
+                    scriptCpTotal++;
+                    if (s == targetScript) {
+                        scriptCpMatching++;
+                    }
+                }
+            }
             i += Character.charCount(cp);
         }
         if (cpCount > 0 && (double) puncCount / cpCount > maxPuncFrac) {
             return null;
         }
+        if (targetScript != null && scriptCpTotal > 0
+                && (double) scriptCpMatching / scriptCpTotal < minTargetScriptFrac) {
+            return null;
+        }
         return text;
     }
 
@@ -624,23 +719,15 @@ private static void writeGzipped(Path path, List<String> lines) throws IOExcepti
 
     private static void printUsage() {
         System.err.println("Usage: BuildJunkTrainingData [options]");
-        System.err.println("  --data-dir              <path>  MADLAD data root"
+        System.err.println("  --data-dir   <path>  MADLAD data root"
                 + " (default: ~/datasets/madlad/data)");
-        System.err.println("  --output-dir            <path>  Output directory"
+        System.err.println("  --output-dir <path>  Output directory"
                 + " (default: ~/datasets/madlad/junkdetect)");
-        System.err.println("  --script-sample-lines   N       Lines per language for script"
-                + " detection (default: 2000)");
-        System.err.println("  --total-budget-bytes    N       Total UTF-8 bytes across all"
-                + " scripts (default: 50000000)");
-        System.err.println("  --min-bytes             N       Min UTF-8 bytes per sentence"
-                + " (default: 50)");
-        System.err.println("  --max-punc-frac         F       Max ASCII punct fraction"
-                + " (default: 0.30)");
-        System.err.println("  --min-dev-sentences     N       Min sentences in dev split for a"
-                + " script to be included (default: 500). Scripts below this floor"
-                + " have unreliable calibration and inflated FPR.");
-        System.err.println("  --seed                  N       Random seed (default: 42)");
-        System.err.println("  --dry-run                       Detect scripts + show budget,"
-                + " skip file writing");
+        System.err.println("  --dry-run            Detect scripts + show budget,"
+                + " skip file writing.");
+        System.err.println();
+        System.err.println("All other training/build parameters (budgets, filters, dropped"
+                + " scripts, seed, etc.) are fixed in JunkDetectorTrainingConfig and tracked"
+                + " in git.  Edit that file and commit to change them.");
     }
 }
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
new file mode 100644
index 00000000000..b287012ddc0
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Diagnostic tool for sizing a per-script F1 bigram store (v7 design).
+ *
+ * <p>Walks every {@code *.train.gz} in {@code dataDir}, treating each file as
+ * one script's corpus.  Counts (cpA, cpB) codepoint-pair frequencies and
+ * reports, per script:
+ *
+ * <ul>
+ *   <li>total bigram occurrences (N)
+ *   <li>distinct pair count (U)
+ *   <li>singletons — pairs seen exactly once (these are usually the
+ *       worst candidates to keep; they often reflect OCR noise / rare
+ *       proper nouns and inflate U without helping discrimination)
+ *   <li>"effective" pair count = pairs seen at least {@code MIN_COUNT} times
+ *   <li>coverage curve: how many of the top-N most-frequent pairs are needed
+ *       to cover {x = 50, 75, 90, 95, 99, 99.9}% of all bigram occurrences
+ *   <li>estimated v7 model size for several candidate cutoffs, assuming
+ *       2.25 bytes/pair (MPHF + 8-bit fingerprint + 8-bit value)
+ *       and 1.3 bytes/pair (MPHF + 8-bit value, no fingerprint)
+ * </ul>
+ *
+ * <p>Usage:
+ * <pre>
+ *   mvn -pl tika-ml/tika-ml-junkdetect exec:java \
+ *       -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.CountPerScriptBigrams \
+ *       -Dexec.args="/path/to/junkdetect"
+ * </pre>
+ *
+ * <p>No model output; this is read-only telemetry to inform the v7 sizing
+ * decision (see {@code 20260514-junk-retrain-v6.md}).
+ */
+public final class CountPerScriptBigrams {
+
+    private static final int[] COVERAGE_PCT = {50, 75, 90, 95, 99};
+    private static final double[] COVERAGE_FRAC_HI = {0.999};
+
+    /** Cutoffs reported in the size-estimate table. */
+    private static final int[] MIN_COUNT_CUTOFFS = {1, 2, 3, 5, 10};
+
+    /** Bytes per retained pair for each candidate storage scheme. */
+    private static final double[] BYTES_PER_PAIR_SCHEMES = {1.3, 2.25, 6.25};
+    private static final String[] SCHEME_NAMES = {
+            "MPHF+val(1.3B)", "MPHF+fp+val(2.25B)", "open-addr+key(6.25B)"};
+
+    private CountPerScriptBigrams() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println(
+                    "Usage: CountPerScriptBigrams <dataDir> [topK-per-script]");
+            System.exit(1);
+        }
+        Path dataDir = Paths.get(args[0]);
+        int topK = args.length >= 2 ? Integer.parseInt(args[1]) : 0;
+
+        List<Path> trainFiles = new ArrayList<>();
+        try (Stream<Path> s = Files.list(dataDir)) {
+            s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+             .sorted()
+             .forEach(trainFiles::add);
+        }
+        if (trainFiles.isEmpty()) {
+            System.err.println("ERROR: no *.train.gz files in " + dataDir);
+            System.exit(1);
+        }
+
+        System.out.printf("Found %d *.train.gz files in %s%n%n",
+                trainFiles.size(), dataDir);
+        System.out.printf(
+                "%-22s %12s %12s %12s %12s | %s%n",
+                "script", "total_N", "distinct_U", "singletons",
+                "U(>=10)", "coverage: pairs needed for [50,75,90,95,99,99.9]%");
+        System.out.println(repeat('-', 140));
+
+        long grandTotalN = 0;
+        long grandTotalU = 0;
+        long grandTotalUge2 = 0;
+        long grandTotalUge10 = 0;
+
+        // Per-script size accumulators for the global-size summary at the end.
+        Map<String, long[]> perScriptStats = new HashMap<>();
+
+        for (Path trainFile : trainFiles) {
+            String fname = trainFile.getFileName().toString();
+            String script = fname.substring(0, fname.length() - ".train.gz".length())
+                    .toUpperCase();
+
+            HashMap<Long, long[]> pairCounts = new HashMap<>(1 << 16);
+            long totalN = 0;
+            try (BufferedReader r = new BufferedReader(
+                    new InputStreamReader(
+                            new GZIPInputStream(Files.newInputStream(trainFile)),
+                            StandardCharsets.UTF_8))) {
+                String line;
+                while ((line = r.readLine()) != null) {
+                    int prevCp = -1;
+                    for (int i = 0; i < line.length(); ) {
+                        int cp = line.codePointAt(i);
+                        i += Character.charCount(cp);
+                        if (prevCp >= 0) {
+                            long key = packPair(prevCp, cp);
+                            long[] c = pairCounts.get(key);
+                            if (c == null) {
+                                pairCounts.put(key, new long[]{1L});
+                            } else {
+                                c[0]++;
+                            }
+                            totalN++;
+                        }
+                        prevCp = cp;
+                    }
+                }
+            }
+
+            int distinctU = pairCounts.size();
+
+            long[] counts = new long[distinctU];
+            int idx = 0;
+            for (long[] c : pairCounts.values()) {
+                counts[idx++] = c[0];
+            }
+            // Sort descending for coverage curve.
+            java.util.Arrays.sort(counts);
+            // Reverse in place.
+            for (int i = 0, j = counts.length - 1; i < j; i++, j--) {
+                long t = counts[i];
+                counts[i] = counts[j];
+                counts[j] = t;
+            }
+
+            int singletons = 0;
+            int uGe2 = 0;
+            int uGe10 = 0;
+            for (long c : counts) {
+                if (c == 1) singletons++;
+                if (c >= 2) uGe2++;
+                if (c >= 10) uGe10++;
+            }
+
+            // Coverage thresholds: minimum k such that sum(counts[0..k-1]) / N >= t.
+            int[] coveragePairs = new int[COVERAGE_PCT.length + COVERAGE_FRAC_HI.length];
+            double[] thresholds = new double[coveragePairs.length];
+            for (int i = 0; i < COVERAGE_PCT.length; i++) {
+                thresholds[i] = COVERAGE_PCT[i] / 100.0;
+            }
+            for (int i = 0; i < COVERAGE_FRAC_HI.length; i++) {
+                thresholds[COVERAGE_PCT.length + i] = COVERAGE_FRAC_HI[i];
+            }
+            long running = 0;
+            int tIdx = 0;
+            for (int k = 0; k < counts.length && tIdx < thresholds.length; k++) {
+                running += counts[k];
+                while (tIdx < thresholds.length
+                        && (double) running / totalN >= thresholds[tIdx]) {
+                    coveragePairs[tIdx++] = k + 1;
+                }
+            }
+            // Fill any unreached thresholds with U (means: never reached, took all).
+            for (; tIdx < thresholds.length; tIdx++) {
+                coveragePairs[tIdx] = distinctU;
+            }
+
+            StringBuilder cov = new StringBuilder();
+            for (int i = 0; i < coveragePairs.length; i++) {
+                if (i > 0) cov.append(", ");
+                cov.append(String.format("%,d", coveragePairs[i]));
+            }
+
+            System.out.printf("%-22s %,12d %,12d %,12d %,12d | %s%n",
+                    script.toLowerCase(),
+                    totalN, distinctU, singletons, uGe10,
+                    cov.toString());
+
+            // Per-script size table.
+            if (topK > 0 || true) {
+                long[] sizeStats = new long[
+                        2 + MIN_COUNT_CUTOFFS.length + BYTES_PER_PAIR_SCHEMES.length];
+                sizeStats[0] = totalN;
+                sizeStats[1] = distinctU;
+                for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+                    int minC = MIN_COUNT_CUTOFFS[i];
+                    int kept = 0;
+                    for (long c : counts) {
+                        if (c >= minC) kept++;
+                        else break;
+                    }
+                    sizeStats[2 + i] = kept;
+                }
+                perScriptStats.put(script.toLowerCase(), sizeStats);
+            }
+
+            // Per-script top-K dump if requested.
+            if (topK > 0) {
+                System.out.printf("    top %d pairs in %s:%n", topK, script.toLowerCase());
+                List<Map.Entry<Long, long[]>> sorted = new ArrayList<>(pairCounts.entrySet());
+                sorted.sort((a, b) -> Long.compare(b.getValue()[0], a.getValue()[0]));
+                for (int i = 0; i < Math.min(topK, sorted.size()); i++) {
+                    Map.Entry<Long, long[]> e = sorted.get(i);
+                    long k = e.getKey();
+                    int cpA = (int) (k >>> 24);
+                    int cpB = (int) (k & 0xFFFFFFL);
+                    System.out.printf("      U+%04X U+%04X  (%c %c)  %,d%n",
+                            cpA, cpB,
+                            safePrint(cpA), safePrint(cpB),
+                            e.getValue()[0]);
+                }
+            }
+
+            grandTotalN += totalN;
+            grandTotalU += distinctU;
+            grandTotalUge2 += uGe2;
+            grandTotalUge10 += uGe10;
+        }
+
+        System.out.println(repeat('-', 140));
+        System.out.printf("%-22s %,12d %,12d %12s %,12d%n%n",
+                "TOTAL", grandTotalN, grandTotalU,
+                "-", grandTotalUge10);
+
+        // ------------------------------------------------------------------
+        // Cutoff vs. model-size summary
+        // ------------------------------------------------------------------
+        System.out.println("=== Model-size estimates by min-count cutoff and storage scheme ===");
+        System.out.println("(sum of retained pairs across all scripts × bytes-per-pair)");
+        System.out.println();
+        System.out.printf("%-12s", "cutoff");
+        for (String name : SCHEME_NAMES) {
+            System.out.printf(" %20s", name);
+        }
+        System.out.printf(" %20s%n", "retained_pairs");
+        System.out.println(repeat('-', 12 + (SCHEME_NAMES.length + 1) * 21));
+
+        for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+            long retained = 0;
+            for (long[] stats : perScriptStats.values()) {
+                retained += stats[2 + i];
+            }
+            System.out.printf("min_count>=%-2d", MIN_COUNT_CUTOFFS[i]);
+            for (double bpp : BYTES_PER_PAIR_SCHEMES) {
+                double bytes = retained * bpp;
+                System.out.printf(" %18s   ", humanBytes(bytes));
+            }
+            System.out.printf(" %,20d%n", retained);
+        }
+
+        System.out.println();
+        System.out.println("Per-script pair counts retained at each cutoff:");
+        System.out.printf("%-22s", "script");
+        for (int c : MIN_COUNT_CUTOFFS) {
+            System.out.printf(" %12s", ">=" + c);
+        }
+        System.out.println();
+        List<Map.Entry<String, long[]>> sortedScripts =
+                new ArrayList<>(perScriptStats.entrySet());
+        sortedScripts.sort(Comparator.comparingLong(
+                (Map.Entry<String, long[]> e) -> -e.getValue()[1]));
+        for (Map.Entry<String, long[]> e : sortedScripts) {
+            System.out.printf("%-22s", e.getKey());
+            for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+                System.out.printf(" %,12d", e.getValue()[2 + i]);
+            }
+            System.out.println();
+        }
+    }
+
+    /** Pack two codepoints (each up to 21 bits) into a single long. */
+    private static long packPair(int cpA, int cpB) {
+        return ((long) cpA << 24) | (cpB & 0xFFFFFFL);
+    }
+
+    private static char safePrint(int cp) {
+        if (cp < 0x20 || cp == 0x7F || !Character.isDefined(cp)) {
+            return '.';
+        }
+        if (Character.charCount(cp) != 1) {
+            return '?';
+        }
+        return (char) cp;
+    }
+
+    private static String repeat(char c, int n) {
+        char[] buf = new char[n];
+        java.util.Arrays.fill(buf, c);
+        return new String(buf);
+    }
+
+    private static String humanBytes(double bytes) {
+        if (bytes < 1024) return String.format("%.0f B", bytes);
+        if (bytes < 1024 * 1024) return String.format("%.1f KB", bytes / 1024.0);
+        if (bytes < 1024L * 1024 * 1024) return String.format("%.2f MB", bytes / (1024.0 * 1024));
+        return String.format("%.2f GB", bytes / (1024.0 * 1024 * 1024));
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
new file mode 100644
index 00000000000..36f3a897a01
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Diagnostic: replicate JunkDetector.buildScriptRuns exactly on a fixture
+ * and print every run.  Helps explain why score() returns UNKNOWN.
+ *
+ * <p>Usage:
+ * <pre>
+ *   ./mvnw exec:java -pl tika-ml/tika-ml-junkdetect \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.DebugScriptRuns \
+ *     -Dexec.args="--file ~/data/regression/.../AIT5... --charset GB18030 --bytes 1024"
+ * </pre>
+ */
+public class DebugScriptRuns {
+
+    // Mirror of JunkDetector.SCRIPT_MODEL_FALLBACK — keep in sync if production changes.
+    private static final Map<String, String> SCRIPT_MODEL_FALLBACK = Map.of(
+            "HIRAGANA", "HAN",
+            "KATAKANA", "HAN");
+
+    public static void main(String[] args) throws IOException {
+        Path file = null;
+        String charset = "GB18030";
+        int probeBytes = 1024;
+        boolean strip = true;
+        boolean expand = true;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--file":
+                    file = Paths.get(expandHome(args[++i]));
+                    break;
+                case "--charset":
+                    charset = args[++i];
+                    break;
+                case "--bytes":
+                    probeBytes = Integer.parseInt(args[++i]);
+                    break;
+                case "--no-strip":
+                    strip = false;
+                    break;
+                case "--no-expand":
+                    expand = false;
+                    break;
+                default:
+                    System.err.println("unknown: " + args[i]);
+                    System.exit(1);
+            }
+        }
+        if (file == null) {
+            System.err.println("Required: --file <path>");
+            System.exit(1);
+        }
+        byte[] raw = Files.readAllBytes(file);
+        byte[] forDecode = raw;
+        if (strip) {
+            byte[] dst = new byte[raw.length];
+            HtmlByteStripper.Result r = HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+            if (r.tagCount > 0 && r.length > 0) {
+                forDecode = Arrays.copyOf(dst, r.length);
+            }
+            System.err.println("After strip: " + forDecode.length + " bytes (was " + raw.length + ")");
+        }
+        if (forDecode.length > probeBytes) {
+            forDecode = Arrays.copyOf(forDecode, probeBytes);
+        }
+        System.err.println("Probe: " + forDecode.length + " bytes decoded as " + charset);
+
+        String decoded = new String(forDecode, Charset.forName(charset));
+        if (expand) {
+            decoded = expandEntities(decoded);
+        }
+        System.err.println("Decoded codepoints: " + decoded.codePointCount(0, decoded.length()));
+
+        List<Run> runs = buildScriptRuns(decoded);
+        System.err.println("Built " + runs.size() + " script runs.");
+
+        // Mirror JunkDetector.scoreText filter and report what would be scored.
+        JunkDetector detector = JunkDetector.loadFromClasspath();
+        java.util.Set<String> modeled = detector.knownScripts();
+
+        TreeMap<String, int[]> totals = new TreeMap<>(); // script -> {chars, bytes, runs, modeled?}
+        int totalScored = 0;
+        int totalSkippedShort = 0;
+        int totalSkippedUnmodeled = 0;
+        long totalBytesScored = 0;
+
+        for (Run r : runs) {
+            byte[] runUtf8 = r.text.getBytes(StandardCharsets.UTF_8);
+            boolean isModeled = modeled.contains(r.script);
+            boolean longEnough = runUtf8.length >= 2;
+            totals.merge(r.script, new int[]{r.text.codePointCount(0, r.text.length()),
+                            runUtf8.length, 1, isModeled ? 1 : 0},
+                    (a, b) -> new int[]{a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3]});
+            if (!isModeled) {
+                totalSkippedUnmodeled++;
+            } else if (!longEnough) {
+                totalSkippedShort++;
+            } else {
+                totalScored++;
+                totalBytesScored += runUtf8.length;
+            }
+        }
+
+        System.out.println("Script roll-up (script: cps, utf8_bytes, runs, modeled):");
+        for (Map.Entry<String, int[]> e : totals.entrySet()) {
+            int[] v = e.getValue();
+            System.out.printf("  %-15s cps=%-5d bytes=%-6d runs=%-4d modeled=%s%n",
+                    e.getKey(), v[0], v[1], v[2], v[3] == 1 ? "Y" : "N");
+        }
+        System.out.println();
+        System.out.println("Scoring filter outcome:");
+        System.out.println("  runs scored:           " + totalScored);
+        System.out.println("  runs skipped (short):  " + totalSkippedShort);
+        System.out.println("  runs skipped (unmod):  " + totalSkippedUnmodeled);
+        System.out.println("  total bytes scored:    " + totalBytesScored);
+
+        // The bug: computeF1MeanLogP returns NaN when String.length() < 2.
+        // String.length() counts UTF-16 code units, but the outer filter uses
+        // UTF-8 bytes.  A single CJK char = 1 UTF-16 unit but 3 UTF-8 bytes,
+        // so it passes the outer filter and produces NaN inside.
+        int nanCausing = 0;
+        for (Run r : runs) {
+            byte[] u = r.text.getBytes(StandardCharsets.UTF_8);
+            if (u.length >= 2 && r.text.length() < 2 && modeled.contains(r.script)) {
+                nanCausing++;
+            }
+        }
+        System.out.println();
+        System.out.println("NaN-causing runs (utf8≥2 but utf16<2, modeled): " + nanCausing);
+
+        TextQualityScore score = detector.score(decoded);
+        System.out.println("  detector.score() z:    "
+                + (score.isUnknown() ? "UNKNOWN(" + score.getDominantScript() + ")"
+                : String.format("%.3f (script=%s)", score.getZScore(), score.getDominantScript())));
+
+        // Print the longest 10 runs so we can see what's actually in there.
+        System.out.println();
+        System.out.println("Longest 10 runs:");
+        runs.sort((a, b) -> Integer.compare(b.text.length(), a.text.length()));
+        for (int i = 0; i < Math.min(10, runs.size()); i++) {
+            Run r = runs.get(i);
+            byte[] u = r.text.getBytes(StandardCharsets.UTF_8);
+            String preview = r.text.length() > 30
+                    ? r.text.substring(0, 30) + "…" : r.text;
+            preview = preview.replace("\n", "\\n").replace("\r", "\\r");
+            System.out.printf("  %-15s cps=%-4d bytes=%-4d preview=%s%n",
+                    r.script, r.text.codePointCount(0, r.text.length()), u.length, preview);
+        }
+    }
+
+    // Exact mirror of JunkDetector.buildScriptRuns (private, copied here for diagnosis).
+    private static List<Run> buildScriptRuns(String text) {
+        List<Run> runs = new ArrayList<>();
+        String currentScript = null;
+        StringBuilder currentText = new StringBuilder();
+        StringBuilder leadingCommon = new StringBuilder();
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            if (s == Character.UnicodeScript.COMMON
+                    || s == Character.UnicodeScript.INHERITED
+                    || s == Character.UnicodeScript.UNKNOWN) {
+                if (currentScript != null) {
+                    currentText.appendCodePoint(cp);
+                } else {
+                    leadingCommon.appendCodePoint(cp);
+                }
+                continue;
+            }
+            String scriptName = SCRIPT_MODEL_FALLBACK.getOrDefault(s.name(), s.name());
+            if (!scriptName.equals(currentScript)) {
+                if (currentScript != null && currentText.length() > 0) {
+                    runs.add(new Run(currentScript, currentText.toString()));
+                }
+                currentScript = scriptName;
+                currentText = new StringBuilder();
+                if (leadingCommon.length() > 0) {
+                    currentText.append(leadingCommon);
+                    leadingCommon.setLength(0);
+                }
+            }
+            currentText.appendCodePoint(cp);
+        }
+        if (currentScript != null && currentText.length() > 0) {
+            runs.add(new Run(currentScript, currentText.toString()));
+        }
+        return runs;
+    }
+
+    private static final class Run {
+        final String script;
+        final String text;
+        Run(String s, String t) {
+            this.script = s;
+            this.text = t;
+        }
+    }
+
+    private static final Pattern NUM_DEC = Pattern.compile("&#(\\d{1,7});");
+    private static final Pattern NUM_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
+    private static final Pattern NAMED =
+            Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+    private static String expandEntities(String in) {
+        String s = NUM_DEC.matcher(in).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1));
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // leave unchanged
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = NUM_HEX.matcher(s).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1), 16);
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // leave unchanged
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = NAMED.matcher(s).replaceAll(mr -> {
+            switch (mr.group(1)) {
+                case "amp":  return "&";
+                case "lt":   return "<";
+                case "gt":   return ">";
+                case "quot": return "\"";
+                case "apos": return "'";
+                case "nbsp": return " ";
+                case "copy": return "©";
+                case "reg":  return "®";
+                default:     return Matcher.quoteReplacement(mr.group());
+            }
+        });
+        return s;
+    }
+
+    private static String expandHome(String s) {
+        return s.startsWith("~/") ? System.getProperty("user.home") + s.substring(1) : s;
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
index 6b6057fc34f..e0b4bc0ae10 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
@@ -470,7 +470,7 @@ private static void writeCompareEval(JunkDetector detector,
                                 sourceCodec, asSource, wrongCodec, asWrong);
 
                         deltas.add(result.delta());
-                        if ("A".equals(result.winner())) nCorrect++;
+                        if (sourceCodec.equals(result.winner())) nCorrect++;
                     }
 
                     if (deltas.isEmpty()) continue;
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
new file mode 100644
index 00000000000..30d175a4b12
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
@@ -0,0 +1,688 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Eval harness: for each labeled charset in {@code ~/data/charsets/devtest/},
+ * decode under its true charset (clean) and under a curated set of wrong
+ * charsets (mojibake), score with {@link JunkDetector}, report margin
+ * statistics per (labeled_charset × wrong_charset × source-byte-length).
+ *
+ * <p>Devtest file format: gzip → repeated {@code [u16 big-endian length,
+ * length bytes]} records, where the bytes are real text encoded in the
+ * labeled charset.  Same format the charset trainer consumes.
+ *
+ * <p>Output (TSVs):
+ * <ul>
+ *   <li><b>detail.tsv</b>: one row per (labeled_cs, script, wrong_cs, length).
+ *       Columns: n, mean_clean_z, mean_mojibake_z, cohens_d, mean_margin,
+ *       p5_margin, p50_margin, fpr, tpr.</li>
+ *   <li><b>summary.tsv</b>: macro-averaged across wrong charsets, per
+ *       (script, length).  The headline "is this script in trouble?" view.</li>
+ *   <li><b>script_pivot.tsv</b>: per-script rollup across all lengths +
+ *       wrong charsets.  Single-number-per-script view for spot inversion.</li>
+ * </ul>
+ *
+ * <p>"Margin" is the per-record paired difference {@code clean_z -
+ * mojibake_z}.  Mean margin and 5th-percentile margin are the
+ * margin-maximization metrics the v6 retrain is optimizing for.  Cohen's d
+ * is the independent-distribution analog (kept for compatibility with the
+ * existing {@link EvalJunkDetector} schema).
+ *
+ * <p>Usage:
+ * <pre>
+ *   ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.EvalJunkOnCharsetDevtest \
+ *     -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v5-baseline"
+ * </pre>
+ */
+public class EvalJunkOnCharsetDevtest {
+
+    /**
+     * Curated set of wrong charsets to cross-decode every labeled charset
+     * against.  Chosen to span the common real-world mojibake families:
+     * Western Latin (cp1252, ISO-8859-1, MacRoman), CJK over-claim (GB18030,
+     * Big5-HKSCS, Shift_JIS), Cyrillic (KOI8-R, cp1251), Arabic (cp1256),
+     * EBCDIC over-claim (IBM424), DOS Latin (IBM850), and UTF-8 (catches
+     * non-UTF8 bytes as replacement-character garbage).
+     */
+    private static final List<String> DEFAULT_WRONG_CHARSETS = List.of(
+            "windows-1252", "ISO-8859-1", "x-MacRoman",
+            "GB18030", "Big5-HKSCS", "Shift_JIS",
+            "KOI8-R", "windows-1251",
+            "windows-1256", "IBM424",
+            "IBM850", "UTF-8"
+    );
+
+    /** Source-byte length buckets to slice records into. */
+    private static final int[] DEFAULT_LENGTHS = {20, 50, 100, 200, 500, 1000};
+
+    /** Cap on records loaded per labeled-charset file. */
+    private static final int DEFAULT_MAX_RECORDS = 2000;
+
+    /** Threshold for FPR/TPR reporting; matches EvalJunkDetector default. */
+    private static final float DEFAULT_THRESHOLD = -2.0f;
+
+    /** Minimum number of paired (clean, mojibake) samples per cell to emit a row. */
+    private static final int MIN_SAMPLES_PER_CELL = 30;
+
+    public static void main(String[] args) throws IOException {
+        Path devtestDir = Paths.get(System.getProperty("user.home"),
+                "data", "charsets", "devtest");
+        Path outputDir = Paths.get("/tmp/junkdetect-eval");
+        Path modelPath = null;
+        int maxRecords = DEFAULT_MAX_RECORDS;
+        int[] lengths = DEFAULT_LENGTHS;
+        float threshold = DEFAULT_THRESHOLD;
+        List<String> wrongCharsets = DEFAULT_WRONG_CHARSETS;
+        List<String> labeledFilter = null;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--devtest-dir":
+                    devtestDir = Paths.get(args[++i]);
+                    break;
+                case "--output-dir":
+                    outputDir = Paths.get(args[++i]);
+                    break;
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--max-records":
+                    maxRecords = Integer.parseInt(args[++i]);
+                    break;
+                case "--threshold":
+                    threshold = Float.parseFloat(args[++i]);
+                    break;
+                case "--lengths":
+                    lengths = Arrays.stream(args[++i].split(","))
+                            .mapToInt(Integer::parseInt).toArray();
+                    break;
+                case "--wrong-charsets":
+                    wrongCharsets = Arrays.asList(args[++i].split(","));
+                    break;
+                case "--only":
+                    labeledFilter = Arrays.asList(args[++i].split(","));
+                    break;
+                default:
+                    System.err.println("Unknown arg: " + args[i]);
+                    printUsage();
+                    System.exit(1);
+            }
+        }
+
+        if (!Files.isDirectory(devtestDir)) {
+            System.err.println("ERROR: devtest-dir not found: " + devtestDir);
+            System.exit(1);
+        }
+        Files.createDirectories(outputDir);
+
+        JunkDetector detector = modelPath != null
+                ? JunkDetector.loadFromPath(modelPath)
+                : JunkDetector.loadFromClasspath();
+
+        System.err.println("=== EvalJunkOnCharsetDevtest ===");
+        System.err.println("  devtest-dir:  " + devtestDir);
+        System.err.println("  output-dir:   " + outputDir);
+        System.err.println("  model:        " + (modelPath != null ? modelPath : "classpath default"));
+        System.err.println("  model version: " + detector.getModelVersion());
+        System.err.println("  max-records:  " + maxRecords);
+        System.err.println("  lengths:      " + Arrays.toString(lengths));
+        System.err.println("  threshold:    " + threshold);
+        System.err.println("  wrong-cs:     " + wrongCharsets);
+
+        // Resolve wrong charsets (skip any the JVM doesn't have)
+        Map<String, Charset> resolvedWrong = new LinkedHashMap<>();
+        for (String name : wrongCharsets) {
+            Charset cs = tryGetCharset(name);
+            if (cs == null) {
+                System.err.println("  WARN: wrong-charset unavailable: " + name);
+                continue;
+            }
+            resolvedWrong.put(name, cs);
+        }
+
+        List<Path> files;
+        try (Stream<Path> stream = Files.list(devtestDir)) {
+            files = stream
+                    .filter(p -> p.getFileName().toString().endsWith(".bin.gz"))
+                    .sorted()
+                    .toList();
+        }
+        if (files.isEmpty()) {
+            System.err.println("ERROR: no *.bin.gz files in " + devtestDir);
+            System.exit(1);
+        }
+
+        Path detailPath = outputDir.resolve("detail.tsv");
+        Path summaryPath = outputDir.resolve("summary.tsv");
+        Path pivotPath = outputDir.resolve("script_pivot.tsv");
+
+        List<Row> allRows = new ArrayList<>();
+
+        try (PrintWriter detail = new PrintWriter(
+                Files.newBufferedWriter(detailPath, StandardCharsets.UTF_8))) {
+
+            detail.println("labeled_cs\tscript\twrong_cs\tlength\tn"
+                    + "\tmean_clean_z\tmean_mojibake_z\tcohens_d"
+                    + "\tmean_margin\tp5_margin\tp50_margin"
+                    + "\tfpr\ttpr");
+
+            for (Path file : files) {
+                String labeledName = filenameToCharsetName(file);
+                if (labeledFilter != null && !labeledFilter.contains(labeledName)) {
+                    continue;
+                }
+                Charset labeled = tryGetCharset(labeledName);
+                if (labeled == null) {
+                    System.err.println("  SKIP: labeled charset unavailable: " + labeledName);
+                    continue;
+                }
+
+                List<byte[]> records = readRecords(file, maxRecords);
+                if (records.size() < MIN_SAMPLES_PER_CELL) {
+                    System.err.printf("  SKIP %s: only %d records%n",
+                            labeledName, records.size());
+                    continue;
+                }
+
+                System.err.printf("%n--- %s (%d records) ---%n",
+                        labeledName, records.size());
+
+                for (int len : lengths) {
+                    List<byte[]> slices = sliceToLength(records, len);
+                    if (slices.size() < MIN_SAMPLES_PER_CELL) {
+                        continue;
+                    }
+
+                    // Decode all slices under labeled (clean) once
+                    List<String> cleanTexts = decodeAll(slices, labeled);
+                    List<Float> cleanZs = scoreAll(detector, cleanTexts);
+                    if (cleanZs.size() < MIN_SAMPLES_PER_CELL) {
+                        continue;
+                    }
+
+                    // Detect script from a sample of the clean decoded text
+                    String script = detectDominantScript(
+                            cleanTexts.get(cleanTexts.size() / 2));
+
+                    for (Map.Entry<String, Charset> entry : resolvedWrong.entrySet()) {
+                        String wrongName = entry.getKey();
+                        Charset wrongCs = entry.getValue();
+                        if (equalCharset(labeled, wrongCs)) {
+                            continue; // can't be its own mojibake
+                        }
+
+                        List<String> mojiTexts = decodeAll(slices, wrongCs);
+                        // Pair cleanTexts[i] with mojiTexts[i] by source record
+                        Row row = scorePairs(detector, script, labeledName,
+                                wrongName, len, cleanTexts, mojiTexts,
+                                cleanZs, threshold);
+                        if (row == null) {
+                            continue;
+                        }
+                        allRows.add(row);
+                        detail.println(row.toTsv());
+                    }
+                    detail.flush();
+                    System.err.printf("    len=%4d  n_clean=%d  cells=%d%n",
+                            len, cleanZs.size(),
+                            allRows.stream()
+                                    .filter(r -> r.labeledCs.equals(labeledName)
+                                            && r.length == len)
+                                    .count());
+                }
+            }
+        }
+
+        writeSummary(summaryPath, allRows, lengths);
+        writeScriptPivot(pivotPath, allRows);
+
+        System.err.println("\nWrote " + detailPath);
+        System.err.println("Wrote " + summaryPath);
+        System.err.println("Wrote " + pivotPath);
+        System.err.println("Done.");
+    }
+
+    // -----------------------------------------------------------------------
+    // Per-cell scoring (one labeled × wrong × length cell)
+    // -----------------------------------------------------------------------
+
+    private static Row scorePairs(JunkDetector detector,
+                                  String script,
+                                  String labeledName, String wrongName,
+                                  int length,
+                                  List<String> cleanTexts,
+                                  List<String> mojiTexts,
+                                  List<Float> cleanZsPre,
+                                  float threshold) {
+        // cleanZsPre is the already-scored clean text (avoid re-scoring per wrong cs).
+        // We re-score only the mojibake side here.
+        int n = Math.min(cleanTexts.size(), mojiTexts.size());
+        List<Float> cleanZs = new ArrayList<>(n);
+        List<Float> mojiZs = new ArrayList<>(n);
+        List<Float> margins = new ArrayList<>(n);
+        for (int i = 0; i < n; i++) {
+            float cz = cleanZsPre.get(i);
+            TextQualityScore ms = detector.score(mojiTexts.get(i));
+            if (ms.isUnknown()) {
+                continue;
+            }
+            float mz = ms.getZScore();
+            cleanZs.add(cz);
+            mojiZs.add(mz);
+            margins.add(cz - mz);
+        }
+        if (margins.size() < MIN_SAMPLES_PER_CELL) {
+            return null;
+        }
+        return new Row(labeledName, script, wrongName, length,
+                cleanZs, mojiZs, margins, threshold);
+    }
+
+    // -----------------------------------------------------------------------
+    // I/O: read the gzipped length-prefixed record format
+    // -----------------------------------------------------------------------
+
+    private static List<byte[]> readRecords(Path file, int maxRecords) throws IOException {
+        List<byte[]> records = new ArrayList<>();
+        try (FileInputStream fis = new FileInputStream(file.toFile());
+             GZIPInputStream gis = new GZIPInputStream(fis);
+             DataInputStream dis = new DataInputStream(gis)) {
+            while (records.size() < maxRecords) {
+                int len;
+                try {
+                    len = dis.readUnsignedShort();
+                } catch (EOFException eof) {
+                    break;
+                }
+                byte[] rec = new byte[len];
+                dis.readFully(rec);
+                records.add(rec);
+            }
+        }
+        return records;
+    }
+
+    private static List<byte[]> sliceToLength(List<byte[]> records, int len) {
+        List<byte[]> slices = new ArrayList<>();
+        for (byte[] r : records) {
+            if (r.length >= len) {
+                slices.add(Arrays.copyOf(r, len));
+            }
+        }
+        return slices;
+    }
+
+    private static List<String> decodeAll(List<byte[]> slices, Charset cs) {
+        List<String> texts = new ArrayList<>(slices.size());
+        for (byte[] s : slices) {
+            texts.add(decode(s, cs));
+        }
+        return texts;
+    }
+
+    private static String decode(byte[] bytes, Charset cs) {
+        CharsetDecoder dec = cs.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPLACE)
+                .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        try {
+            return dec.decode(ByteBuffer.wrap(bytes)).toString();
+        } catch (CharacterCodingException e) {
+            return new String(bytes, cs); // fallback; shouldn't happen with REPLACE
+        }
+    }
+
+    private static List<Float> scoreAll(JunkDetector detector, List<String> texts) {
+        List<Float> zs = new ArrayList<>(texts.size());
+        for (String t : texts) {
+            TextQualityScore s = detector.score(t);
+            if (!s.isUnknown()) {
+                zs.add(s.getZScore());
+            } else {
+                zs.add(Float.NaN);
+            }
+        }
+        return zs;
+    }
+
+    // -----------------------------------------------------------------------
+    // Aggregation: summary.tsv (macro across wrong charsets, per script×length)
+    // -----------------------------------------------------------------------
+
+    private static void writeSummary(Path summaryPath, List<Row> rows,
+                                     int[] lengths) throws IOException {
+        try (PrintWriter out = new PrintWriter(
+                Files.newBufferedWriter(summaryPath, StandardCharsets.UTF_8))) {
+            out.println("script\tlength\tn_cells"
+                    + "\tmacro_cohens_d\tmacro_mean_margin\tmacro_p5_margin"
+                    + "\tmacro_fpr\tmacro_tpr");
+
+            // Group by (script, length)
+            Map<String, Map<Integer, List<Row>>> bucketed = new HashMap<>();
+            for (Row r : rows) {
+                bucketed
+                        .computeIfAbsent(r.script, k -> new HashMap<>())
+                        .computeIfAbsent(r.length, k -> new ArrayList<>())
+                        .add(r);
+            }
+
+            List<String> scripts = new ArrayList<>(bucketed.keySet());
+            Collections.sort(scripts);
+            for (String script : scripts) {
+                for (int len : lengths) {
+                    List<Row> cell = bucketed.get(script).get(len);
+                    if (cell == null || cell.isEmpty()) {
+                        continue;
+                    }
+                    double macroD = cell.stream()
+                            .filter(r -> !Double.isNaN(r.cohensD))
+                            .mapToDouble(r -> r.cohensD)
+                            .average().orElse(Double.NaN);
+                    double macroMargin = cell.stream()
+                            .mapToDouble(r -> r.meanMargin)
+                            .average().orElse(Double.NaN);
+                    double macroP5 = cell.stream()
+                            .mapToDouble(r -> r.p5Margin)
+                            .average().orElse(Double.NaN);
+                    double macroFpr = cell.stream()
+                            .mapToDouble(r -> r.fpr)
+                            .average().orElse(Double.NaN);
+                    double macroTpr = cell.stream()
+                            .mapToDouble(r -> r.tpr)
+                            .average().orElse(Double.NaN);
+                    out.printf("%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f%n",
+                            script, len, cell.size(),
+                            macroD, macroMargin, macroP5, macroFpr, macroTpr);
+                }
+            }
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Aggregation: script_pivot.tsv (single line per script — quick triage)
+    // -----------------------------------------------------------------------
+
+    private static void writeScriptPivot(Path path, List<Row> rows) throws IOException {
+        try (PrintWriter out = new PrintWriter(
+                Files.newBufferedWriter(path, StandardCharsets.UTF_8))) {
+            out.println("script\tn_cells"
+                    + "\tmean_d\tmean_margin\tmean_p5_margin"
+                    + "\tmin_d_cell\tmin_margin_cell");
+
+            Map<String, List<Row>> byScript = new HashMap<>();
+            for (Row r : rows) {
+                byScript.computeIfAbsent(r.script, k -> new ArrayList<>()).add(r);
+            }
+            List<String> scripts = new ArrayList<>(byScript.keySet());
+            Collections.sort(scripts);
+            for (String script : scripts) {
+                List<Row> cells = byScript.get(script);
+                double meanD = cells.stream()
+                        .filter(r -> !Double.isNaN(r.cohensD))
+                        .mapToDouble(r -> r.cohensD)
+                        .average().orElse(Double.NaN);
+                double meanMargin = cells.stream()
+                        .mapToDouble(r -> r.meanMargin)
+                        .average().orElse(Double.NaN);
+                double meanP5 = cells.stream()
+                        .mapToDouble(r -> r.p5Margin)
+                        .average().orElse(Double.NaN);
+                Row minDCell = cells.stream()
+                        .filter(r -> !Double.isNaN(r.cohensD))
+                        .min((a, b) -> Double.compare(a.cohensD, b.cohensD))
+                        .orElse(null);
+                Row minMarginCell = cells.stream()
+                        .min((a, b) -> Double.compare(a.meanMargin, b.meanMargin))
+                        .orElse(null);
+                out.printf("%s\t%d\t%.3f\t%.3f\t%.3f\t%s\t%s%n",
+                        script, cells.size(),
+                        meanD, meanMargin, meanP5,
+                        minDCell != null ? cellLabel(minDCell) : "-",
+                        minMarginCell != null ? cellLabel(minMarginCell) : "-");
+            }
+        }
+    }
+
+    private static String cellLabel(Row r) {
+        return String.format("[%s→%s@%d]", r.labeledCs, r.wrongCs, r.length);
+    }
+
+    // -----------------------------------------------------------------------
+    // Charset utilities
+    // -----------------------------------------------------------------------
+
+    private static String filenameToCharsetName(Path file) {
+        String name = file.getFileName().toString();
+        if (name.endsWith(".bin.gz")) {
+            name = name.substring(0, name.length() - ".bin.gz".length());
+        }
+        return name;
+    }
+
+    private static Charset tryGetCharset(String name) {
+        try {
+            return Charset.forName(name);
+        } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
+            return null;
+        }
+    }
+
+    private static boolean equalCharset(Charset a, Charset b) {
+        return a.name().equalsIgnoreCase(b.name())
+                || a.aliases().contains(b.name())
+                || b.aliases().contains(a.name());
+    }
+
+    // -----------------------------------------------------------------------
+    // Script detection (parallels JunkDetector.detectDominantScript, which is
+    // package-private; small enough to inline)
+    // -----------------------------------------------------------------------
+
+    private static final Map<String, String> SCRIPT_FALLBACK = Map.of(
+            "HIRAGANA", "HAN",
+            "KATAKANA", "HAN"
+    );
+
+    private static String detectDominantScript(String text) {
+        if (text == null || text.isEmpty()) {
+            return "LATIN";
+        }
+        Map<Character.UnicodeScript, Integer> counts = new HashMap<>();
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+            if (s != Character.UnicodeScript.COMMON
+                    && s != Character.UnicodeScript.INHERITED
+                    && s != Character.UnicodeScript.UNKNOWN) {
+                counts.merge(s, 1, Integer::sum);
+            }
+            i += Character.charCount(cp);
+        }
+        if (counts.isEmpty()) {
+            return "LATIN";
+        }
+        String name = counts.entrySet().stream()
+                .max(Map.Entry.comparingByValue())
+                .map(e -> e.getKey().name())
+                .orElse("LATIN");
+        return SCRIPT_FALLBACK.getOrDefault(name, name);
+    }
+
+    // -----------------------------------------------------------------------
+    // Row
+    // -----------------------------------------------------------------------
+
+    private static final class Row {
+        final String labeledCs;
+        final String script;
+        final String wrongCs;
+        final int length;
+        final int n;
+        final double meanCleanZ;
+        final double meanMojiZ;
+        final double cohensD;
+        final double meanMargin;
+        final double p5Margin;
+        final double p50Margin;
+        final double fpr;
+        final double tpr;
+
+        Row(String labeledCs, String script, String wrongCs, int length,
+            List<Float> cleanZs, List<Float> mojiZs, List<Float> margins,
+            float threshold) {
+            this.labeledCs = labeledCs;
+            this.script = script;
+            this.wrongCs = wrongCs;
+            this.length = length;
+            this.n = margins.size();
+            this.meanCleanZ = mean(cleanZs);
+            this.meanMojiZ = mean(mojiZs);
+            this.cohensD = computeCohensD(cleanZs, mojiZs);
+            this.meanMargin = mean(margins);
+            this.p5Margin = percentile(margins, 0.05);
+            this.p50Margin = percentile(margins, 0.50);
+            this.fpr = fractionBelow(cleanZs, threshold);
+            this.tpr = fractionBelow(mojiZs, threshold);
+        }
+
+        String toTsv() {
+            return String.format(
+                    "%s\t%s\t%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f",
+                    labeledCs, script, wrongCs, length, n,
+                    meanCleanZ, meanMojiZ, cohensD,
+                    meanMargin, p5Margin, p50Margin,
+                    fpr, tpr);
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Statistics
+    // -----------------------------------------------------------------------
+
+    private static double computeCohensD(List<Float> a, List<Float> b) {
+        if (a.size() < 2 || b.size() < 2) {
+            return Double.NaN;
+        }
+        double ma = mean(a);
+        double mb = mean(b);
+        double va = variance(a, ma);
+        double vb = variance(b, mb);
+        double pooled = Math.sqrt((va + vb) / 2.0);
+        if (pooled < 1e-9) {
+            return Double.NaN;
+        }
+        return (ma - mb) / pooled;
+    }
+
+    private static double mean(List<Float> xs) {
+        double s = 0;
+        int n = 0;
+        for (float f : xs) {
+            if (!Float.isNaN(f)) {
+                s += f;
+                n++;
+            }
+        }
+        return n == 0 ? Double.NaN : s / n;
+    }
+
+    private static double variance(List<Float> xs, double m) {
+        if (xs.size() < 2) {
+            return 0;
+        }
+        double s = 0;
+        int n = 0;
+        for (float f : xs) {
+            if (!Float.isNaN(f)) {
+                double d = f - m;
+                s += d * d;
+                n++;
+            }
+        }
+        return n < 2 ? 0 : s / (n - 1);
+    }
+
+    private static double percentile(List<Float> xs, double p) {
+        List<Float> sorted = new ArrayList<>(xs);
+        sorted.removeIf(f -> Float.isNaN(f));
+        if (sorted.isEmpty()) {
+            return Double.NaN;
+        }
+        Collections.sort(sorted);
+        int idx = (int) Math.floor(p * (sorted.size() - 1));
+        return sorted.get(idx);
+    }
+
+    private static double fractionBelow(List<Float> xs, float threshold) {
+        int below = 0;
+        int n = 0;
+        for (float f : xs) {
+            if (!Float.isNaN(f)) {
+                if (f < threshold) {
+                    below++;
+                }
+                n++;
+            }
+        }
+        return n == 0 ? Double.NaN : (double) below / n;
+    }
+
+    // -----------------------------------------------------------------------
+
+    private static void printUsage() {
+        System.err.println("Usage:");
+        System.err.println("  EvalJunkOnCharsetDevtest");
+        System.err.println("    [--devtest-dir <path>]   (default ~/data/charsets/devtest)");
+        System.err.println("    [--output-dir <path>]    (default /tmp/junkdetect-eval)");
+        System.err.println("    [--model <path>]         (default classpath junkdetect.bin)");
+        System.err.println("    [--max-records N]        (default 2000)");
+        System.err.println("    [--threshold F]          (default -2.0)");
+        System.err.println("    [--lengths 20,50,...]");
+        System.err.println("    [--wrong-charsets a,b,...]");
+        System.err.println("    [--only labeledCs,...]   (filter for spot runs)");
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
new file mode 100644
index 00000000000..aa3761ef79f
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Frozen set of training-time choices that together define a junk-detector
+ * model's identity.  Any change to these values produces a meaningfully
+ * different model and must be reviewed in git.
+ *
+ * <p>Two principles drove making this a class rather than CLI flags:
+ *
+ * <ol>
+ *   <li><b>Reproducibility.</b>  When we look back at a model file six
+ *       months later we want a single commit hash that says exactly what
+ *       knobs produced it, not a half-remembered shell history.
+ *   <li><b>Drift prevention.</b>  CLI flags with defaults allow accidental
+ *       deviation between developers ("did you remember to pass
+ *       {@code --min-target-script-frac 0.05}?").  Constants in a tracked
+ *       file remove that failure mode.
+ * </ol>
+ *
+ * <p>{@link BuildJunkTrainingData} and {@link TrainJunkModel} read the
+ * values here; both tools <b>refuse to start</b> if any CLI argument
+ * attempts to override a config-controlled parameter, surfacing the
+ * mistake at launch time rather than silently producing a non-canonical
+ * model.
+ *
+ * <p>The constants below reflect the choices that produced the current
+ * shipping model and are recorded in the corresponding training notes
+ * ({@code 20260514-junk-retrain-v6.md}).  Update them by editing this
+ * file and committing the change together with the new model output.
+ *
+ * <p>The class has no instance state; all values are exposed as
+ * {@code public static final}.  This keeps callsites short and avoids
+ * the temptation of passing a runtime-mutable config around.
+ *
+ * <p>This is not part of the public model-loading API.  The {@link
+ * org.apache.tika.ml.junkdetect.JunkDetector} runtime is configuration-
+ * free; once a model file is built, all of its baked-in choices travel
+ * with the file's binary format.
+ */
+public final class JunkDetectorTrainingConfig {
+
+    // =======================================================================
+    // Corpus build (BuildJunkTrainingData)
+    // =======================================================================
+
+    /**
+     * Total UTF-8 byte budget across all script groups.  Divided
+     * proportionally by per-script bigram entropy after the sampling phase.
+     */
+    public static final long TOTAL_BUDGET_BYTES = 500_000_000L;
+
+    /**
+     * Maximum UTF-8 bytes a single language may contribute to a
+     * multi-language script bucket.  Prevents one large source (e.g. {@code
+     * zho} with 8 GB of MADLAD) from dominating a multi-language script
+     * model.  Buckets with only one language ignore this cap and may consume
+     * their full budget.  See {@link BuildJunkTrainingData} Phase 4.
+     */
+    public static final long PER_LANGUAGE_CAP_BYTES = 5_000_000L;
+
+    /**
+     * Sentence-level filter: minimum fraction of non-COMMON/INHERITED
+     * codepoints that must belong to the script bucket's target script for a
+     * sentence to be accepted.  Set low so legitimate mixed-script content
+     * (Japanese kanji + kana, Korean with hanja annotations, Chinese with
+     * English citations, etc.) is preserved, but enough to reject lines that
+     * are essentially off-target (e.g. an English article about Gothic in
+     * the GOTHIC bucket).
+     */
+    public static final double MIN_TARGET_SCRIPT_FRAC = 0.05;
+
+    /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
+    public static final int MIN_BYTES_PER_SENTENCE = 50;
+
+    /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
+    public static final double MAX_PUNC_FRAC = 0.30;
+
+    /**
+     * Minimum number of sentences that must land in the dev split for a
+     * script to be included in the model.  Scripts below this floor have
+     * insufficient data to reliably estimate calibration statistics, which
+     * inflates FPR.  With {@code DEV_FRAC = 0.10} this corresponds to a
+     * total-sentence floor of {@code 500 / 0.10 = 5000} per script.
+     */
+    public static final int MIN_DEV_SENTENCES = 500;
+
+    /** Lines read per language to determine the language's dominant script. */
+    public static final int SCRIPT_SAMPLE_LINES = 2_000;
+
+    /**
+     * UTF-8 bytes loaded per script group for bigram entropy estimation,
+     * driving the entropy-proportional budget allocation.  200 KB is
+     * sufficient to characterise the bigram distribution of any single
+     * script.
+     */
+    public static final long ENTROPY_SAMPLE_BYTES = 200_000L;
+
+    /** Random seed for sentence shuffling and other corpus-build randomness. */
+    public static final int SEED = 42;
+
+    /**
+     * Script bucket names whose source data is too thin or too off-target
+     * to produce reliable per-script F1 calibration.  Excluded from the
+     * model entirely; the {@link
+     * org.apache.tika.ml.junkdetect.JunkDetector#score(String)} routing
+     * falls back to "unknown script" behavior for these scripts.
+     *
+     * <p>The current selection is based on a corpus audit that found these
+     * scripts either had thin native source data (e.g. THAANA: 216 train
+     * sentences from Maldivian), or had sources dominated by off-target
+     * content (e.g. GOTHIC: 40% of lines are {@literal <}5% Gothic — the
+     * Wikipedia "gothic" directory is English text about Gothic).
+     *
+     * <p>Three further scripts (CANADIAN_ABORIGINAL, CHEROKEE, TIFINAGH)
+     * are not listed here because the {@link #MIN_TARGET_SCRIPT_FRAC}
+     * filter implicitly removes them — their MADLAD sources contain
+     * effectively no native-script content at the 5% threshold.  Listing
+     * them here is unnecessary and would obscure the data-quality finding.
+     */
+    public static final Set<String> DROP_SCRIPTS =
+            Collections.unmodifiableSet(new java.util.TreeSet<>(Set.of("GOTHIC", "THAANA")));
+
+    /**
+     * Per-script byte-budget overrides applied on top of the entropy-
+     * proportional allocation.  Empty in the current configuration.
+     *
+     * <p>Under v6 the {@code HAN=60MB} experiment <em>worsened</em> every
+     * non-HAN script (the global F1 hash table was the bottleneck).  Under
+     * v7's per-script tables, the same experiment correctly leaves other
+     * scripts untouched, but the HAN gain itself was negligible (Cohen's d
+     * moved 7.26 → 7.35) — the per-script HAN model is already near its
+     * data-saturation point with ~18 MB of training data.  Override left
+     * empty until a more decisive HAN-coverage experiment is designed.
+     */
+    public static final Map<String, Long> SCRIPT_BUDGET_OVERRIDES =
+            Collections.emptyMap();
+
+    // =======================================================================
+    // Model train (TrainJunkModel)
+    // =======================================================================
+
+    /**
+     * Drop per-script F1 bigrams whose per-pair occurrence count (within
+     * that script's training data) is below this threshold.  Set to 3 on
+     * evidence that singleton and doubleton pairs are overwhelmingly OCR
+     * artifacts and proper-noun noise that inflate the clean-side score
+     * distribution tail without contributing signal.
+     *
+     * <p>Set to 1 to disable the filter (every observed pair retained).
+     */
+    public static final int MIN_BIGRAM_COUNT = 3;
+
+    /**
+     * Target load factor for the per-script open-addressing F1 hash
+     * table.  Table capacity is sized as the smallest power of two
+     * larger than {@code keptPairs / loadFactor}, giving an average of
+     * 1 / (1 - loadFactor) probes per lookup.  0.5 → ~2 probes; modestly
+     * wasteful in space but very cheap to probe.
+     */
+    public static final double OA_LOAD_FACTOR = 0.5;
+
+    /**
+     * Bit width of each codepoint's dense index within a script's F1
+     * table.  Each bigram is packed as {@code (idxA << KEY_INDEX_BITS) |
+     * idxB}, so each side must fit in this many bits.  16 bits supports
+     * up to 65535 distinct codepoints per script, which is comfortably
+     * above the largest per-script count we have measured (HAN is the
+     * worst case at ~15K kept codepoints).
+     */
+    public static final int KEY_INDEX_BITS = 16;
+
+    private JunkDetectorTrainingConfig() {
+        // No instances.
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
new file mode 100644
index 00000000000..bcda57c9f7c
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * For each {@code *.train.gz} file in a directory, compute per-line statistics
+ * of "target-script fraction" — i.e. the fraction of codepoints in each line
+ * that belong to the script the file is supposed to represent.
+ *
+ * <p>Reports a histogram across the buckets
+ * [0, 5, 10, 20, 30, 50, 70, 90, 100]% so we can pick a per-script keep
+ * threshold (e.g. "drop lines with &lt;20% HAN codepoints").  Also reports
+ * what fraction of total bytes / lines would be dropped at each threshold.
+ *
+ * <p>Each {@code {script}.train.gz} maps to a {@link Character.UnicodeScript};
+ * the file basename is uppercased.  Special-case handling routes a few
+ * project-internal script names (e.g. HAN includes HALF_FULL ideographic
+ * forms) when desired.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java LineScriptFractions &lt;dataDir&gt; [thresholds]
+ * </pre>
+ */
+public final class LineScriptFractions {
+
+    private static final int[] BUCKETS = {0, 5, 10, 20, 30, 50, 70, 90, 100};
+
+    private LineScriptFractions() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: LineScriptFractions <dataDir>");
+            System.exit(1);
+        }
+        Path dataDir = Paths.get(args[0]);
+        Path[] files;
+        try (var s = Files.list(dataDir)) {
+            files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+                    .sorted().toArray(Path[]::new);
+        }
+        if (files.length == 0) {
+            System.err.println("No *.train.gz files in " + dataDir);
+            System.exit(1);
+        }
+
+        System.out.printf("%-20s %10s %10s | %s%n",
+                "script", "lines", "<5%",
+                "lines at target-frac threshold (cumulative dropped %)");
+        System.out.println("                                            "
+                + " <10%   <20%   <30%   <50%   <70%   <90%  <100%");
+        System.out.println(repeat('-', 110));
+
+        for (Path file : files) {
+            String fname = file.getFileName().toString();
+            String name = fname.substring(0, fname.length() - ".train.gz".length())
+                    .toUpperCase();
+            Character.UnicodeScript target = mapScript(name);
+            if (target == null) {
+                System.out.printf("%-20s  (no UnicodeScript mapping for '%s')%n", name, name);
+                continue;
+            }
+
+            long lines = 0;
+            long[] bucketCounts = new long[BUCKETS.length];
+            try (BufferedReader r = new BufferedReader(
+                    new InputStreamReader(
+                            new GZIPInputStream(Files.newInputStream(file)),
+                            StandardCharsets.UTF_8))) {
+                String line;
+                while ((line = r.readLine()) != null) {
+                    lines++;
+                    int total = 0;
+                    int matching = 0;
+                    for (int i = 0; i < line.length(); ) {
+                        int cp = line.codePointAt(i);
+                        i += Character.charCount(cp);
+                        Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+                        if (s == Character.UnicodeScript.COMMON
+                                || s == Character.UnicodeScript.INHERITED
+                                || s == Character.UnicodeScript.UNKNOWN) {
+                            // Don't count toward denominator: punctuation,
+                            // spaces, diacritics are script-neutral.
+                            continue;
+                        }
+                        total++;
+                        if (s == target) matching++;
+                    }
+                    double pct = total == 0 ? 0.0 : 100.0 * matching / total;
+                    int b = 0;
+                    while (b < BUCKETS.length - 1 && pct >= BUCKETS[b + 1]) b++;
+                    bucketCounts[b]++;
+                }
+            }
+
+            // Convert bucket counts to "cumulative fraction dropped at threshold = BUCKETS[i]".
+            StringBuilder sb = new StringBuilder();
+            long cum = 0;
+            // bucketCounts[i] holds lines with pct in [BUCKETS[i], BUCKETS[i+1]).
+            // Drop-if-pct<T means drop all bucketCounts[j] with BUCKETS[j+1] <= T.
+            // We report drop-fraction for thresholds 10, 20, 30, 50, 70, 90, 100.
+            int[] thresholds = {10, 20, 30, 50, 70, 90, 100};
+            for (int t : thresholds) {
+                long dropped = 0;
+                for (int j = 0; j < BUCKETS.length; j++) {
+                    int hi = (j == BUCKETS.length - 1) ? 101 : BUCKETS[j + 1];
+                    if (hi <= t) dropped += bucketCounts[j];
+                }
+                double pct = 100.0 * dropped / Math.max(1, lines);
+                sb.append(String.format(" %6.1f", pct));
+            }
+
+            long below5 = bucketCounts[0];
+            System.out.printf("%-20s %,10d %,10d |%s%n",
+                    name.toLowerCase(), lines, below5, sb.toString());
+        }
+    }
+
+    private static Character.UnicodeScript mapScript(String name) {
+        try {
+            return Character.UnicodeScript.valueOf(name);
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+    }
+
+    private static String repeat(char c, int n) {
+        char[] b = new char[n];
+        java.util.Arrays.fill(b, c);
+        return new String(b);
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
new file mode 100644
index 00000000000..47ee346d37f
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
@@ -0,0 +1,1208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Throwaway prototype: validates the v6 codepoint-bigram-hash architecture
+ * (Bloom-gated lookup with unigram backoff) by training on locally-available
+ * text and measuring margins on the AIT5-class failure case (UTF-8 multi-
+ * language records cross-decoded as GB18030).
+ *
+ * <p>Goal: prove the codepoint-bigram-hash approach opens the
+ * UTF-8→GB18030 mojibake margin meaningfully above v5's ~1 z-unit
+ * baseline BEFORE committing to a multi-day production retrain.
+ *
+ * <p>Training corpus: decode {@code ~/data/charsets/devtest/GB18030.bin.gz}
+ * (Chinese) + first 80% of {@code UTF-8.bin.gz} (multi-language Wikipedia)
+ * under their labeled charsets, iterate codepoints, count bigrams and unigrams,
+ * hash into N buckets, build Bloom filter of seen pairs.  Held-out: last 20%
+ * of UTF-8 records.
+ *
+ * <p>Eval: for each held-out UTF-8 record, slice to length buckets
+ * {20, 50, 100, 200, 500, 1000} source bytes.  Decode each slice under
+ * UTF-8 (clean) and GB18030 (mojibake-as-HAN).  Score both with the
+ * prototype model.  Margin = clean_score - mojibake_score.  Report
+ * mean and 5th-percentile margin per length.
+ *
+ * <p>Sweep: {bigramBuckets, alpha} grid.  Pick the configuration that
+ * maximises margin.  Compare to v5 baseline (mean margin ~1 z-unit
+ * across all lengths in the same cohort).
+ *
+ * <p>Outputs:
+ * <ul>
+ *   <li><b>prototype-sweep.tsv</b>: one row per
+ *       (bigram_buckets, alpha, length).  Columns: n, mean_clean,
+ *       mean_moji, mean_margin, std_margin, p5_margin, p50_margin,
+ *       margin_in_clean_stds (effective z-units).</li>
+ * </ul>
+ *
+ * <p>Usage:
+ * <pre>
+ *   ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.PrototypeCodepointHash \
+ *     -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v6-prototype"
+ * </pre>
+ */
+public class PrototypeCodepointHash {
+
+    // --- Hyperparameter sweep grid ---
+    private static final int[] BIGRAM_BUCKETS = {4096, 8192, 16384, 32768};
+    private static final double[] ALPHAS = {1.0, 0.4};
+    private static final int UNIGRAM_BUCKETS = 8192;
+    private static final int BLOOM_BITS = 4 * 1024 * 1024; // 512 KB
+    private static final int BLOOM_K = 7;
+
+    // --- Smoothing ---
+    private static final double ADD_ALPHA = 0.01;
+
+    // --- Eval ---
+    private static final int[] LENGTHS = {20, 50, 100, 200, 500, 1000};
+    private static final int MAX_RECORDS_PER_FILE = 5000;
+    private static final double HOLDOUT_FRACTION = 0.20;
+    private static final int MIN_SCORE_CODEPOINTS = 3;
+
+    public static void main(String[] args) throws IOException {
+        Path devtestDir = Paths.get(System.getProperty("user.home"),
+                "data", "charsets", "devtest");
+        Path outputDir = Paths.get("/tmp/v6-prototype");
+        int maxRecords = MAX_RECORDS_PER_FILE;
+        List<Path> fixturesDirs = new ArrayList<>();
+        String wrongCharsetName = "GB18030";
+        boolean singleModel = false;
+        List<String> candidates = List.of(
+                "UTF-8", "GB18030", "windows-1252", "windows-1251", "windows-1257",
+                "Shift_JIS", "EUC-JP", "ISO-2022-JP", "UTF-16LE", "UTF-16BE");
+        List<String> forceCandidates = null; // when set, skip base detectors
+        String expected = "UTF-8";
+        int[] probeSizes = null; // when set, sweep these probe sizes per fixture
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--devtest-dir":
+                    devtestDir = Paths.get(args[++i]);
+                    break;
+                case "--output-dir":
+                    outputDir = Paths.get(args[++i]);
+                    break;
+                case "--max-records":
+                    maxRecords = Integer.parseInt(args[++i]);
+                    break;
+                case "--fixtures-dir":
+                    fixturesDirs.add(Paths.get(args[++i]));
+                    break;
+                case "--wrong-charset":
+                    wrongCharsetName = args[++i];
+                    break;
+                case "--single-model":
+                    // Skip prototype training; run N-way fixture eval on bundled JunkDetector only.
+                    singleModel = true;
+                    break;
+                case "--candidates":
+                    candidates = Arrays.asList(args[++i].split(","));
+                    break;
+                case "--force-candidates":
+                    // Bypass base detectors; pairwise tournament directly on these.
+                    forceCandidates = Arrays.asList(args[++i].split(","));
+                    break;
+                case "--expected":
+                    expected = args[++i];
+                    break;
+                case "--probe-sizes":
+                    // Comma-separated probe sizes (bytes).  Each fixture
+                    // gets one row per size, so you can see how length
+                    // affects UNKNOWN vs scored.
+                    String[] sizes = args[++i].split(",");
+                    probeSizes = new int[sizes.length];
+                    for (int k = 0; k < sizes.length; k++) {
+                        probeSizes[k] = Integer.parseInt(sizes[k].trim());
+                    }
+                    break;
+                default:
+                    System.err.println("Unknown arg: " + args[i]);
+                    System.exit(1);
+            }
+        }
+        Files.createDirectories(outputDir);
+
+        // --single-model bypasses the v5/v6-prototype comparison apparatus.
+        // Requires --force-candidates to specify the charsets to compare;
+        // the base-detector-driven path was removed to keep tika-ml-junkdetect
+        // free of heavy encoding-detector deps.
+        if (singleModel) {
+            if (fixturesDirs.isEmpty()) {
+                System.err.println("--single-model requires --fixtures-dir");
+                System.exit(1);
+            }
+            if (forceCandidates == null || forceCandidates.isEmpty()) {
+                System.err.println("--single-model requires --force-candidates "
+                        + "(e.g. --force-candidates UTF-8,GB18030)");
+                System.exit(1);
+            }
+            evalFixturesSingleModel(fixturesDirs, forceCandidates, expected,
+                    probeSizes, outputDir);
+            return;
+        }
+
+        System.err.println("=== PrototypeCodepointHash ===");
+        System.err.println("  devtest-dir:  " + devtestDir);
+        System.err.println("  output-dir:   " + outputDir);
+        System.err.println("  max-records:  " + maxRecords);
+        System.err.println("  bigram_buckets sweep: " + Arrays.toString(BIGRAM_BUCKETS));
+        System.err.println("  alpha sweep:          " + Arrays.toString(ALPHAS));
+        System.err.println("  unigram_buckets:      " + UNIGRAM_BUCKETS);
+        System.err.println("  bloom_bits:           " + BLOOM_BITS
+                + " (" + (BLOOM_BITS / 8 / 1024) + " KB, k=" + BLOOM_K + ")");
+
+        // -------- Load corpus --------
+
+        Charset utf8 = StandardCharsets.UTF_8;
+        Charset gb18030 = Charset.forName("GB18030");
+
+        System.err.println("\n--- Loading corpus ---");
+        List<byte[]> utf8Records = readRecords(
+                devtestDir.resolve("UTF-8.bin.gz"), maxRecords);
+        List<byte[]> gbRecords = readRecords(
+                devtestDir.resolve("GB18030.bin.gz"), maxRecords);
+        System.err.printf("  UTF-8.bin.gz:    %d records%n", utf8Records.size());
+        System.err.printf("  GB18030.bin.gz:  %d records%n", gbRecords.size());
+
+        // Train/eval split on UTF-8 records.  GB18030 records all go to training.
+        int holdoutCount = (int) (utf8Records.size() * HOLDOUT_FRACTION);
+        int utf8TrainSize = utf8Records.size() - holdoutCount;
+        List<byte[]> utf8TrainBytes = utf8Records.subList(0, utf8TrainSize);
+        List<byte[]> utf8EvalBytes = utf8Records.subList(utf8TrainSize, utf8Records.size());
+        System.err.printf("  UTF-8 train: %d  eval: %d%n",
+                utf8TrainBytes.size(), utf8EvalBytes.size());
+
+        // Decode training corpus to codepoint streams
+        System.err.println("\n--- Decoding training corpus ---");
+        List<int[]> trainStreams = new ArrayList<>();
+        long totalTrainCp = 0;
+        for (byte[] r : utf8TrainBytes) {
+            int[] cps = toCodepoints(decode(r, utf8));
+            if (cps.length >= 2) trainStreams.add(cps);
+            totalTrainCp += cps.length;
+        }
+        for (byte[] r : gbRecords) {
+            int[] cps = toCodepoints(decode(r, gb18030));
+            if (cps.length >= 2) trainStreams.add(cps);
+            totalTrainCp += cps.length;
+        }
+        System.err.printf("  total training codepoints: %,d across %d records%n",
+                totalTrainCp, trainStreams.size());
+
+        // Count unique pairs (for Bloom sizing sanity)
+        Set<Long> uniquePairs = new HashSet<>();
+        for (int[] cps : trainStreams) {
+            for (int i = 0; i + 1 < cps.length; i++) {
+                uniquePairs.add(packPair(cps[i], cps[i + 1]));
+                if (uniquePairs.size() >= 2_000_000) break;
+            }
+            if (uniquePairs.size() >= 2_000_000) break;
+        }
+        System.err.printf("  unique codepoint-pairs in training: ~%,d%n",
+                uniquePairs.size());
+
+        // -------- Hyperparameter sweep --------
+
+        Path sweepPath = outputDir.resolve("prototype-sweep.tsv");
+        try (PrintWriter out = new PrintWriter(
+                Files.newBufferedWriter(sweepPath, StandardCharsets.UTF_8))) {
+            out.println("bigram_buckets\talpha\tlength\tn"
+                    + "\tmean_clean\tstd_clean\tmean_moji"
+                    + "\tmean_margin\tstd_margin\tp5_margin\tp50_margin"
+                    + "\tmargin_in_clean_stds\tbloom_seen_frac_clean\tbloom_seen_frac_moji");
+
+            for (int buckets : BIGRAM_BUCKETS) {
+                for (double alpha : ALPHAS) {
+                    System.err.printf("%n--- Config: bigram_buckets=%d  alpha=%.1f ---%n",
+                            buckets, alpha);
+
+                    Model m = train(trainStreams, buckets, UNIGRAM_BUCKETS,
+                            BLOOM_BITS, BLOOM_K, ADD_ALPHA, alpha);
+
+                    // Calibrate on a sample of training streams (for the
+                    // "margin_in_clean_stds" effective-z normalization)
+                    double[] muSigma = calibrate(m, trainStreams);
+                    System.err.printf("    train mu=%.3f  sigma=%.3f%n", muSigma[0], muSigma[1]);
+
+                    // Eval on held-out UTF-8 records
+                    for (int len : LENGTHS) {
+                        EvalCell cell = evalAtLength(m, utf8EvalBytes, len, utf8, gb18030);
+                        if (cell == null) continue;
+                        double effZ = cell.meanMargin / Math.max(muSigma[1], 1e-6);
+                        out.printf("%d\t%.2f\t%d\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.3f\t%.3f\t%.3f%n",
+                                buckets, alpha, len, cell.n,
+                                cell.meanClean, cell.stdClean, cell.meanMoji,
+                                cell.meanMargin, cell.stdMargin,
+                                cell.p5Margin, cell.p50Margin,
+                                effZ, cell.bloomSeenFracClean, cell.bloomSeenFracMoji);
+                        System.err.printf("    len=%4d  n=%-5d  mean_margin=%6.3f  p5=%6.3f"
+                                + "  eff_z=%5.2f  bloom_clean=%.2f  bloom_moji=%.2f%n",
+                                len, cell.n, cell.meanMargin, cell.p5Margin, effZ,
+                                cell.bloomSeenFracClean, cell.bloomSeenFracMoji);
+                        out.flush();
+                    }
+                }
+            }
+        }
+        System.err.println("\nWrote " + sweepPath);
+
+        // -------- Fixture eval (AIT5-class HTML files) --------
+
+        if (!fixturesDirs.isEmpty()) {
+            evalFixtures(trainStreams, fixturesDirs, wrongCharsetName, outputDir);
+        }
+
+        System.err.println("Done.");
+    }
+
+    // -----------------------------------------------------------------------
+    // Real-life fixture eval: runs the production base detectors (BOM +
+    // HtmlEncodingDetector + UniversalEncodingDetector) and asks the
+    // JunkDetector to pick among their candidates via pairwise compare.
+    // Mirrors the production charset-detection arbitration.
+    // -----------------------------------------------------------------------
+
+    private static void evalFixturesSingleModel(List<Path> fixturesDirs,
+                                                List<String> forceCandidates,
+                                                String expected,
+                                                int[] probeSizes,
+                                                Path outputDir) throws IOException {
+        System.err.println("\n--- Forced-candidates fixture eval ---");
+        System.err.println("  candidates: " + forceCandidates);
+        JunkDetector detector = JunkDetector.loadFromClasspath();
+        System.err.println("  model version: " + detector.getModelVersion());
+        System.err.println("  expected:      " + expected);
+
+        List<Charset> forced = new ArrayList<>();
+        for (String n : forceCandidates) {
+            try {
+                forced.add(Charset.forName(n));
+            } catch (Exception e) {
+                System.err.println("  skip unsupported charset: " + n);
+            }
+        }
+
+        Path out = outputDir.resolve("fixtures-real-life.tsv");
+        try (PrintWriter pw = new PrintWriter(
+                Files.newBufferedWriter(out, StandardCharsets.UTF_8))) {
+            pw.println("dir\tfile\tn_bytes\tprobe_size\texpected\tbom_cs\thtml_cs\tuniversal_cs"
+                    + "\tcandidates\twinner\tmargin\tstatus\tnotes");
+            int pass = 0, fail = 0, skip = 0, agree = 0;
+            double passMarginSum = 0.0;
+            List<String> failingLines = new ArrayList<>();
+
+            for (Path dir : fixturesDirs) {
+                if (!Files.isDirectory(dir)) {
+                    System.err.println("  WARN: not a directory: " + dir);
+                    continue;
+                }
+                try (Stream<Path> stream = Files.walk(dir)) {
+                    List<Path> files = new ArrayList<>();
+                    stream.filter(Files::isRegularFile).forEach(files::add);
+                    Collections.sort(files);
+                    int[] sizes = probeSizes != null ? probeSizes : new int[]{16_384};
+                    for (Path f : files) {
+                        for (int sz : sizes) {
+                            FixtureResult r =
+                                    evalOneForced(f, expected, detector, forced, sz);
+                            pw.println(r.toTsvLine());
+                            switch (r.status) {
+                                case "PASS":
+                                    pass++;
+                                    passMarginSum += r.margin;
+                                    break;
+                                case "FAIL":
+                                    fail++;
+                                    failingLines.add(r.dir + "/" + r.shortName
+                                            + "@" + sz + " -> " + r.winner
+                                            + " (expected " + r.expected + ")");
+                                    break;
+                                case "AGREE":
+                                    agree++;
+                                    break;
+                                default:
+                                    skip++;
+                            }
+                        }
+                    }
+                }
+            }
+            int n = pass + fail;
+            System.err.println();
+            System.err.println("=== Summary ===");
+            System.err.printf("Pass:    %d / %d (%.1f%%) — JunkDetector picked the expected charset%n",
+                    pass, n, n == 0 ? 0.0 : 100.0 * pass / n);
+            System.err.printf("Fail:    %d%n", fail);
+            System.err.printf("Agree:   %d  (all detectors agreed; no arbitration needed)%n", agree);
+            System.err.printf("Skip:    %d%n", skip);
+            if (pass > 0) {
+                System.err.printf("Mean margin on pass: %.3f%n", passMarginSum / pass);
+            }
+            if (!failingLines.isEmpty()) {
+                System.err.println("Failing:");
+                Collections.sort(failingLines);
+                for (String line : failingLines) {
+                    System.err.println("  " + line);
+                }
+            }
+        }
+        System.err.println("Wrote " + out);
+    }
+
+    private static FixtureResult evalOneForced(Path file, String expected,
+                                               JunkDetector detector,
+                                               List<Charset> forced,
+                                               int probeBytes) throws IOException {
+        byte[] raw = Files.readAllBytes(file);
+        FixtureResult r = new FixtureResult();
+        r.dir = file.getParent().getFileName().toString();
+        String fname = file.getFileName().toString();
+        r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname;
+        r.bytes = raw.length;
+        r.probeSize = probeBytes;
+        r.expected = expected;
+
+        if (isBinaryMagic(raw)) {
+            r.status = "SKIP_BIN";
+            return r;
+        }
+        // Strip HTML on the WHOLE raw buffer first, then slice to probeBytes
+        // from the stripped content.  Otherwise a small probe slice can land
+        // entirely inside <!DOCTYPE>/<html>/<head> boilerplate and leave
+        // nothing to score after strip.
+        byte[] strippedFull = stripHtmlBytes(raw);
+        byte[] forDecode = strippedFull.length > probeBytes
+                ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull;
+        r.candidatesStr = forced.stream().map(Charset::name)
+                .reduce((a, b) -> a + "," + b).orElse("-");
+
+        // Always log every candidate in notes — even those JunkDetector
+        // rejects as unknown — so the failure mode is visible.  An
+        // "unknown" score itself is meaningful information when the other
+        // candidate scored fine.
+        String winner = null;
+        String runner = null;
+        float winnerZ = Float.NEGATIVE_INFINITY;
+        float runnerZ = Float.NEGATIVE_INFINITY;
+        StringBuilder notes = new StringBuilder();
+        int decoded_scored = 0;
+        for (Charset cs : forced) {
+            String decoded = applyEntityVariant(new String(forDecode, cs), "expanded");
+            int cps = toCodepoints(decoded).length;
+            if (cps < 3) {
+                notes.append(cs.name()).append("=TOO_SHORT(").append(cps).append(") ");
+                continue;
+            }
+            TextQualityScore s = detector.score(decoded);
+            if (s.isUnknown()) {
+                // Diagnose: is this script-not-in-model (neutral case) or
+                // all-runs-fragmented-too-short (a real mojibake signal)?
+                String why = diagnoseUnknown(decoded, detector);
+                notes.append(cs.name()).append("=UNK[").append(why).append("] ");
+                continue;
+            }
+            float z = s.getZScore();
+            notes.append(cs.name()).append("=").append(String.format("%.2f", z)).append(" ");
+            decoded_scored++;
+            if (z > winnerZ) {
+                runner = winner;
+                runnerZ = winnerZ;
+                winner = cs.name();
+                winnerZ = z;
+            } else if (z > runnerZ) {
+                runner = cs.name();
+                runnerZ = z;
+            }
+        }
+        if (winner == null) {
+            r.status = "NO_DECODE";
+            r.notes = notes.toString().trim();
+            return r;
+        }
+        r.winner = winner;
+        if (decoded_scored < 2) {
+            // Only one candidate scored; no real arbitration happened.
+            r.margin = Float.NaN;
+            r.status = safeCanonical(winner).equals(safeCanonical(expected))
+                    ? "ONLY_EXPECTED_SCORED" : "ONLY_WRONG_SCORED";
+        } else {
+            r.margin = winnerZ - runnerZ;
+            r.status = safeCanonical(winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL";
+        }
+        r.notes = notes.toString().trim();
+        return r;
+    }
+
+    /**
+     * Diagnose why JunkDetector returned UNKNOWN for {@code text}.  Walks
+     * the same script-run logic, then classifies the failure mode:
+     * <ul>
+     *   <li>{@code EMPTY} — input had no characters.</li>
+     *   <li>{@code NO_MODELED_SCRIPT} — all runs are in scripts the model
+     *       doesn't know (legit reason to be neutral).</li>
+     *   <li>{@code ALL_RUNS_TOO_SHORT(N)} — runs exist in modeled scripts
+     *       but every one is &lt;2 UTF-8 bytes.  Strong mojibake signal —
+     *       text is a salad of single codepoints from many scripts.</li>
+     *   <li>{@code MIXED} — some runs were modeled-but-too-short and
+     *       some were unmodeled.</li>
+     * </ul>
+     */
+    private static String diagnoseUnknown(String text, JunkDetector detector) {
+        if (text == null || text.isEmpty()) {
+            return "EMPTY";
+        }
+        Set<String> modeled = detector.knownScripts();
+        // Walk codepoints, splitting on script boundaries — same as
+        // JunkDetector.buildScriptRuns conceptually.  Track per-script:
+        // longest UTF-8-byte run length, plus a separate "unmodeled" tally.
+        java.util.Map<String, Integer> longestModeled = new java.util.HashMap<>();
+        int unmodeledRuns = 0;
+        int modeledTooShortRuns = 0;
+        int currentBytes = 0;
+        String currentScript = null;
+        for (int i = 0; i < text.length(); ) {
+            int cp = text.codePointAt(i);
+            int charCount = Character.charCount(cp);
+            String script = Character.UnicodeScript.of(cp).name();
+            // COMMON / INHERITED / UNKNOWN attach to preceding run, but for
+            // diagnosis we don't need to be that precise — treat them as a
+            // continuation.
+            if ("COMMON".equals(script) || "INHERITED".equals(script)
+                    || "UNKNOWN".equals(script)) {
+                if (currentScript != null) {
+                    currentBytes += new String(new int[]{cp}, 0, 1)
+                            .getBytes(StandardCharsets.UTF_8).length;
+                }
+            } else if (script.equals(currentScript)) {
+                currentBytes += new String(new int[]{cp}, 0, 1)
+                        .getBytes(StandardCharsets.UTF_8).length;
+            } else {
+                // close out previous run
+                tallyRun(currentScript, currentBytes, modeled, longestModeled);
+                if (currentScript != null) {
+                    if (!modeled.contains(currentScript)) {
+                        unmodeledRuns++;
+                    } else if (currentBytes < 2) {
+                        modeledTooShortRuns++;
+                    }
+                }
+                currentScript = script;
+                currentBytes = new String(new int[]{cp}, 0, 1)
+                        .getBytes(StandardCharsets.UTF_8).length;
+            }
+            i += charCount;
+        }
+        // close final run
+        if (currentScript != null) {
+            if (!modeled.contains(currentScript)) {
+                unmodeledRuns++;
+            } else if (currentBytes < 2) {
+                modeledTooShortRuns++;
+            } else {
+                longestModeled.merge(currentScript, currentBytes, Math::max);
+            }
+        }
+        boolean anyModeledLong = !longestModeled.isEmpty();
+        if (anyModeledLong) {
+            // Some modeled run is ≥2 bytes — shouldn't have hit UNKNOWN.
+            // (Possible discrepancy with the production logic; reported as MIXED.)
+            return "MIXED(modeled_long=" + longestModeled.size() + ")";
+        }
+        if (modeledTooShortRuns > 0 && unmodeledRuns > 0) {
+            return "MIXED(short=" + modeledTooShortRuns
+                    + ",unmodeled=" + unmodeledRuns + ")";
+        }
+        if (modeledTooShortRuns > 0) {
+            return "ALL_RUNS_TOO_SHORT(" + modeledTooShortRuns + ")";
+        }
+        if (unmodeledRuns > 0) {
+            return "NO_MODELED_SCRIPT(" + unmodeledRuns + ")";
+        }
+        return "OTHER";
+    }
+
+    private static void tallyRun(String script, int bytes, Set<String> modeled,
+                                 java.util.Map<String, Integer> longestModeled) {
+        if (script == null) {
+            return;
+        }
+        if (modeled.contains(script) && bytes >= 2) {
+            longestModeled.merge(script, bytes, Math::max);
+        }
+    }
+
+    /**
+     * Run HtmlByteStripper over the entire input; return the stripped
+     * content bytes (or the input verbatim if no tags found).
+     */
+    private static byte[] stripHtmlBytes(byte[] raw) {
+        byte[] dst = new byte[raw.length];
+        HtmlByteStripper.Result r =
+                HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+        if (r.tagCount > 0 && r.length > 0) {
+            return Arrays.copyOf(dst, r.length);
+        }
+        return raw;
+    }
+
+    private static boolean isBinaryMagic(byte[] b) {
+        if (b.length < 4) {
+            return false;
+        }
+        if (b[0] == 0x50 && b[1] == 0x4B
+                && (b[2] == 0x03 || b[2] == 0x05 || b[2] == 0x07)) {
+            return true; // ZIP / JAR / APK / docx
+        }
+        if ((b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) {
+            return true; // gzip
+        }
+        if (b[0] == '%' && b[1] == 'P' && b[2] == 'D' && b[3] == 'F') {
+            return true; // PDF
+        }
+        if ((b[0] & 0xFF) == 0xD0 && (b[1] & 0xFF) == 0xCF) {
+            return true; // OLE2
+        }
+        return false;
+    }
+
+    private static String safeCanonical(String charset) {
+        if (charset == null) {
+            return "";
+        }
+        try {
+            return Charset.forName(charset).name();
+        } catch (Exception e) {
+            return charset.toUpperCase();
+        }
+    }
+
+    private static final class FixtureResult {
+        String dir;
+        String shortName;
+        int bytes;
+        int probeSize;
+        String expected;
+        String bomCs;
+        String htmlCs;
+        String universalCs;
+        String candidatesStr = "-";
+        String winner = "-";
+        float margin = Float.NaN;
+        String status = "";
+        String notes = "";
+
+        String toTsvLine() {
+            return String.format("%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
+                    dir, shortName, bytes, probeSize, expected,
+                    str(bomCs), str(htmlCs), str(universalCs),
+                    candidatesStr, str(winner),
+                    Float.isNaN(margin) ? "-" : String.format("%.3f", margin),
+                    status, notes.isEmpty() ? "-" : notes);
+        }
+
+        private static String str(String s) {
+            return s == null ? "-" : s;
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Fixture eval: score real-world AIT5-class HTML files under v5 and v6
+    // prototype, with byte-level HTML stripping and entity-variant comparison.
+    // -----------------------------------------------------------------------
+
+    private static void evalFixtures(List<int[]> trainStreams,
+                                     List<Path> fixturesDirs,
+                                     String wrongCharsetName,
+                                     Path outputDir) throws IOException {
+        System.err.println("\n--- Fixture eval (best config: 4096 buckets, alpha=1.0) ---");
+        Model v6 = train(trainStreams, 4096, UNIGRAM_BUCKETS,
+                BLOOM_BITS, BLOOM_K, ADD_ALPHA, 1.0);
+        double[] muSigma = calibrate(v6, trainStreams);
+        float mu = (float) muSigma[0];
+        float sigma = (float) Math.max(muSigma[1], 1e-6);
+        System.err.printf("  v6 train mu=%.3f  sigma=%.3f%n", mu, sigma);
+
+        JunkDetector v5 = JunkDetector.loadFromClasspath();
+        Charset cleanCs = StandardCharsets.UTF_8;
+        Charset wrongCs = Charset.forName(wrongCharsetName);
+        System.err.println("  v5 model version: " + v5.getModelVersion());
+        System.err.println("  clean charset:    " + cleanCs.name());
+        System.err.println("  mojibake charset: " + wrongCs.name());
+
+        Path fixturesPath = outputDir.resolve("fixtures.tsv");
+        try (PrintWriter out = new PrintWriter(
+                Files.newBufferedWriter(fixturesPath, StandardCharsets.UTF_8))) {
+            out.println("cluster\tfile\tentity_variant\tn_clean_cp\tn_moji_cp"
+                    + "\tv5_clean_z\tv5_moji_z\tv5_margin"
+                    + "\tv6_F1_clean\tv6_F1_moji\tv6_F1_margin"
+                    + "\tv6_combo_clean\tv6_combo_moji\tv6_combo_margin"
+                    + "\tdominant_script"
+                    + "\tv5_winner\tv6_F1_winner\tv6_combo_winner");
+
+            for (Path dir : fixturesDirs) {
+                if (!Files.isDirectory(dir)) {
+                    System.err.println("  WARN: not a directory: " + dir);
+                    continue;
+                }
+                try (java.util.stream.Stream<Path> files = Files.walk(dir)) {
+                    List<Path> sorted = new ArrayList<>();
+                    files.filter(Files::isRegularFile).forEach(sorted::add);
+                    Collections.sort(sorted);
+                    for (Path f : sorted) {
+                        evalOneFixture(f, v6, mu, sigma, v5, cleanCs, wrongCs, out);
+                    }
+                }
+            }
+        }
+        System.err.println("Wrote " + fixturesPath);
+    }
+
+    private static void evalOneFixture(Path file, Model v6, float v6Mu, float v6Sigma,
+                                       JunkDetector v5,
+                                       Charset cleanCs, Charset wrongCs,
+                                       PrintWriter out) throws IOException {
+        byte[] rawBytes = Files.readAllBytes(file);
+        if (rawBytes.length > 16384) {
+            rawBytes = Arrays.copyOf(rawBytes, 16384);
+        }
+        // Byte-level HTML strip (matches JunkFilterEncodingDetector production pipeline)
+        byte[] stripDst = new byte[rawBytes.length];
+        HtmlByteStripper.Result strip =
+                HtmlByteStripper.strip(rawBytes, 0, rawBytes.length, stripDst, 0);
+        byte[] forDecode = rawBytes;
+        if (strip.tagCount > 0 && strip.length > 0) {
+            forDecode = new byte[strip.length];
+            System.arraycopy(stripDst, 0, forDecode, 0, strip.length);
+        }
+
+        String cluster = file.getParent().getFileName().toString();
+        String fname = file.getFileName().toString();
+        // shorten long content-hash names for readability in output
+        String shortName = fname.length() > 12 ? fname.substring(0, 12) : fname;
+
+        String cleanRaw = decode(forDecode, cleanCs);
+        String mojiRaw = decode(forDecode, wrongCs);
+
+        for (String variant : List.of("raw", "expanded", "removed")) {
+            String clean = applyEntityVariant(cleanRaw, variant);
+            String moji = applyEntityVariant(mojiRaw, variant);
+            int[] cleanCps = toCodepoints(clean);
+            int[] mojiCps = toCodepoints(moji);
+            if (cleanCps.length < 3 || mojiCps.length < 3) continue;
+
+            // --- v5 full pipeline (existing) ---
+            TextQualityScore v5cs = v5.score(clean);
+            TextQualityScore v5ms = v5.score(moji);
+            float v5cleanZ = v5cs.isUnknown() ? Float.NaN : v5cs.getZScore();
+            float v5mojiZ = v5ms.isUnknown() ? Float.NaN : v5ms.getZScore();
+            float v5Margin = v5cleanZ - v5mojiZ;
+
+            // --- v6 Feature 1 alone (codepoint-bigram-hash + Bloom + unigram backoff) ---
+            ScoreResult v6c = score(v6, cleanCps);
+            ScoreResult v6m = score(v6, mojiCps);
+            double v6Margin = v6c.meanLogP - v6m.meanLogP;
+
+            // --- v6 combined: substitute v6's F1 z-score into v5's classifier ---
+            JunkDetector.FeatureComponents cleanFc = v5.scoreWithFeatureComponents(clean);
+            JunkDetector.FeatureComponents mojiFc = v5.scoreWithFeatureComponents(moji);
+            float v6F1zClean = (float) (v6c.meanLogP - v6Mu) / v6Sigma;
+            float v6F1zMoji  = (float) (v6m.meanLogP - v6Mu) / v6Sigma;
+            float comboClean = recombineLogit(v6F1zClean, cleanFc);
+            float comboMoji  = recombineLogit(v6F1zMoji, mojiFc);
+            float comboMargin = comboClean - comboMoji;
+            String dominantScript = cleanFc != null ? cleanFc.dominantScript : "?";
+
+            String v5Winner   = Float.isNaN(v5Margin) ? "?" : (v5Margin > 0 ? "CLEAN" : "MOJI");
+            String v6F1Winner = Double.isNaN(v6Margin) ? "?" : (v6Margin > 0 ? "CLEAN" : "MOJI");
+            String v6cWinner  = Float.isNaN(comboMargin) ? "?" : (comboMargin > 0 ? "CLEAN" : "MOJI");
+
+            out.printf("%s\t%s\t%s\t%d\t%d"
+                            + "\t%.3f\t%.3f\t%.3f"
+                            + "\t%.4f\t%.4f\t%.4f"
+                            + "\t%.3f\t%.3f\t%.3f"
+                            + "\t%s\t%s\t%s\t%s%n",
+                    cluster, shortName, variant,
+                    cleanCps.length, mojiCps.length,
+                    v5cleanZ, v5mojiZ, v5Margin,
+                    v6c.meanLogP, v6m.meanLogP, v6Margin,
+                    comboClean, comboMoji, comboMargin,
+                    dominantScript,
+                    v5Winner, v6F1Winner, v6cWinner);
+            out.flush();
+            System.err.printf("    [%s/%s %-8s] v5: Δ%+6.2f %s   v6F1: Δ%+6.3f %s   v6combo: Δ%+6.2f %s   script=%s%n",
+                    cluster, shortName, variant,
+                    v5Margin, v5Winner,
+                    v6Margin, v6F1Winner,
+                    comboMargin, v6cWinner,
+                    dominantScript);
+        }
+    }
+
+    /**
+     * Recomputes v5's per-script classifier logit with v6's F1 z-score
+     * substituted for v5's z1.  Approximation: keeps v5's classifier weights
+     * (w1..w4, bias) which were trained on the OLD F1 distribution.  A true
+     * v6 retrain would re-fit w1 on the new F1 distribution; this version
+     * gives a directional estimate of "what if we just swap F1?"
+     */
+    private static float recombineLogit(float v6F1z, JunkDetector.FeatureComponents fc) {
+        if (fc == null || fc.classifierWeights == null) {
+            return Float.NaN;
+        }
+        float[] cw = fc.classifierWeights;
+        int nFeat = cw.length - 1;
+        float logit = cw[nFeat]; // bias
+        if (nFeat >= 1) logit += cw[0] * v6F1z;
+        if (nFeat >= 2) logit += cw[1] * fc.z2;
+        if (nFeat >= 3) logit += cw[2] * fc.z3;
+        if (nFeat >= 4) logit += cw[3] * fc.z4;
+        return logit;
+    }
+
+    // -----------------------------------------------------------------------
+    // HTML entity expansion / removal (regex-based, sufficient for fixtures)
+    // -----------------------------------------------------------------------
+
+    private static final Pattern NUM_DEC = Pattern.compile("&#(\\d{1,7});");
+    private static final Pattern NUM_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
+    private static final Pattern NAMED =
+            Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+    private static String applyEntityVariant(String s, String variant) {
+        switch (variant) {
+            case "raw": return s;
+            case "expanded": return expandEntities(s);
+            case "removed": return removeEntities(s);
+            default: throw new IllegalArgumentException(variant);
+        }
+    }
+
+    private static String expandEntities(String in) {
+        String s = in;
+        s = NUM_DEC.matcher(s).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1));
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // fall through, leave unchanged
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = NUM_HEX.matcher(s).replaceAll(mr -> {
+            try {
+                int cp = Integer.parseInt(mr.group(1), 16);
+                if (cp >= 0 && cp <= 0x10FFFF) {
+                    return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+                }
+            } catch (NumberFormatException ignored) {
+                // fall through, leave unchanged
+            }
+            return Matcher.quoteReplacement(mr.group());
+        });
+        s = NAMED.matcher(s).replaceAll(mr -> {
+            switch (mr.group(1)) {
+                case "amp":  return "&";
+                case "lt":   return "<";
+                case "gt":   return ">";
+                case "quot": return "\"";
+                case "apos": return "'";
+                case "nbsp": return " ";
+                case "copy": return "©";
+                case "reg":  return "®";
+                default:     return Matcher.quoteReplacement(mr.group());
+            }
+        });
+        return s;
+    }
+
+    private static String removeEntities(String s) {
+        s = NUM_DEC.matcher(s).replaceAll("");
+        s = NUM_HEX.matcher(s).replaceAll("");
+        s = NAMED.matcher(s).replaceAll("");
+        return s;
+    }
+
+    // -----------------------------------------------------------------------
+    // Training
+    // -----------------------------------------------------------------------
+
+    private static Model train(List<int[]> streams,
+                               int bigramBuckets, int unigramBuckets,
+                               int bloomBits, int bloomK,
+                               double addAlpha, double backoffAlpha) {
+        if (Integer.bitCount(bigramBuckets) != 1 || Integer.bitCount(unigramBuckets) != 1) {
+            throw new IllegalArgumentException("Bucket counts must be powers of 2");
+        }
+        long[] bigramCounts = new long[bigramBuckets];
+        long[] unigramCounts = new long[unigramBuckets];
+        long bigramTotal = 0;
+        long unigramTotal = 0;
+        long[] bloomBitArr = new long[(bloomBits + 63) / 64];
+
+        for (int[] cps : streams) {
+            for (int i = 0; i < cps.length; i++) {
+                int cp = cps[i];
+                int uBucket = (int) (fnv1aUnigram(cp) & (unigramBuckets - 1));
+                unigramCounts[uBucket]++;
+                unigramTotal++;
+                if (i + 1 < cps.length) {
+                    int cpNext = cps[i + 1];
+                    int bBucket = (int) (fnv1aBigram(cp, cpNext) & (bigramBuckets - 1));
+                    bigramCounts[bBucket]++;
+                    bigramTotal++;
+                    bloomAdd(bloomBitArr, bloomBits, bloomK, cp, cpNext);
+                }
+            }
+        }
+
+        // Convert to log-probabilities with add-alpha smoothing
+        float[] bigramLogP = new float[bigramBuckets];
+        double bigramDenom = bigramTotal + addAlpha * bigramBuckets;
+        for (int i = 0; i < bigramBuckets; i++) {
+            double p = (bigramCounts[i] + addAlpha) / bigramDenom;
+            bigramLogP[i] = (float) Math.log(p);
+        }
+        float[] unigramLogP = new float[unigramBuckets];
+        double unigramDenom = unigramTotal + addAlpha * unigramBuckets;
+        for (int i = 0; i < unigramBuckets; i++) {
+            double p = (unigramCounts[i] + addAlpha) / unigramDenom;
+            unigramLogP[i] = (float) Math.log(p);
+        }
+
+        return new Model(bigramBuckets, unigramBuckets, bigramLogP, unigramLogP,
+                bloomBitArr, bloomBits, bloomK, backoffAlpha);
+    }
+
+    private static double[] calibrate(Model m, List<int[]> streams) {
+        double s = 0;
+        double s2 = 0;
+        int n = 0;
+        // Use a stride to avoid scoring every single train record
+        int stride = Math.max(1, streams.size() / 1000);
+        for (int i = 0; i < streams.size(); i += stride) {
+            int[] cps = streams.get(i);
+            if (cps.length < MIN_SCORE_CODEPOINTS) continue;
+            ScoreResult r = score(m, cps);
+            s += r.meanLogP;
+            s2 += r.meanLogP * r.meanLogP;
+            n++;
+        }
+        if (n == 0) return new double[]{0, 1};
+        double mu = s / n;
+        double var = Math.max(0, s2 / n - mu * mu);
+        double sigma = Math.sqrt(var);
+        return new double[]{mu, sigma};
+    }
+
+    // -----------------------------------------------------------------------
+    // Scoring
+    // -----------------------------------------------------------------------
+
+    private static ScoreResult score(Model m, int[] cps) {
+        if (cps.length < 2) return new ScoreResult(Double.NaN, 0, 0);
+        double sum = 0;
+        int n = 0;
+        int seen = 0;
+        for (int i = 0; i + 1 < cps.length; i++) {
+            int cp1 = cps[i];
+            int cp2 = cps[i + 1];
+            double logP;
+            if (bloomContains(m.bloomBits, m.bloomBitCount, m.bloomK, cp1, cp2)) {
+                int b = (int) (fnv1aBigram(cp1, cp2) & (m.bigramBuckets - 1));
+                logP = m.bigramLogP[b];
+                seen++;
+            } else {
+                int u1 = (int) (fnv1aUnigram(cp1) & (m.unigramBuckets - 1));
+                int u2 = (int) (fnv1aUnigram(cp2) & (m.unigramBuckets - 1));
+                logP = m.backoffAlpha * (m.unigramLogP[u1] + m.unigramLogP[u2]);
+            }
+            sum += logP;
+            n++;
+        }
+        return new ScoreResult(sum / n, n, seen);
+    }
+
+    private static final class ScoreResult {
+        final double meanLogP;
+        final int nPairs;
+        final int seenPairs;
+        ScoreResult(double m, int n, int s) {
+            this.meanLogP = m;
+            this.nPairs = n;
+            this.seenPairs = s;
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Eval at one length bucket
+    // -----------------------------------------------------------------------
+
+    private static EvalCell evalAtLength(Model m, List<byte[]> evalBytes, int length,
+                                         Charset cleanCs, Charset wrongCs) {
+        List<Double> cleans = new ArrayList<>();
+        List<Double> mojis = new ArrayList<>();
+        List<Double> margins = new ArrayList<>();
+        double seenSumClean = 0, seenSumMoji = 0;
+        int nSeenObs = 0;
+        for (byte[] rec : evalBytes) {
+            if (rec.length < length) continue;
+            byte[] slice = Arrays.copyOf(rec, length);
+            int[] cleanCps = toCodepoints(decode(slice, cleanCs));
+            int[] mojiCps = toCodepoints(decode(slice, wrongCs));
+            if (cleanCps.length < MIN_SCORE_CODEPOINTS
+                    || mojiCps.length < MIN_SCORE_CODEPOINTS) continue;
+            ScoreResult sc = score(m, cleanCps);
+            ScoreResult sm = score(m, mojiCps);
+            if (Double.isNaN(sc.meanLogP) || Double.isNaN(sm.meanLogP)) continue;
+            cleans.add(sc.meanLogP);
+            mojis.add(sm.meanLogP);
+            margins.add(sc.meanLogP - sm.meanLogP);
+            if (sc.nPairs > 0) seenSumClean += (double) sc.seenPairs / sc.nPairs;
+            if (sm.nPairs > 0) seenSumMoji += (double) sm.seenPairs / sm.nPairs;
+            nSeenObs++;
+        }
+        if (margins.size() < 30) return null;
+        EvalCell cell = new EvalCell();
+        cell.n = margins.size();
+        cell.meanClean = mean(cleans);
+        cell.stdClean = std(cleans, cell.meanClean);
+        cell.meanMoji = mean(mojis);
+        cell.meanMargin = mean(margins);
+        cell.stdMargin = std(margins, cell.meanMargin);
+        cell.p5Margin = percentile(margins, 0.05);
+        cell.p50Margin = percentile(margins, 0.50);
+        cell.bloomSeenFracClean = nSeenObs > 0 ? seenSumClean / nSeenObs : Double.NaN;
+        cell.bloomSeenFracMoji = nSeenObs > 0 ? seenSumMoji / nSeenObs : Double.NaN;
+        return cell;
+    }
+
+    private static final class EvalCell {
+        int n;
+        double meanClean, stdClean;
+        double meanMoji;
+        double meanMargin, stdMargin;
+        double p5Margin, p50Margin;
+        double bloomSeenFracClean, bloomSeenFracMoji;
+    }
+
+    // -----------------------------------------------------------------------
+    // FNV-1a hashing for codepoint bigram / unigram + Bloom filter
+    // -----------------------------------------------------------------------
+
+    private static final long FNV_OFFSET = 0xcbf29ce484222325L;
+    private static final long FNV_PRIME = 0x100000001b3L;
+
+    private static long fnv1aBigram(int cp1, int cp2) {
+        long h = FNV_OFFSET;
+        h = (h ^ ((cp1 >>> 24) & 0xFF)) * FNV_PRIME;
+        h = (h ^ ((cp1 >>> 16) & 0xFF)) * FNV_PRIME;
+        h = (h ^ ((cp1 >>> 8) & 0xFF))  * FNV_PRIME;
+        h = (h ^ (cp1 & 0xFF))          * FNV_PRIME;
+        h = (h ^ 0xFF)                  * FNV_PRIME; // separator
+        h = (h ^ ((cp2 >>> 24) & 0xFF)) * FNV_PRIME;
+        h = (h ^ ((cp2 >>> 16) & 0xFF)) * FNV_PRIME;
+        h = (h ^ ((cp2 >>> 8) & 0xFF))  * FNV_PRIME;
+        h = (h ^ (cp2 & 0xFF))          * FNV_PRIME;
+        return h;
+    }
+
+    private static long fnv1aUnigram(int cp) {
+        long h = FNV_OFFSET;
+        h = (h ^ ((cp >>> 24) & 0xFF)) * FNV_PRIME;
+        h = (h ^ ((cp >>> 16) & 0xFF)) * FNV_PRIME;
+        h = (h ^ ((cp >>> 8) & 0xFF))  * FNV_PRIME;
+        h = (h ^ (cp & 0xFF))          * FNV_PRIME;
+        return h;
+    }
+
+    private static long secondaryHash(int cp1, int cp2) {
+        // Independent secondary hash for Bloom double-hashing.  Just shuffle
+        // the inputs differently.
+        long h = 0xff51afd7ed558ccdL;
+        h = (h ^ Integer.reverse(cp1)) * 0xc4ceb9fe1a85ec53L;
+        h = (h ^ Integer.reverse(cp2)) * 0xc4ceb9fe1a85ec53L;
+        h ^= h >>> 33;
+        return h;
+    }
+
+    private static void bloomAdd(long[] bits, int bitCount, int k, int cp1, int cp2) {
+        long h1 = fnv1aBigram(cp1, cp2);
+        long h2 = secondaryHash(cp1, cp2);
+        for (int i = 0; i < k; i++) {
+            long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount;
+            bits[(int) (pos >>> 6)] |= 1L << (pos & 63);
+        }
+    }
+
+    private static boolean bloomContains(long[] bits, int bitCount, int k,
+                                         int cp1, int cp2) {
+        long h1 = fnv1aBigram(cp1, cp2);
+        long h2 = secondaryHash(cp1, cp2);
+        for (int i = 0; i < k; i++) {
+            long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount;
+            if ((bits[(int) (pos >>> 6)] & (1L << (pos & 63))) == 0) return false;
+        }
+        return true;
+    }
+
+    private static long packPair(int cp1, int cp2) {
+        return ((long) cp1 << 32) | (cp2 & 0xFFFFFFFFL);
+    }
+
+    // -----------------------------------------------------------------------
+    // I/O and decode utilities (copied from EvalJunkOnCharsetDevtest)
+    // -----------------------------------------------------------------------
+
+    private static List<byte[]> readRecords(Path file, int maxRecords) throws IOException {
+        List<byte[]> records = new ArrayList<>();
+        try (FileInputStream fis = new FileInputStream(file.toFile());
+             GZIPInputStream gis = new GZIPInputStream(fis);
+             DataInputStream dis = new DataInputStream(gis)) {
+            while (records.size() < maxRecords) {
+                int len;
+                try {
+                    len = dis.readUnsignedShort();
+                } catch (EOFException eof) {
+                    break;
+                }
+                byte[] rec = new byte[len];
+                dis.readFully(rec);
+                records.add(rec);
+            }
+        }
+        return records;
+    }
+
+    private static String decode(byte[] bytes, Charset cs) {
+        CharsetDecoder dec = cs.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPLACE)
+                .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        try {
+            return dec.decode(ByteBuffer.wrap(bytes)).toString();
+        } catch (CharacterCodingException e) {
+            return new String(bytes, cs);
+        }
+    }
+
+    private static int[] toCodepoints(String s) {
+        int[] cps = new int[s.length()];
+        int n = 0;
+        for (int i = 0; i < s.length(); ) {
+            int cp = s.codePointAt(i);
+            cps[n++] = cp;
+            i += Character.charCount(cp);
+        }
+        return Arrays.copyOf(cps, n);
+    }
+
+    // -----------------------------------------------------------------------
+    // Stats
+    // -----------------------------------------------------------------------
+
+    private static double mean(List<Double> xs) {
+        double s = 0;
+        int n = 0;
+        for (double v : xs) {
+            if (!Double.isNaN(v)) {
+                s += v;
+                n++;
+            }
+        }
+        return n == 0 ? Double.NaN : s / n;
+    }
+
+    private static double std(List<Double> xs, double mu) {
+        if (xs.size() < 2) return 0;
+        double s = 0;
+        int n = 0;
+        for (double v : xs) {
+            if (!Double.isNaN(v)) {
+                s += (v - mu) * (v - mu);
+                n++;
+            }
+        }
+        return n < 2 ? 0 : Math.sqrt(s / (n - 1));
+    }
+
+    private static double percentile(List<Double> xs, double p) {
+        List<Double> sorted = new ArrayList<>(xs);
+        sorted.removeIf(v -> Double.isNaN(v));
+        if (sorted.isEmpty()) return Double.NaN;
+        Collections.sort(sorted);
+        int idx = (int) Math.floor(p * (sorted.size() - 1));
+        return sorted.get(idx);
+    }
+
+    // -----------------------------------------------------------------------
+    // Model
+    // -----------------------------------------------------------------------
+
+    private static final class Model {
+        final int bigramBuckets;
+        final int unigramBuckets;
+        final float[] bigramLogP;
+        final float[] unigramLogP;
+        final long[] bloomBits;
+        final int bloomBitCount;
+        final int bloomK;
+        final double backoffAlpha;
+        Model(int bb, int ub, float[] blp, float[] ulp,
+              long[] bloom, int bbc, int bk, double a) {
+            this.bigramBuckets = bb;
+            this.unigramBuckets = ub;
+            this.bigramLogP = blp;
+            this.unigramLogP = ulp;
+            this.bloomBits = bloom;
+            this.bloomBitCount = bbc;
+            this.bloomK = bk;
+            this.backoffAlpha = a;
+        }
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
new file mode 100644
index 00000000000..b384d5f4c51
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Codepoint-level script census of one or more text files.  For each input
+ * file, reports the percentage of codepoints in each {@link
+ * Character.UnicodeScript}, optionally per-line script-mix histograms.
+ *
+ * <p>Useful to verify whether {@code BuildJunkTrainingData} is bucketing
+ * languages correctly: e.g. Japanese is usually a mix of HIRAGANA, KATAKANA
+ * and HAN; if {@code jpn} ends up in {@code han.train.gz} we want to know
+ * what fraction of its codepoints are actually Han ideographs vs. kana.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java ScriptCensus &lt;file&gt; [file ...]   # supports .gz and plain text
+ * </pre>
+ */
+public final class ScriptCensus {
+
+    /** Max lines to sample per file (set high for full pass). */
+    private static final int MAX_LINES = 200_000;
+
+    private ScriptCensus() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: ScriptCensus <file> [file ...]");
+            System.exit(1);
+        }
+        for (String arg : args) {
+            Path f = Paths.get(arg);
+            if (!Files.isRegularFile(f)) {
+                System.err.println("Skipping non-file: " + f);
+                continue;
+            }
+            reportOne(f);
+            System.out.println();
+        }
+    }
+
+    private static void reportOne(Path file) throws IOException {
+        Map<String, long[]> scriptCounts = new HashMap<>();
+        // Per-line dominant-script histogram.
+        Map<String, long[]> dominantHistogram = new HashMap<>();
+        long total = 0;
+        long lines = 0;
+        long sampledBytes = 0;
+
+        try (BufferedReader r = open(file)) {
+            String line;
+            while ((line = r.readLine()) != null && lines < MAX_LINES) {
+                lines++;
+                sampledBytes += line.length();
+                // For MADLAD/Wikipedia files the format is "lineNum TAB text";
+                // strip the prefix if present.
+                int tab = line.indexOf('\t');
+                String text = tab >= 0 ? line.substring(tab + 1) : line;
+
+                Map<String, Long> perLine = new HashMap<>();
+                for (int i = 0; i < text.length(); ) {
+                    int cp = text.codePointAt(i);
+                    i += Character.charCount(cp);
+                    Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+                    if (s == Character.UnicodeScript.COMMON
+                            || s == Character.UnicodeScript.INHERITED
+                            || s == Character.UnicodeScript.UNKNOWN) {
+                        continue;
+                    }
+                    String name = s.name();
+                    scriptCounts.computeIfAbsent(name, k -> new long[1])[0]++;
+                    perLine.merge(name, 1L, Long::sum);
+                    total++;
+                }
+                // Identify the dominant script for this line.
+                String dom = null;
+                long best = -1;
+                for (Map.Entry<String, Long> e : perLine.entrySet()) {
+                    if (e.getValue() > best) {
+                        best = e.getValue();
+                        dom = e.getKey();
+                    }
+                }
+                if (dom != null) {
+                    dominantHistogram.computeIfAbsent(dom, k -> new long[1])[0]++;
+                }
+            }
+        }
+
+        System.out.printf("File: %s%n", file);
+        System.out.printf("  lines sampled: %,d   total codepoints (excl. COMMON/INHERITED): %,d%n%n",
+                lines, total);
+
+        if (total == 0) {
+            System.out.println("  (empty / no scripted codepoints)");
+            return;
+        }
+
+        System.out.println("  Codepoint distribution by script:");
+        List<Map.Entry<String, long[]>> sorted = new ArrayList<>(scriptCounts.entrySet());
+        sorted.sort(Comparator.comparingLong((Map.Entry<String, long[]> e) -> -e.getValue()[0]));
+        long cumulative = 0;
+        for (Map.Entry<String, long[]> e : sorted) {
+            long c = e.getValue()[0];
+            cumulative += c;
+            double pct = 100.0 * c / total;
+            double cumPct = 100.0 * cumulative / total;
+            if (pct < 0.01 && c < 100) continue;
+            System.out.printf("    %-22s %,14d  %6.2f%%  (cum %6.2f%%)%n",
+                    e.getKey(), c, pct, cumPct);
+        }
+
+        System.out.println();
+        System.out.println("  Per-line dominant-script histogram:");
+        List<Map.Entry<String, long[]>> dom = new ArrayList<>(dominantHistogram.entrySet());
+        dom.sort(Comparator.comparingLong((Map.Entry<String, long[]> e) -> -e.getValue()[0]));
+        long domTotal = 0;
+        for (long[] v : dominantHistogram.values()) domTotal += v[0];
+        for (Map.Entry<String, long[]> e : dom) {
+            long c = e.getValue()[0];
+            double pct = 100.0 * c / domTotal;
+            if (pct < 0.05) continue;
+            System.out.printf("    %-22s %,12d  %6.2f%% of lines%n",
+                    e.getKey(), c, pct);
+        }
+    }
+
+    private static BufferedReader open(Path path) throws IOException {
+        if (path.getFileName().toString().endsWith(".gz")) {
+            return new BufferedReader(new InputStreamReader(
+                    new GZIPInputStream(Files.newInputStream(path)),
+                    StandardCharsets.UTF_8));
+        }
+        return Files.newBufferedReader(path, StandardCharsets.UTF_8);
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index fe99f3214e3..cf52a9eedfc 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -41,6 +41,9 @@
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.ml.junkdetect.V7Tables;
+
 /**
  * Trains the junk detector model from per-script corpus files produced by
  * {@link BuildJunkTrainingData}.
@@ -124,7 +127,17 @@
 public class TrainJunkModel {
 
     static final String MAGIC = "JUNKDET1";
-    static final byte VERSION = 5;
+    /** Sole supported file-format version.  Matches JunkDetector.VERSION. */
+    static final byte VERSION = 7;
+
+    // -----------------------------------------------------------------------
+    // v7 model constants (per-script open-addressing codepoint-bigram tables)
+    // -----------------------------------------------------------------------
+
+    /** Unigram backoff multiplier.  α=1.0 = plain independence; prototype validated. */
+    static final float V7_BACKOFF_ALPHA = 1.0f;
+    /** Additive smoothing constant for log-prob computation. */
+    static final double V7_ADD_ALPHA = 0.01;
 
     /** Number of clean (and corrupted) windows used to train the per-script classifier. */
     static final int NUM_CLASSIFIER_SAMPLES = 500;
@@ -179,6 +192,25 @@ public static void main(String[] args) throws IOException {
                 "datasets", "madlad", "junkdetect");
         Path output = dataDir.resolve("junkdetect.bin");
 
+        // Durable training parameters live in JunkDetectorTrainingConfig; this
+        // tool deliberately refuses CLI overrides so a built model file's
+        // identity always matches a committed config.
+        int minBigramCount = JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT;
+        double loadFactor = JunkDetectorTrainingConfig.OA_LOAD_FACTOR;
+        int keyIndexBits = JunkDetectorTrainingConfig.KEY_INDEX_BITS;
+        if (minBigramCount < 1) {
+            System.err.println("ERROR: MIN_BIGRAM_COUNT must be >= 1");
+            System.exit(1);
+        }
+        if (loadFactor <= 0 || loadFactor >= 1) {
+            System.err.println("ERROR: OA_LOAD_FACTOR must be in (0, 1), got " + loadFactor);
+            System.exit(1);
+        }
+        if (keyIndexBits < 1 || keyIndexBits > 16) {
+            System.err.println("ERROR: KEY_INDEX_BITS must be in [1, 16], got " + keyIndexBits);
+            System.exit(1);
+        }
+
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
                 case "--data-dir":
@@ -187,6 +219,12 @@ public static void main(String[] args) throws IOException {
                 case "--output":
                     output = Paths.get(args[++i]);
                     break;
+                case "--bloom-bits":
+                case "--min-bigram-count":
+                    System.err.println("ERROR: " + args[i] + " is no longer a CLI option."
+                            + "  Edit JunkDetectorTrainingConfig and commit the change instead.");
+                    System.exit(1);
+                    break;
                 default:
                     System.err.println("Unknown argument: " + args[i]);
                     printUsage();
@@ -194,31 +232,34 @@ public static void main(String[] args) throws IOException {
             }
         }
 
-        System.out.println("=== TrainJunkModel (v5) ===");
-        System.out.println("  data-dir: " + dataDir);
-        System.out.println("  output:   " + output);
+        System.out.println("=== TrainJunkModel ===");
+        System.out.println("  data-dir:           " + dataDir);
+        System.out.println("  output:             " + output);
+        System.out.println("  --- v7 format constants (TrainJunkModel) ---");
+        System.out.printf( "  backoff_alpha:      %.2f%n", V7_BACKOFF_ALPHA);
+        System.out.println("  --- config (JunkDetectorTrainingConfig) ---");
+        System.out.printf( "  min_bigram_count:   %d%n", minBigramCount);
+        System.out.printf( "  oa_load_factor:     %.2f%n", loadFactor);
+        System.out.printf( "  key_index_bits:     %d%n", keyIndexBits);
 
         if (!Files.isDirectory(dataDir)) {
             System.err.println("ERROR: data-dir not found: " + dataDir);
             System.exit(1);
         }
 
-        System.out.print("Building Unicode named-block index... ");
+        int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
+        System.out.printf("Block bucketing: %d named blocks + 1 unassigned "
+                + "(scheme version %d, JVM-independent)%n",
+                blockN - 1, org.apache.tika.ml.junkdetect.UnicodeBlockRanges.SCHEME_VERSION);
         long t0 = System.currentTimeMillis();
-        Map<Character.UnicodeBlock, Integer> blockIndex = buildBlockIndex();
-        int blockN = blockIndex.size() + 1;
-        System.out.printf("%d named blocks → table size %d×%d (%dms)%n",
-                blockIndex.size(), blockN, blockN, System.currentTimeMillis() - t0);
-
-        TreeMap<String, float[]> bigramTables        = new TreeMap<>();
-        TreeMap<String, float[]> bigramCalibrations  = new TreeMap<>();
-        TreeMap<String, float[]> blockTables         = new TreeMap<>();
-        TreeMap<String, float[]> blockCalibrations   = new TreeMap<>();
+
+        TreeMap<String, float[]> f1Calibrations    = new TreeMap<>();
+        TreeMap<String, float[]> blockTables       = new TreeMap<>();
+        TreeMap<String, float[]> blockCalibrations = new TreeMap<>();
         TreeMap<String, float[]> controlCalibrations = new TreeMap<>();
-        TreeMap<String, float[]> classifierWeights   = new TreeMap<>();
-        TreeMap<String, Path>    devFilePaths        = new TreeMap<>();
-        List<Path>               allTrainFiles       = new ArrayList<>();
-        List<Path>               allDevFiles         = new ArrayList<>();
+        TreeMap<String, float[]> classifierWeights = new TreeMap<>();
+        TreeMap<String, Path>    trainFilePaths    = new TreeMap<>();
+        List<Path>               allTrainFiles     = new ArrayList<>();
 
         List<Path> trainFiles;
         try (var stream = Files.list(dataDir)) {
@@ -234,69 +275,62 @@ public static void main(String[] args) throws IOException {
         }
 
         // -----------------------------------------------------------------------
-        // Phase 1 — per-script bigram tables, block tables, calibrations
+        // Phase 1 — per-script F1 tables (V7), F1 calibration, F2 block tables,
+        //           F3 control-byte calibration
         // -----------------------------------------------------------------------
-        System.out.println("\n--- Phase 1: per-script tables and calibrations ---");
+        TreeMap<String, V7Tables> f1TablesByScript = new TreeMap<>();
+        System.out.println("\n--- Phase 1: per-script F1 tables + calibrations ---");
         for (Path trainFile : trainFiles) {
             String filename = trainFile.getFileName().toString();
             String script = filename.substring(0, filename.length() - ".train.gz".length())
                     .toUpperCase();
-            Path devFile = trainFile.getParent().resolve(
-                    filename.replace(".train.gz", ".dev.gz"));
 
             System.out.printf("%n  [%s]%n", script);
             allTrainFiles.add(trainFile);
 
             t0 = System.currentTimeMillis();
-            System.out.print("    Training byte-bigram table...      ");
-            float[] bigramTable = trainBigramTable(trainFile);
-            System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0);
+            System.out.print("    Training V7 F1 tables (cp index + OA)..");
+            V7Tables v7 = trainV7TablesForScript(trainFile, minBigramCount,
+                    loadFactor, keyIndexBits);
+            System.out.printf(" done (%dms)%n", System.currentTimeMillis() - t0);
+            System.out.println(v7.statsString());
+            f1TablesByScript.put(script, v7);
 
             t0 = System.currentTimeMillis();
-            System.out.print("    Training named-block table...      ");
-            float[] blockTable = trainBlockTable(trainFile, blockIndex, blockN);
+            System.out.print("    Training named-block table...       ");
+            float[] blockTable = trainBlockTable(trainFile);
             System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0);
 
-            float[] bigramCal  = new float[]{0f, 1f};
-            float[] blockCal   = new float[]{0f, 1f};
-            float[] controlCal = new float[]{0f, 1f};
-
-            if (Files.exists(devFile)) {
-                t0 = System.currentTimeMillis();
-                System.out.print("    Calibrating byte bigrams on dev... ");
-                bigramCal = computeBigramCalibration(devFile, bigramTable);
-                System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
-                        bigramCal[0], bigramCal[1], System.currentTimeMillis() - t0);
-
-                t0 = System.currentTimeMillis();
-                System.out.print("    Calibrating named blocks on dev... ");
-                blockCal = computeBlockCalibration(devFile, blockTable, blockIndex, blockN);
-                System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
-                        blockCal[0], blockCal[1], System.currentTimeMillis() - t0);
-
-                t0 = System.currentTimeMillis();
-                System.out.print("    Calibrating control bytes on dev...");
-                controlCal = computeControlByteCalibration(devFile);
-                System.out.printf("done — mu=%.6f sigma=%.6f (%dms)%n",
-                        controlCal[0], controlCal[1], System.currentTimeMillis() - t0);
-
-                devFilePaths.put(script, devFile);
-                allDevFiles.add(devFile);
-            } else {
-                System.out.println("    WARNING: no dev file found, using uncalibrated defaults");
-            }
+            t0 = System.currentTimeMillis();
+            System.out.print("    Calibrating F1 (cp-hash) on train.. ");
+            float[] f1Cal = calibrateF1PerScript(trainFile, v7);
+            System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
+                    f1Cal[0], f1Cal[1], System.currentTimeMillis() - t0);
+
+            t0 = System.currentTimeMillis();
+            System.out.print("    Calibrating named blocks on train...");
+            float[] blockCal = computeBlockCalibration(trainFile, blockTable);
+            System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
+                    blockCal[0], blockCal[1], System.currentTimeMillis() - t0);
 
-            bigramTables.put(script, bigramTable);
-            bigramCalibrations.put(script, bigramCal);
+            t0 = System.currentTimeMillis();
+            System.out.print("    Calibrating control bytes on train..");
+            float[] controlCal = computeControlByteCalibration(trainFile);
+            System.out.printf("done — mu=%.6f sigma=%.6f (%dms)%n",
+                    controlCal[0], controlCal[1], System.currentTimeMillis() - t0);
+
+            trainFilePaths.put(script, trainFile);
+
+            f1Calibrations.put(script, f1Cal);
             blockTables.put(script, blockTable);
             blockCalibrations.put(script, blockCal);
             controlCalibrations.put(script, controlCal);
-            // Placeholder — set in phase 3
+            // Placeholder — set in Phase 3
             classifierWeights.put(script, new float[]{1f / 4, 1f / 4, 1f / 4, 1f / 4, 0f});
         }
 
         // -----------------------------------------------------------------------
-        // Phase 2 — global script-transition table
+        // Phase 2 — global script-transition table + supporting pools
         // -----------------------------------------------------------------------
         System.out.println("\n--- Phase 2: global script-transition table ---");
         List<String> scriptBuckets = buildScriptBuckets();
@@ -314,7 +348,7 @@ public static void main(String[] args) throws IOException {
 
         t0 = System.currentTimeMillis();
         System.out.print("  Calibrating script transitions...   ");
-        float[] scriptTransCal = calibrateScriptTransitions(allDevFiles, scriptTransTable,
+        float[] scriptTransCal = calibrateScriptTransitions(allTrainFiles, scriptTransTable,
                 scriptBucketMap, numScriptBuckets);
         System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
                 scriptTransCal[0], scriptTransCal[1], System.currentTimeMillis() - t0);
@@ -334,21 +368,21 @@ public static void main(String[] args) throws IOException {
         System.out.printf("%d tables built%n", remapTables.size());
 
         // -----------------------------------------------------------------------
-        // Phase 3 — per-script linear classifiers (now with z4)
+        // Phase 3 — per-script linear classifiers using v6 features
         // -----------------------------------------------------------------------
         System.out.println("\n--- Phase 3: per-script linear classifiers (z1,z2,z3,z4) ---");
-        for (String script : bigramTables.keySet()) {
-            Path devFile = devFilePaths.get(script);
-            if (devFile == null) {
-                System.out.printf("  [%s] WARNING: no dev file, keeping equal-weight defaults%n", script);
+        for (String script : f1Calibrations.keySet()) {
+            Path trainFile = trainFilePaths.get(script);
+            if (trainFile == null) {
+                System.out.printf("  [%s] WARNING: no train file, keeping equal-weight defaults%n", script);
                 continue;
             }
             t0 = System.currentTimeMillis();
             System.out.printf("  [%s] training classifier... ", script);
-            float[] weights = trainClassifier(devFile,
-                    bigramTables.get(script), bigramCalibrations.get(script),
+            float[] weights = trainClassifierV7(trainFile,
+                    f1TablesByScript.get(script), f1Calibrations.get(script),
                     blockTables.get(script), blockCalibrations.get(script),
-                    controlCalibrations.get(script), blockIndex, blockN,
+                    controlCalibrations.get(script),
                     scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets,
                     scriptCodepoints, remapTables);
             classifierWeights.put(script, weights);
@@ -358,82 +392,31 @@ public static void main(String[] args) throws IOException {
         }
 
         System.out.printf("%nWriting model (%d scripts, blockN=%d, scriptBuckets=%d) → %s%n",
-                bigramTables.size(), blockN, numScriptBuckets, output);
-        saveModel(bigramTables, bigramCalibrations,
+                f1Calibrations.size(), blockN, numScriptBuckets, output);
+        saveModelV7(f1TablesByScript, f1Calibrations,
                   blockTables, blockCalibrations,
                   controlCalibrations, classifierWeights,
-                  blockIndex, blockN, scriptBuckets, scriptTransTable, scriptTransCal, output);
-        System.out.printf("Model size: %,d bytes (%.1f MB)%n",
-                Files.size(output), Files.size(output) / 1_000_000.0);
+                  scriptBuckets, scriptTransTable, scriptTransCal,
+                  output);
+        System.out.printf("Model size: %,d bytes (%.1f KB)%n",
+                Files.size(output), Files.size(output) / 1024.0);
         System.out.println("Done.");
     }
 
-    // -----------------------------------------------------------------------
-    // Block index
-    // -----------------------------------------------------------------------
-
-    /**
-     * Builds a stable ordered mapping from {@link Character.UnicodeBlock} to integer index
-     * by scanning all valid Unicode codepoints in order (U+0000 to U+10FFFF) and
-     * recording each block's first occurrence.
-     *
-     * <p>The resulting map has {@code size()} entries (one per named block).
-     * Callers should reserve index {@code size()} as the "unassigned" bucket
-     * (for codepoints where {@code UnicodeBlock.of(cp)} returns null).
-     *
-     * @return immutable ordered map: UnicodeBlock → integer index [0, size)
-     */
-    static Map<Character.UnicodeBlock, Integer> buildBlockIndex() {
-        LinkedHashMap<Character.UnicodeBlock, Integer> index = new LinkedHashMap<>();
-        for (int cp = 0; cp <= 0x10FFFF; cp++) {
-            Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
-            if (b != null) index.putIfAbsent(b, index.size());
-        }
-        return Collections.unmodifiableMap(index);
-    }
-
     // -----------------------------------------------------------------------
     // Training
     // -----------------------------------------------------------------------
 
     /**
-     * Trains a 256×256 byte-bigram log-probability table from a gzipped sentence file.
-     *
-     * @return float[65536] where index {@code a*256+b} = log P(b|a)
-     */
-    static float[] trainBigramTable(Path trainGz) throws IOException {
-        long[] counts = new long[65536];
-        long totalBigrams = 0;
-        long sentences = 0;
-
-        try (BufferedReader r = openGzipped(trainGz)) {
-            String line;
-            while ((line = r.readLine()) != null) {
-                byte[] bytes = line.getBytes(StandardCharsets.UTF_8);
-                for (int i = 0; i + 1 < bytes.length; i++) {
-                    counts[((bytes[i] & 0xFF) << 8) | (bytes[i + 1] & 0xFF)]++;
-                    totalBigrams++;
-                }
-                sentences++;
-            }
-        }
-
-        System.out.printf("    %,d sentences, %,d byte bigrams%n", sentences, totalBigrams);
-        return laplaceSmoothLogProb(counts, 256);
-    }
-
-    /**
-     * Trains a {@code blockN×blockN} named-Unicode-block transition log-probability table.
+     * Trains a {@code N × N} block-transition log-probability table where
+     * {@code N = UnicodeBlockRanges.bucketCount()}.  Block bucketing uses
+     * the JVM-independent {@link UnicodeBlockRanges} table.
      *
-     * @param blockIndex ordered mapping from UnicodeBlock to index [0, blockIndex.size())
-     * @param blockN     blockIndex.size() + 1 (includes the null bucket)
-     * @return float[blockN*blockN] where index {@code a*blockN+b} = log P(block_b | block_a)
+     * @return float[N*N] where index {@code a*N+b} = log P(block_b | block_a)
      */
-    static float[] trainBlockTable(Path trainGz,
-                                   Map<Character.UnicodeBlock, Integer> blockIndex,
-                                   int blockN) throws IOException {
+    static float[] trainBlockTable(Path trainGz) throws IOException {
+        int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
         long[] counts = new long[blockN * blockN];
-        int nullId = blockN - 1;
         long totalBigrams = 0;
         long sentences = 0;
 
@@ -443,8 +426,7 @@ static float[] trainBlockTable(Path trainGz,
                 int prev = -1;
                 for (int i = 0; i < line.length(); ) {
                     int cp = line.codePointAt(i);
-                    Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
-                    int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
+                    int blockId = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketOf(cp);
                     if (prev >= 0) {
                         counts[prev * blockN + blockId]++;
                         totalBigrams++;
@@ -533,37 +515,17 @@ static List<String> sampleSubstrings(Path devGz, int nSamples,
         return result;
     }
 
-    /** @return float[2] = {mu, sigma} of byte-bigram mean log-prob on dev windows */
-    static float[] computeBigramCalibration(Path devGz, float[] bigramTable) throws IOException {
-        List<String> windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 42);
-        List<Double> scores = new ArrayList<>(windows.size());
-        for (String window : windows) {
-            byte[] bytes = window.getBytes(StandardCharsets.UTF_8);
-            if (bytes.length < 2) continue;
-            double sum = 0;
-            for (int i = 0; i + 1 < bytes.length; i++) {
-                sum += bigramTable[((bytes[i] & 0xFF) << 8) | (bytes[i + 1] & 0xFF)];
-            }
-            scores.add(sum / (bytes.length - 1));
-        }
-        System.out.printf("    %,d dev windows%n", scores.size());
-        return muSigma(scores);
-    }
-
     /** @return float[2] = {mu, sigma} of block-transition mean log-prob on dev windows */
-    static float[] computeBlockCalibration(Path devGz, float[] blockTable,
-                                           Map<Character.UnicodeBlock, Integer> blockIndex,
-                                           int blockN) throws IOException {
+    static float[] computeBlockCalibration(Path devGz, float[] blockTable) throws IOException {
+        int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
         List<String> windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 43);
         List<Double> scores = new ArrayList<>(windows.size());
-        int nullId = blockN - 1;
         for (String window : windows) {
             int[] ids = new int[window.length()];
             int len = 0;
             for (int i = 0; i < window.length(); ) {
                 int cp = window.codePointAt(i);
-                Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
-                ids[len++] = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
+                ids[len++] = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketOf(cp);
                 i += Character.charCount(cp);
             }
             if (len < 2) continue;
@@ -623,166 +585,15 @@ static float[] computeControlByteCalibration(Path devGz) throws IOException {
      * @param remapTables list of pre-built wrong-codec remap tables from {@link #buildRemapTable}
      * @return float[5] = {w1, w2, w3, w4, bias} — classifier weights; positive logit = clean
      */
-    static float[] trainClassifier(Path devGz,
-                                    float[] bigramTable, float[] bigramCal,
-                                    float[] blockTable, float[] blockCal,
-                                    float[] controlCal,
-                                    Map<Character.UnicodeBlock, Integer> blockIndex,
-                                    int blockN,
-                                    float[] scriptTransTable, float[] scriptTransCal,
-                                    Map<String, Integer> scriptBucketMap, int numScriptBuckets,
-                                    Map<String, List<Integer>> scriptCodepoints,
-                                    List<Map<Character, Character>> remapTables)
-            throws IOException {
-        int nEach = NUM_CLASSIFIER_SAMPLES;
 
-        // Clean windows
-        List<String> cleanWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 100);
-
-        // Corrupted windows: sample base windows (seed 101), then distort
-        // Four-way rotation: inject / shuffle / cross-script / wrong-codec remap
-        List<String> baseWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 101);
-        Random rng = new Random(102);
-        List<String> corruptedWindows = new ArrayList<>(nEach);
-        for (int i = 0; i < baseWindows.size(); i++) {
-            String w = baseWindows.get(i);
-            switch (i % 4) {
-                case 0:
-                    corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng));
-                    break;
-                case 1:
-                    corruptedWindows.add(shuffleChars(w, rng));
-                    break;
-                case 2:
-                    corruptedWindows.add(injectCrossScriptChars(w, CLASSIFIER_INJECT_RATE, rng,
-                            scriptCodepoints));
-                    break;
-                default:
-                    if (!remapTables.isEmpty()) {
-                        Map<Character, Character> table =
-                                remapTables.get(rng.nextInt(remapTables.size()));
-                        corruptedWindows.add(wrongCodecRemap(w, table, CLASSIFIER_INJECT_RATE, rng));
-                    } else {
-                        corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng));
-                    }
-                    break;
-            }
-        }
-
-        // Build (z1, z2, z3, z4) feature matrix
-        List<float[]> features = new ArrayList<>(cleanWindows.size() + corruptedWindows.size());
-        List<Integer> labels   = new ArrayList<>(cleanWindows.size() + corruptedWindows.size());
-
-        for (String w : cleanWindows) {
-            features.add(extractFeatures(w, bigramTable, bigramCal,
-                    blockTable, blockCal, blockN, controlCal, blockIndex,
-                    scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets));
-            labels.add(1); // clean
-        }
-        for (String w : corruptedWindows) {
-            features.add(extractFeatures(w, bigramTable, bigramCal,
-                    blockTable, blockCal, blockN, controlCal, blockIndex,
-                    scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets));
-            labels.add(0); // corrupted
-        }
-
-        float[] weights = fitLogisticRegression(features, labels, 4);
-
-        // Calibrate bias using only short (len=15) windows so that FPR ≤ 2.5%
-        // even at the worst-case (shortest) window length.
-        List<String> shortWindows = sampleSubstrings(devGz, nEach, new int[]{15}, 200);
-        List<Float> shortLogits = new ArrayList<>(shortWindows.size());
-        int nFeat = weights.length - 1;
-        for (String w : shortWindows) {
-            float[] x = extractFeatures(w, bigramTable, bigramCal,
-                    blockTable, blockCal, blockN, controlCal, blockIndex,
-                    scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets);
-            float logit = weights[nFeat];
-            for (int j = 0; j < nFeat; j++) logit += weights[j] * x[j];
-            shortLogits.add(logit);
-        }
-        if (!shortLogits.isEmpty()) {
-            Collections.sort(shortLogits);
-            int pIdx = (int) (0.025 * shortLogits.size());
-            float p025 = shortLogits.get(Math.max(0, pIdx));
-            weights[nFeat] -= p025;
-        }
-
-        return weights;
-    }
-
-    /**
-     * Extracts calibrated z-scores (z1, z2, z3, z4) for a single text window.
-     *
-     * @return float[4] = {z1_bigram, z2_block, z3_control, z4_scriptTrans}
-     */
-    static float[] extractFeatures(String window,
-                                    float[] bigramTable, float[] bigramCal,
-                                    float[] blockTable, float[] blockCal,
-                                    int blockN, float[] controlCal,
-                                    Map<Character.UnicodeBlock, Integer> blockIndex,
-                                    float[] scriptTransTable, float[] scriptTransCal,
-                                    Map<String, Integer> scriptBucketMap, int numScriptBuckets) {
-        byte[] utf8 = window.getBytes(StandardCharsets.UTF_8);
-
-        // z1: byte-bigram mean log-prob
-        float z1 = 0f;
-        if (utf8.length >= 2) {
-            double sum = 0;
-            int count = 0;
-            for (int i = 0; i + 1 < utf8.length; i++) {
-                sum += bigramTable[((utf8[i] & 0xFF) << 8) | (utf8[i + 1] & 0xFF)];
-                count++;
-            }
-            z1 = ((float) (sum / count) - bigramCal[0]) / bigramCal[1];
-        }
-
-        // z2: block-transition mean log-prob
-        float z2 = 0f;
-        if (blockTable != null && window.length() >= 2) {
-            int nullId = blockN - 1;
-            int prev = -1;
-            double sum = 0;
-            int count = 0;
-            for (int i = 0; i < window.length(); ) {
-                int cp = window.codePointAt(i);
-                Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
-                int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
-                if (prev >= 0) {
-                    sum += blockTable[prev * blockN + blockId];
-                    count++;
-                }
-                prev = blockId;
-                i += Character.charCount(cp);
-            }
-            if (count > 0) {
-                z2 = ((float) (sum / count) - blockCal[0]) / blockCal[1];
-            }
-        }
-
-        // z3: control-byte fraction (stored as −fraction, so higher = cleaner)
-        float z3 = 0f;
-        if (utf8.length > 0 && controlCal != null) {
-            long controlCount = 0;
-            for (byte b : utf8) {
-                if (isControlByte(b & 0xFF)) controlCount++;
-            }
-            float score = -(float) controlCount / utf8.length;
-            z3 = (score - controlCal[0]) / controlCal[1];
-        }
-
-        // z4: script-transition mean log-prob (raw UnicodeScript, no model fallback)
-        float z4 = 0f;
-        if (scriptTransTable != null && scriptTransCal != null) {
-            double raw = rawScriptTransitionLogProb(window, scriptTransTable,
-                    scriptBucketMap, numScriptBuckets, numScriptBuckets - 1);
-            if (!Double.isNaN(raw)) {
-                z4 = ((float) raw - scriptTransCal[0]) / scriptTransCal[1];
-            }
-        }
-
-        return new float[]{z1, z2, z3, z4};
-    }
+    // Per-feature z-score helpers (z2, z3, z4) for the classifier-training
+    // path live on JunkDetector as public static methods so they are the
+    // SOLE implementation — inference and training share the exact same
+    // math by construction.  See {@link JunkDetector#computeZ2BlockTransition},
+    // {@link JunkDetector#computeZ3ControlByte},
+    // {@link JunkDetector#computeZ4ScriptTransition}.  z1 (codepoint-hash)
+    // is computed against the in-progress hash tables during training and
+    // against the loaded model at inference.
 
     /**
      * Replaces a random fraction of characters with Unicode control characters.
@@ -900,50 +711,394 @@ static float[] fitLogisticRegression(List<float[]> features, List<Integer> label
     // Model serialisation
     // -----------------------------------------------------------------------
 
+    private static byte[] toBytes(float[] table) {
+        ByteBuffer buf = ByteBuffer.allocate(table.length * 4).order(ByteOrder.BIG_ENDIAN);
+        for (float v : table) buf.putFloat(v);
+        return buf.array();
+    }
+
+    // -----------------------------------------------------------------------
+    // v7 Phase 1: per-script open-addressing F1 table training
+    // -----------------------------------------------------------------------
+
+    /**
+     * Builds the {@link V7Tables} F1 carrier for one script's training data.
+     *
+     * <p>Two-pass:
+     * <ol>
+     *   <li><b>Pass 1.</b> Count every (cpA, cpB) pair occurrence and every
+     *       cp unigram occurrence in the script's {@code *.train.gz} file.
+     *       Pairs with count {@code < minBigramCount} are dropped at this
+     *       step — they're typically OCR artifacts and proper-noun noise.</li>
+     *   <li><b>Pass 2.</b> Collect every codepoint that appears in any
+     *       kept pair (as either side), sort, assign each a dense small
+     *       index.  Build a power-of-two open-addressing hash table sized
+     *       for {@code keptPairs / loadFactor}; pack each retained
+     *       {@code (idxA, idxB)} into a 32-bit key and insert via linear
+     *       probing.  Quantize both bigram log-probs and unigram log-probs
+     *       to 8-bit.</li>
+     * </ol>
+     *
+     * <p>Returned {@link V7Tables} are ready to hand to
+     * {@link #saveModelV7}.
+     *
+     * @param trainFile         the per-script {@code *.train.gz}
+     * @param minBigramCount    drop pairs whose count is below this
+     * @param loadFactor        target OA table load factor (e.g. 0.5)
+     * @param keyIndexBits      bit-width per index in the packed key
+     *                          (each side of the pair must fit)
+     */
+    public static V7Tables trainV7TablesForScript(Path trainFile,
+                                                  int minBigramCount,
+                                                  double loadFactor,
+                                                  int keyIndexBits) throws IOException {
+        // --- Pass 1: tally pair and unigram counts. ---
+        HashMap<Long, long[]> pairCounts = new HashMap<>(1 << 14);
+        HashMap<Integer, long[]> unigramCounts = new HashMap<>(1 << 12);
+        long bigramTotal = 0;
+        long unigramTotal = 0;
+
+        try (BufferedReader r = openGzipped(trainFile)) {
+            String line;
+            while ((line = r.readLine()) != null) {
+                int prevCp = -1;
+                for (int i = 0; i < line.length(); ) {
+                    int cp = line.codePointAt(i);
+                    i += Character.charCount(cp);
+                    long[] uc = unigramCounts.get(cp);
+                    if (uc == null) {
+                        unigramCounts.put(cp, new long[]{1L});
+                    } else {
+                        uc[0]++;
+                    }
+                    unigramTotal++;
+                    if (prevCp >= 0) {
+                        long packed = ((long) prevCp << 32) | (cp & 0xFFFFFFFFL);
+                        long[] bc = pairCounts.get(packed);
+                        if (bc == null) {
+                            pairCounts.put(packed, new long[]{1L});
+                        } else {
+                            bc[0]++;
+                        }
+                        bigramTotal++;
+                    }
+                    prevCp = cp;
+                }
+            }
+        }
+
+        // --- Filter pairs by count, collect kept-codepoint set. ---
+        int totalDistinct = pairCounts.size();
+        int keptPairs = 0;
+        long keptBigramTotal = 0;
+        java.util.TreeSet<Integer> keptCodepoints = new java.util.TreeSet<>();
+        for (Map.Entry<Long, long[]> e : pairCounts.entrySet()) {
+            if (e.getValue()[0] < minBigramCount) continue;
+            keptPairs++;
+            keptBigramTotal += e.getValue()[0];
+            long packed = e.getKey();
+            int cpA = (int) (packed >>> 32);
+            int cpB = (int) (packed & 0xFFFFFFFFL);
+            keptCodepoints.add(cpA);
+            keptCodepoints.add(cpB);
+        }
+        int dropped = totalDistinct - keptPairs;
+
+        // --- Build sorted codepoint index. ---
+        int[] cpIndex = new int[keptCodepoints.size()];
+        int idx = 0;
+        for (int cp : keptCodepoints) {
+            cpIndex[idx++] = cp;
+        }
+        // Enforce the indexable-bits contract.
+        int maxIndex = (1 << keyIndexBits) - 1;
+        if (cpIndex.length > maxIndex + 1) {
+            throw new IllegalStateException("Per-script codepoint count "
+                    + cpIndex.length + " exceeds 2^KEY_INDEX_BITS (= "
+                    + (maxIndex + 1) + ").  Increase KEY_INDEX_BITS or apply"
+                    + " a tighter pair-count filter for "
+                    + trainFile.getFileName());
+        }
+
+        // --- Compute per-pair log-prob (add-α smoothed over kept pairs). ---
+        // Denominator: kept-bigram total + α × keptPairs (only pairs we store).
+        double bigramDenom = keptBigramTotal + V7_ADD_ALPHA * keptPairs;
+        // Unigram log-probs.  We keep one entry per indexed codepoint; the
+        // denominator uses ALL unigram observations (kept pairs only would
+        // bias the backoff toward common pairs).
+        double unigramDenom = unigramTotal + V7_ADD_ALPHA * unigramCounts.size();
+        float[] unigramLogP = new float[cpIndex.length];
+        for (int i = 0; i < cpIndex.length; i++) {
+            long[] uc = unigramCounts.get(cpIndex[i]);
+            long count = uc != null ? uc[0] : 0L;
+            double p = (count + V7_ADD_ALPHA) / unigramDenom;
+            unigramLogP[i] = (float) Math.log(p);
+        }
+        // Per-script "absent codepoint" fallback: the lowest unigram log-prob
+        // we'd assign to a codepoint observed exactly once.  A codepoint
+        // *not* in our index has count 0, so:
+        double fallbackP = V7_ADD_ALPHA / unigramDenom;
+        float unigramFallbackLogP = (float) Math.log(fallbackP);
+
+        // Quantize unigram log-probs.
+        QuantizedFloats qUnigram = quantizeFloats(unigramLogP);
+
+        // --- Build the open-addressing bigram table. ---
+        int slots = nextPowerOfTwo((int) Math.max(2, Math.ceil(keptPairs / loadFactor)));
+        int[] keys = new int[slots];
+        java.util.Arrays.fill(keys, V7Tables.EMPTY_KEY);
+        // Compute log-probs first, quantize once, then write into the table
+        // alongside its key.
+        float[] keptLogP = new float[keptPairs];
+        int[] keptKeys = new int[keptPairs];
+        int writeIdx = 0;
+        // codepoint -> index lookup helper (small map keyed by Integer)
+        HashMap<Integer, Integer> cpToIdx = new HashMap<>(cpIndex.length * 2);
+        for (int i = 0; i < cpIndex.length; i++) {
+            cpToIdx.put(cpIndex[i], i);
+        }
+        for (Map.Entry<Long, long[]> e : pairCounts.entrySet()) {
+            long count = e.getValue()[0];
+            if (count < minBigramCount) continue;
+            long packed = e.getKey();
+            int cpA = (int) (packed >>> 32);
+            int cpB = (int) (packed & 0xFFFFFFFFL);
+            int idxA = cpToIdx.get(cpA);
+            int idxB = cpToIdx.get(cpB);
+            int packedKey = JunkDetector.packBigramKey(idxA, idxB);
+            double p = (count + V7_ADD_ALPHA) / bigramDenom;
+            keptKeys[writeIdx] = packedKey;
+            keptLogP[writeIdx] = (float) Math.log(p);
+            writeIdx++;
+        }
+        // Quantize all kept log-probs together so they share min/max.
+        QuantizedFloats qBigram = quantizeFloats(keptLogP);
+        byte[] values = new byte[slots];
+        for (int i = 0; i < keptPairs; i++) {
+            insertOA(keys, values, keptKeys[i], qBigram.bytes[i]);
+        }
+
+        System.out.printf(
+                "    pair_counts: distinct=%,d, kept=%,d (>=%d), dropped=%,d  "
+                + "cp_index=%,d  slots=%,d (load=%.2f)%n",
+                totalDistinct, keptPairs, minBigramCount, dropped,
+                cpIndex.length, slots, keptPairs / (double) slots);
+
+        return new V7Tables(cpIndex, keys, values, qUnigram.bytes,
+                qBigram.min, qBigram.max,
+                qUnigram.min, qUnigram.max,
+                unigramFallbackLogP, V7_BACKOFF_ALPHA);
+    }
+
+    /**
+     * Inserts a {@code (packedKey, value)} pair into the open-addressing
+     * table.  The caller is responsible for sizing the table large enough
+     * to avoid an infinite probe (any load &lt; 1.0 is safe).
+     */
+    private static void insertOA(int[] keys, byte[] values, int packedKey, byte value) {
+        int mask = keys.length - 1;
+        int h = JunkDetector.mixIndexKey(packedKey) & mask;
+        while (keys[h] != V7Tables.EMPTY_KEY) {
+            if (keys[h] == packedKey) {
+                // Same key twice — shouldn't happen with our dedup, but be
+                // defensive and overwrite rather than corrupt.
+                values[h] = value;
+                return;
+            }
+            h = (h + 1) & mask;
+        }
+        keys[h] = packedKey;
+        values[h] = value;
+    }
+
+    private static int nextPowerOfTwo(int n) {
+        if (n < 1) return 1;
+        int p = Integer.highestOneBit(n - 1) << 1;
+        return Math.max(1, p);
+    }
+
+    /**
+     * Computes per-script F1 calibration ({mu, sigma}) by scoring each
+     * window in the dev file against the trained per-script codepoint
+     * tables.  Delegates to
+     * {@link org.apache.tika.ml.junkdetect.JunkDetector#computeF1MeanLogP}
+     * — the single authoritative F1 implementation shared between training
+     * and inference.
+     */
+    public static float[] calibrateF1PerScript(Path devGz, V7Tables tables) throws IOException {
+        List<String> windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 42);
+        List<Double> scores = new ArrayList<>(windows.size());
+        for (String window : windows) {
+            double score = JunkDetector.computeF1MeanLogP(window, tables);
+            if (!Double.isNaN(score)) {
+                scores.add(score);
+            }
+        }
+        System.out.printf("    %,d dev windows%n", scores.size());
+        return muSigma(scores);
+    }
+
+    // -----------------------------------------------------------------------
+    // v7 Phase 3: classifier feature extractor + orchestrator
+    // -----------------------------------------------------------------------
+
+    /**
+     * Extracts a 4-dim calibrated z-score vector for one training window
+     * using the v7 per-script tables.  z2/z3/z4 delegate to the public
+     * helpers on {@link JunkDetector} — same math used at inference, no
+     * trainer/inference drift possible.
+     *
+     * @return float[4] = {z1_cpHash, z2_block, z3_control, z4_scriptTrans}
+     */
+    static float[] extractFeaturesV7(String window,
+                                      V7Tables tables, float[] f1Cal,
+                                      float[] blockTable, float[] blockCal,
+                                      float[] controlCal,
+                                      float[] scriptTransTable, float[] scriptTransCal,
+                                      Map<String, Integer> scriptBucketMap,
+                                      int numScriptBuckets) {
+        byte[] utf8 = window.getBytes(StandardCharsets.UTF_8);
+
+        // z1: per-script codepoint-bigram mean log-prob
+        float z1 = 0f;
+        double rawF1 = JunkDetector.computeF1MeanLogP(window, tables);
+        if (!Double.isNaN(rawF1) && f1Cal != null && f1Cal[1] > 0) {
+            z1 = ((float) rawF1 - f1Cal[0]) / f1Cal[1];
+        }
+
+        float z2 = org.apache.tika.ml.junkdetect.JunkDetector
+                .computeZ2BlockTransition(window, blockTable, blockCal);
+        float z3 = org.apache.tika.ml.junkdetect.JunkDetector
+                .computeZ3ControlByte(utf8, controlCal);
+        float z4 = org.apache.tika.ml.junkdetect.JunkDetector
+                .computeZ4ScriptTransition(window, scriptTransTable, scriptTransCal,
+                        scriptBucketMap, numScriptBuckets);
+
+        return new float[]{z1, z2, z3, z4};
+    }
+
+    /**
+     * Trains a per-script binary logistic regression classifier on
+     * (z1_cpHash, z2, z3, z4).  Same scaffolding as the v6 trainer
+     * (sample windows, corrupt half, fit LR, bias-calibrate on short
+     * windows) but uses v7 per-script F1 tables.
+     */
+    static float[] trainClassifierV7(Path devGz,
+                                      V7Tables tables, float[] f1Cal,
+                                      float[] blockTable, float[] blockCal,
+                                      float[] controlCal,
+                                      float[] scriptTransTable, float[] scriptTransCal,
+                                      Map<String, Integer> scriptBucketMap, int numScriptBuckets,
+                                      Map<String, List<Integer>> scriptCodepoints,
+                                      List<Map<Character, Character>> remapTables)
+            throws IOException {
+        int nEach = NUM_CLASSIFIER_SAMPLES;
+
+        List<String> cleanWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 100);
+
+        List<String> baseWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 101);
+        Random rng = new Random(102);
+        List<String> corruptedWindows = new ArrayList<>(nEach);
+        for (int i = 0; i < baseWindows.size(); i++) {
+            String w = baseWindows.get(i);
+            switch (i % 4) {
+                case 0:
+                    corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng));
+                    break;
+                case 1:
+                    corruptedWindows.add(shuffleChars(w, rng));
+                    break;
+                case 2:
+                    corruptedWindows.add(injectCrossScriptChars(w, CLASSIFIER_INJECT_RATE, rng,
+                            scriptCodepoints));
+                    break;
+                default:
+                    if (!remapTables.isEmpty()) {
+                        Map<Character, Character> table =
+                                remapTables.get(rng.nextInt(remapTables.size()));
+                        corruptedWindows.add(wrongCodecRemap(w, table, CLASSIFIER_INJECT_RATE, rng));
+                    } else {
+                        corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng));
+                    }
+                    break;
+            }
+        }
+
+        List<float[]> features = new ArrayList<>(cleanWindows.size() + corruptedWindows.size());
+        List<Integer> labels   = new ArrayList<>(cleanWindows.size() + corruptedWindows.size());
+
+        for (String w : cleanWindows) {
+            features.add(extractFeaturesV7(w, tables, f1Cal,
+                    blockTable, blockCal, controlCal,
+                    scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets));
+            labels.add(1);
+        }
+        for (String w : corruptedWindows) {
+            features.add(extractFeaturesV7(w, tables, f1Cal,
+                    blockTable, blockCal, controlCal,
+                    scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets));
+            labels.add(0);
+        }
+
+        float[] weights = fitLogisticRegression(features, labels, 4);
+
+        // Bias calibration on short windows so FPR ≤ 2.5% at worst-case length.
+        List<String> shortWindows = sampleSubstrings(devGz, nEach, new int[]{15}, 200);
+        List<Float> shortLogits = new ArrayList<>(shortWindows.size());
+        int nFeat = weights.length - 1;
+        for (String w : shortWindows) {
+            float[] x = extractFeaturesV7(w, tables, f1Cal,
+                    blockTable, blockCal, controlCal,
+                    scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets);
+            float logit = weights[nFeat];
+            for (int j = 0; j < nFeat; j++) logit += weights[j] * x[j];
+            shortLogits.add(logit);
+        }
+        if (!shortLogits.isEmpty()) {
+            Collections.sort(shortLogits);
+            int pIdx = (int) (0.025 * shortLogits.size());
+            float p025 = shortLogits.get(Math.max(0, pIdx));
+            weights[nFeat] -= p025;
+        }
+
+        return weights;
+    }
+
     /**
-     * Writes the trained model (version 4) to a gzipped binary file.
+     * Writes a v7 model file (JUNKDET1 version=7 gzipped binary).
      *
-     * <p>Format documented in the class Javadoc.  All multi-byte integers are
-     * big-endian; floats are IEEE 754 big-endian.
+     * <p>Layout vs. v6: no global F1+Bloom section.  Each per-script
+     * section embeds that script's {@link V7Tables} (codepoint index,
+     * open-addressing bigram keys+values, unigram table) directly after
+     * its F1 calibration, before F2.  See {@link JunkDetector#load} for
+     * the full layout spec.
      *
-     * @param classifierWeights per-script float[5] = {w1, w2, w3, w4, bias}
-     * @param blockN the block table dimension (blockIndex.size() + 1)
-     * @param scriptBuckets ordered list of script bucket names (last = "OTHER")
-     * @param scriptTransTable global script-transition log-prob table
-     * @param scriptTransCal float[2] = {mu, sigma} for script-transition feature
+     * <p>F2 (block transition), F3 (control byte), F4 (script transition)
+     * sections are unchanged from v6.
      */
-    static void saveModel(TreeMap<String, float[]> bigramTables,
-                          TreeMap<String, float[]> bigramCalibrations,
-                          TreeMap<String, float[]> blockTables,
-                          TreeMap<String, float[]> blockCalibrations,
-                          TreeMap<String, float[]> controlCalibrations,
-                          TreeMap<String, float[]> classifierWeights,
-                          Map<Character.UnicodeBlock, Integer> blockIndex,
-                          int blockN,
-                          List<String> scriptBuckets,
-                          float[] scriptTransTable,
-                          float[] scriptTransCal,
-                          Path output) throws IOException {
+    public static void saveModelV7(TreeMap<String, V7Tables> f1Tables,
+                                   TreeMap<String, float[]> f1Calibrations,
+                                   TreeMap<String, float[]> blockTables,
+                                   TreeMap<String, float[]> blockCalibrations,
+                                   TreeMap<String, float[]> controlCalibrations,
+                                   TreeMap<String, float[]> classifierWeights,
+                                   List<String> scriptBuckets,
+                                   float[] scriptTransTable,
+                                   float[] scriptTransCal,
+                                   Path output) throws IOException {
         try (DataOutputStream dos = new DataOutputStream(
                 new GZIPOutputStream(Files.newOutputStream(output)))) {
 
             dos.write(MAGIC.getBytes(StandardCharsets.UTF_8));
             dos.writeByte(VERSION);
-            dos.writeInt(bigramTables.size());
-            dos.writeShort(blockN);
+            dos.writeInt(f1Calibrations.size());
 
-            // Block names section (v5+): write ordered block names for JVM-independence
-            String[] blockNames = new String[blockN - 1];
-            for (Map.Entry<Character.UnicodeBlock, Integer> e : blockIndex.entrySet()) {
-                blockNames[e.getValue()] = e.getKey().toString();
-            }
-            for (String name : blockNames) {
-                byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
-                dos.writeShort(nameBytes.length);
-                dos.write(nameBytes);
-            }
+            // Block-scheme version byte — bound to the JVM-independent
+            // UnicodeBlockRanges static table.  Mismatch at load time is a
+            // hard error (no silent re-mapping).
+            dos.writeByte(org.apache.tika.ml.junkdetect.UnicodeBlockRanges.SCHEME_VERSION);
 
-            // Global script-transition section (v4+)
+            // Global script-transition section
             int numBuckets = scriptBuckets.size();
             dos.writeByte(numBuckets);
             for (String bucketName : scriptBuckets) {
@@ -952,45 +1107,99 @@ static void saveModel(TreeMap<String, float[]> bigramTables,
                 dos.write(nameBytes);
             }
             dos.write(toBytes(scriptTransTable));
-            dos.writeFloat(scriptTransCal[0]); // mu
-            dos.writeFloat(scriptTransCal[1]); // sigma
+            dos.writeFloat(scriptTransCal[0]);
+            dos.writeFloat(scriptTransCal[1]);
 
-            for (var entry : bigramTables.entrySet()) {
+            // Per-script sections.  V7 embeds the F1 tables inline.
+            int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
+            for (var entry : f1Calibrations.entrySet()) {
                 String script = entry.getKey();
-                float[] bigramTable  = entry.getValue();
-                float[] bigramCal    = bigramCalibrations.getOrDefault(script, new float[]{0f, 1f});
-                float[] blockTable   = blockTables.getOrDefault(script, new float[blockN * blockN]);
-                float[] blockCal     = blockCalibrations.getOrDefault(script, new float[]{0f, 1f});
-                float[] controlCal   = controlCalibrations.getOrDefault(script, new float[]{0f, 1f});
-                float[] weights      = classifierWeights.getOrDefault(script,
+                float[] f1Cal      = entry.getValue();
+                V7Tables tables    = f1Tables.get(script);
+                if (tables == null) {
+                    throw new IllegalStateException("No V7Tables for script " + script);
+                }
+                float[] blockTable = blockTables.getOrDefault(script, new float[blockN * blockN]);
+                float[] blockCal   = blockCalibrations.getOrDefault(script, new float[]{0f, 1f});
+                float[] controlCal = controlCalibrations.getOrDefault(script, new float[]{0f, 1f});
+                float[] weights    = classifierWeights.getOrDefault(script,
                         new float[]{1f / 4, 1f / 4, 1f / 4, 1f / 4, 0f});
 
                 byte[] nameBytes = script.getBytes(StandardCharsets.UTF_8);
                 dos.writeShort(nameBytes.length);
                 dos.write(nameBytes);
 
-                dos.writeFloat(bigramCal[0]);
-                dos.writeFloat(bigramCal[1]);
-                dos.write(toBytes(bigramTable));
+                // F1 calibration
+                dos.writeFloat(f1Cal[0]);
+                dos.writeFloat(f1Cal[1]);
+
+                // F1 per-script tables
+                tables.writeTo(dos);
 
+                // F2 — block transitions
                 dos.writeFloat(blockCal[0]);
                 dos.writeFloat(blockCal[1]);
                 dos.write(toBytes(blockTable));
 
+                // F3 — control-byte calibration
                 dos.writeFloat(controlCal[0]);
                 dos.writeFloat(controlCal[1]);
 
+                // Classifier weights
                 int numFeatures = weights.length - 1;
                 dos.writeByte(numFeatures);
-                for (float v : weights) dos.writeFloat(v);
+                for (float v : weights) {
+                    dos.writeFloat(v);
+                }
             }
         }
     }
 
-    private static byte[] toBytes(float[] table) {
-        ByteBuffer buf = ByteBuffer.allocate(table.length * 4).order(ByteOrder.BIG_ENDIAN);
-        for (float v : table) buf.putFloat(v);
-        return buf.array();
+    /**
+     * Quantizes a float array to 8-bit unsigned by linearly mapping
+     * {@code [min, max] → [0, 255]}.  Returns the byte array; {@code min}
+     * and {@code max} are computed from the input.
+     *
+     * <p>Stored in v6 model files as 8-bit log-prob tables; reader
+     * dequantizes via {@code min + (b/255) * (max - min)}.
+     *
+     * @return three-element record: byte[] quantized, float min, float max
+     */
+    public static QuantizedFloats quantizeFloats(float[] in) {
+        float min = Float.POSITIVE_INFINITY;
+        float max = Float.NEGATIVE_INFINITY;
+        for (float v : in) {
+            if (Float.isFinite(v)) {
+                if (v < min) min = v;
+                if (v > max) max = v;
+            }
+        }
+        if (!Float.isFinite(min) || !Float.isFinite(max) || max == min) {
+            // Degenerate input — emit zeros, store dummy range.
+            return new QuantizedFloats(new byte[in.length], 0f, 1f);
+        }
+        byte[] out = new byte[in.length];
+        float range = max - min;
+        for (int i = 0; i < in.length; i++) {
+            float v = Float.isFinite(in[i]) ? in[i] : min;
+            int q = Math.round(((v - min) / range) * 255.0f);
+            if (q < 0) q = 0;
+            else if (q > 255) q = 255;
+            out[i] = (byte) q;
+        }
+        return new QuantizedFloats(out, min, max);
+    }
+
+    /** Return type of {@link #quantizeFloats(float[])}. */
+    public static final class QuantizedFloats {
+        public final byte[] bytes;
+        public final float min;
+        public final float max;
+        public QuantizedFloats(byte[] bytes, float min, float max) {
+            this.bytes = bytes;
+            this.min = min;
+            this.max = max;
+        }
     }
 
     // -----------------------------------------------------------------------
@@ -1307,5 +1516,9 @@ private static void printUsage() {
         System.err.println("                     (default: ~/datasets/madlad/junkdetect)");
         System.err.println("  --output   <path>  Output model file");
         System.err.println("                     (default: {data-dir}/junkdetect.bin)");
+        System.err.println();
+        System.err.println("All other training parameters (Bloom filter size, min bigram count, etc.)");
+        System.err.println("are fixed in JunkDetectorTrainingConfig and tracked in git.  Edit that");
+        System.err.println("file and commit to change them.");
     }
 }
diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index feb9da112e7..644d46bad05 100644
Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ
diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
index 88a5a8c16fa..e670f9e1639 100644
--- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
+++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.ml.junkdetect;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.nio.charset.StandardCharsets;
@@ -31,6 +32,7 @@
 /**
  * Smoke tests verifying the bundled model meets minimum quality thresholds.
  * Failures indicate the model needs more data or feature extraction is wrong.
+ *
  */
 public class JunkDetectorSmokeTest {
 
@@ -110,7 +112,7 @@ void cp1252VsCp1257OnBalticText() throws Exception {
 
         System.out.println("Baltic comparison: " + result);
 
-        assertEquals("B", result.winner(),
+        assertEquals("cp1257", result.winner(),
                 "cp1257 should be identified as the correct encoding for Lithuanian text");
         // Delta is weak (pooled LATIN model dilutes Baltic-specific bigrams).
         // Production threshold is delta > 1.0; PoC floor is 0.1.
@@ -142,7 +144,7 @@ void cp1252VsCp1251OnRussianText() throws Exception {
 
         System.out.println("Russian Cyrillic comparison: " + result);
 
-        assertEquals("B", result.winner(),
+        assertEquals("cp1251", result.winner(),
                 "cp1251 should be identified as the correct encoding for Russian text");
         assertTrue(result.delta() > 1.0,
                 "Cyrillic codec separation should be strong: delta=" + result.delta());
@@ -195,10 +197,50 @@ void shiftJisZipEntryNameVsUtf8() throws Exception {
 
         System.out.println("Shift-JIS zip entry: " + result);
 
-        assertEquals("A", result.winner(),
+        assertEquals("Shift-JIS", result.winner(),
                 "Shift-JIS decode should beat garbled UTF-8 for short Japanese filename");
     }
 
+    /**
+     * Regression: a single CJK codepoint sandwiched between modeled-script
+     * runs used to NaN-poison the entire score, because the byte-length
+     * filter ({@code runUtf8.length >= 2}) and the UTF-16 char-length
+     * filter inside {@code computeF1MeanLogP} ({@code text.length() >= 2})
+     * disagreed.  A single CJK char is 3 UTF-8 bytes (1 UTF-16 unit), so
+     * it passed the outer filter, computed NaN inside, and poisoned the
+     * weighted aggregate — surfacing as UNKNOWN to callers.  This was the
+     * root cause of the AIT5-class regressions (UTF-8 Malayalam decoded as
+     * GB18030 returns lots of single-Han-char runs).
+     */
+    @Test
+    void singleCjkCharDoesNotNaNPoisonScore() {
+        // Latin sentence with a stray CJK char dropped in — exactly the
+        // shape of a GB18030-mojibake-of-UTF-8 decode at the run-boundary
+        // level.  The CJK char forms a single-codepoint HAN run.
+        String text = "The quick brown 中 fox jumps over the lazy dog. "
+                + "Pack 中 my box with five dozen liquor jugs.";
+        TextQualityScore score = detector.score(text);
+        assertFalse(score.isUnknown(),
+                "score should not be UNKNOWN — single-CJK run should be skipped, "
+                        + "not poison the aggregate.  Got: " + score);
+    }
+
+    /**
+     * Sibling regression: the same NaN-poisoning case caused by a single
+     * supplementary-plane (4-byte UTF-8, 2-UTF-16-unit) codepoint.  Less
+     * load-bearing than the BMP-CJK case — supplementary chars decode to
+     * {@code text.length() == 2} so they pass the inner filter — but
+     * worth pinning the behaviour.
+     */
+    @Test
+    void supplementaryPlaneCharSurvivesScoring() {
+        // U+1F600 (😀) is a 2-UTF-16-unit supplementary char with script COMMON,
+        // so it attaches to a preceding modeled run rather than forming its own.
+        String text = "Hello world 😀 this is some plain English text.";
+        TextQualityScore score = detector.score(text);
+        assertFalse(score.isUnknown(), "supplementary char should not break scoring: " + score);
+    }
+
     // -----------------------------------------------------------------------
 
     /**
diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java
new file mode 100644
index 00000000000..b846064c52f
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.zip.GZIPOutputStream;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.ml.junkdetect.tools.JunkDetectorTrainingConfig;
+import org.apache.tika.ml.junkdetect.tools.TrainJunkModel;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Validates the v7 model file format end-to-end: a synthetic small model is
+ * constructed in-memory with known table values, saved via
+ * {@link TrainJunkModel#saveModelV7}, loaded via {@link JunkDetector#load},
+ * scored against known input, and the output verified against hand-computed
+ * expected values.
+ *
+ * <p>This is the architectural-decision validation: it confirms that the v7
+ * file format spec, the trainer's save path, the loader, and the scoring
+ * path (per-script open-addressing codepoint-bigram + unigram backoff) all
+ * agree on the semantics.  Does not require the production training corpus.
+ */
+public class JunkDetectorV7Test {
+
+    @Test
+    void v7RoundTripSeenPairAndUnigramBackoff(@TempDir Path tmp) throws IOException {
+        // -----------------------------------------------------------------
+        // Build a tiny synthetic v7 model for LATIN.
+        //
+        // codepointIndex = ['A', 'B']  (indices 0, 1)
+        // Pair (A, B) stored with log-prob -1.0
+        // (B, A) is *not* in the bigram table — falls back to unigram.
+        // Unigram log-prob = -2.0 for both 'A' and 'B'.
+        // backoffAlpha = 1.0  →  backoff sum = -4.0
+        //
+        // Expected mean log-prob over "ABAB":
+        //   (A,B) seen:    -1.0
+        //   (B,A) backoff: 1.0 * (-2 + -2) = -4.0
+        //   (A,B) seen:    -1.0
+        //   mean = -2.0
+        // f1Cal mu=-5, sigma=1  →  z1 = (-2 - -5) / 1 = +3.0
+        // Classifier w1=1, rest 0, bias=0  →  logit = +3.0
+        // -----------------------------------------------------------------
+        V7Tables tables = buildLatinTablesAB();
+
+        Path modelFile = tmp.resolve("v7-test.bin");
+        saveMinimalV7Model(tables, modelFile);
+
+        // Verify the file roundtrips through the loader.
+        JunkDetector detector = JunkDetector.loadFromPath(modelFile);
+        assertEquals(7, detector.getModelVersion(), "Loaded model should be v7");
+
+        TextQualityScore score = detector.score("ABAB");
+        assertEquals("LATIN", score.getDominantScript(), "Dominant script should be LATIN");
+        // Quantization of [-4, -1] to 8 bits introduces ~0.012 nat / level.
+        // Net z-error over 3 pairs bounded ~0.05; allow 0.3 to be safe.
+        assertEquals(3.0f, score.getZScore(), 0.3f,
+                "Expected z ≈ +3.0 for 'ABAB' (seen-pair + backoff mix)");
+    }
+
+    @Test
+    void v7RoundTripAllSeenPairsScoreHigher(@TempDir Path tmp) throws IOException {
+        // Same shape as the first test but with BOTH (A,B) and (B,A) in the
+        // bigram table.  mean log-prob = -1.0, z1 = +4.0, logit = +4.0.
+        int[] cpIndex = new int[]{'A', 'B'};
+        int[] keys = new int[4];
+        Arrays.fill(keys, V7Tables.EMPTY_KEY);
+        byte[] values = new byte[4];
+        float bMin = -10.0f;
+        float bMax = -1.0f;
+        byte b = quantizeOne(-1.0f, bMin, bMax);
+        insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
+        insertOA(keys, values, JunkDetector.packBigramKey(1, 0), b);
+
+        float uMin = -5.0f;
+        float uMax = -2.0f;
+        byte[] unigramBytes = new byte[]{
+                quantizeOne(-2.0f, uMin, uMax),
+                quantizeOne(-2.0f, uMin, uMax),
+        };
+
+        V7Tables tables = new V7Tables(cpIndex, keys, values, unigramBytes,
+                bMin, bMax, uMin, uMax,
+                -10.0f, 1.0f);
+
+        Path modelFile = tmp.resolve("v7-test-allseen.bin");
+        saveMinimalV7Model(tables, modelFile);
+        JunkDetector detector = JunkDetector.loadFromPath(modelFile);
+
+        TextQualityScore score = detector.score("ABAB");
+        // mean = -1.0, z1 = (-1 - -5) / 1 = +4.0
+        assertEquals(4.0f, score.getZScore(), 0.3f,
+                "All-seen 'ABAB' should score z ≈ +4");
+    }
+
+    /**
+     * End-to-end trainer integration: drives {@link
+     * TrainJunkModel#trainV7TablesForScript} on a tiny synthetic corpus,
+     * calibrates F1, saves a model, loads it, and scores text.  Catches
+     * drift between trainer F1 math and inference F1 math — the FNV
+     * mix-hash, packed-key layout, and codepoint-pair iteration order all
+     * have to agree exactly, or scoring produces nonsense.
+     *
+     * <p>F2/F3/F4 are zeroed out (placeholder data) — the test isolates
+     * F1's trainer↔inference round-trip.
+     */
+    @Test
+    void trainerRoundTripIntegration(@TempDir Path tmp) throws IOException {
+        // --- 1. Build a tiny LATIN corpus on disk ---
+        Path trainFile = tmp.resolve("LATIN.train.gz");
+        writeGzippedLines(trainFile,
+                "the quick brown fox jumps over the lazy dog",
+                "pack my box with five dozen liquor jugs",
+                "how vexingly quick daft zebras jump",
+                "the five boxing wizards jump quickly",
+                "sphinx of black quartz judge my vow");
+        Path devFile = tmp.resolve("LATIN.dev.gz");
+        writeGzippedLines(devFile,
+                "the rain in spain falls mainly on the plain",
+                "a stitch in time saves nine",
+                "all that glitters is not gold");
+
+        // --- 2. Phase 1: train V7 F1 tables for this script ---
+        // Tiny corpus → min_count=1 so all pairs survive.
+        V7Tables tables = TrainJunkModel.trainV7TablesForScript(trainFile,
+                1, JunkDetectorTrainingConfig.OA_LOAD_FACTOR,
+                JunkDetectorTrainingConfig.KEY_INDEX_BITS);
+
+        // Sanity: 'h' should be in the codepoint index (appears in "the").
+        assertTrue(Arrays.binarySearch(tables.codepointIndex, (int) 'h') >= 0,
+                "'h' should be in codepoint index — it appears in training");
+        assertTrue(Arrays.binarySearch(tables.codepointIndex, (int) 'x') >= 0,
+                "'x' should be in codepoint index — appears in 'box', 'fox'");
+
+        // The pair (t, h) is in training; the OA lookup should find it.
+        int idxT = Arrays.binarySearch(tables.codepointIndex, (int) 't');
+        int idxH = Arrays.binarySearch(tables.codepointIndex, (int) 'h');
+        assertTrue(idxT >= 0 && idxH >= 0);
+        int slot = JunkDetector.lookupBigramSlot(tables, idxT, idxH);
+        assertTrue(slot >= 0, "OA lookup should find seen pair (t, h)");
+
+        // --- 3. F1 raw scoring sanity ---
+        double meanLogP = JunkDetector.computeF1MeanLogP("the quick brown fox", tables);
+        assertTrue(Double.isFinite(meanLogP),
+                "Mean log-prob on training text should be finite, got " + meanLogP);
+        assertTrue(meanLogP > -15 && meanLogP < 0,
+                "Score on training text should be sensible, got " + meanLogP);
+
+        // --- 4. Phase 1.5: F1 calibration on dev ---
+        float[] f1CalLatin = TrainJunkModel.calibrateF1PerScript(devFile, tables);
+        assertTrue(Float.isFinite(f1CalLatin[0]), "mu1 should be finite");
+        assertTrue(Float.isFinite(f1CalLatin[1]) && f1CalLatin[1] > 0,
+                "sigma1 should be positive finite");
+
+        // --- 5. Assemble + save a minimal v7 model ---
+        int blockN = UnicodeBlockRanges.bucketCount();
+        TreeMap<String, V7Tables> f1Tables = new TreeMap<>();
+        f1Tables.put("LATIN", tables);
+        TreeMap<String, float[]> blockTables = new TreeMap<>();
+        blockTables.put("LATIN", new float[blockN * blockN]);
+        TreeMap<String, float[]> blockCal = new TreeMap<>();
+        blockCal.put("LATIN", new float[]{0f, 1f});
+        TreeMap<String, float[]> controlCal = new TreeMap<>();
+        controlCal.put("LATIN", new float[]{0f, 1f});
+        TreeMap<String, float[]> f1CalMap = new TreeMap<>();
+        f1CalMap.put("LATIN", f1CalLatin);
+        TreeMap<String, float[]> classifierWeights = new TreeMap<>();
+        classifierWeights.put("LATIN", new float[]{1f, 0f, 0f, 0f, 0f});
+
+        List<String> scriptBuckets = List.of("LATIN", "OTHER");
+        float[] scriptTransTable = new float[scriptBuckets.size() * scriptBuckets.size()];
+        float[] scriptTransCal = new float[]{0f, 1f};
+
+        Path modelPath = tmp.resolve("junkdetect.bin");
+        TrainJunkModel.saveModelV7(
+                f1Tables, f1CalMap, blockTables, blockCal, controlCal,
+                classifierWeights, scriptBuckets, scriptTransTable,
+                scriptTransCal, modelPath);
+
+        // --- 6. Load via JunkDetector and score ---
+        JunkDetector detector = JunkDetector.loadFromPath(modelPath);
+        assertEquals(7, detector.getModelVersion(),
+                "Loaded model should be v7");
+        assertTrue(detector.knownScripts().contains("LATIN"),
+                "Loaded model should know LATIN");
+
+        TextQualityScore score = detector.score("the quick brown fox jumps");
+        assertEquals("LATIN", score.getDominantScript());
+        assertTrue(Float.isFinite(score.getZScore()),
+                "Score on in-distribution text should be finite, got " + score);
+
+        // --- 7. Train/infer consistency check ---
+        // The inference path should compute the same raw F1 score as
+        // JunkDetector.computeF1MeanLogP on the same text — if these
+        // two ever disagree, the model's calibration is silently wrong.
+        String probe = "pack my box with five dozen liquor jugs";
+        double trainerRawMean = JunkDetector.computeF1MeanLogP(probe, tables);
+        float expectedZ1 = (float) ((trainerRawMean - f1CalLatin[0]) / f1CalLatin[1]);
+        TextQualityScore probeScore = detector.score(probe);
+        // logit = w1 * z1 + 0 + 0 + 0 + 0 = z1 in this test configuration.
+        assertEquals(expectedZ1, probeScore.getZScore(), 0.001f,
+                "Inference z1 must match trainer-computed z1 "
+                + "(train/infer F1 math drift)");
+    }
+
+    // -----------------------------------------------------------------------
+    // Helpers
+    // -----------------------------------------------------------------------
+
+    /**
+     * Builds a V7Tables with codepoint index ['A', 'B'], where (A,B) has a
+     * stored log-prob of -1.0 but (B,A) is absent (forces unigram backoff).
+     * Unigram log-prob = -2.0 for both A and B.
+     *
+     * <p>Bigram quant range is set explicitly to {@code [-10, -1]} so that
+     * the single stored value at -1.0 maps to byte 255 (avoids the
+     * degenerate {@code min == max} branch in
+     * {@link TrainJunkModel#quantizeFloats}).  Same idea for the unigram
+     * range {@code [-5, -2]} so the (-2.0, -2.0) values map to byte 255.
+     */
+    private static V7Tables buildLatinTablesAB() {
+        int[] cpIndex = new int[]{'A', 'B'};
+
+        // 4 slots ≈ 25% load for 1 pair.  Open-addressing with linear probe.
+        int[] keys = new int[4];
+        Arrays.fill(keys, V7Tables.EMPTY_KEY);
+        byte[] values = new byte[4];
+
+        // Manual quantization with a chosen range so we don't hit the
+        // degenerate single-element case.  range=[-10, -1] → -1.0 → byte 255.
+        float bMin = -10.0f;
+        float bMax = -1.0f;
+        byte b = quantizeOne(-1.0f, bMin, bMax);
+        insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b);
+
+        float uMin = -5.0f;
+        float uMax = -2.0f;
+        byte[] unigramBytes = new byte[]{
+                quantizeOne(-2.0f, uMin, uMax),
+                quantizeOne(-2.0f, uMin, uMax),
+        };
+
+        return new V7Tables(cpIndex, keys, values, unigramBytes,
+                bMin, bMax,
+                uMin, uMax,
+                -10.0f, 1.0f);
+    }
+
+    /** Quantize a single float to 8-bit unsigned using the explicit range. */
+    private static byte quantizeOne(float v, float min, float max) {
+        float range = max - min;
+        int q = Math.round(((v - min) / range) * 255.0f);
+        if (q < 0) q = 0;
+        else if (q > 255) q = 255;
+        return (byte) q;
+    }
+
+    /**
+     * Replica of {@code TrainJunkModel.insertOA} (package-private) for the
+     * test's hand-constructed tables.  Uses the same mix-hash as the
+     * production code path.
+     */
+    private static void insertOA(int[] keys, byte[] values, int packedKey, byte value) {
+        int mask = keys.length - 1;
+        int h = JunkDetector.mixIndexKey(packedKey) & mask;
+        while (keys[h] != V7Tables.EMPTY_KEY) {
+            if (keys[h] == packedKey) {
+                values[h] = value;
+                return;
+            }
+            h = (h + 1) & mask;
+        }
+        keys[h] = packedKey;
+        values[h] = value;
+    }
+
+    /**
+     * Saves a minimal v7 model containing only LATIN, with F2/F3/F4 zeroed
+     * out and pure-F1 classifier weights (w1=1, rest 0, bias 0).  Scoring
+     * a window thus reduces to z1 directly.  F1 calibration: mu=-5, sigma=1.
+     */
+    private static void saveMinimalV7Model(V7Tables tables, Path modelFile) throws IOException {
+        TreeMap<String, V7Tables> f1Tables = new TreeMap<>();
+        f1Tables.put("LATIN", tables);
+
+        TreeMap<String, float[]> f1Cal = new TreeMap<>();
+        f1Cal.put("LATIN", new float[]{-5.0f, 1.0f});
+
+        int blockN = UnicodeBlockRanges.bucketCount();
+
+        TreeMap<String, float[]> blockTables = new TreeMap<>();
+        blockTables.put("LATIN", new float[blockN * blockN]);
+        TreeMap<String, float[]> blockCal = new TreeMap<>();
+        blockCal.put("LATIN", new float[]{0f, 1f});
+
+        TreeMap<String, float[]> controlCal = new TreeMap<>();
+        controlCal.put("LATIN", new float[]{0f, 1f});
+
+        List<String> scriptBuckets = List.of("LATIN", "OTHER");
+        float[] scriptTransTable = new float[scriptBuckets.size() * scriptBuckets.size()];
+        float[] scriptTransCal = new float[]{0f, 1f};
+
+        TreeMap<String, float[]> classifierWeights = new TreeMap<>();
+        classifierWeights.put("LATIN", new float[]{1.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+
+        TrainJunkModel.saveModelV7(
+                f1Tables, f1Cal, blockTables, blockCal, controlCal,
+                classifierWeights, scriptBuckets, scriptTransTable,
+                scriptTransCal, modelFile);
+    }
+
+    private static void writeGzippedLines(Path path, String... lines) throws IOException {
+        try (BufferedWriter w = new BufferedWriter(new OutputStreamWriter(
+                new GZIPOutputStream(Files.newOutputStream(path)),
+                StandardCharsets.UTF_8))) {
+            for (String line : lines) {
+                w.write(line);
+                w.write('\n');
+            }
+        }
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
index 0b97a9a0bdb..eac556f139b 100644
--- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
+++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
@@ -161,4 +161,54 @@ public void noopWhenAllDecodingsIdentical() throws Exception {
     // constructor — ServiceLoader cannot instantiate it. Wiring JunkDetector
     // up as a proper SPI provider is tracked as follow-up work for TIKA-4720;
     // at that point this test can be added to exercise the real SPI path.
+
+    @Test
+    void expandHtmlEntities_numericDecimalResolvesToCodepoint() {
+        // U+0D4D = Malayalam Sign Virama
+        assertEquals("്",
+                JunkFilterEncodingDetector.expandHtmlEntities("&#3405;"));
+        // Surrounding ASCII preserved
+        assertEquals("a്b",
+                JunkFilterEncodingDetector.expandHtmlEntities("a&#3405;b"));
+    }
+
+    @Test
+    void expandHtmlEntities_numericHexResolvesToCodepoint() {
+        // U+4E2D = 中 (Han ideograph "middle")
+        assertEquals("中",
+                JunkFilterEncodingDetector.expandHtmlEntities("&#x4E2D;"));
+        assertEquals("中",
+                JunkFilterEncodingDetector.expandHtmlEntities("&#x4e2d;"));
+    }
+
+    @Test
+    void expandHtmlEntities_namedReferences() {
+        assertEquals("&", JunkFilterEncodingDetector.expandHtmlEntities("&amp;"));
+        assertEquals("<", JunkFilterEncodingDetector.expandHtmlEntities("&lt;"));
+        assertEquals(">", JunkFilterEncodingDetector.expandHtmlEntities("&gt;"));
+        assertEquals("\"", JunkFilterEncodingDetector.expandHtmlEntities("&quot;"));
+        assertEquals("a & b < c", JunkFilterEncodingDetector.expandHtmlEntities("a &amp; b &lt; c"));
+    }
+
+    @Test
+    void expandHtmlEntities_malformedPassesThrough() {
+        // No semicolon → not matched, left as literal
+        assertEquals("&#3405", JunkFilterEncodingDetector.expandHtmlEntities("&#3405"));
+        // Unknown named entity → left as literal
+        assertEquals("&unknown;",
+                JunkFilterEncodingDetector.expandHtmlEntities("&unknown;"));
+        // Out-of-range numeric → left as literal (passes overflow guard)
+        assertEquals("&#999999999;",
+                JunkFilterEncodingDetector.expandHtmlEntities("&#999999999;"));
+    }
+
+    @Test
+    void expandHtmlEntities_mixedEntityAndRawCodepoints() {
+        // Simulates an AIT5-style document: mix of raw Malayalam codepoints
+        // and numeric entity references encoding more Malayalam codepoints.
+        // ത = ത  ് = ് (virama)
+        String input = "ത&#3405;ര";
+        String expected = "ത്ര";
+        assertEquals(expected, JunkFilterEncodingDetector.expandHtmlEntities(input));
+    }
 }
diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/UnicodeBlockRangesTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/UnicodeBlockRangesTest.java
new file mode 100644
index 00000000000..e25cff9204e
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/UnicodeBlockRangesTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Sanity checks for the static {@link UnicodeBlockRanges} lookup table.
+ *
+ * <p>The table is the single source of truth for F2 block bucketing across
+ * trainer and inference, so any silent drift here would silently corrupt
+ * the block-transition feature for the entire model.  These tests assert
+ * a handful of known-codepoint → known-bucket facts plus the table's
+ * internal invariants (sorted, non-overlapping, contiguous bucket ids).
+ */
+public class UnicodeBlockRangesTest {
+
+    @Test
+    void bucketCountIs339() {
+        // 338 named ranges in the static table + 1 unassigned = 339 total.
+        // If this ever fails, the static table has changed — check that
+        // SCHEME_VERSION was bumped and downstream models retrained.
+        assertEquals(339, UnicodeBlockRanges.bucketCount());
+        assertEquals(338, UnicodeBlockRanges.UNASSIGNED);
+    }
+
+    @Test
+    void wellKnownCodepointsMapToExpectedBuckets() {
+        // 'A' (U+0041) → BASIC_LATIN bucket 0
+        assertEquals(0, UnicodeBlockRanges.bucketOf('A'));
+        // 'a' (U+0061) → BASIC_LATIN
+        assertEquals(0, UnicodeBlockRanges.bucketOf('a'));
+        // U+00FF (ÿ) → LATIN_1_SUPPLEMENT bucket 1 (last codepoint in range)
+        assertEquals(1, UnicodeBlockRanges.bucketOf(0x00FF));
+        // U+0100 (Ā) → LATIN_EXTENDED_A bucket 2 (first codepoint in next range)
+        assertEquals(2, UnicodeBlockRanges.bucketOf(0x0100));
+        // 中 (U+4E2D) → CJK_UNIFIED_IDEOGRAPHS bucket 120
+        assertEquals(120, UnicodeBlockRanges.bucketOf(0x4E2D));
+        // 国 (U+56FD) → CJK_UNIFIED_IDEOGRAPHS bucket 120
+        assertEquals(120, UnicodeBlockRanges.bucketOf(0x56FD));
+        // U+0D24 (ത, Malayalam letter ta) → MALAYALAM bucket 30
+        assertEquals(30, UnicodeBlockRanges.bucketOf(0x0D24));
+        // Hangul syllables - U+AC00 → bucket 147
+        assertEquals(147, UnicodeBlockRanges.bucketOf(0xAC00));
+        // Cyrillic А (U+0410) → CYRILLIC bucket 8
+        assertEquals(8, UnicodeBlockRanges.bucketOf(0x0410));
+    }
+
+    @Test
+    void codepointsInGapsBetweenBlocksReturnUnassigned() {
+        // The Unicode standard leaves gaps where no block is assigned.
+        // Examples (verified by enumeration on JDK 25):
+        // U+10200 falls between PHAISTOS_DISC (U+101D0..U+101FF) and
+        // LYCIAN (U+10280..U+1029F).
+        assertEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(0x10200));
+        // U+0860 changed in Unicode 10 — verify it's in some block (SYRIAC_SUPPLEMENT).
+        assertNotEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(0x0860));
+    }
+
+    @Test
+    void codepointsBeyondSupplementaryReturnUnassigned() {
+        // Negative codepoints, supplementary range edges, and beyond U+10FFFF
+        // are not valid input but the lookup must not crash; UNASSIGNED is fine.
+        assertEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(-1));
+        // U+10FFFF is the last codepoint and is in SUPPLEMENTARY_PRIVATE_USE_AREA_B.
+        assertNotEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(0x10FFFF));
+    }
+
+    @Test
+    void schemeVersionIsBumpedOnAnyTableChange() {
+        // If the static table is ever modified, SCHEME_VERSION MUST be bumped
+        // — otherwise loaded models silently re-map to the new bucketing.
+        // This test enforces awareness: anyone changing the table will see
+        // this assertion fail and be forced to think about the consequence.
+        // Update the expected value here and bump SCHEME_VERSION together.
+        assertEquals(1, UnicodeBlockRanges.SCHEME_VERSION);
+    }
+
+    @Test
+    void bucketIdsCoverContiguousRange() {
+        // Every named block id 0..337 must be reachable.  Hits a representative
+        // codepoint in each range and asserts all 338 ids are produced (plus
+        // UNASSIGNED for the gaps).
+        boolean[] seen = new boolean[UnicodeBlockRanges.bucketCount()];
+        for (int cp = 0; cp <= 0x10FFFF; cp++) {
+            int bucket = UnicodeBlockRanges.bucketOf(cp);
+            assertTrue(bucket >= 0 && bucket < UnicodeBlockRanges.bucketCount(),
+                    "Bucket out of range at cp=U+" + Integer.toHexString(cp)
+                            + ": " + bucket);
+            seen[bucket] = true;
+        }
+        for (int b = 0; b < UnicodeBlockRanges.bucketCount(); b++) {
+            assertTrue(seen[b], "Bucket id " + b + " is never produced by any codepoint");
+        }
+    }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
new file mode 100644
index 00000000000..55398307191
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Pin-test for {@link JunkDetectorTrainingConfig}.
+ *
+ * <p>The values exercised here are the durable choices that define the
+ * shipping junk-detector model's identity.  This test exists so that any
+ * change to those values requires updating an assertion in the same
+ * commit, surfacing the change in code review rather than letting it
+ * slip silently.
+ *
+ * <p>If you are intentionally tuning a parameter, update both the
+ * constant and the matching assertion below in the same change.  Do not
+ * "fix" a failing assertion in isolation.
+ */
+class JunkDetectorTrainingConfigTest {
+
+    @Test
+    void corpusBuildValues() {
+        assertEquals(500_000_000L,
+                JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES);
+        assertEquals(5_000_000L,
+                JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES);
+        assertEquals(0.05,
+                JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC, 1e-9);
+        assertEquals(50,
+                JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE);
+        assertEquals(0.30,
+                JunkDetectorTrainingConfig.MAX_PUNC_FRAC, 1e-9);
+        assertEquals(500,
+                JunkDetectorTrainingConfig.MIN_DEV_SENTENCES);
+        assertEquals(2_000,
+                JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES);
+        assertEquals(200_000L,
+                JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES);
+        assertEquals(42,
+                JunkDetectorTrainingConfig.SEED);
+    }
+
+    @Test
+    void droppedScripts() {
+        Set<String> drop = JunkDetectorTrainingConfig.DROP_SCRIPTS;
+        assertEquals(Set.of("GOTHIC", "THAANA"), drop);
+        // Must be immutable: any caller that tries to mutate the set
+        // should fail loudly rather than corrupting the shared config.
+        assertThrows(UnsupportedOperationException.class,
+                () -> drop.add("FAKE"));
+    }
+
+    @Test
+    void scriptBudgetOverridesEmpty() {
+        // v7 hypothesis test (HAN=60MB) ran but gave only marginal gains.
+        // Override map is intentionally empty pending a more decisive
+        // experiment.
+        assertTrue(JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES.isEmpty());
+    }
+
+    @Test
+    void modelTrainValues() {
+        assertEquals(3, JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT);
+        assertEquals(0.5, JunkDetectorTrainingConfig.OA_LOAD_FACTOR, 1e-9);
+        assertEquals(16, JunkDetectorTrainingConfig.KEY_INDEX_BITS);
+        assertTrue(JunkDetectorTrainingConfig.KEY_INDEX_BITS <= 16,
+                "KEY_INDEX_BITS must be <= 16 to fit packed key in an int");
+    }
+
+    @Test
+    void notInstantiable() {
+        // The class is a frozen configuration container; making it
+        // instantiable would invite per-call mutation.
+        java.lang.reflect.Constructor<?>[] ctors =
+                JunkDetectorTrainingConfig.class.getDeclaredConstructors();
+        assertEquals(1, ctors.length, "expected exactly one constructor");
+        assertFalse(java.lang.reflect.Modifier.isPublic(ctors[0].getModifiers()),
+                "constructor should not be public");
+    }
+}