diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
index 8c054b0ef75..c1f78cebb68 100644
--- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
+++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java
@@ -50,8 +50,8 @@ public TextQualityComparison(String winner, float delta,
}
/**
- * Returns {@code "A"} if candidate A is cleaner, {@code "B"} otherwise.
- * Check {@link #delta()} to gauge confidence.
+ * Returns the label of the cleaner candidate ({@link #labelA()} or
+ * {@link #labelB()}). Check {@link #delta()} to gauge confidence.
*/
public String winner() {
return winner;
@@ -88,8 +88,7 @@ public String labelB() {
@Override
public String toString() {
return String.format(java.util.Locale.ROOT,
- "TextQualityComparison[winner=%s(%s) delta=%.3f A=%s B=%s]",
- winner, winner.equals("A") ? labelA : labelB,
- delta, scoreA, scoreB);
+ "TextQualityComparison[winner=%s delta=%.3f A=%s(%s) B=%s(%s)]",
+ winner, delta, labelA, scoreA, labelB, scoreB);
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
index d832b5a169d..b91315e7272 100644
--- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java
@@ -37,7 +37,7 @@
* // Arbitrate between two charset decodings
* TextQualityComparison cmp = detector.compare("cp1252", decodedAsCp1252,
* "cp1251", decodedAsCp1251);
- * String winner = cmp.winner(); // "A" or "B"
+ * String winner = cmp.winner(); // returns the chosen label, e.g. "cp1251"
* }
*/
public interface TextQualityDetector {
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 1719043f408..5635f6f168d 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -41,26 +41,35 @@
* Language-agnostic text quality scorer. Discriminates clean UTF-8 text from
* mojibake, reversed text, wrong-codec decodings, and other corruption forms.
*
- *
Scoring combines up to three features, depending on the model version:
+ *
Scoring combines four features:
*
- * - Byte-bigram log-probability — 256×256 table of log P(b|a) over
- * consecutive byte pairs in the UTF-8 encoding.
- * - Unicode named-block transition log-probability (version 2+) —
- * N×N table of log P(block_b | block_a) where block IDs are the named
- * {@link Character.UnicodeBlock} values (BASIC_LATIN, ARABIC,
- * CJK_UNIFIED_IDEOGRAPHS, etc.).
- * - Control-byte fraction (version 2+) — fraction of bytes in control
+ *
- Codepoint-bigram log-probability (F1) — global hashed table
+ * indexed by FNV-1a(cp_a, cp_b, seed) into {@code bigramBuckets} cells.
+ * A Bloom filter records seen pairs; unseen pairs fall back to a
+ * hashed-unigram independence-assumption score
+ * {@code α * (log P(cp_a) + log P(cp_b))}.
+ * - Unicode named-block transition log-probability (F2) —
+ * per-script N×N table over {@link Character.UnicodeBlock} values.
+ * - Control-byte fraction (F3) — fraction of bytes in control
* ranges [0x01–0x08, 0x0B, 0x0C, 0x0E–0x1F, 0x7F].
+ * - Global script-transition log-probability (F4) — single
+ * transition table over raw {@link Character.UnicodeScript} values,
+ * capturing document-level cross-script anomalies.
*
*
- * All features are calibrated (mu/sigma) on held-out dev text so their z-scores
- * are on a common scale.
+ *
All features are calibrated per-script (mu/sigma) on held-out dev text
+ * so their z-scores are on a common scale. z-scores are combined by a
+ * per-script linear classifier:
+ * {@code logit = w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias}, where weights are
+ * fit on clean vs. corrupted dev windows. Natural junk threshold is 0
+ * (positive logit = clean); use negative thresholds for conservative
+ * detection.
*
- * Features are combined by a per-script logistic regression classifier:
- * {@code w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias}, where weights are fit on
- * clean vs. corrupted dev windows. The natural junk threshold is 0 (positive
- * logit = clean); use a negative threshold for conservative detection
- * (e.g., {@code score < -1}).
+ * Model file format: a single binary spec (see {@link #load(InputStream)}
+ * javadoc). No backwards-compat fallback to older formats — the loader
+ * rejects mismatched version bytes with a clear error. This is
+ * intentional: keeping parallel scoring paths is a known source of silent
+ * miscalibration bugs.
*
*
Instances are immutable and thread-safe after construction.
*
@@ -72,7 +81,7 @@
*
* // Arbitrate between two charset decodings
* TextQualityComparison result = detector.compare("cp1252", ascp1252, "cp1251", ascp1251);
- * String winner = result.winner(); // "A" or "B"
+ * String winner = result.winner(); // returns "cp1252" or "cp1251"
* }
*/
public final class JunkDetector implements TextQualityDetector {
@@ -82,68 +91,54 @@ public final class JunkDetector implements TextQualityDetector {
"org/apache/tika/ml/junkdetect/junkdetect.bin";
static final String MAGIC = "JUNKDET1";
+ /** Sole supported file-format version. Mismatch is a hard error. */
+ static final int VERSION = 7;
- private final int modelVersion;
+ // Feature 1 — per-script open-addressed codepoint-bigram tables.
+ // No global Bloom: empty-slot is the membership oracle.
+ private final Map f1TablesByScript;
- // Feature 1: byte bigrams (all versions)
- private final Map tables; // script → float[65536] log-prob
+ /** Per-script F1 calibration on the codepoint-hash mean log-prob. */
private final Map calibrations; // script → float[2] {mu, sigma}
- // Feature 2: named-block transitions (version 2+); null for v1 models
- private final Map blockTables; // script → float[blockN*blockN]
- private final Map blockCalibrations; // script → float[2] {mu, sigma}
- private final int blockN; // block table dimension (0 for v1)
+ // Feature 2 — per-script block transition. Block bucketing uses the
+ // JVM-independent {@link UnicodeBlockRanges} static table; table size
+ // per script is {@code bucketCount()²} floats.
+ private final Map blockTables;
+ private final Map blockCalibrations;
- // Feature 3: control-byte fraction (version 2+); null for v1 models
- private final Map controlCalibrations; // script → float[2] {mu, sigma}
+ // Feature 3 — per-script control-byte fraction calibration
+ private final Map controlCalibrations;
- // Feature combination: per-script linear classifier (version 3+); null for v1/v2 models
- // float[numFeatures+1] = {w1, ..., wN, bias}; positive logit = clean
- private final Map classifierWeights;
-
- // Feature 4: global script-transition (version 4+); null for v1/v2/v3 models
- // One global table: float[numScriptBuckets * numScriptBuckets] log P(script_b | script_a)
- // Uses raw UnicodeScript names (not SCRIPT_MODEL_FALLBACK) to distinguish HIRAGANA/KATAKANA/HAN.
+ // Feature 4 — single global script-transition table
private final float[] scriptTransitionTable;
- private final float[] scriptTransitionCalibration; // float[2] = {mu, sigma}
- private final Map scriptBucketIndex; // raw UnicodeScript name → bucket ID
- private final int numScriptBuckets; // 0 for v1/v2/v3
+ private final float[] scriptTransitionCalibration;
+ private final Map scriptBucketIndex;
+ private final int numScriptBuckets;
- // Shared block index for v2+ models: UnicodeBlock → index [0, blockN-1)
- // Index blockN-1 is the "unassigned" bucket (null UnicodeBlock).
- private final Map blockIndex;
+ // Per-script linear classifier: float[numFeatures+1] = {w1, ..., wN, bias}.
+ private final Map classifierWeights;
- private JunkDetector(int modelVersion,
- Map tables,
- Map calibrations,
+ private JunkDetector(Map calibrations,
Map blockTables,
Map blockCalibrations,
- int blockN,
Map controlCalibrations,
Map classifierWeights,
- Map blockIndex,
float[] scriptTransitionTable,
float[] scriptTransitionCalibration,
Map scriptBucketIndex,
- int numScriptBuckets) {
- this.modelVersion = modelVersion;
- this.tables = Collections.unmodifiableMap(tables);
+ int numScriptBuckets,
+ Map f1TablesByScript) {
this.calibrations = Collections.unmodifiableMap(calibrations);
- this.blockTables = blockTables != null
- ? Collections.unmodifiableMap(blockTables) : null;
- this.blockCalibrations = blockCalibrations != null
- ? Collections.unmodifiableMap(blockCalibrations) : null;
- this.blockN = blockN;
- this.controlCalibrations = controlCalibrations != null
- ? Collections.unmodifiableMap(controlCalibrations) : null;
- this.classifierWeights = classifierWeights != null
- ? Collections.unmodifiableMap(classifierWeights) : null;
- this.blockIndex = blockIndex;
+ this.blockTables = Collections.unmodifiableMap(blockTables);
+ this.blockCalibrations = Collections.unmodifiableMap(blockCalibrations);
+ this.controlCalibrations = Collections.unmodifiableMap(controlCalibrations);
+ this.classifierWeights = Collections.unmodifiableMap(classifierWeights);
this.scriptTransitionTable = scriptTransitionTable;
this.scriptTransitionCalibration = scriptTransitionCalibration;
- this.scriptBucketIndex = scriptBucketIndex != null
- ? Collections.unmodifiableMap(scriptBucketIndex) : null;
+ this.scriptBucketIndex = Collections.unmodifiableMap(scriptBucketIndex);
this.numScriptBuckets = numScriptBuckets;
+ this.f1TablesByScript = Collections.unmodifiableMap(f1TablesByScript);
}
// -----------------------------------------------------------------------
@@ -196,7 +191,53 @@ public static JunkDetector loadFromPath(Path path) throws IOException {
/**
* Loads a model from an {@link InputStream}. Gzip-detection is automatic.
- * Supports model versions 1 through 5.
+ * Strictly requires the current file-format version ({@value #VERSION}) —
+ * older formats are rejected with a clear error rather than supported
+ * via a fallback path.
+ *
+ * File-format layout (gzipped):
+ *
+ * [8 bytes] magic "JUNKDET1" (ASCII)
+ * [1 byte] version (= 7)
+ * [4 bytes] num_scripts (int BE)
+ * [1 byte] block_scheme_version (must equal
+ * {@link UnicodeBlockRanges#SCHEME_VERSION})
+ * [1 byte] num_script_buckets
+ * for each bucket:
+ * [2 bytes] name length (ushort BE)
+ * [name bytes] bucket name (UTF-8)
+ * [num_script_buckets² × 4 bytes] script-transition log-prob table (F4)
+ * [4 bytes] mu4 (float32 BE)
+ * [4 bytes] sigma4 (float32 BE)
+ * for each script (sorted by name):
+ * [2 bytes] name length
+ * [name bytes] script name (UTF-8)
+ * [4 bytes] mu1 (F1 calibration, codepoint-bigram mean log-prob)
+ * [4 bytes] sigma1
+ * // V7 F1 tables for this script — see {@link V7Tables#writeTo}
+ * [4 bytes] backoff_alpha (float32 BE)
+ * [4 bytes] codepoint_count
+ * [codepoint_count × 4 bytes] codepoint index (sorted, ascending)
+ * [4 bytes] bigram_slots (power of 2)
+ * [4 bytes] bigram_quant_min (float32 BE)
+ * [4 bytes] bigram_quant_max (float32 BE)
+ * [bigram_slots × 4 bytes] bigram open-addressing keys
+ * ((idxA<<16)|idxB, or {@link V7Tables#EMPTY_KEY})
+ * [bigram_slots bytes] bigram values (8-bit quantized log-probs)
+ * [4 bytes] unigram_quant_min (float32 BE)
+ * [4 bytes] unigram_quant_max (float32 BE)
+ * [4 bytes] unigram_fallback_log_prob (float32 BE; used for
+ * codepoints not in index)
+ * [codepoint_count bytes] unigram values (8-bit quantized log-probs)
+ * // F2/F3/classifier (unchanged from v6 layout)
+ * [4 bytes] mu2 (F2 calibration)
+ * [4 bytes] sigma2
+ * [block_N² × 4 bytes] block-transition log-prob table (F2)
+ * [4 bytes] mu3 (F3 calibration)
+ * [4 bytes] sigma3
+ * [1 byte] num_features
+ * [(num_features+1) × 4 bytes] classifier weights w1..wN and bias
+ *
*/
public static JunkDetector load(InputStream rawIs) throws IOException {
byte[] peek = rawIs.readNBytes(2);
@@ -215,21 +256,22 @@ public static JunkDetector load(InputStream rawIs) throws IOException {
throw new IOException("Not a JunkDetector model file (bad magic)");
}
int version = dis.readUnsignedByte();
- if (version != 5) {
- throw new IOException("Unsupported model version: " + version
- + ". Only version 5 is supported. Retrain the model with TrainJunkModel.");
+ if (version != VERSION) {
+ throw new IOException("Unsupported model format version: " + version
+ + ". This build expects version " + VERSION
+ + ". Retrain the model with the current TrainJunkModel.");
}
int numScripts = dis.readInt();
- // Block names (v5): stored in model for JVM-independence
- int blockN = dis.readUnsignedShort();
- String[] blockNames = new String[blockN - 1];
- for (int i = 0; i < blockN - 1; i++) {
- int nameLen = dis.readUnsignedShort();
- blockNames[i] = new String(dis.readNBytes(nameLen), StandardCharsets.UTF_8);
+ int blockSchemeVersion = dis.readUnsignedByte();
+ if (blockSchemeVersion != UnicodeBlockRanges.SCHEME_VERSION) {
+ throw new IOException("Unsupported block-scheme version: "
+ + blockSchemeVersion + ". This build expects "
+ + UnicodeBlockRanges.SCHEME_VERSION
+ + ". Retrain with the current TrainJunkModel.");
}
- Map blockIndex = buildBlockIndexFromNames(blockNames);
+ int blockN = UnicodeBlockRanges.bucketCount();
// Global script-transition section
int numScriptBuckets = dis.readUnsignedByte();
@@ -242,42 +284,39 @@ public static JunkDetector load(InputStream rawIs) throws IOException {
float[] scriptTransitionTable = readFloatTable(dis, numScriptBuckets * numScriptBuckets);
float[] scriptTransitionCalibration = new float[]{dis.readFloat(), dis.readFloat()};
- Map tables = new HashMap<>(numScripts * 2);
- Map calibrations = new HashMap<>(numScripts * 2);
- Map blockTables = new HashMap<>(numScripts * 2);
- Map blockCalibrations = new HashMap<>(numScripts * 2);
- Map controlCalibrations = new HashMap<>(numScripts * 2);
- Map classifierWeights = new HashMap<>(numScripts * 2);
+ Map f1TablesByScript = new HashMap<>(numScripts * 2);
+ Map calibrations = new HashMap<>(numScripts * 2);
+ Map blockTables = new HashMap<>(numScripts * 2);
+ Map blockCalibrations = new HashMap<>(numScripts * 2);
+ Map controlCalibrations = new HashMap<>(numScripts * 2);
+ Map classifierWeights = new HashMap<>(numScripts * 2);
for (int s = 0; s < numScripts; s++) {
int nameLen = dis.readUnsignedShort();
String script = new String(dis.readNBytes(nameLen), StandardCharsets.UTF_8);
- // Feature 1: byte bigrams
calibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()});
- tables.put(script, readFloatTable(dis, 65536));
- // Feature 2: named-block transitions
+ // Per-script V7 F1 tables.
+ f1TablesByScript.put(script, V7Tables.readFrom(dis));
+
blockCalibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()});
blockTables.put(script, readFloatTable(dis, blockN * blockN));
-
- // Feature 3: control-byte fraction
controlCalibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()});
- // Classifier weights: num_features (1 byte) + num_features floats + 1 bias
int numFeatures = dis.readUnsignedByte();
- float[] weights = new float[numFeatures + 1]; // last = bias
+ float[] weights = new float[numFeatures + 1];
for (int j = 0; j <= numFeatures; j++) {
weights[j] = dis.readFloat();
}
classifierWeights.put(script, weights);
}
- return new JunkDetector(version, tables, calibrations,
- blockTables, blockCalibrations, blockN,
- controlCalibrations, classifierWeights, blockIndex,
+ return new JunkDetector(calibrations,
+ blockTables, blockCalibrations,
+ controlCalibrations, classifierWeights,
scriptTransitionTable, scriptTransitionCalibration,
- scriptBucketIndex, numScriptBuckets);
+ scriptBucketIndex, numScriptBuckets, f1TablesByScript);
}
}
@@ -289,44 +328,6 @@ private static float[] readFloatTable(DataInputStream dis, int size) throws IOEx
return table;
}
- /**
- * Builds the stable ordered mapping from {@link Character.UnicodeBlock} to index.
- * This must produce the same ordering as {@link TrainJunkModel#buildBlockIndex()}.
- * Used for v2/v3/v4 models only; v5+ models store block names in the file.
- */
- static Map buildBlockIndex() {
- LinkedHashMap index = new LinkedHashMap<>();
- for (int cp = 0; cp <= 0x10FFFF; cp++) {
- Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
- if (b != null) index.putIfAbsent(b, index.size());
- }
- return Collections.unmodifiableMap(index);
- }
-
- /**
- * Builds a block index from an ordered array of block names stored in a v5+ model.
- * Resolves each name via {@link Character.UnicodeBlock#forName(String)}.
- * Throws {@link IOException} if any name is not recognised by the current JVM —
- * this means the model was trained on a newer JVM; retrain on the minimum
- * supported JVM (Java 17) to produce a compatible model.
- *
- * @param blockNames ordered array of block names (index = position in block table)
- * @return unmodifiable map from UnicodeBlock to table index
- */
- static Map buildBlockIndexFromNames(String[] blockNames)
- throws IOException {
- Map index = new HashMap<>(blockNames.length * 2);
- for (int i = 0; i < blockNames.length; i++) {
- try {
- Character.UnicodeBlock b = Character.UnicodeBlock.forName(blockNames[i]);
- index.put(b, i);
- } catch (IllegalArgumentException e) {
- throw new IOException("Unicode block not known to this JVM: " + blockNames[i]
- + ". Model was trained on a newer JVM; retrain on Java 17.", e);
- }
- }
- return Collections.unmodifiableMap(index);
- }
// -----------------------------------------------------------------------
// TextQualityDetector implementation
@@ -373,7 +374,7 @@ public TextQualityComparison compare(String labelA, String candidateA,
float zA = scoreA.isUnknown() ? 0f : scoreA.getZScore();
float zB = scoreB.isUnknown() ? 0f : scoreB.getZScore();
- String winner = zA >= zB ? "A" : "B";
+ String winner = zA >= zB ? labelA : labelB;
float delta = Math.abs(zA - zB);
return new TextQualityComparison(winner, delta, scoreA, scoreB, labelA, labelB);
@@ -381,12 +382,12 @@ public TextQualityComparison compare(String labelA, String candidateA,
/** Returns the set of script names this model knows about. */
public Set knownScripts() {
- return tables.keySet();
+ return calibrations.keySet();
}
- /** Returns the version of the loaded model (1, 2, or 3). */
+ /** Returns the file-format version of the loaded model. */
public int getModelVersion() {
- return modelVersion;
+ return VERSION;
}
// -----------------------------------------------------------------------
@@ -409,12 +410,16 @@ private TextQualityScore scoreText(String text) {
float[] dominantCal1 = null;
for (ScriptRun run : runs) {
- if (!tables.containsKey(run.script)) {
+ if (!calibrations.containsKey(run.script)) {
continue; // skip scripts not in model; treat as neutral, not junk
}
byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8);
- if (runUtf8.length < 2) {
- continue; // too short to score
+ // Skip if too short to form a bigram by either metric. A single
+ // CJK char is 3 UTF-8 bytes (passes the byte filter) but 1 UTF-16
+ // unit, and computeF1MeanLogP filters by text.length() < 2 and
+ // returns NaN — which would poison the weighted sum here.
+ if (runUtf8.length < 2 || run.text.length() < 2) {
+ continue;
}
float logit = scoreChunk(runUtf8, run.text, run.script, z4);
int n = runUtf8.length;
@@ -444,88 +449,226 @@ private TextQualityScore scoreText(String text) {
return new TextQualityScore(zScore, pClean, ciLow, ciHigh, dominantScript);
}
+ /**
+ * Diagnostic — exposes per-feature z-scores and classifier weights. Same
+ * chunking and aggregation as {@link #score(String)}, but returns the
+ * intermediate signals individually for analysis or for hybrid models
+ * that want to substitute one feature with an externally-computed value.
+ *
+ * Aggregation: per-chunk z1/z2/z3 and per-chunk logit are byte-count-
+ * weighted across script-homogeneous chunks. z4 is a global signal
+ * (already document-level). {@code dominantScript} and
+ * {@code classifierWeights} refer to the script run with the most bytes.
+ */
+ public FeatureComponents scoreWithFeatureComponents(String text) {
+ if (text == null || text.isEmpty()) {
+ return new FeatureComponents(Float.NaN, Float.NaN, Float.NaN,
+ Float.NaN, Float.NaN, "UNKNOWN", null, 0);
+ }
+ List runs = buildScriptRuns(text);
+ float z4 = computeScriptTransitionZ(text);
+
+ float totalBytes = 0;
+ float weightedZ1 = 0;
+ float weightedZ2 = 0;
+ float weightedZ3 = 0;
+ float weightedLogit = 0;
+ String dominantScript = null;
+ int maxBytes = 0;
+
+ for (ScriptRun run : runs) {
+ if (!calibrations.containsKey(run.script)) {
+ continue;
+ }
+ byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8);
+ if (runUtf8.length < 2 || run.text.length() < 2) {
+ continue; // see scoreText: paired filter avoids NaN poisoning
+ }
+ float[] zs = computeChunkZs(runUtf8, run.text, run.script);
+ float chunkLogit = combineLogit(zs[0], zs[1], zs[2], z4, run.script);
+ int n = runUtf8.length;
+ weightedZ1 += zs[0] * n;
+ weightedZ2 += zs[1] * n;
+ weightedZ3 += zs[2] * n;
+ weightedLogit += chunkLogit * n;
+ totalBytes += n;
+ if (n > maxBytes) {
+ maxBytes = n;
+ dominantScript = run.script;
+ }
+ }
+
+ if (totalBytes == 0 || dominantScript == null) {
+ return new FeatureComponents(Float.NaN, Float.NaN, Float.NaN, z4,
+ Float.NaN, runs.isEmpty() ? "UNKNOWN" : runs.get(0).script,
+ null, 0);
+ }
+
+ float[] cw = classifierWeights.get(dominantScript);
+ return new FeatureComponents(
+ weightedZ1 / totalBytes,
+ weightedZ2 / totalBytes,
+ weightedZ3 / totalBytes,
+ z4,
+ weightedLogit / totalBytes,
+ dominantScript,
+ cw,
+ (int) totalBytes);
+ }
+
+ /**
+ * Per-feature z-score breakdown returned by
+ * {@link #scoreWithFeatureComponents(String)}. All z-scores are
+ * byte-count-weighted aggregates across script-homogeneous chunks
+ * except {@code z4}, which is a single document-level value.
+ *
+ * {@code classifierWeights} is the per-script linear classifier
+ * weight vector {@code {w1, w2, w3, w4, bias}} for the dominant
+ * script — useful for hybrid models that recompute the logit after
+ * substituting one z-score with an externally-computed value.
+ */
+ public static final class FeatureComponents {
+ public final float z1;
+ public final float z2;
+ public final float z3;
+ public final float z4;
+ public final float logit;
+ public final String dominantScript;
+ public final float[] classifierWeights;
+ public final int totalBytes;
+
+ FeatureComponents(float z1, float z2, float z3, float z4,
+ float logit, String dominantScript,
+ float[] classifierWeights, int totalBytes) {
+ this.z1 = z1;
+ this.z2 = z2;
+ this.z3 = z3;
+ this.z4 = z4;
+ this.logit = logit;
+ this.dominantScript = dominantScript;
+ this.classifierWeights = classifierWeights;
+ this.totalBytes = totalBytes;
+ }
+ }
+
/**
* Scores a single script-homogeneous chunk and returns its logit.
* Positive = clean, negative = junk. Returns 0 (neutral) if the chunk
* has no model or is too short.
*/
private float scoreChunk(byte[] utf8, String text, String script, float z4) {
- float[] bigramTable = tables.get(script);
- if (bigramTable == null || utf8.length < 2) {
+ if (utf8.length < 2 || !calibrations.containsKey(script)) {
return 0f;
}
+ float[] zs = computeChunkZs(utf8, text, script);
+ return combineLogit(zs[0], zs[1], zs[2], z4, script);
+ }
- // Feature 1: byte-bigram mean log-prob
- double bigramSum = 0;
- int bigramCount = 0;
- for (int i = 0; i + 1 < utf8.length; i++) {
- bigramSum += bigramTable[((utf8[i] & 0xFF) << 8) | (utf8[i + 1] & 0xFF)];
- bigramCount++;
- }
- float meanBigramLogProb = (float) (bigramSum / bigramCount);
+ /**
+ * Computes per-feature z-scores {z1, z2, z3} for a single script-
+ * homogeneous chunk. Shared between {@link #scoreChunk} and
+ * {@link #scoreWithFeatureComponents}, and used at training time
+ * via the public {@code computeZ2/3/4...} static helpers so
+ * training and inference share the same math.
+ */
+ private float[] computeChunkZs(byte[] utf8, String text, String script) {
+ // Feature 1: per-script codepoint-bigram, calibrated per-script
+ V7Tables tables = f1TablesByScript.get(script);
+ float meanF1LogProb = computeCodepointF1MeanLogP(text, tables);
float[] cal1 = calibrations.get(script);
- float z1 = (meanBigramLogProb - cal1[0]) / cal1[1];
-
- // Feature 2: named-block transition mean log-prob
- float z2 = 0f;
- float[] blockTable = blockTables.get(script);
- if (blockTable != null) {
- int nullId = blockN - 1;
- int prev = -1;
- double blockSum = 0;
- int blockCount = 0;
- for (int i = 0; i < text.length(); ) {
- int cp = text.codePointAt(i);
- Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
- int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
- if (prev >= 0) {
- blockSum += blockTable[prev * blockN + blockId];
- blockCount++;
- }
- prev = blockId;
- i += Character.charCount(cp);
- }
- if (blockCount > 0) {
- float meanBlockLogProb = (float) (blockSum / blockCount);
- float[] cal2 = blockCalibrations.get(script);
- z2 = cal2 != null ? (meanBlockLogProb - cal2[0]) / cal2[1] : 0f;
+ float z1 = (meanF1LogProb - cal1[0]) / cal1[1];
+
+ float z2 = computeZ2BlockTransition(text,
+ blockTables.get(script), blockCalibrations.get(script));
+ float z3 = computeZ3ControlByte(utf8, controlCalibrations.get(script));
+ return new float[]{z1, z2, z3};
+ }
+
+ private static float computeCodepointF1MeanLogP(String text, V7Tables tables) {
+ if (tables == null) return Float.NaN;
+ double v = computeF1MeanLogP(text, tables);
+ return Double.isNaN(v) ? Float.NaN : (float) v;
+ }
+
+ /**
+ * Feature 2 — calibrated z-score for block-transition mean log-prob on
+ * one text window. Returns 0 if the window has fewer than two
+ * codepoints or if {@code blockTable} / {@code blockCal} are null.
+ *
+ *
Block bucketing is via the JVM-independent
+ * {@link UnicodeBlockRanges}. Public so the trainer's classifier
+ * feature extractor calls into the exact same math used at inference
+ * time — single source of truth, no train/infer drift.
+ *
+ * @param blockTable {@code (blockN)² × float} log-prob table where
+ * {@code blockN = UnicodeBlockRanges.bucketCount()}
+ */
+ public static float computeZ2BlockTransition(String text,
+ float[] blockTable, float[] blockCal) {
+ if (blockTable == null || blockCal == null || text.length() < 2) {
+ return 0f;
+ }
+ int blockN = UnicodeBlockRanges.bucketCount();
+ int prev = -1;
+ double sum = 0;
+ int count = 0;
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ int blockId = UnicodeBlockRanges.bucketOf(cp);
+ if (prev >= 0) {
+ sum += blockTable[prev * blockN + blockId];
+ count++;
}
+ prev = blockId;
+ i += Character.charCount(cp);
+ }
+ if (count == 0) {
+ return 0f;
}
+ return ((float) (sum / count) - blockCal[0]) / blockCal[1];
+ }
- // Feature 3: control-byte fraction (stored as −fraction, so higher = cleaner)
+ /**
+ * Feature 3 — calibrated z-score for control-byte fraction on the UTF-8
+ * byte sequence of one text window. Stored score is {@code -fraction}
+ * so higher = cleaner (matching the direction convention of the other
+ * z-features).
+ *
+ *
Public for train/infer math-sharing.
+ */
+ public static float computeZ3ControlByte(byte[] utf8, float[] controlCal) {
+ if (utf8.length == 0 || controlCal == null) {
+ return 0f;
+ }
long controlCount = 0;
for (byte b : utf8) {
- if (isControlByte(b & 0xFF)) controlCount++;
- }
- float controlScore = -(float) controlCount / utf8.length;
- float[] cal3 = controlCalibrations.get(script);
- float z3 = cal3 != null ? (controlScore - cal3[0]) / cal3[1] : 0f;
-
- // Per-script linear classifier: w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias
- float[] cw = classifierWeights.get(script);
- if (cw != null) {
- int nFeat = cw.length - 1; // bias is last
- float logit = cw[nFeat]; // bias
- if (nFeat >= 1) logit += cw[0] * z1;
- if (nFeat >= 2) logit += cw[1] * z2;
- if (nFeat >= 3) logit += cw[2] * z3;
- if (nFeat >= 4) logit += cw[3] * z4;
- return logit;
+ if (isControlByte(b & 0xFF)) {
+ controlCount++;
+ }
}
- return (z1 + z2 + z3 + z4) / 4.0f; // fallback: equal weight
+ float score = -(float) controlCount / utf8.length;
+ return (score - controlCal[0]) / controlCal[1];
}
/**
- * Computes the global script-transition z-score for the whole input string.
- * Uses raw {@link Character.UnicodeScript} values — NOT {@link #SCRIPT_MODEL_FALLBACK} —
- * so that HIRAGANA, KATAKANA, and HAN remain distinct, preserving the
- * characteristic script-mixing pattern of Japanese text.
+ * Feature 4 — calibrated z-score for global script-transition mean
+ * log-prob on one text window. Uses raw {@link Character.UnicodeScript}
+ * values (no model fallback) so HIRAGANA / KATAKANA / HAN remain
+ * distinct. Returns 0 if the window has fewer than two non-neutral
+ * codepoints or if the script-transition data isn't supplied.
*
- *
Returns 0 if the string has fewer than two non-neutral codepoints.
+ *
Public for train/infer math-sharing. Note: inference computes
+ * z4 once per document via {@link #computeScriptTransitionZ} (which
+ * uses the instance's loaded tables); this helper takes them as
+ * arguments so training can compute z4 before the model is finalised.
*/
- private float computeScriptTransitionZ(String text) {
- if (scriptTransitionTable == null || scriptBucketIndex == null
- || scriptTransitionCalibration == null || numScriptBuckets == 0) {
+ public static float computeZ4ScriptTransition(String text,
+ float[] scriptTransTable,
+ float[] scriptTransCal,
+ Map scriptBucketIndex,
+ int numScriptBuckets) {
+ if (scriptTransTable == null || scriptTransCal == null
+ || scriptBucketIndex == null || numScriptBuckets == 0) {
return 0f;
}
int otherBucket = numScriptBuckets - 1;
@@ -543,7 +686,7 @@ private float computeScriptTransitionZ(String text) {
}
int bucket = scriptBucketIndex.getOrDefault(s.name(), otherBucket);
if (prev >= 0) {
- sum += scriptTransitionTable[prev * numScriptBuckets + bucket];
+ sum += scriptTransTable[prev * numScriptBuckets + bucket];
count++;
}
prev = bucket;
@@ -551,8 +694,184 @@ private float computeScriptTransitionZ(String text) {
if (count == 0) {
return 0f;
}
- float mean = (float) (sum / count);
- return (mean - scriptTransitionCalibration[0]) / scriptTransitionCalibration[1];
+ return ((float) (sum / count) - scriptTransCal[0]) / scriptTransCal[1];
+ }
+
+ /**
+ * Combines per-feature z-scores via the per-script linear classifier.
+ * Fallback (when no classifier weights stored): equal-weight average.
+ */
+ private float combineLogit(float z1, float z2, float z3, float z4, String script) {
+ float[] cw = classifierWeights.get(script);
+ if (cw != null) {
+ int nFeat = cw.length - 1; // bias is last
+ float logit = cw[nFeat]; // bias
+ if (nFeat >= 1) logit += cw[0] * z1;
+ if (nFeat >= 2) logit += cw[1] * z2;
+ if (nFeat >= 3) logit += cw[2] * z3;
+ if (nFeat >= 4) logit += cw[3] * z4;
+ return logit;
+ }
+ return (z1 + z2 + z3 + z4) / 4.0f; // fallback: equal weight
+ }
+
+ // -----------------------------------------------------------------------
+ // Feature 1: per-script open-addressing codepoint-bigram lookup
+ // -----------------------------------------------------------------------
+
+ /**
+ * Mean log-prob over the codepoint pairs in {@code text} using the given
+ * script's V7 F1 tables.
+ *
+ * For each adjacent codepoint pair {@code (a, b)}:
+ *
+ * - Binary-search both codepoints in the script's codepoint index.
+ * If either is absent, the pair was never seen in training; emit
+ * {@code α * (logP(a) + logP(b))} using each codepoint's unigram
+ * value (or {@link V7Tables#unigramFallbackLogProb} if the
+ * codepoint isn't even in the unigram index).
+ * - Otherwise, look up the packed {@code (idxA<<16)|idxB} key in
+ * the open-addressing bigram table. Empty slot → unseen pair →
+ * unigram backoff (same formula). Match → dequantize the stored
+ * value.
+ *
+ *
+ * This is the single authoritative implementation of the V7 F1
+ * scoring math, shared by inference and training. Keeping one
+ * implementation eliminates the risk of train/infer drift in the F1
+ * feature.
+ *
+ * @return mean log-prob, or {@link Double#NaN} if {@code text} has fewer
+ * than two codepoints or {@code tables} is null
+ */
+ public static double computeF1MeanLogP(String text, V7Tables tables) {
+ if (text == null || text.length() < 2 || tables == null) {
+ return Double.NaN;
+ }
+ double sum = 0;
+ int n = 0;
+ int prevCp = -1;
+ int prevIdx = -1;
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ i += Character.charCount(cp);
+ int curIdx = codepointToIndex(tables, cp);
+ if (prevCp >= 0) {
+ sum += scorePairF1V7(prevCp, prevIdx, cp, curIdx, tables);
+ n++;
+ }
+ prevCp = cp;
+ prevIdx = curIdx;
+ }
+ return n == 0 ? Double.NaN : sum / n;
+ }
+
+ /**
+ * Binary-search a codepoint in the script's index.
+ *
+ * @return the dense index (≥ 0) if found, or -1 if the codepoint
+ * doesn't appear in any kept bigram for this script
+ */
+ public static int codepointToIndex(V7Tables tables, int cp) {
+ return java.util.Arrays.binarySearch(tables.codepointIndex, cp);
+ }
+
+ /**
+ * Mixing function used to scatter packed (idxA, idxB) keys across
+ * the open-addressing table. A simple integer finalizer (splitmix32
+ * style) gives good distribution for sequential index values.
+ *
+ *
Public so the trainer's open-addressing insertion routine uses
+ * the same probe order as inference — drift here would silently
+ * corrupt every lookup.
+ */
+ public static int mixIndexKey(int packedKey) {
+ int x = packedKey;
+ x = (x ^ (x >>> 16)) * 0x7feb352d;
+ x = (x ^ (x >>> 15)) * 0x846ca68b;
+ x = x ^ (x >>> 16);
+ return x;
+ }
+
+ /**
+ * Packed bigram key for indices {@code (a, b)} where each index fits in
+ * {@link JunkDetectorTrainingConfig#KEY_INDEX_BITS} bits. Asserts that
+ * indices are non-negative; that's the caller's contract.
+ */
+ public static int packBigramKey(int idxA, int idxB) {
+ return (idxA << 16) | (idxB & 0xFFFF);
+ }
+
+ /**
+ * Looks up a (cpA, cpB) bigram in the script's V7 tables and returns
+ * its dequantized log-prob. Falls back to unigram backoff on miss.
+ *
+ *
{@code idxA}/{@code idxB} are the pre-computed codepoint indices
+ * (from {@link #codepointToIndex}); {@code -1} means the codepoint is
+ * not in this script's index. The caller is expected to compute them
+ * once when scanning the text (avoiding a redundant binary search per
+ * codepoint).
+ */
+ private static double scorePairF1V7(int cpA, int idxA, int cpB, int idxB,
+ V7Tables tables) {
+ if (idxA >= 0 && idxB >= 0) {
+ int slot = lookupBigramSlot(tables, idxA, idxB);
+ if (slot >= 0) {
+ return dequantize(tables.bigramValues[slot],
+ tables.bigramQuantMin, tables.bigramQuantMax);
+ }
+ }
+ // Unigram backoff for unseen pair or for codepoints absent from the
+ // per-script index. α=1.0 = plain independence; prototype-validated.
+ double ua = unigramLogProb(tables, idxA);
+ double ub = unigramLogProb(tables, idxB);
+ return tables.backoffAlpha * (ua + ub);
+ }
+
+ /**
+ * Open-addressing lookup: returns the slot index that contains the key
+ * for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an
+ * empty slot first).
+ *
+ *
Linear probing with the same mix-hash used at training time —
+ * required for the table to be readable, not just writable.
+ */
+ static int lookupBigramSlot(V7Tables tables, int idxA, int idxB) {
+ int packedKey = packBigramKey(idxA, idxB);
+ int[] keys = tables.bigramKeys;
+ int mask = keys.length - 1;
+ int h = mixIndexKey(packedKey) & mask;
+ while (true) {
+ int k = keys[h];
+ if (k == V7Tables.EMPTY_KEY) return -1;
+ if (k == packedKey) return h;
+ h = (h + 1) & mask;
+ }
+ }
+
+ private static double unigramLogProb(V7Tables tables, int idx) {
+ if (idx < 0) {
+ return tables.unigramFallbackLogProb;
+ }
+ return dequantize(tables.unigramTable[idx],
+ tables.unigramQuantMin, tables.unigramQuantMax);
+ }
+
+ private static float dequantize(byte b, float min, float max) {
+ int u = b & 0xFF;
+ return min + (u / 255.0f) * (max - min);
+ }
+
+ /**
+ * Computes the global script-transition z-score for the whole input
+ * string against this model's loaded tables. Thin wrapper around the
+ * public static {@link #computeZ4ScriptTransition} helper — same math,
+ * just preloaded with this instance's parameters.
+ */
+ private float computeScriptTransitionZ(String text) {
+ return computeZ4ScriptTransition(text,
+ scriptTransitionTable, scriptTransitionCalibration,
+ scriptBucketIndex, numScriptBuckets);
}
/**
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index f1de37d989a..72e51e8094f 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -25,6 +25,8 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,7 +41,6 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.quality.TextQualityComparison;
import org.apache.tika.quality.TextQualityDetector;
-import org.apache.tika.quality.TextQualityScore;
/**
* A {@link MetaEncodingDetector} that arbitrates charset candidates by
@@ -76,34 +77,6 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector {
* default read limit used by the charset base detectors. */
private static final int DEFAULT_READ_LIMIT = 16384;
- // ---------------------------------------------------------------------
- // TACTICAL: declarative-override gate constants.
- //
- // These exist to compensate for known per-script calibration unevenness
- // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/
- // BENGALI floors too strict). They produce wrong tournaments when an
- // honest in-document declaration (`` / XML decl) decodes
- // to sparse non-Latin content that scores junky-but-correct, while a
- // statistical pick decodes to dense mojibake-Han that scores decent-
- // but-wrong. See `analyses/2026-04-26-tika-eval-charset-and-other.md`
- // and the indic-collapse + Korean+Hanja fixtures.
- //
- // REMOVE when the quality scorer is recalibrated per-script — the
- // tournament should then be reliable on its own.
- // ---------------------------------------------------------------------
-
- /** Maximum delta in z-score units we tolerate before honoring the
- * in-document declaration over the tournament winner. Tuned so that
- * small same-script-different-codepage deltas (windows-1252 vs
- * windows-1257 ≈ 1-2 units) don't trigger override when scripts
- * match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */
- private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f;
-
- /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared
- * decoder's output. Above this, the declared charset clearly cannot
- * decode the bytes and we should not honor the declaration. */
- private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01;
-
/** Cached quality detector. {@code null} if none is on the classpath. */
private final TextQualityDetector qualityDetector;
@@ -187,10 +160,21 @@ public List detect(TikaInputStream tis, Metadata metadata,
// Decode probe under each candidate, preserving insertion order so
// tournament seeding is deterministic.
+ //
+ // Each decoded string is then run through HTML entity expansion.
+ // For entity-encoded HTML (numeric refs like ്), this is
+ // load-bearing: entity refs are ASCII bytes that decode identically
+ // under every candidate charset, so they don't differentiate.
+ // After expansion they become real codepoints — and crucially, in
+ // the *wrong* decoding (e.g. mojibake-as-HAN), they introduce
+ // cross-script transitions (HAN ↔ MALAYALAM mid-document) that the
+ // quality detector's script-transition feature correctly penalises.
+ // See `20260512-junkdetector-codepoint-hash-plan.md` (AIT5 case).
Map candidates = new LinkedHashMap<>();
for (Charset cs : uniqueCharsets) {
String decoded = safeDecode(forDecode, cs);
if (decoded != null && !decoded.isEmpty()) {
+ decoded = expandHtmlEntities(decoded);
candidates.put(cs, decoded);
if (LOG.isTraceEnabled()) {
int sampleLen = Math.min(400, decoded.length());
@@ -246,148 +230,17 @@ public List detect(TikaInputStream tis, Metadata metadata,
champion.getKey().name(), challenger.getKey().name(),
cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f", cmp.delta()),
cmp.scoreA(), cmp.scoreB());
- if ("B".equals(cmp.winner())) {
+ if (challenger.getKey().name().equals(cmp.winner())) {
champion = challenger;
}
}
LOG.trace("junk-filter -> {} (tournament champion)", champion.getKey().name());
- // TACTICAL: declarative override. See class-level comment block.
- // REMOVE when quality scorer is recalibrated per-script.
- Charset declarativeOverride = applyInDocumentDeclarativeOverride(
- context, candidates, champion.getKey());
- if (declarativeOverride != null) {
- float conf = context.getTopConfidenceFor(declarativeOverride);
- context.setArbitrationInfo("junk-filter-declarative-override");
- LOG.trace("junk-filter -> {} (declarative override of tournament winner {})",
- declarativeOverride.name(), champion.getKey().name());
- return List.of(new EncodingResult(declarativeOverride, conf));
- }
-
float confidence = context.getTopConfidenceFor(champion.getKey());
context.setArbitrationInfo("junk-filter-selected");
return List.of(new EncodingResult(champion.getKey(), confidence));
}
- /**
- * Tactical fix: honor an in-document {@code } or XML
- * declaration when the quality scorer's per-script calibration unevenness
- * would otherwise mis-rank candidates of different scripts.
- *
- * Returns the in-document declared charset to use, or {@code null} to
- * leave the tournament winner intact.
- *
- * Gates (all must hold to override):
- *
- * - (a) Decode is mostly clean: declared decoder produces
- * fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per char.
- * - (b) Both decoded: declared and tournament winner are
- * both in the candidate map (already guaranteed by upstream code).
- * - (c) Quality gap small: tournament winner's z-score
- * is not vastly higher than the declared's; specifically
- * {@code winner.z - declared.z <= DECLARATIVE_OVERRIDE_MAX_DELTA}.
- * - (d) Different scripts: declared and winner classify
- * as different scripts. Same-script Latin-cousin lies (e.g. windows-1252
- * declared on a windows-1257 file) fall through to the tournament,
- * which correctly handles them via byte-distribution scoring.
- *
- *
- * "In-document" means {@code HtmlEncodingDetector} or any future XML-decl
- * source — explicitly NOT {@code MetadataCharsetDetector} (outer Content-Type
- * header), which is more often wrong.
- */
- private Charset applyInDocumentDeclarativeOverride(
- EncodingDetectorContext context,
- Map candidates,
- Charset champion) {
- Charset declared = findInDocumentDeclarative(context);
- if (declared == null) {
- return null;
- }
- if (declared.equals(champion)) {
- return null; // already winning
- }
- // Per HTML5 spec, cannot validly declare UTF-16 / UTF-32:
- // the meta tag itself is bytes that have to be parsed before its
- // declaration is known, and UTF-16/32 require a BOM. If the
- // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs
- // first in the chain), we treat the declaration as invalid and let
- // the tournament winner stand. This catches govdocs1-style "utf-16
- // declared on a Latin file" lies that would otherwise look like a
- // legitimate script-mismatch override.
- String declaredName = declared.name();
- if (declaredName.startsWith("UTF-16") || declaredName.startsWith("UTF-32")) {
- LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in (HTML5 invalid)");
- return null;
- }
- String championText = candidates.get(champion);
- String declaredText = candidates.get(declared);
- if (declaredText == null || championText == null) {
- return null; // failed to decode
- }
- // (a) decode mostly clean
- double fffdRate = replacementCharRate(declaredText);
- if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) {
- LOG.trace("junk-filter declarative-override skipped: U+FFFD rate {} > {}",
- fffdRate, DECLARATIVE_MAX_FFFD_RATE);
- return null;
- }
- TextQualityScore declaredScore = qualityDetector.score(declaredText);
- TextQualityScore championScore = qualityDetector.score(championText);
- // (c) winner not vastly higher
- float delta = championScore.getZScore() - declaredScore.getZScore();
- if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) {
- LOG.trace("junk-filter declarative-override skipped: delta {} > {}",
- delta, DECLARATIVE_OVERRIDE_MAX_DELTA);
- return null;
- }
- // (d) different scripts
- String declaredScript = declaredScore.getDominantScript();
- String championScript = championScore.getDominantScript();
- if (declaredScript == null || declaredScript.equals(championScript)) {
- LOG.trace("junk-filter declarative-override skipped: same script {}",
- declaredScript);
- return null;
- }
- LOG.trace("junk-filter declarative-override fires: declared={} (script={}, z={}) vs winner={} (script={}, z={}) delta={}",
- declared.name(), declaredScript, declaredScore.getZScore(),
- champion.name(), championScript, championScore.getZScore(), delta);
- return declared;
- }
-
- /**
- * Find the first in-document DECLARATIVE candidate (from
- * {@code HtmlEncodingDetector} / XML declaration), or {@code null}.
- * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is
- * intentionally excluded — those headers lie too often.
- */
- private static Charset findInDocumentDeclarative(EncodingDetectorContext context) {
- for (EncodingDetectorContext.Result r : context.getResults()) {
- String name = r.getDetectorName();
- if (("HtmlEncodingDetector".equals(name)
- || "StandardHtmlEncodingDetector".equals(name))
- && r.getResultType() == EncodingResult.ResultType.DECLARATIVE) {
- return r.getCharset();
- }
- }
- return null;
- }
-
- /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded String —
- * a proxy for "this charset cannot decode these bytes". */
- private static double replacementCharRate(String s) {
- if (s.isEmpty()) {
- return 0.0;
- }
- long count = 0;
- for (int i = 0; i < s.length(); i++) {
- if (s.charAt(i) == '�') {
- count++;
- }
- }
- return (double) count / s.length();
- }
-
/**
* Return the first DECLARATIVE charset whose decoded output equals at
* least one other candidate's, or {@code null}.
@@ -459,6 +312,69 @@ private static String safeDecode(byte[] bytes, Charset charset) {
}
}
+ // -----------------------------------------------------------------------
+ // HTML entity expansion
+ //
+ // Applied to every decoded candidate before quality scoring. Resolves
+ // numeric character refs (NNNN; / HHHH;) to their target codepoints
+ // and a small set of common named entities. Malformed entities pass
+ // through as literal text. Sufficient for the AIT5-class failure
+ // mode where blogspot/news pages use numeric Malayalam/Bengali entities
+ // intermixed with raw UTF-8 codepoints.
+ // -----------------------------------------------------------------------
+
+ private static final Pattern ENTITY_DEC = Pattern.compile("(\\d{1,7});");
+ private static final Pattern ENTITY_HEX = Pattern.compile("[xX]([0-9a-fA-F]{1,6});");
+ private static final Pattern ENTITY_NAMED =
+ Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+ /**
+ * Expands HTML numeric and a small set of named entity references in
+ * {@code s}. Malformed or out-of-range entities pass through unchanged.
+ * The named-entity set is intentionally small — only the universally-
+ * declared HTML5 entities that don't depend on a DOCTYPE. Anything more
+ * exotic stays as a literal entity reference (which scores as ASCII noise,
+ * the same as it would have before).
+ */
+ static String expandHtmlEntities(String s) {
+ s = ENTITY_DEC.matcher(s).replaceAll(mr -> {
+ try {
+ int cp = Integer.parseInt(mr.group(1));
+ if (cp >= 0 && cp <= 0x10FFFF) {
+ return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+ }
+ } catch (NumberFormatException ignored) {
+ // overflow — fall through, leave entity literal
+ }
+ return Matcher.quoteReplacement(mr.group());
+ });
+ s = ENTITY_HEX.matcher(s).replaceAll(mr -> {
+ try {
+ int cp = Integer.parseInt(mr.group(1), 16);
+ if (cp >= 0 && cp <= 0x10FFFF) {
+ return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+ }
+ } catch (NumberFormatException ignored) {
+ // overflow — fall through, leave entity literal
+ }
+ return Matcher.quoteReplacement(mr.group());
+ });
+ s = ENTITY_NAMED.matcher(s).replaceAll(mr -> {
+ switch (mr.group(1)) {
+ case "amp": return "&";
+ case "lt": return "<";
+ case "gt": return ">";
+ case "quot": return "\"";
+ case "apos": return "'";
+ case "nbsp": return " ";
+ case "copy": return "©";
+ case "reg": return "®";
+ default: return Matcher.quoteReplacement(mr.group());
+ }
+ });
+ return s;
+ }
+
/**
* Strip a leading byte-order mark, if any. UTF-32 signatures are
* checked before UTF-16 because the UTF-32 LE BOM ({@code FF FE 00 00})
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java
new file mode 100644
index 00000000000..ab7e1b00b7e
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java
@@ -0,0 +1,445 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+/**
+ * Static codepoint-range → bucket-index lookup table used by Feature 2
+ * (block-transition log-probability). Replaces
+ * {@link Character.UnicodeBlock#of(int)} so that the model's block
+ * semantics are fully decoupled from the JVM's Unicode-data release —
+ * training on one JDK and serving on another produces identical scores
+ * by construction.
+ *
+ * The 338 named blocks are a snapshot from JDK 25's
+ * {@link Character.UnicodeBlock} (Unicode 16.x). Codepoints in gaps
+ * between named blocks resolve to the {@link #UNASSIGNED} bucket
+ * ({@value #UNASSIGNED}). The total bucket count is
+ * {@link #bucketCount()} = 339.
+ *
+ *
If the block list is ever updated, bump {@link #SCHEME_VERSION} —
+ * the model file's {@code block_scheme_version} byte must match. This
+ * forces a clean retrain rather than silent re-mapping.
+ *
+ *
Lookup cost: O(log N) binary search. Thread-safe, immutable.
+ */
+public final class UnicodeBlockRanges {
+
+ /**
+ * Bumped whenever the static range table below changes. A model
+ * trained against scheme version X cannot be served by code at
+ * version Y ≠ X — the loader rejects the mismatch.
+ */
+ public static final int SCHEME_VERSION = 1;
+
+ /** Bucket index returned for codepoints in no named block. */
+ public static final int UNASSIGNED = 338;
+
+ /**
+ * Sorted by {@code start_cp}. Each row: {@code {start, end_inclusive, bucket_id}}.
+ * Bucket ids are 0..337 — the {@link #UNASSIGNED} bucket has id 338
+ * and is implicit (returned when binary search finds no matching range).
+ *
+ *
Generated from JDK 25 {@code Character.UnicodeBlock.of(cp)} for
+ * every codepoint in [0, 0x10FFFF].
+ */
+ private static final int[][] RANGES = {
+ {0x0000, 0x007F, 0}, // BASIC_LATIN
+ {0x0080, 0x00FF, 1}, // LATIN_1_SUPPLEMENT
+ {0x0100, 0x017F, 2}, // LATIN_EXTENDED_A
+ {0x0180, 0x024F, 3}, // LATIN_EXTENDED_B
+ {0x0250, 0x02AF, 4}, // IPA_EXTENSIONS
+ {0x02B0, 0x02FF, 5}, // SPACING_MODIFIER_LETTERS
+ {0x0300, 0x036F, 6}, // COMBINING_DIACRITICAL_MARKS
+ {0x0370, 0x03FF, 7}, // GREEK
+ {0x0400, 0x04FF, 8}, // CYRILLIC
+ {0x0500, 0x052F, 9}, // CYRILLIC_SUPPLEMENTARY
+ {0x0530, 0x058F, 10}, // ARMENIAN
+ {0x0590, 0x05FF, 11}, // HEBREW
+ {0x0600, 0x06FF, 12}, // ARABIC
+ {0x0700, 0x074F, 13}, // SYRIAC
+ {0x0750, 0x077F, 14}, // ARABIC_SUPPLEMENT
+ {0x0780, 0x07BF, 15}, // THAANA
+ {0x07C0, 0x07FF, 16}, // NKO
+ {0x0800, 0x083F, 17}, // SAMARITAN
+ {0x0840, 0x085F, 18}, // MANDAIC
+ {0x0860, 0x086F, 19}, // SYRIAC_SUPPLEMENT
+ {0x0870, 0x089F, 20}, // ARABIC_EXTENDED_B
+ {0x08A0, 0x08FF, 21}, // ARABIC_EXTENDED_A
+ {0x0900, 0x097F, 22}, // DEVANAGARI
+ {0x0980, 0x09FF, 23}, // BENGALI
+ {0x0A00, 0x0A7F, 24}, // GURMUKHI
+ {0x0A80, 0x0AFF, 25}, // GUJARATI
+ {0x0B00, 0x0B7F, 26}, // ORIYA
+ {0x0B80, 0x0BFF, 27}, // TAMIL
+ {0x0C00, 0x0C7F, 28}, // TELUGU
+ {0x0C80, 0x0CFF, 29}, // KANNADA
+ {0x0D00, 0x0D7F, 30}, // MALAYALAM
+ {0x0D80, 0x0DFF, 31}, // SINHALA
+ {0x0E00, 0x0E7F, 32}, // THAI
+ {0x0E80, 0x0EFF, 33}, // LAO
+ {0x0F00, 0x0FFF, 34}, // TIBETAN
+ {0x1000, 0x109F, 35}, // MYANMAR
+ {0x10A0, 0x10FF, 36}, // GEORGIAN
+ {0x1100, 0x11FF, 37}, // HANGUL_JAMO
+ {0x1200, 0x137F, 38}, // ETHIOPIC
+ {0x1380, 0x139F, 39}, // ETHIOPIC_SUPPLEMENT
+ {0x13A0, 0x13FF, 40}, // CHEROKEE
+ {0x1400, 0x167F, 41}, // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
+ {0x1680, 0x169F, 42}, // OGHAM
+ {0x16A0, 0x16FF, 43}, // RUNIC
+ {0x1700, 0x171F, 44}, // TAGALOG
+ {0x1720, 0x173F, 45}, // HANUNOO
+ {0x1740, 0x175F, 46}, // BUHID
+ {0x1760, 0x177F, 47}, // TAGBANWA
+ {0x1780, 0x17FF, 48}, // KHMER
+ {0x1800, 0x18AF, 49}, // MONGOLIAN
+ {0x18B0, 0x18FF, 50}, // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED
+ {0x1900, 0x194F, 51}, // LIMBU
+ {0x1950, 0x197F, 52}, // TAI_LE
+ {0x1980, 0x19DF, 53}, // NEW_TAI_LUE
+ {0x19E0, 0x19FF, 54}, // KHMER_SYMBOLS
+ {0x1A00, 0x1A1F, 55}, // BUGINESE
+ {0x1A20, 0x1AAF, 56}, // TAI_THAM
+ {0x1AB0, 0x1AFF, 57}, // COMBINING_DIACRITICAL_MARKS_EXTENDED
+ {0x1B00, 0x1B7F, 58}, // BALINESE
+ {0x1B80, 0x1BBF, 59}, // SUNDANESE
+ {0x1BC0, 0x1BFF, 60}, // BATAK
+ {0x1C00, 0x1C4F, 61}, // LEPCHA
+ {0x1C50, 0x1C7F, 62}, // OL_CHIKI
+ {0x1C80, 0x1C8F, 63}, // CYRILLIC_EXTENDED_C
+ {0x1C90, 0x1CBF, 64}, // GEORGIAN_EXTENDED
+ {0x1CC0, 0x1CCF, 65}, // SUNDANESE_SUPPLEMENT
+ {0x1CD0, 0x1CFF, 66}, // VEDIC_EXTENSIONS
+ {0x1D00, 0x1D7F, 67}, // PHONETIC_EXTENSIONS
+ {0x1D80, 0x1DBF, 68}, // PHONETIC_EXTENSIONS_SUPPLEMENT
+ {0x1DC0, 0x1DFF, 69}, // COMBINING_DIACRITICAL_MARKS_SUPPLEMENT
+ {0x1E00, 0x1EFF, 70}, // LATIN_EXTENDED_ADDITIONAL
+ {0x1F00, 0x1FFF, 71}, // GREEK_EXTENDED
+ {0x2000, 0x206F, 72}, // GENERAL_PUNCTUATION
+ {0x2070, 0x209F, 73}, // SUPERSCRIPTS_AND_SUBSCRIPTS
+ {0x20A0, 0x20CF, 74}, // CURRENCY_SYMBOLS
+ {0x20D0, 0x20FF, 75}, // COMBINING_MARKS_FOR_SYMBOLS
+ {0x2100, 0x214F, 76}, // LETTERLIKE_SYMBOLS
+ {0x2150, 0x218F, 77}, // NUMBER_FORMS
+ {0x2190, 0x21FF, 78}, // ARROWS
+ {0x2200, 0x22FF, 79}, // MATHEMATICAL_OPERATORS
+ {0x2300, 0x23FF, 80}, // MISCELLANEOUS_TECHNICAL
+ {0x2400, 0x243F, 81}, // CONTROL_PICTURES
+ {0x2440, 0x245F, 82}, // OPTICAL_CHARACTER_RECOGNITION
+ {0x2460, 0x24FF, 83}, // ENCLOSED_ALPHANUMERICS
+ {0x2500, 0x257F, 84}, // BOX_DRAWING
+ {0x2580, 0x259F, 85}, // BLOCK_ELEMENTS
+ {0x25A0, 0x25FF, 86}, // GEOMETRIC_SHAPES
+ {0x2600, 0x26FF, 87}, // MISCELLANEOUS_SYMBOLS
+ {0x2700, 0x27BF, 88}, // DINGBATS
+ {0x27C0, 0x27EF, 89}, // MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
+ {0x27F0, 0x27FF, 90}, // SUPPLEMENTAL_ARROWS_A
+ {0x2800, 0x28FF, 91}, // BRAILLE_PATTERNS
+ {0x2900, 0x297F, 92}, // SUPPLEMENTAL_ARROWS_B
+ {0x2980, 0x29FF, 93}, // MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
+ {0x2A00, 0x2AFF, 94}, // SUPPLEMENTAL_MATHEMATICAL_OPERATORS
+ {0x2B00, 0x2BFF, 95}, // MISCELLANEOUS_SYMBOLS_AND_ARROWS
+ {0x2C00, 0x2C5F, 96}, // GLAGOLITIC
+ {0x2C60, 0x2C7F, 97}, // LATIN_EXTENDED_C
+ {0x2C80, 0x2CFF, 98}, // COPTIC
+ {0x2D00, 0x2D2F, 99}, // GEORGIAN_SUPPLEMENT
+ {0x2D30, 0x2D7F, 100}, // TIFINAGH
+ {0x2D80, 0x2DDF, 101}, // ETHIOPIC_EXTENDED
+ {0x2DE0, 0x2DFF, 102}, // CYRILLIC_EXTENDED_A
+ {0x2E00, 0x2E7F, 103}, // SUPPLEMENTAL_PUNCTUATION
+ {0x2E80, 0x2EFF, 104}, // CJK_RADICALS_SUPPLEMENT
+ {0x2F00, 0x2FDF, 105}, // KANGXI_RADICALS
+ {0x2FF0, 0x2FFF, 106}, // IDEOGRAPHIC_DESCRIPTION_CHARACTERS
+ {0x3000, 0x303F, 107}, // CJK_SYMBOLS_AND_PUNCTUATION
+ {0x3040, 0x309F, 108}, // HIRAGANA
+ {0x30A0, 0x30FF, 109}, // KATAKANA
+ {0x3100, 0x312F, 110}, // BOPOMOFO
+ {0x3130, 0x318F, 111}, // HANGUL_COMPATIBILITY_JAMO
+ {0x3190, 0x319F, 112}, // KANBUN
+ {0x31A0, 0x31BF, 113}, // BOPOMOFO_EXTENDED
+ {0x31C0, 0x31EF, 114}, // CJK_STROKES
+ {0x31F0, 0x31FF, 115}, // KATAKANA_PHONETIC_EXTENSIONS
+ {0x3200, 0x32FF, 116}, // ENCLOSED_CJK_LETTERS_AND_MONTHS
+ {0x3300, 0x33FF, 117}, // CJK_COMPATIBILITY
+ {0x3400, 0x4DBF, 118}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
+ {0x4DC0, 0x4DFF, 119}, // YIJING_HEXAGRAM_SYMBOLS
+ {0x4E00, 0x9FFF, 120}, // CJK_UNIFIED_IDEOGRAPHS
+ {0xA000, 0xA48F, 121}, // YI_SYLLABLES
+ {0xA490, 0xA4CF, 122}, // YI_RADICALS
+ {0xA4D0, 0xA4FF, 123}, // LISU
+ {0xA500, 0xA63F, 124}, // VAI
+ {0xA640, 0xA69F, 125}, // CYRILLIC_EXTENDED_B
+ {0xA6A0, 0xA6FF, 126}, // BAMUM
+ {0xA700, 0xA71F, 127}, // MODIFIER_TONE_LETTERS
+ {0xA720, 0xA7FF, 128}, // LATIN_EXTENDED_D
+ {0xA800, 0xA82F, 129}, // SYLOTI_NAGRI
+ {0xA830, 0xA83F, 130}, // COMMON_INDIC_NUMBER_FORMS
+ {0xA840, 0xA87F, 131}, // PHAGS_PA
+ {0xA880, 0xA8DF, 132}, // SAURASHTRA
+ {0xA8E0, 0xA8FF, 133}, // DEVANAGARI_EXTENDED
+ {0xA900, 0xA92F, 134}, // KAYAH_LI
+ {0xA930, 0xA95F, 135}, // REJANG
+ {0xA960, 0xA97F, 136}, // HANGUL_JAMO_EXTENDED_A
+ {0xA980, 0xA9DF, 137}, // JAVANESE
+ {0xA9E0, 0xA9FF, 138}, // MYANMAR_EXTENDED_B
+ {0xAA00, 0xAA5F, 139}, // CHAM
+ {0xAA60, 0xAA7F, 140}, // MYANMAR_EXTENDED_A
+ {0xAA80, 0xAADF, 141}, // TAI_VIET
+ {0xAAE0, 0xAAFF, 142}, // MEETEI_MAYEK_EXTENSIONS
+ {0xAB00, 0xAB2F, 143}, // ETHIOPIC_EXTENDED_A
+ {0xAB30, 0xAB6F, 144}, // LATIN_EXTENDED_E
+ {0xAB70, 0xABBF, 145}, // CHEROKEE_SUPPLEMENT
+ {0xABC0, 0xABFF, 146}, // MEETEI_MAYEK
+ {0xAC00, 0xD7AF, 147}, // HANGUL_SYLLABLES
+ {0xD7B0, 0xD7FF, 148}, // HANGUL_JAMO_EXTENDED_B
+ {0xD800, 0xDB7F, 149}, // HIGH_SURROGATES
+ {0xDB80, 0xDBFF, 150}, // HIGH_PRIVATE_USE_SURROGATES
+ {0xDC00, 0xDFFF, 151}, // LOW_SURROGATES
+ {0xE000, 0xF8FF, 152}, // PRIVATE_USE_AREA
+ {0xF900, 0xFAFF, 153}, // CJK_COMPATIBILITY_IDEOGRAPHS
+ {0xFB00, 0xFB4F, 154}, // ALPHABETIC_PRESENTATION_FORMS
+ {0xFB50, 0xFDFF, 155}, // ARABIC_PRESENTATION_FORMS_A
+ {0xFE00, 0xFE0F, 156}, // VARIATION_SELECTORS
+ {0xFE10, 0xFE1F, 157}, // VERTICAL_FORMS
+ {0xFE20, 0xFE2F, 158}, // COMBINING_HALF_MARKS
+ {0xFE30, 0xFE4F, 159}, // CJK_COMPATIBILITY_FORMS
+ {0xFE50, 0xFE6F, 160}, // SMALL_FORM_VARIANTS
+ {0xFE70, 0xFEFF, 161}, // ARABIC_PRESENTATION_FORMS_B
+ {0xFF00, 0xFFEF, 162}, // HALFWIDTH_AND_FULLWIDTH_FORMS
+ {0xFFF0, 0xFFFF, 163}, // SPECIALS
+ {0x10000, 0x1007F, 164}, // LINEAR_B_SYLLABARY
+ {0x10080, 0x100FF, 165}, // LINEAR_B_IDEOGRAMS
+ {0x10100, 0x1013F, 166}, // AEGEAN_NUMBERS
+ {0x10140, 0x1018F, 167}, // ANCIENT_GREEK_NUMBERS
+ {0x10190, 0x101CF, 168}, // ANCIENT_SYMBOLS
+ {0x101D0, 0x101FF, 169}, // PHAISTOS_DISC
+ {0x10280, 0x1029F, 170}, // LYCIAN
+ {0x102A0, 0x102DF, 171}, // CARIAN
+ {0x102E0, 0x102FF, 172}, // COPTIC_EPACT_NUMBERS
+ {0x10300, 0x1032F, 173}, // OLD_ITALIC
+ {0x10330, 0x1034F, 174}, // GOTHIC
+ {0x10350, 0x1037F, 175}, // OLD_PERMIC
+ {0x10380, 0x1039F, 176}, // UGARITIC
+ {0x103A0, 0x103DF, 177}, // OLD_PERSIAN
+ {0x10400, 0x1044F, 178}, // DESERET
+ {0x10450, 0x1047F, 179}, // SHAVIAN
+ {0x10480, 0x104AF, 180}, // OSMANYA
+ {0x104B0, 0x104FF, 181}, // OSAGE
+ {0x10500, 0x1052F, 182}, // ELBASAN
+ {0x10530, 0x1056F, 183}, // CAUCASIAN_ALBANIAN
+ {0x10570, 0x105BF, 184}, // VITHKUQI
+ {0x105C0, 0x105FF, 185}, // TODHRI
+ {0x10600, 0x1077F, 186}, // LINEAR_A
+ {0x10780, 0x107BF, 187}, // LATIN_EXTENDED_F
+ {0x10800, 0x1083F, 188}, // CYPRIOT_SYLLABARY
+ {0x10840, 0x1085F, 189}, // IMPERIAL_ARAMAIC
+ {0x10860, 0x1087F, 190}, // PALMYRENE
+ {0x10880, 0x108AF, 191}, // NABATAEAN
+ {0x108E0, 0x108FF, 192}, // HATRAN
+ {0x10900, 0x1091F, 193}, // PHOENICIAN
+ {0x10920, 0x1093F, 194}, // LYDIAN
+ {0x10980, 0x1099F, 195}, // MEROITIC_HIEROGLYPHS
+ {0x109A0, 0x109FF, 196}, // MEROITIC_CURSIVE
+ {0x10A00, 0x10A5F, 197}, // KHAROSHTHI
+ {0x10A60, 0x10A7F, 198}, // OLD_SOUTH_ARABIAN
+ {0x10A80, 0x10A9F, 199}, // OLD_NORTH_ARABIAN
+ {0x10AC0, 0x10AFF, 200}, // MANICHAEAN
+ {0x10B00, 0x10B3F, 201}, // AVESTAN
+ {0x10B40, 0x10B5F, 202}, // INSCRIPTIONAL_PARTHIAN
+ {0x10B60, 0x10B7F, 203}, // INSCRIPTIONAL_PAHLAVI
+ {0x10B80, 0x10BAF, 204}, // PSALTER_PAHLAVI
+ {0x10C00, 0x10C4F, 205}, // OLD_TURKIC
+ {0x10C80, 0x10CFF, 206}, // OLD_HUNGARIAN
+ {0x10D00, 0x10D3F, 207}, // HANIFI_ROHINGYA
+ {0x10D40, 0x10D8F, 208}, // GARAY
+ {0x10E60, 0x10E7F, 209}, // RUMI_NUMERAL_SYMBOLS
+ {0x10E80, 0x10EBF, 210}, // YEZIDI
+ {0x10EC0, 0x10EFF, 211}, // ARABIC_EXTENDED_C
+ {0x10F00, 0x10F2F, 212}, // OLD_SOGDIAN
+ {0x10F30, 0x10F6F, 213}, // SOGDIAN
+ {0x10F70, 0x10FAF, 214}, // OLD_UYGHUR
+ {0x10FB0, 0x10FDF, 215}, // CHORASMIAN
+ {0x10FE0, 0x10FFF, 216}, // ELYMAIC
+ {0x11000, 0x1107F, 217}, // BRAHMI
+ {0x11080, 0x110CF, 218}, // KAITHI
+ {0x110D0, 0x110FF, 219}, // SORA_SOMPENG
+ {0x11100, 0x1114F, 220}, // CHAKMA
+ {0x11150, 0x1117F, 221}, // MAHAJANI
+ {0x11180, 0x111DF, 222}, // SHARADA
+ {0x111E0, 0x111FF, 223}, // SINHALA_ARCHAIC_NUMBERS
+ {0x11200, 0x1124F, 224}, // KHOJKI
+ {0x11280, 0x112AF, 225}, // MULTANI
+ {0x112B0, 0x112FF, 226}, // KHUDAWADI
+ {0x11300, 0x1137F, 227}, // GRANTHA
+ {0x11380, 0x113FF, 228}, // TULU_TIGALARI
+ {0x11400, 0x1147F, 229}, // NEWA
+ {0x11480, 0x114DF, 230}, // TIRHUTA
+ {0x11580, 0x115FF, 231}, // SIDDHAM
+ {0x11600, 0x1165F, 232}, // MODI
+ {0x11660, 0x1167F, 233}, // MONGOLIAN_SUPPLEMENT
+ {0x11680, 0x116CF, 234}, // TAKRI
+ {0x116D0, 0x116FF, 235}, // MYANMAR_EXTENDED_C
+ {0x11700, 0x1174F, 236}, // AHOM
+ {0x11800, 0x1184F, 237}, // DOGRA
+ {0x118A0, 0x118FF, 238}, // WARANG_CITI
+ {0x11900, 0x1195F, 239}, // DIVES_AKURU
+ {0x119A0, 0x119FF, 240}, // NANDINAGARI
+ {0x11A00, 0x11A4F, 241}, // ZANABAZAR_SQUARE
+ {0x11A50, 0x11AAF, 242}, // SOYOMBO
+ {0x11AB0, 0x11ABF, 243}, // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A
+ {0x11AC0, 0x11AFF, 244}, // PAU_CIN_HAU
+ {0x11B00, 0x11B5F, 245}, // DEVANAGARI_EXTENDED_A
+ {0x11BC0, 0x11BFF, 246}, // SUNUWAR
+ {0x11C00, 0x11C6F, 247}, // BHAIKSUKI
+ {0x11C70, 0x11CBF, 248}, // MARCHEN
+ {0x11D00, 0x11D5F, 249}, // MASARAM_GONDI
+ {0x11D60, 0x11DAF, 250}, // GUNJALA_GONDI
+ {0x11EE0, 0x11EFF, 251}, // MAKASAR
+ {0x11F00, 0x11F5F, 252}, // KAWI
+ {0x11FB0, 0x11FBF, 253}, // LISU_SUPPLEMENT
+ {0x11FC0, 0x11FFF, 254}, // TAMIL_SUPPLEMENT
+ {0x12000, 0x123FF, 255}, // CUNEIFORM
+ {0x12400, 0x1247F, 256}, // CUNEIFORM_NUMBERS_AND_PUNCTUATION
+ {0x12480, 0x1254F, 257}, // EARLY_DYNASTIC_CUNEIFORM
+ {0x12F90, 0x12FFF, 258}, // CYPRO_MINOAN
+ {0x13000, 0x1342F, 259}, // EGYPTIAN_HIEROGLYPHS
+ {0x13430, 0x1345F, 260}, // EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS
+ {0x13460, 0x143FF, 261}, // EGYPTIAN_HIEROGLYPHS_EXTENDED_A
+ {0x14400, 0x1467F, 262}, // ANATOLIAN_HIEROGLYPHS
+ {0x16100, 0x1613F, 263}, // GURUNG_KHEMA
+ {0x16800, 0x16A3F, 264}, // BAMUM_SUPPLEMENT
+ {0x16A40, 0x16A6F, 265}, // MRO
+ {0x16A70, 0x16ACF, 266}, // TANGSA
+ {0x16AD0, 0x16AFF, 267}, // BASSA_VAH
+ {0x16B00, 0x16B8F, 268}, // PAHAWH_HMONG
+ {0x16D40, 0x16D7F, 269}, // KIRAT_RAI
+ {0x16E40, 0x16E9F, 270}, // MEDEFAIDRIN
+ {0x16F00, 0x16F9F, 271}, // MIAO
+ {0x16FE0, 0x16FFF, 272}, // IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION
+ {0x17000, 0x187FF, 273}, // TANGUT
+ {0x18800, 0x18AFF, 274}, // TANGUT_COMPONENTS
+ {0x18B00, 0x18CFF, 275}, // KHITAN_SMALL_SCRIPT
+ {0x18D00, 0x18D7F, 276}, // TANGUT_SUPPLEMENT
+ {0x1AFF0, 0x1AFFF, 277}, // KANA_EXTENDED_B
+ {0x1B000, 0x1B0FF, 278}, // KANA_SUPPLEMENT
+ {0x1B100, 0x1B12F, 279}, // KANA_EXTENDED_A
+ {0x1B130, 0x1B16F, 280}, // SMALL_KANA_EXTENSION
+ {0x1B170, 0x1B2FF, 281}, // NUSHU
+ {0x1BC00, 0x1BC9F, 282}, // DUPLOYAN
+ {0x1BCA0, 0x1BCAF, 283}, // SHORTHAND_FORMAT_CONTROLS
+ {0x1CC00, 0x1CEBF, 284}, // SYMBOLS_FOR_LEGACY_COMPUTING_SUPPLEMENT
+ {0x1CF00, 0x1CFCF, 285}, // ZNAMENNY_MUSICAL_NOTATION
+ {0x1D000, 0x1D0FF, 286}, // BYZANTINE_MUSICAL_SYMBOLS
+ {0x1D100, 0x1D1FF, 287}, // MUSICAL_SYMBOLS
+ {0x1D200, 0x1D24F, 288}, // ANCIENT_GREEK_MUSICAL_NOTATION
+ {0x1D2C0, 0x1D2DF, 289}, // KAKTOVIK_NUMERALS
+ {0x1D2E0, 0x1D2FF, 290}, // MAYAN_NUMERALS
+ {0x1D300, 0x1D35F, 291}, // TAI_XUAN_JING_SYMBOLS
+ {0x1D360, 0x1D37F, 292}, // COUNTING_ROD_NUMERALS
+ {0x1D400, 0x1D7FF, 293}, // MATHEMATICAL_ALPHANUMERIC_SYMBOLS
+ {0x1D800, 0x1DAAF, 294}, // SUTTON_SIGNWRITING
+ {0x1DF00, 0x1DFFF, 295}, // LATIN_EXTENDED_G
+ {0x1E000, 0x1E02F, 296}, // GLAGOLITIC_SUPPLEMENT
+ {0x1E030, 0x1E08F, 297}, // CYRILLIC_EXTENDED_D
+ {0x1E100, 0x1E14F, 298}, // NYIAKENG_PUACHUE_HMONG
+ {0x1E290, 0x1E2BF, 299}, // TOTO
+ {0x1E2C0, 0x1E2FF, 300}, // WANCHO
+ {0x1E4D0, 0x1E4FF, 301}, // NAG_MUNDARI
+ {0x1E5D0, 0x1E5FF, 302}, // OL_ONAL
+ {0x1E7E0, 0x1E7FF, 303}, // ETHIOPIC_EXTENDED_B
+ {0x1E800, 0x1E8DF, 304}, // MENDE_KIKAKUI
+ {0x1E900, 0x1E95F, 305}, // ADLAM
+ {0x1EC70, 0x1ECBF, 306}, // INDIC_SIYAQ_NUMBERS
+ {0x1ED00, 0x1ED4F, 307}, // OTTOMAN_SIYAQ_NUMBERS
+ {0x1EE00, 0x1EEFF, 308}, // ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS
+ {0x1F000, 0x1F02F, 309}, // MAHJONG_TILES
+ {0x1F030, 0x1F09F, 310}, // DOMINO_TILES
+ {0x1F0A0, 0x1F0FF, 311}, // PLAYING_CARDS
+ {0x1F100, 0x1F1FF, 312}, // ENCLOSED_ALPHANUMERIC_SUPPLEMENT
+ {0x1F200, 0x1F2FF, 313}, // ENCLOSED_IDEOGRAPHIC_SUPPLEMENT
+ {0x1F300, 0x1F5FF, 314}, // MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS
+ {0x1F600, 0x1F64F, 315}, // EMOTICONS
+ {0x1F650, 0x1F67F, 316}, // ORNAMENTAL_DINGBATS
+ {0x1F680, 0x1F6FF, 317}, // TRANSPORT_AND_MAP_SYMBOLS
+ {0x1F700, 0x1F77F, 318}, // ALCHEMICAL_SYMBOLS
+ {0x1F780, 0x1F7FF, 319}, // GEOMETRIC_SHAPES_EXTENDED
+ {0x1F800, 0x1F8FF, 320}, // SUPPLEMENTAL_ARROWS_C
+ {0x1F900, 0x1F9FF, 321}, // SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS
+ {0x1FA00, 0x1FA6F, 322}, // CHESS_SYMBOLS
+ {0x1FA70, 0x1FAFF, 323}, // SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A
+ {0x1FB00, 0x1FBFF, 324}, // SYMBOLS_FOR_LEGACY_COMPUTING
+ {0x20000, 0x2A6DF, 325}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
+ {0x2A700, 0x2B73F, 326}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C
+ {0x2B740, 0x2B81F, 327}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D
+ {0x2B820, 0x2CEAF, 328}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E
+ {0x2CEB0, 0x2EBEF, 329}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F
+ {0x2EBF0, 0x2EE5F, 330}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I
+ {0x2F800, 0x2FA1F, 331}, // CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
+ {0x30000, 0x3134F, 332}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G
+ {0x31350, 0x323AF, 333}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H
+ {0xE0000, 0xE007F, 334}, // TAGS
+ {0xE0100, 0xE01EF, 335}, // VARIATION_SELECTORS_SUPPLEMENT
+ {0xF0000, 0xFFFFF, 336}, // SUPPLEMENTARY_PRIVATE_USE_AREA_A
+ {0x100000, 0x10FFFF, 337}, // SUPPLEMENTARY_PRIVATE_USE_AREA_B
+ };
+
+ /** Cached start_cp array for binary search. */
+ private static final int[] STARTS;
+ static {
+ STARTS = new int[RANGES.length];
+ for (int i = 0; i < RANGES.length; i++) {
+ STARTS[i] = RANGES[i][0];
+ }
+ }
+
+ private UnicodeBlockRanges() {
+ // utility class
+ }
+
+ /** Total number of buckets (named blocks + 1 unassigned). */
+ public static int bucketCount() {
+ return RANGES.length + 1;
+ }
+
+ /**
+ * Returns the bucket id for the given codepoint, or {@link #UNASSIGNED}
+ * if the codepoint falls outside every named block range.
+ *
+ *
Binary search over the sorted-by-{@code start_cp} range list:
+ * O(log N) where N = {@value #UNASSIGNED} (the number of named blocks).
+ */
+ public static int bucketOf(int cp) {
+ // Binary search: find largest STARTS[i] <= cp
+ int lo = 0;
+ int hi = STARTS.length - 1;
+ int found = -1;
+ while (lo <= hi) {
+ int mid = (lo + hi) >>> 1;
+ if (STARTS[mid] <= cp) {
+ found = mid;
+ lo = mid + 1;
+ } else {
+ hi = mid - 1;
+ }
+ }
+ if (found < 0) {
+ return UNASSIGNED;
+ }
+ // RANGES[found] is the candidate. Confirm cp is within end_inclusive.
+ return cp <= RANGES[found][1] ? RANGES[found][2] : UNASSIGNED;
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java
new file mode 100644
index 00000000000..93a82640caa
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+/**
+ * Carrier for one script's v7 F1 tables.
+ *
+ *
The v6 design used a single global codepoint-bigram hash + Bloom
+ * filter shared across all scripts. We measured that this ceiling
+ * limits accuracy: enlarging one script's training data (e.g. HAN) hurts
+ * the other scripts' z-scores because they share the global hash. v7
+ * gives each script its own pair of tables.
+ *
+ *
Per-script layout:
+ *
+ *
+ * - {@code codepointIndex} — sorted, ascending {@code int[]} of every
+ * codepoint that appears as either side of a kept bigram for this
+ * script. Codepoint → dense index is a binary search; index →
+ * codepoint is direct array access. Typical sizes: ~7K-15K for HAN,
+ * ~200-500 for most other scripts.
+ *
- {@code bigramKeys} / {@code bigramValues} — parallel arrays
+ * implementing an open-addressed hash table with linear probing.
+ * Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code
+ * -1} means "empty slot." Indices are bounded at 16 bits (65535),
+ * which is comfortably above the largest per-script codepoint count
+ * we observe.
+ *
- {@code unigramTable} — {@code byte[numCodepoints]}, quantized
+ * unigram log-probabilities indexed by the same codepoint→index map.
+ *
- {@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} —
+ * per-quantization ranges; dequantize by
+ * {@code min + (b/255) * (max - min)}.
+ *
- {@code unigramFallbackLogProb} — log-prob assigned when a
+ * codepoint is not in {@code codepointIndex} at all. Set to the
+ * script's most-pessimistic unigram value (its quantization min) so
+ * absent codepoints don't accidentally score above legitimately-rare
+ * ones.
+ *
- {@code backoffAlpha} — multiplier on the unigram-backoff
+ * independence sum, copied from v6.
+ *
+ *
+ * Membership semantics: no Bloom filter. The empty-slot sentinel is
+ * the membership oracle — a pair is "seen" iff binary-search finds both
+ * codepoints in the index AND a probe sequence hits a matching key before
+ * an empty slot. Lookups are therefore exact; there is no false-positive
+ * backoff path as there is in v6.
+ *
+ *
Fields are package-private so the
+ * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can
+ * construct instances directly without going through accessors.
+ */
+public final class V7Tables {
+
+ /** Reserved value in {@link #bigramKeys} marking an unoccupied slot. */
+ public static final int EMPTY_KEY = -1;
+
+ final int[] codepointIndex;
+ final int[] bigramKeys;
+ final byte[] bigramValues;
+ final byte[] unigramTable;
+ final float bigramQuantMin;
+ final float bigramQuantMax;
+ final float unigramQuantMin;
+ final float unigramQuantMax;
+ final float unigramFallbackLogProb;
+ final float backoffAlpha;
+
+ public V7Tables(int[] codepointIndex,
+ int[] bigramKeys, byte[] bigramValues,
+ byte[] unigramTable,
+ float bigramQuantMin, float bigramQuantMax,
+ float unigramQuantMin, float unigramQuantMax,
+ float unigramFallbackLogProb,
+ float backoffAlpha) {
+ if (bigramKeys.length != bigramValues.length) {
+ throw new IllegalArgumentException(
+ "bigramKeys and bigramValues must have equal length: "
+ + bigramKeys.length + " vs " + bigramValues.length);
+ }
+ if (unigramTable.length != codepointIndex.length) {
+ throw new IllegalArgumentException(
+ "unigramTable.length must equal codepointIndex.length: "
+ + unigramTable.length + " vs " + codepointIndex.length);
+ }
+ this.codepointIndex = codepointIndex;
+ this.bigramKeys = bigramKeys;
+ this.bigramValues = bigramValues;
+ this.unigramTable = unigramTable;
+ this.bigramQuantMin = bigramQuantMin;
+ this.bigramQuantMax = bigramQuantMax;
+ this.unigramQuantMin = unigramQuantMin;
+ this.unigramQuantMax = unigramQuantMax;
+ this.unigramFallbackLogProb = unigramFallbackLogProb;
+ this.backoffAlpha = backoffAlpha;
+ }
+
+ /**
+ * Serialises this script's F1 tables. Read back via
+ * {@link #readFrom(DataInputStream)}.
+ */
+ public void writeTo(DataOutputStream dos) throws IOException {
+ dos.writeFloat(backoffAlpha);
+
+ // Codepoint index.
+ dos.writeInt(codepointIndex.length);
+ ByteBuffer cpBuf = ByteBuffer.allocate(codepointIndex.length * 4)
+ .order(ByteOrder.BIG_ENDIAN);
+ cpBuf.asIntBuffer().put(codepointIndex);
+ dos.write(cpBuf.array());
+
+ // Bigram open-addressing table (keys + values).
+ dos.writeInt(bigramKeys.length);
+ dos.writeFloat(bigramQuantMin);
+ dos.writeFloat(bigramQuantMax);
+ ByteBuffer keyBuf = ByteBuffer.allocate(bigramKeys.length * 4)
+ .order(ByteOrder.BIG_ENDIAN);
+ keyBuf.asIntBuffer().put(bigramKeys);
+ dos.write(keyBuf.array());
+ dos.write(bigramValues);
+
+ // Unigram table.
+ dos.writeFloat(unigramQuantMin);
+ dos.writeFloat(unigramQuantMax);
+ dos.writeFloat(unigramFallbackLogProb);
+ dos.write(unigramTable);
+ }
+
+ /** Inverse of {@link #writeTo(DataOutputStream)}. */
+ public static V7Tables readFrom(DataInputStream dis) throws IOException {
+ float backoffAlpha = dis.readFloat();
+
+ int cpCount = dis.readInt();
+ byte[] cpBytes = dis.readNBytes(cpCount * 4);
+ int[] codepoints = new int[cpCount];
+ ByteBuffer.wrap(cpBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(codepoints);
+
+ int slots = dis.readInt();
+ float bMin = dis.readFloat();
+ float bMax = dis.readFloat();
+ byte[] keyBytes = dis.readNBytes(slots * 4);
+ int[] keys = new int[slots];
+ ByteBuffer.wrap(keyBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(keys);
+ byte[] values = dis.readNBytes(slots);
+
+ float uMin = dis.readFloat();
+ float uMax = dis.readFloat();
+ float uFallback = dis.readFloat();
+ byte[] unigramTable = dis.readNBytes(cpCount);
+
+ return new V7Tables(codepoints, keys, values, unigramTable,
+ bMin, bMax, uMin, uMax, uFallback, backoffAlpha);
+ }
+
+ /**
+ * Returns a one-line summary for trainer progress output.
+ */
+ public String statsString() {
+ return String.format(
+ " cp_index=%d, bigram_slots=%d (load≈%.2f), "
+ + "bigram_range=[%.3f, %.3f], unigram_range=[%.3f, %.3f]",
+ codepointIndex.length, bigramKeys.length,
+ occupiedSlots() / (double) Math.max(1, bigramKeys.length),
+ bigramQuantMin, bigramQuantMax,
+ unigramQuantMin, unigramQuantMax);
+ }
+
+ private int occupiedSlots() {
+ int n = 0;
+ for (int k : bigramKeys) {
+ if (k != EMPTY_KEY) n++;
+ }
+ return n;
+ }
+
+ /** Number of codepoints in this script's index. Diagnostic. */
+ public int codepointCount() {
+ return codepointIndex.length;
+ }
+
+ /** Number of bigram-table slots (capacity). Diagnostic. */
+ public int bigramSlots() {
+ return bigramKeys.length;
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
new file mode 100644
index 00000000000..08b2aa4eb57
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Diagnostic tool: bucket every bigram in {@code han.train.gz} (or any
+ * specified file) by the {@link Character.UnicodeBlock} of each codepoint,
+ * and report the distribution.
+ *
+ *
Goal: determine whether HAN's 224K distinct pairs split cleanly along
+ * block boundaries — e.g. CJK Unified Ideographs vs. Hiragana vs. Katakana —
+ * which would justify routing HAN windows to language-specific sub-models in
+ * the v7 design.
+ *
+ *
Usage:
+ *
+ * java ... AnalyzeHanByBlock /path/to/junkdetect/han.train.gz
+ *
+ */
+public final class AnalyzeHanByBlock {
+
+ private AnalyzeHanByBlock() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.err.println("Usage: AnalyzeHanByBlock ");
+ System.exit(1);
+ }
+ Path file = Paths.get(args[0]);
+
+ // (blockA, blockB) -> [totalBigrams, distinctSet via HashMap]
+ // We use Maps of Maps to keep code simple; HAN is the only file
+ // big enough to matter and fits in heap.
+ Map> byBlockPair = new HashMap<>();
+ Map blockPairTotals = new HashMap<>();
+ long totalN = 0;
+
+ try (BufferedReader r = new BufferedReader(
+ new InputStreamReader(
+ new GZIPInputStream(Files.newInputStream(file)),
+ StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = r.readLine()) != null) {
+ int prevCp = -1;
+ String prevBlock = null;
+ for (int i = 0; i < line.length(); ) {
+ int cp = line.codePointAt(i);
+ i += Character.charCount(cp);
+ String block = blockShortName(cp);
+ if (prevCp >= 0) {
+ String key = prevBlock + "|" + block;
+ Map set = byBlockPair.computeIfAbsent(
+ key, k -> new HashMap<>(256));
+ long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL);
+ long[] c = set.get(packed);
+ if (c == null) {
+ set.put(packed, new long[]{1L});
+ } else {
+ c[0]++;
+ }
+ blockPairTotals.computeIfAbsent(key, k -> new long[1])[0]++;
+ totalN++;
+ }
+ prevCp = cp;
+ prevBlock = block;
+ }
+ }
+ }
+
+ System.out.printf("File: %s%n", file);
+ System.out.printf("Total bigram occurrences: %,d%n%n", totalN);
+
+ // Sort block-pair keys by total occurrences (descending).
+ List> sorted = new ArrayList<>(blockPairTotals.entrySet());
+ sorted.sort(Comparator.comparingLong(
+ (Map.Entry e) -> -e.getValue()[0]));
+
+ System.out.printf("%-50s %14s %14s %12s %8s%n",
+ "block_pair", "occurrences", "distinct", "singletons", "%total");
+ System.out.println(repeat('-', 105));
+
+ long distinctTotal = 0;
+ long singletonsTotal = 0;
+ for (Map.Entry e : sorted) {
+ String pair = e.getKey();
+ long n = e.getValue()[0];
+ Map set = byBlockPair.get(pair);
+ int distinct = set.size();
+ int singletons = 0;
+ for (long[] c : set.values()) {
+ if (c[0] == 1) singletons++;
+ }
+ distinctTotal += distinct;
+ singletonsTotal += singletons;
+ double pct = 100.0 * n / totalN;
+ if (pct < 0.1 && n < 1000) {
+ continue; // skip tail noise rows
+ }
+ System.out.printf("%-50s %,14d %,14d %,12d %7.2f%%%n",
+ pair, n, distinct, singletons, pct);
+ }
+ System.out.println(repeat('-', 105));
+ System.out.printf("Total distinct pairs (incl. tail): %,d%n", distinctTotal);
+ System.out.printf("Total singletons (incl. tail): %,d%n", singletonsTotal);
+
+ // Roll up by individual block (left side only) to see per-block distinct counts.
+ System.out.println();
+ System.out.println("=== Per-leading-block roll-up ===");
+ Map distinctByLeadingBlock = new HashMap<>();
+ Map occByLeadingBlock = new HashMap<>();
+ for (Map.Entry> e : byBlockPair.entrySet()) {
+ String leading = e.getKey().substring(0, e.getKey().indexOf('|'));
+ distinctByLeadingBlock.merge(leading, (long) e.getValue().size(), Long::sum);
+ long sum = 0;
+ for (long[] c : e.getValue().values()) sum += c[0];
+ occByLeadingBlock.merge(leading, sum, Long::sum);
+ }
+ List> rollup = new ArrayList<>(occByLeadingBlock.entrySet());
+ rollup.sort(Comparator.comparingLong(
+ (Map.Entry e) -> -e.getValue()));
+ System.out.printf("%-35s %14s %14s%n",
+ "leading_block", "occurrences", "distinct(rough)");
+ System.out.println(repeat('-', 70));
+ for (Map.Entry e : rollup) {
+ System.out.printf("%-35s %,14d %,14d%n",
+ e.getKey(), e.getValue(),
+ distinctByLeadingBlock.get(e.getKey()));
+ }
+ }
+
+ /**
+ * Short-name for the Unicode block containing {@code cp}. Compresses the
+ * many CJK-related blocks into a handful of human-readable labels.
+ *
+ * Splits ASCII into ASCII_DIGIT / ASCII_LETTER / ASCII_PUNCT so we can
+ * distinguish numerals (which are content-bearing across all scripts) from
+ * English-letter contamination and punctuation.
+ */
+ private static String blockShortName(int cp) {
+ Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
+ if (b == null) return "UNK";
+
+ String name = b.toString();
+ if (name.equals("BASIC_LATIN")) {
+ if (cp >= '0' && cp <= '9') return "ASCII_DIGIT";
+ if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) return "ASCII_LETTER";
+ return "ASCII_PUNCT";
+ }
+ // Compress noisy block names for the report.
+ if (name.startsWith("CJK_UNIFIED_IDEOGRAPHS_EXTENSION")) {
+ return "CJK_EXT_" + name.substring(name.lastIndexOf('_') + 1);
+ }
+ if (name.equals("CJK_UNIFIED_IDEOGRAPHS")) return "CJK_UNIFIED";
+ if (name.equals("CJK_SYMBOLS_AND_PUNCTUATION")) return "CJK_PUNCT";
+ if (name.equals("CJK_COMPATIBILITY_IDEOGRAPHS")) return "CJK_COMPAT";
+ if (name.equals("CJK_COMPATIBILITY_FORMS")) return "CJK_COMPAT_FORMS";
+ if (name.equals("HALFWIDTH_AND_FULLWIDTH_FORMS")) return "HALF_FULL";
+ if (name.equals("HIRAGANA")) return "HIRAGANA";
+ if (name.equals("KATAKANA")) return "KATAKANA";
+ if (name.equals("KATAKANA_PHONETIC_EXTENSIONS")) return "KATAKANA_EXT";
+ if (name.equals("HANGUL_SYLLABLES")) return "HANGUL";
+ if (name.equals("HANGUL_JAMO")) return "HANGUL_JAMO";
+ if (name.equals("HANGUL_COMPATIBILITY_JAMO")) return "HANGUL_JAMO_C";
+ if (name.equals("LATIN_1_SUPPLEMENT")) return "LATIN1";
+ return name;
+ }
+
+ private static String repeat(char c, int n) {
+ char[] buf = new char[n];
+ java.util.Arrays.fill(buf, c);
+ return new String(buf);
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
new file mode 100644
index 00000000000..f64986b8dd8
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * For each {@code *.train.gz} file, classify every adjacent codepoint pair
+ * by its relation to the target script S (= file's script). Categories:
+ *
+ *
+ * - IN_S_INTERIOR — both codepoints are in S or in COMMON/INHERITED
+ *
- S_BOUNDARY — exactly one codepoint is in S-or-COMMON, the other
+ * is a non-S script
+ *
- FOREIGN_INTERIOR — both codepoints are in some non-S script
+ * (possibly different scripts). Under the proposed generalized
+ * boundary rule, these are the bigrams to drop from S's training.
+ *
- ASCII_LETTER_RUN — special subcategory of foreign interior where
+ * both cps are ASCII A–Z/a–z; this is the English-run case.
+ *
+ *
+ * Reports occurrence counts, distinct-pair counts, and singleton counts
+ * for each category, plus the implied model-size impact of dropping
+ * FOREIGN_INTERIOR (or just ASCII_LETTER_RUN) under {@code min_count>=1}
+ * and {@code min_count>=3}.
+ */
+public final class BoundaryBigramAudit {
+
+ private BoundaryBigramAudit() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.err.println("Usage: BoundaryBigramAudit ");
+ System.exit(1);
+ }
+ Path dataDir = Paths.get(args[0]);
+ Path[] files;
+ try (Stream s = Files.list(dataDir)) {
+ files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+ .sorted().toArray(Path[]::new);
+ }
+
+ System.out.printf("%-22s %14s %14s %14s %14s %12s | %14s %14s%n",
+ "script", "in_S_occ", "boundary_occ", "foreign_occ",
+ "ascii_run_occ", "total_occ",
+ "drop_foreign_dist", "drop_asciirun_dist");
+ System.out.println(repeat('-', 165));
+
+ for (Path file : files) {
+ String fname = file.getFileName().toString();
+ String name = fname.substring(0, fname.length() - ".train.gz".length())
+ .toUpperCase();
+ Character.UnicodeScript target;
+ try {
+ target = Character.UnicodeScript.valueOf(name);
+ } catch (IllegalArgumentException e) {
+ continue;
+ }
+
+ long inS = 0, boundary = 0, foreign = 0, asciiRun = 0;
+ HashMap distinctAll = new HashMap<>(1 << 16);
+ HashMap distinctKeptUnderForeignDrop = new HashMap<>(1 << 16);
+ HashMap distinctKeptUnderAsciiDrop = new HashMap<>(1 << 16);
+
+ try (BufferedReader r = new BufferedReader(
+ new InputStreamReader(
+ new GZIPInputStream(Files.newInputStream(file)),
+ StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = r.readLine()) != null) {
+ int prevCp = -1;
+ for (int i = 0; i < line.length(); ) {
+ int cp = line.codePointAt(i);
+ i += Character.charCount(cp);
+ if (prevCp >= 0) {
+ boolean aInS = inScriptOrCommon(prevCp, target);
+ boolean bInS = inScriptOrCommon(cp, target);
+ boolean aLetter = isLatinLetter(prevCp);
+ boolean bLetter = isLatinLetter(cp);
+
+ long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL);
+ increment(distinctAll, packed);
+
+ if (aInS && bInS) {
+ inS++;
+ increment(distinctKeptUnderForeignDrop, packed);
+ increment(distinctKeptUnderAsciiDrop, packed);
+ } else if (aInS != bInS) {
+ boundary++;
+ increment(distinctKeptUnderForeignDrop, packed);
+ increment(distinctKeptUnderAsciiDrop, packed);
+ } else {
+ // both foreign (neither in S nor COMMON)
+ foreign++;
+ if (aLetter && bLetter) {
+ asciiRun++;
+ } else {
+ // foreign interior but not pure ASCII letters:
+ // we'd keep this under the "ASCII-letter only" rule.
+ increment(distinctKeptUnderAsciiDrop, packed);
+ }
+ }
+ }
+ prevCp = cp;
+ }
+ }
+ }
+
+ long total = inS + boundary + foreign;
+ int distAll = distinctAll.size();
+ int distForeignDrop = distinctKeptUnderForeignDrop.size();
+ int distAsciiDrop = distinctKeptUnderAsciiDrop.size();
+
+ System.out.printf("%-22s %,14d %,14d %,14d %,14d %,12d | %,14d %,14d%n",
+ name.toLowerCase(), inS, boundary, foreign, asciiRun, total,
+ distAll - distForeignDrop, distAll - distAsciiDrop);
+ }
+ }
+
+ private static boolean inScriptOrCommon(int cp, Character.UnicodeScript target) {
+ Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ return s == target
+ || s == Character.UnicodeScript.COMMON
+ || s == Character.UnicodeScript.INHERITED;
+ }
+
+ private static boolean isLatinLetter(int cp) {
+ return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')
+ || (cp >= 0xFF21 && cp <= 0xFF3A) // fullwidth A-Z
+ || (cp >= 0xFF41 && cp <= 0xFF5A); // fullwidth a-z
+ }
+
+ private static void increment(HashMap map, long key) {
+ long[] c = map.get(key);
+ if (c == null) {
+ map.put(key, new long[]{1L});
+ } else {
+ c[0]++;
+ }
+ }
+
+ private static String repeat(char c, int n) {
+ char[] b = new char[n];
+ java.util.Arrays.fill(b, c);
+ return new String(b);
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index 27a5436d5e4..a80fafbd6b4 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -82,45 +82,18 @@
public class BuildJunkTrainingData {
// -----------------------------------------------------------------------
- // Defaults
+ // Split ratios — fixed, part of the model identity (changing them would
+ // invalidate downstream eval comparisons).
// -----------------------------------------------------------------------
- /** Lines read per language to determine dominant script. */
- private static final int DEFAULT_SCRIPT_SAMPLE_LINES = 2_000;
-
- /**
- * UTF-8 bytes loaded per script group for entropy estimation.
- * Budget is spread evenly across languages in the group.
- * 200KB is enough to observe the bigram distribution reliably.
- */
- private static final long ENTROPY_SAMPLE_BYTES = 200_000L;
-
- /**
- * Total UTF-8 byte budget across all script groups. Divided proportionally
- * by bigram entropy after the sampling phase. 50MB gives ~1–3MB per script
- * on average across 34 groups; scale up for production runs.
- */
- private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L;
-
- /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
- private static final int DEFAULT_MIN_BYTES = 50;
-
- /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
- private static final double DEFAULT_MAX_PUNC_FRAC = 0.30;
-
/** Fraction of sentences written to each split (train / dev / test = 80/10/10). */
private static final double TRAIN_FRAC = 0.80;
private static final double DEV_FRAC = 0.10;
// remaining (1 - TRAIN_FRAC - DEV_FRAC) goes to the test split
- /**
- * Minimum number of sentences that must land in the dev split for a script to be
- * included in the model. Scripts below this floor have too few samples to reliably
- * estimate calibration statistics (mu/sigma), which produces noisy z-scores and
- * inflated false positive rates. With DEV_FRAC=0.10 the effective minimum total
- * sentence count is minDevSentences / DEV_FRAC (default: 5,000 total sentences).
- */
- private static final int DEFAULT_MIN_DEV_SENTENCES = 500;
+ // All other durable parameters live in JunkDetectorTrainingConfig. This
+ // tool deliberately does not accept CLI overrides for those values; see
+ // the rejection logic in main() below.
// -----------------------------------------------------------------------
// Entry point
@@ -129,13 +102,22 @@ public class BuildJunkTrainingData {
public static void main(String[] args) throws IOException {
Path dataDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "data");
Path outputDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "junkdetect");
- int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES;
- long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES;
- int minBytes = DEFAULT_MIN_BYTES;
- double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC;
- int seed = 42;
boolean dryRun = false;
- int minDevSentences = DEFAULT_MIN_DEV_SENTENCES;
+
+ // Bind config-controlled values into local variables. These are
+ // read-only from this point on; any attempt to override them via CLI
+ // is rejected below.
+ long totalBudgetBytes = JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES;
+ long perLanguageCapBytes = JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES;
+ int minBytes = JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE;
+ double maxPuncFrac = JunkDetectorTrainingConfig.MAX_PUNC_FRAC;
+ double minTargetScriptFrac = JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC;
+ int minDevSentences = JunkDetectorTrainingConfig.MIN_DEV_SENTENCES;
+ int scriptSampleLines = JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES;
+ int seed = JunkDetectorTrainingConfig.SEED;
+ java.util.Set dropScripts = JunkDetectorTrainingConfig.DROP_SCRIPTS;
+ Map scriptBudgetOverrides =
+ JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
@@ -145,26 +127,25 @@ public static void main(String[] args) throws IOException {
case "--output-dir":
outputDir = Paths.get(args[++i]);
break;
- case "--script-sample-lines":
- scriptSampleLines = Integer.parseInt(args[++i]);
+ case "--dry-run":
+ dryRun = true;
break;
+ // Durable parameters are config-controlled. Refuse any CLI
+ // override so that a model file's identity always matches the
+ // committed config.
+ case "--script-sample-lines":
case "--total-budget-bytes":
- totalBudgetBytes = Long.parseLong(args[++i]);
- break;
+ case "--per-language-cap-bytes":
case "--min-bytes":
- minBytes = Integer.parseInt(args[++i]);
- break;
case "--max-punc-frac":
- maxPuncFrac = Double.parseDouble(args[++i]);
- break;
+ case "--min-target-script-frac":
case "--seed":
- seed = Integer.parseInt(args[++i]);
- break;
case "--min-dev-sentences":
- minDevSentences = Integer.parseInt(args[++i]);
- break;
- case "--dry-run":
- dryRun = true;
+ case "--drop-scripts":
+ case "--script-budget-override":
+ System.err.println("ERROR: " + args[i] + " is no longer a CLI option."
+ + " Edit JunkDetectorTrainingConfig and commit the change instead.");
+ System.exit(1);
break;
default:
System.err.println("Unknown argument: " + args[i]);
@@ -174,15 +155,26 @@ public static void main(String[] args) throws IOException {
}
System.out.println("=== BuildJunkTrainingData ===");
- System.out.println(" data-dir: " + dataDir);
- System.out.println(" output-dir: " + outputDir);
- System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n",
+ System.out.println(" data-dir: " + dataDir);
+ System.out.println(" output-dir: " + outputDir);
+ System.out.println(" --- config (JunkDetectorTrainingConfig) ---");
+ System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n",
totalBudgetBytes, totalBudgetBytes / 1_000_000.0);
- System.out.printf( " min-bytes: %d%n", minBytes);
- System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac);
- System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n",
+ System.out.printf( " per-language-cap: %,d (%.1f MB)%n",
+ perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0);
+ System.out.printf( " min-bytes: %d%n", minBytes);
+ System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac);
+ System.out.printf( " min-target-script-frac: %.2f%n", minTargetScriptFrac);
+ System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n",
minDevSentences, (int)(minDevSentences / DEV_FRAC));
- System.out.println(" dry-run: " + dryRun);
+ System.out.printf( " seed: %d%n", seed);
+ if (!dropScripts.isEmpty()) {
+ System.out.println(" drop-scripts: " + dropScripts);
+ }
+ if (!scriptBudgetOverrides.isEmpty()) {
+ System.out.println(" script-budget-override: " + scriptBudgetOverrides);
+ }
+ System.out.println(" dry-run: " + dryRun);
if (!Files.isDirectory(dataDir)) {
System.err.println("ERROR: data-dir not found: " + dataDir);
@@ -208,6 +200,15 @@ public static void main(String[] args) throws IOException {
System.out.printf(" %-12s → %s%n", lang, script);
}
}
+
+ if (!dropScripts.isEmpty()) {
+ for (String s : dropScripts) {
+ if (scriptGroups.remove(s) != null) {
+ System.out.printf(" DROP script: %s%n", s);
+ }
+ }
+ }
+
System.out.printf("%n → %d languages, %d script groups%n",
langToScript.size(), scriptGroups.size());
@@ -222,7 +223,8 @@ public static void main(String[] args) throws IOException {
String script = entry.getKey();
List langDirs = entry.getValue();
- long perLangSampleBytes = Math.max(ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L);
+ long perLangSampleBytes = Math.max(
+ JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L);
List sample = new ArrayList<>();
for (Path langDir : langDirs) {
loadSentences(langDir, perLangSampleBytes, minBytes, maxPuncFrac, sample);
@@ -246,9 +248,25 @@ public static void main(String[] args) throws IOException {
Map scriptBudget = new TreeMap<>();
for (Map.Entry e : scriptEntropy.entrySet()) {
long budget = (long) (totalBudgetBytes * e.getValue() / totalEntropy);
+ Long override = scriptBudgetOverrides.get(e.getKey());
+ if (override != null) {
+ System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)"
+ + " [OVERRIDE: was %,d (%.1f MB)]%n",
+ e.getKey(), e.getValue(), override, override / 1_000_000.0,
+ budget, budget / 1_000_000.0);
+ budget = override;
+ } else {
+ System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)%n",
+ e.getKey(), e.getValue(), budget, budget / 1_000_000.0);
+ }
scriptBudget.put(e.getKey(), budget);
- System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)%n",
- e.getKey(), e.getValue(), budget, budget / 1_000_000.0);
+ }
+ // Warn about overrides for scripts that aren't in the bucket set.
+ for (String k : scriptBudgetOverrides.keySet()) {
+ if (!scriptBudget.containsKey(k)) {
+ System.err.printf("WARNING: --script-budget-override for %s ignored"
+ + " (script not in bucket set)%n", k);
+ }
}
if (dryRun) {
@@ -273,8 +291,16 @@ public static void main(String[] args) throws IOException {
String script = budgetEntry.getKey();
long budget = budgetEntry.getValue();
List langDirs = scriptGroups.get(script);
+ Character.UnicodeScript targetScript = parseUnicodeScript(script);
long perLangBytes = Math.max(budget / langDirs.size(), 1L);
+ // Apply per-language cap on top of the even split, but only for
+ // multi-language buckets. For single-language scripts (e.g. KHMER,
+ // HANGUL), the cap would needlessly limit a bucket that has only
+ // one source; let it consume its full budget instead.
+ long capPerLang = langDirs.size() > 1
+ ? Math.min(perLangBytes, perLanguageCapBytes)
+ : perLangBytes;
List sentences = new ArrayList<>();
long totalBytesLoaded = 0;
@@ -282,8 +308,10 @@ public static void main(String[] args) throws IOException {
long remaining = budget - totalBytesLoaded;
if (remaining <= 0) break;
long langBytes = loadSentences(langDir,
- Math.min(perLangBytes, remaining),
- minBytes, maxPuncFrac, sentences);
+ Math.min(capPerLang, remaining),
+ minBytes, maxPuncFrac,
+ targetScript, minTargetScriptFrac,
+ sentences);
totalBytesLoaded += langBytes;
if (langBytes > 0) {
System.out.printf(" %-12s %-20s +%,d bytes%n",
@@ -327,7 +355,11 @@ public static void main(String[] args) throws IOException {
long newBudget = budget + extra;
List langDirs = scriptGroups.get(script);
+ Character.UnicodeScript targetScript = parseUnicodeScript(script);
long perLangBytes = Math.max(newBudget / langDirs.size(), 1L);
+ long capPerLang = langDirs.size() > 1
+ ? Math.min(perLangBytes, perLanguageCapBytes)
+ : perLangBytes;
List sentences = new ArrayList<>();
long totalBytesLoaded = 0;
@@ -335,8 +367,10 @@ public static void main(String[] args) throws IOException {
long remaining = newBudget - totalBytesLoaded;
if (remaining <= 0) break;
long langBytes = loadSentences(langDir,
- Math.min(perLangBytes, remaining),
- minBytes, maxPuncFrac, sentences);
+ Math.min(capPerLang, remaining),
+ minBytes, maxPuncFrac,
+ targetScript, minTargetScriptFrac,
+ sentences);
totalBytesLoaded += langBytes;
}
if (!sentences.isEmpty()) {
@@ -415,6 +449,21 @@ public static void main(String[] args) throws IOException {
System.out.println("Done.");
}
+ /**
+ * Parses a script-bucket name (e.g. {@code "HAN"}) into a
+ * {@link Character.UnicodeScript}, or returns {@code null} if the name
+ * does not correspond to a real script (e.g. {@code "COMMON"} or any
+ * future synthetic bucket). Used by the corpus builder to look up the
+ * target script for the {@code min-target-script-frac} filter.
+ */
+ static Character.UnicodeScript parseUnicodeScript(String name) {
+ try {
+ return Character.UnicodeScript.valueOf(name);
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ }
+
// -----------------------------------------------------------------------
// Script detection
// -----------------------------------------------------------------------
@@ -531,6 +580,22 @@ static double computeBigramEntropy(List sentences) {
*/
static long loadSentences(Path langDir, long maxBytes, int minBytes,
double maxPuncFrac, List result) {
+ // Backwards-compatible overload: no target-script filter.
+ return loadSentences(langDir, maxBytes, minBytes, maxPuncFrac,
+ null, 0.0, result);
+ }
+
+ /**
+ * Same as the 5-arg overload, but additionally drops sentences whose
+ * fraction of {@code targetScript} codepoints (relative to all non-
+ * COMMON/INHERITED codepoints) is below {@code minTargetScriptFrac}.
+ * Passing {@code targetScript == null} disables the target-script filter.
+ */
+ static long loadSentences(Path langDir, long maxBytes, int minBytes,
+ double maxPuncFrac,
+ Character.UnicodeScript targetScript,
+ double minTargetScriptFrac,
+ List result) {
long bytesLoaded = 0;
for (String filename : new String[]{"sentences_wikipedia.txt", "sentences_madlad.txt"}) {
if (bytesLoaded >= maxBytes) {
@@ -553,7 +618,8 @@ static long loadSentences(Path langDir, long maxBytes, int minBytes,
if (text.isEmpty()) {
continue;
}
- String filtered = filterSentence(text, minBytes, maxPuncFrac);
+ String filtered = filterSentence(text, minBytes, maxPuncFrac,
+ targetScript, minTargetScriptFrac);
if (filtered != null) {
int sentBytes = filtered.getBytes(StandardCharsets.UTF_8).length;
result.add(filtered);
@@ -577,6 +643,18 @@ static long loadSentences(Path langDir, long maxBytes, int minBytes,
* @return the normalised sentence, or {@code null} if it should be discarded
*/
static String filterSentence(String text, int minBytes, double maxPuncFrac) {
+ return filterSentence(text, minBytes, maxPuncFrac, null, 0.0);
+ }
+
+ /**
+ * Same as the 3-arg overload, but additionally rejects sentences whose
+ * fraction of {@code targetScript} codepoints (over non-COMMON/INHERITED
+ * codepoints) is below {@code minTargetScriptFrac}. If {@code
+ * targetScript == null} the target-script filter is skipped.
+ */
+ static String filterSentence(String text, int minBytes, double maxPuncFrac,
+ Character.UnicodeScript targetScript,
+ double minTargetScriptFrac) {
if (text.indexOf('\uFFFD') >= 0) {
return null;
}
@@ -586,17 +664,34 @@ static String filterSentence(String text, int minBytes, double maxPuncFrac) {
}
int cpCount = 0;
int puncCount = 0;
+ int scriptCpTotal = 0;
+ int scriptCpMatching = 0;
for (int i = 0; i < text.length(); ) {
int cp = text.codePointAt(i);
cpCount++;
if (cp >= 0x21 && cp <= 0x7E && !Character.isLetter(cp)) {
puncCount++;
}
+ if (targetScript != null) {
+ Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ if (s != Character.UnicodeScript.COMMON
+ && s != Character.UnicodeScript.INHERITED
+ && s != Character.UnicodeScript.UNKNOWN) {
+ scriptCpTotal++;
+ if (s == targetScript) {
+ scriptCpMatching++;
+ }
+ }
+ }
i += Character.charCount(cp);
}
if (cpCount > 0 && (double) puncCount / cpCount > maxPuncFrac) {
return null;
}
+ if (targetScript != null && scriptCpTotal > 0
+ && (double) scriptCpMatching / scriptCpTotal < minTargetScriptFrac) {
+ return null;
+ }
return text;
}
@@ -624,23 +719,15 @@ private static void writeGzipped(Path path, List lines) throws IOExcepti
private static void printUsage() {
System.err.println("Usage: BuildJunkTrainingData [options]");
- System.err.println(" --data-dir MADLAD data root"
+ System.err.println(" --data-dir MADLAD data root"
+ " (default: ~/datasets/madlad/data)");
- System.err.println(" --output-dir Output directory"
+ System.err.println(" --output-dir Output directory"
+ " (default: ~/datasets/madlad/junkdetect)");
- System.err.println(" --script-sample-lines N Lines per language for script"
- + " detection (default: 2000)");
- System.err.println(" --total-budget-bytes N Total UTF-8 bytes across all"
- + " scripts (default: 50000000)");
- System.err.println(" --min-bytes N Min UTF-8 bytes per sentence"
- + " (default: 50)");
- System.err.println(" --max-punc-frac F Max ASCII punct fraction"
- + " (default: 0.30)");
- System.err.println(" --min-dev-sentences N Min sentences in dev split for a"
- + " script to be included (default: 500). Scripts below this floor"
- + " have unreliable calibration and inflated FPR.");
- System.err.println(" --seed N Random seed (default: 42)");
- System.err.println(" --dry-run Detect scripts + show budget,"
- + " skip file writing");
+ System.err.println(" --dry-run Detect scripts + show budget,"
+ + " skip file writing.");
+ System.err.println();
+ System.err.println("All other training/build parameters (budgets, filters, dropped"
+ + " scripts, seed, etc.) are fixed in JunkDetectorTrainingConfig and tracked"
+ + " in git. Edit that file and commit to change them.");
}
}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
new file mode 100644
index 00000000000..b287012ddc0
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Diagnostic tool for sizing a per-script F1 bigram store (v7 design).
+ *
+ * Walks every {@code *.train.gz} in {@code dataDir}, treating each file as
+ * one script's corpus. Counts (cpA, cpB) codepoint-pair frequencies and
+ * reports, per script:
+ *
+ *
+ * - total bigram occurrences (N)
+ *
- distinct pair count (U)
+ *
- singletons — pairs seen exactly once (these are usually the
+ * worst candidates to keep; they often reflect OCR noise / rare
+ * proper nouns and inflate U without helping discrimination)
+ *
- "effective" pair count = pairs seen at least {@code MIN_COUNT} times
+ *
- coverage curve: how many of the top-N most-frequent pairs are needed
+ * to cover {x = 50, 75, 90, 95, 99, 99.9}% of all bigram occurrences
+ *
- estimated v7 model size for several candidate cutoffs, assuming
+ * 2.25 bytes/pair (MPHF + 8-bit fingerprint + 8-bit value)
+ * and 1.3 bytes/pair (MPHF + 8-bit value, no fingerprint)
+ *
+ *
+ * Usage:
+ *
+ * mvn -pl tika-ml/tika-ml-junkdetect exec:java \
+ * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.CountPerScriptBigrams \
+ * -Dexec.args="/path/to/junkdetect"
+ *
+ *
+ * No model output; this is read-only telemetry to inform the v7 sizing
+ * decision (see {@code 20260514-junk-retrain-v6.md}).
+ */
+public final class CountPerScriptBigrams {
+
+ private static final int[] COVERAGE_PCT = {50, 75, 90, 95, 99};
+ private static final double[] COVERAGE_FRAC_HI = {0.999};
+
+ /** Cutoffs reported in the size-estimate table. */
+ private static final int[] MIN_COUNT_CUTOFFS = {1, 2, 3, 5, 10};
+
+ /** Bytes per retained pair for each candidate storage scheme. */
+ private static final double[] BYTES_PER_PAIR_SCHEMES = {1.3, 2.25, 6.25};
+ private static final String[] SCHEME_NAMES = {
+ "MPHF+val(1.3B)", "MPHF+fp+val(2.25B)", "open-addr+key(6.25B)"};
+
+ private CountPerScriptBigrams() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.err.println(
+ "Usage: CountPerScriptBigrams [topK-per-script]");
+ System.exit(1);
+ }
+ Path dataDir = Paths.get(args[0]);
+ int topK = args.length >= 2 ? Integer.parseInt(args[1]) : 0;
+
+ List trainFiles = new ArrayList<>();
+ try (Stream s = Files.list(dataDir)) {
+ s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+ .sorted()
+ .forEach(trainFiles::add);
+ }
+ if (trainFiles.isEmpty()) {
+ System.err.println("ERROR: no *.train.gz files in " + dataDir);
+ System.exit(1);
+ }
+
+ System.out.printf("Found %d *.train.gz files in %s%n%n",
+ trainFiles.size(), dataDir);
+ System.out.printf(
+ "%-22s %12s %12s %12s %12s | %s%n",
+ "script", "total_N", "distinct_U", "singletons",
+ "U(>=10)", "coverage: pairs needed for [50,75,90,95,99,99.9]%");
+ System.out.println(repeat('-', 140));
+
+ long grandTotalN = 0;
+ long grandTotalU = 0;
+ long grandTotalUge2 = 0;
+ long grandTotalUge10 = 0;
+
+ // Per-script size accumulators for the global-size summary at the end.
+ Map perScriptStats = new HashMap<>();
+
+ for (Path trainFile : trainFiles) {
+ String fname = trainFile.getFileName().toString();
+ String script = fname.substring(0, fname.length() - ".train.gz".length())
+ .toUpperCase();
+
+ HashMap pairCounts = new HashMap<>(1 << 16);
+ long totalN = 0;
+ try (BufferedReader r = new BufferedReader(
+ new InputStreamReader(
+ new GZIPInputStream(Files.newInputStream(trainFile)),
+ StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = r.readLine()) != null) {
+ int prevCp = -1;
+ for (int i = 0; i < line.length(); ) {
+ int cp = line.codePointAt(i);
+ i += Character.charCount(cp);
+ if (prevCp >= 0) {
+ long key = packPair(prevCp, cp);
+ long[] c = pairCounts.get(key);
+ if (c == null) {
+ pairCounts.put(key, new long[]{1L});
+ } else {
+ c[0]++;
+ }
+ totalN++;
+ }
+ prevCp = cp;
+ }
+ }
+ }
+
+ int distinctU = pairCounts.size();
+
+ long[] counts = new long[distinctU];
+ int idx = 0;
+ for (long[] c : pairCounts.values()) {
+ counts[idx++] = c[0];
+ }
+ // Sort descending for coverage curve.
+ java.util.Arrays.sort(counts);
+ // Reverse in place.
+ for (int i = 0, j = counts.length - 1; i < j; i++, j--) {
+ long t = counts[i];
+ counts[i] = counts[j];
+ counts[j] = t;
+ }
+
+ int singletons = 0;
+ int uGe2 = 0;
+ int uGe10 = 0;
+ for (long c : counts) {
+ if (c == 1) singletons++;
+ if (c >= 2) uGe2++;
+ if (c >= 10) uGe10++;
+ }
+
+ // Coverage thresholds: minimum k such that sum(counts[0..k-1]) / N >= t.
+ int[] coveragePairs = new int[COVERAGE_PCT.length + COVERAGE_FRAC_HI.length];
+ double[] thresholds = new double[coveragePairs.length];
+ for (int i = 0; i < COVERAGE_PCT.length; i++) {
+ thresholds[i] = COVERAGE_PCT[i] / 100.0;
+ }
+ for (int i = 0; i < COVERAGE_FRAC_HI.length; i++) {
+ thresholds[COVERAGE_PCT.length + i] = COVERAGE_FRAC_HI[i];
+ }
+ long running = 0;
+ int tIdx = 0;
+ for (int k = 0; k < counts.length && tIdx < thresholds.length; k++) {
+ running += counts[k];
+ while (tIdx < thresholds.length
+ && (double) running / totalN >= thresholds[tIdx]) {
+ coveragePairs[tIdx++] = k + 1;
+ }
+ }
+ // Fill any unreached thresholds with U (means: never reached, took all).
+ for (; tIdx < thresholds.length; tIdx++) {
+ coveragePairs[tIdx] = distinctU;
+ }
+
+ StringBuilder cov = new StringBuilder();
+ for (int i = 0; i < coveragePairs.length; i++) {
+ if (i > 0) cov.append(", ");
+ cov.append(String.format("%,d", coveragePairs[i]));
+ }
+
+ System.out.printf("%-22s %,12d %,12d %,12d %,12d | %s%n",
+ script.toLowerCase(),
+ totalN, distinctU, singletons, uGe10,
+ cov.toString());
+
+ // Per-script size table.
+ if (topK > 0 || true) {
+ long[] sizeStats = new long[
+ 2 + MIN_COUNT_CUTOFFS.length + BYTES_PER_PAIR_SCHEMES.length];
+ sizeStats[0] = totalN;
+ sizeStats[1] = distinctU;
+ for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+ int minC = MIN_COUNT_CUTOFFS[i];
+ int kept = 0;
+ for (long c : counts) {
+ if (c >= minC) kept++;
+ else break;
+ }
+ sizeStats[2 + i] = kept;
+ }
+ perScriptStats.put(script.toLowerCase(), sizeStats);
+ }
+
+ // Per-script top-K dump if requested.
+ if (topK > 0) {
+ System.out.printf(" top %d pairs in %s:%n", topK, script.toLowerCase());
+ List> sorted = new ArrayList<>(pairCounts.entrySet());
+ sorted.sort((a, b) -> Long.compare(b.getValue()[0], a.getValue()[0]));
+ for (int i = 0; i < Math.min(topK, sorted.size()); i++) {
+ Map.Entry e = sorted.get(i);
+ long k = e.getKey();
+ int cpA = (int) (k >>> 24);
+ int cpB = (int) (k & 0xFFFFFFL);
+ System.out.printf(" U+%04X U+%04X (%c %c) %,d%n",
+ cpA, cpB,
+ safePrint(cpA), safePrint(cpB),
+ e.getValue()[0]);
+ }
+ }
+
+ grandTotalN += totalN;
+ grandTotalU += distinctU;
+ grandTotalUge2 += uGe2;
+ grandTotalUge10 += uGe10;
+ }
+
+ System.out.println(repeat('-', 140));
+ System.out.printf("%-22s %,12d %,12d %12s %,12d%n%n",
+ "TOTAL", grandTotalN, grandTotalU,
+ "-", grandTotalUge10);
+
+ // ------------------------------------------------------------------
+ // Cutoff vs. model-size summary
+ // ------------------------------------------------------------------
+ System.out.println("=== Model-size estimates by min-count cutoff and storage scheme ===");
+ System.out.println("(sum of retained pairs across all scripts × bytes-per-pair)");
+ System.out.println();
+ System.out.printf("%-12s", "cutoff");
+ for (String name : SCHEME_NAMES) {
+ System.out.printf(" %20s", name);
+ }
+ System.out.printf(" %20s%n", "retained_pairs");
+ System.out.println(repeat('-', 12 + (SCHEME_NAMES.length + 1) * 21));
+
+ for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+ long retained = 0;
+ for (long[] stats : perScriptStats.values()) {
+ retained += stats[2 + i];
+ }
+ System.out.printf("min_count>=%-2d", MIN_COUNT_CUTOFFS[i]);
+ for (double bpp : BYTES_PER_PAIR_SCHEMES) {
+ double bytes = retained * bpp;
+ System.out.printf(" %18s ", humanBytes(bytes));
+ }
+ System.out.printf(" %,20d%n", retained);
+ }
+
+ System.out.println();
+ System.out.println("Per-script pair counts retained at each cutoff:");
+ System.out.printf("%-22s", "script");
+ for (int c : MIN_COUNT_CUTOFFS) {
+ System.out.printf(" %12s", ">=" + c);
+ }
+ System.out.println();
+ List> sortedScripts =
+ new ArrayList<>(perScriptStats.entrySet());
+ sortedScripts.sort(Comparator.comparingLong(
+ (Map.Entry e) -> -e.getValue()[1]));
+ for (Map.Entry e : sortedScripts) {
+ System.out.printf("%-22s", e.getKey());
+ for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+ System.out.printf(" %,12d", e.getValue()[2 + i]);
+ }
+ System.out.println();
+ }
+ }
+
+ /** Pack two codepoints (each up to 21 bits) into a single long. */
+ private static long packPair(int cpA, int cpB) {
+ return ((long) cpA << 24) | (cpB & 0xFFFFFFL);
+ }
+
+ private static char safePrint(int cp) {
+ if (cp < 0x20 || cp == 0x7F || !Character.isDefined(cp)) {
+ return '.';
+ }
+ if (Character.charCount(cp) != 1) {
+ return '?';
+ }
+ return (char) cp;
+ }
+
+ private static String repeat(char c, int n) {
+ char[] buf = new char[n];
+ java.util.Arrays.fill(buf, c);
+ return new String(buf);
+ }
+
+ private static String humanBytes(double bytes) {
+ if (bytes < 1024) return String.format("%.0f B", bytes);
+ if (bytes < 1024 * 1024) return String.format("%.1f KB", bytes / 1024.0);
+ if (bytes < 1024L * 1024 * 1024) return String.format("%.2f MB", bytes / (1024.0 * 1024));
+ return String.format("%.2f GB", bytes / (1024.0 * 1024 * 1024));
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
new file mode 100644
index 00000000000..36f3a897a01
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Diagnostic: replicate JunkDetector.buildScriptRuns exactly on a fixture
+ * and print every run. Helps explain why score() returns UNKNOWN.
+ *
+ * Usage:
+ *
+ * ./mvnw exec:java -pl tika-ml/tika-ml-junkdetect \
+ * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.DebugScriptRuns \
+ * -Dexec.args="--file ~/data/regression/.../AIT5... --charset GB18030 --bytes 1024"
+ *
+ */
+public class DebugScriptRuns {
+
+ // Mirror of JunkDetector.SCRIPT_MODEL_FALLBACK — keep in sync if production changes.
+ private static final Map SCRIPT_MODEL_FALLBACK = Map.of(
+ "HIRAGANA", "HAN",
+ "KATAKANA", "HAN");
+
+ public static void main(String[] args) throws IOException {
+ Path file = null;
+ String charset = "GB18030";
+ int probeBytes = 1024;
+ boolean strip = true;
+ boolean expand = true;
+
+ for (int i = 0; i < args.length; i++) {
+ switch (args[i]) {
+ case "--file":
+ file = Paths.get(expandHome(args[++i]));
+ break;
+ case "--charset":
+ charset = args[++i];
+ break;
+ case "--bytes":
+ probeBytes = Integer.parseInt(args[++i]);
+ break;
+ case "--no-strip":
+ strip = false;
+ break;
+ case "--no-expand":
+ expand = false;
+ break;
+ default:
+ System.err.println("unknown: " + args[i]);
+ System.exit(1);
+ }
+ }
+ if (file == null) {
+ System.err.println("Required: --file ");
+ System.exit(1);
+ }
+ byte[] raw = Files.readAllBytes(file);
+ byte[] forDecode = raw;
+ if (strip) {
+ byte[] dst = new byte[raw.length];
+ HtmlByteStripper.Result r = HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+ if (r.tagCount > 0 && r.length > 0) {
+ forDecode = Arrays.copyOf(dst, r.length);
+ }
+ System.err.println("After strip: " + forDecode.length + " bytes (was " + raw.length + ")");
+ }
+ if (forDecode.length > probeBytes) {
+ forDecode = Arrays.copyOf(forDecode, probeBytes);
+ }
+ System.err.println("Probe: " + forDecode.length + " bytes decoded as " + charset);
+
+ String decoded = new String(forDecode, Charset.forName(charset));
+ if (expand) {
+ decoded = expandEntities(decoded);
+ }
+ System.err.println("Decoded codepoints: " + decoded.codePointCount(0, decoded.length()));
+
+ List runs = buildScriptRuns(decoded);
+ System.err.println("Built " + runs.size() + " script runs.");
+
+ // Mirror JunkDetector.scoreText filter and report what would be scored.
+ JunkDetector detector = JunkDetector.loadFromClasspath();
+ java.util.Set modeled = detector.knownScripts();
+
+ TreeMap totals = new TreeMap<>(); // script -> {chars, bytes, runs, modeled?}
+ int totalScored = 0;
+ int totalSkippedShort = 0;
+ int totalSkippedUnmodeled = 0;
+ long totalBytesScored = 0;
+
+ for (Run r : runs) {
+ byte[] runUtf8 = r.text.getBytes(StandardCharsets.UTF_8);
+ boolean isModeled = modeled.contains(r.script);
+ boolean longEnough = runUtf8.length >= 2;
+ totals.merge(r.script, new int[]{r.text.codePointCount(0, r.text.length()),
+ runUtf8.length, 1, isModeled ? 1 : 0},
+ (a, b) -> new int[]{a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3]});
+ if (!isModeled) {
+ totalSkippedUnmodeled++;
+ } else if (!longEnough) {
+ totalSkippedShort++;
+ } else {
+ totalScored++;
+ totalBytesScored += runUtf8.length;
+ }
+ }
+
+ System.out.println("Script roll-up (script: cps, utf8_bytes, runs, modeled):");
+ for (Map.Entry e : totals.entrySet()) {
+ int[] v = e.getValue();
+ System.out.printf(" %-15s cps=%-5d bytes=%-6d runs=%-4d modeled=%s%n",
+ e.getKey(), v[0], v[1], v[2], v[3] == 1 ? "Y" : "N");
+ }
+ System.out.println();
+ System.out.println("Scoring filter outcome:");
+ System.out.println(" runs scored: " + totalScored);
+ System.out.println(" runs skipped (short): " + totalSkippedShort);
+ System.out.println(" runs skipped (unmod): " + totalSkippedUnmodeled);
+ System.out.println(" total bytes scored: " + totalBytesScored);
+
+ // The bug: computeF1MeanLogP returns NaN when String.length() < 2.
+ // String.length() counts UTF-16 code units, but the outer filter uses
+ // UTF-8 bytes. A single CJK char = 1 UTF-16 unit but 3 UTF-8 bytes,
+ // so it passes the outer filter and produces NaN inside.
+ int nanCausing = 0;
+ for (Run r : runs) {
+ byte[] u = r.text.getBytes(StandardCharsets.UTF_8);
+ if (u.length >= 2 && r.text.length() < 2 && modeled.contains(r.script)) {
+ nanCausing++;
+ }
+ }
+ System.out.println();
+ System.out.println("NaN-causing runs (utf8≥2 but utf16<2, modeled): " + nanCausing);
+
+ TextQualityScore score = detector.score(decoded);
+ System.out.println(" detector.score() z: "
+ + (score.isUnknown() ? "UNKNOWN(" + score.getDominantScript() + ")"
+ : String.format("%.3f (script=%s)", score.getZScore(), score.getDominantScript())));
+
+ // Print the longest 10 runs so we can see what's actually in there.
+ System.out.println();
+ System.out.println("Longest 10 runs:");
+ runs.sort((a, b) -> Integer.compare(b.text.length(), a.text.length()));
+ for (int i = 0; i < Math.min(10, runs.size()); i++) {
+ Run r = runs.get(i);
+ byte[] u = r.text.getBytes(StandardCharsets.UTF_8);
+ String preview = r.text.length() > 30
+ ? r.text.substring(0, 30) + "…" : r.text;
+ preview = preview.replace("\n", "\\n").replace("\r", "\\r");
+ System.out.printf(" %-15s cps=%-4d bytes=%-4d preview=%s%n",
+ r.script, r.text.codePointCount(0, r.text.length()), u.length, preview);
+ }
+ }
+
+ // Exact mirror of JunkDetector.buildScriptRuns (private, copied here for diagnosis).
+ private static List buildScriptRuns(String text) {
+ List runs = new ArrayList<>();
+ String currentScript = null;
+ StringBuilder currentText = new StringBuilder();
+ StringBuilder leadingCommon = new StringBuilder();
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ i += Character.charCount(cp);
+ Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ if (s == Character.UnicodeScript.COMMON
+ || s == Character.UnicodeScript.INHERITED
+ || s == Character.UnicodeScript.UNKNOWN) {
+ if (currentScript != null) {
+ currentText.appendCodePoint(cp);
+ } else {
+ leadingCommon.appendCodePoint(cp);
+ }
+ continue;
+ }
+ String scriptName = SCRIPT_MODEL_FALLBACK.getOrDefault(s.name(), s.name());
+ if (!scriptName.equals(currentScript)) {
+ if (currentScript != null && currentText.length() > 0) {
+ runs.add(new Run(currentScript, currentText.toString()));
+ }
+ currentScript = scriptName;
+ currentText = new StringBuilder();
+ if (leadingCommon.length() > 0) {
+ currentText.append(leadingCommon);
+ leadingCommon.setLength(0);
+ }
+ }
+ currentText.appendCodePoint(cp);
+ }
+ if (currentScript != null && currentText.length() > 0) {
+ runs.add(new Run(currentScript, currentText.toString()));
+ }
+ return runs;
+ }
+
+ private static final class Run {
+ final String script;
+ final String text;
+ Run(String s, String t) {
+ this.script = s;
+ this.text = t;
+ }
+ }
+
+ private static final Pattern NUM_DEC = Pattern.compile("(\\d{1,7});");
+ private static final Pattern NUM_HEX = Pattern.compile("[xX]([0-9a-fA-F]{1,6});");
+ private static final Pattern NAMED =
+ Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+ private static String expandEntities(String in) {
+ String s = NUM_DEC.matcher(in).replaceAll(mr -> {
+ try {
+ int cp = Integer.parseInt(mr.group(1));
+ if (cp >= 0 && cp <= 0x10FFFF) {
+ return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+ }
+ } catch (NumberFormatException ignored) {
+ // leave unchanged
+ }
+ return Matcher.quoteReplacement(mr.group());
+ });
+ s = NUM_HEX.matcher(s).replaceAll(mr -> {
+ try {
+ int cp = Integer.parseInt(mr.group(1), 16);
+ if (cp >= 0 && cp <= 0x10FFFF) {
+ return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+ }
+ } catch (NumberFormatException ignored) {
+ // leave unchanged
+ }
+ return Matcher.quoteReplacement(mr.group());
+ });
+ s = NAMED.matcher(s).replaceAll(mr -> {
+ switch (mr.group(1)) {
+ case "amp": return "&";
+ case "lt": return "<";
+ case "gt": return ">";
+ case "quot": return "\"";
+ case "apos": return "'";
+ case "nbsp": return " ";
+ case "copy": return "©";
+ case "reg": return "®";
+ default: return Matcher.quoteReplacement(mr.group());
+ }
+ });
+ return s;
+ }
+
+ private static String expandHome(String s) {
+ return s.startsWith("~/") ? System.getProperty("user.home") + s.substring(1) : s;
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
index 6b6057fc34f..e0b4bc0ae10 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java
@@ -470,7 +470,7 @@ private static void writeCompareEval(JunkDetector detector,
sourceCodec, asSource, wrongCodec, asWrong);
deltas.add(result.delta());
- if ("A".equals(result.winner())) nCorrect++;
+ if (sourceCodec.equals(result.winner())) nCorrect++;
}
if (deltas.isEmpty()) continue;
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
new file mode 100644
index 00000000000..30d175a4b12
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java
@@ -0,0 +1,688 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Eval harness: for each labeled charset in {@code ~/data/charsets/devtest/},
+ * decode under its true charset (clean) and under a curated set of wrong
+ * charsets (mojibake), score with {@link JunkDetector}, report margin
+ * statistics per (labeled_charset × wrong_charset × source-byte-length).
+ *
+ * Devtest file format: gzip → repeated {@code [u16 big-endian length,
+ * length bytes]} records, where the bytes are real text encoded in the
+ * labeled charset. Same format the charset trainer consumes.
+ *
+ *
Output (TSVs):
+ *
+ * - detail.tsv: one row per (labeled_cs, script, wrong_cs, length).
+ * Columns: n, mean_clean_z, mean_mojibake_z, cohens_d, mean_margin,
+ * p5_margin, p50_margin, fpr, tpr.
+ * - summary.tsv: macro-averaged across wrong charsets, per
+ * (script, length). The headline "is this script in trouble?" view.
+ * - script_pivot.tsv: per-script rollup across all lengths +
+ * wrong charsets. Single-number-per-script view for spot inversion.
+ *
+ *
+ * "Margin" is the per-record paired difference {@code clean_z -
+ * mojibake_z}. Mean margin and 5th-percentile margin are the
+ * margin-maximization metrics the v6 retrain is optimizing for. Cohen's d
+ * is the independent-distribution analog (kept for compatibility with the
+ * existing {@link EvalJunkDetector} schema).
+ *
+ *
Usage:
+ *
+ * ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
+ * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.EvalJunkOnCharsetDevtest \
+ * -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v5-baseline"
+ *
+ */
+public class EvalJunkOnCharsetDevtest {
+
+ /**
+ * Curated set of wrong charsets to cross-decode every labeled charset
+ * against. Chosen to span the common real-world mojibake families:
+ * Western Latin (cp1252, ISO-8859-1, MacRoman), CJK over-claim (GB18030,
+ * Big5-HKSCS, Shift_JIS), Cyrillic (KOI8-R, cp1251), Arabic (cp1256),
+ * EBCDIC over-claim (IBM424), DOS Latin (IBM850), and UTF-8 (catches
+ * non-UTF8 bytes as replacement-character garbage).
+ */
+ private static final List DEFAULT_WRONG_CHARSETS = List.of(
+ "windows-1252", "ISO-8859-1", "x-MacRoman",
+ "GB18030", "Big5-HKSCS", "Shift_JIS",
+ "KOI8-R", "windows-1251",
+ "windows-1256", "IBM424",
+ "IBM850", "UTF-8"
+ );
+
+ /** Source-byte length buckets to slice records into. */
+ private static final int[] DEFAULT_LENGTHS = {20, 50, 100, 200, 500, 1000};
+
+ /** Cap on records loaded per labeled-charset file. */
+ private static final int DEFAULT_MAX_RECORDS = 2000;
+
+ /** Threshold for FPR/TPR reporting; matches EvalJunkDetector default. */
+ private static final float DEFAULT_THRESHOLD = -2.0f;
+
+ /** Minimum number of paired (clean, mojibake) samples per cell to emit a row. */
+ private static final int MIN_SAMPLES_PER_CELL = 30;
+
+ public static void main(String[] args) throws IOException {
+ Path devtestDir = Paths.get(System.getProperty("user.home"),
+ "data", "charsets", "devtest");
+ Path outputDir = Paths.get("/tmp/junkdetect-eval");
+ Path modelPath = null;
+ int maxRecords = DEFAULT_MAX_RECORDS;
+ int[] lengths = DEFAULT_LENGTHS;
+ float threshold = DEFAULT_THRESHOLD;
+ List wrongCharsets = DEFAULT_WRONG_CHARSETS;
+ List labeledFilter = null;
+
+ for (int i = 0; i < args.length; i++) {
+ switch (args[i]) {
+ case "--devtest-dir":
+ devtestDir = Paths.get(args[++i]);
+ break;
+ case "--output-dir":
+ outputDir = Paths.get(args[++i]);
+ break;
+ case "--model":
+ modelPath = Paths.get(args[++i]);
+ break;
+ case "--max-records":
+ maxRecords = Integer.parseInt(args[++i]);
+ break;
+ case "--threshold":
+ threshold = Float.parseFloat(args[++i]);
+ break;
+ case "--lengths":
+ lengths = Arrays.stream(args[++i].split(","))
+ .mapToInt(Integer::parseInt).toArray();
+ break;
+ case "--wrong-charsets":
+ wrongCharsets = Arrays.asList(args[++i].split(","));
+ break;
+ case "--only":
+ labeledFilter = Arrays.asList(args[++i].split(","));
+ break;
+ default:
+ System.err.println("Unknown arg: " + args[i]);
+ printUsage();
+ System.exit(1);
+ }
+ }
+
+ if (!Files.isDirectory(devtestDir)) {
+ System.err.println("ERROR: devtest-dir not found: " + devtestDir);
+ System.exit(1);
+ }
+ Files.createDirectories(outputDir);
+
+ JunkDetector detector = modelPath != null
+ ? JunkDetector.loadFromPath(modelPath)
+ : JunkDetector.loadFromClasspath();
+
+ System.err.println("=== EvalJunkOnCharsetDevtest ===");
+ System.err.println(" devtest-dir: " + devtestDir);
+ System.err.println(" output-dir: " + outputDir);
+ System.err.println(" model: " + (modelPath != null ? modelPath : "classpath default"));
+ System.err.println(" model version: " + detector.getModelVersion());
+ System.err.println(" max-records: " + maxRecords);
+ System.err.println(" lengths: " + Arrays.toString(lengths));
+ System.err.println(" threshold: " + threshold);
+ System.err.println(" wrong-cs: " + wrongCharsets);
+
+ // Resolve wrong charsets (skip any the JVM doesn't have)
+ Map resolvedWrong = new LinkedHashMap<>();
+ for (String name : wrongCharsets) {
+ Charset cs = tryGetCharset(name);
+ if (cs == null) {
+ System.err.println(" WARN: wrong-charset unavailable: " + name);
+ continue;
+ }
+ resolvedWrong.put(name, cs);
+ }
+
+ List files;
+ try (Stream stream = Files.list(devtestDir)) {
+ files = stream
+ .filter(p -> p.getFileName().toString().endsWith(".bin.gz"))
+ .sorted()
+ .toList();
+ }
+ if (files.isEmpty()) {
+ System.err.println("ERROR: no *.bin.gz files in " + devtestDir);
+ System.exit(1);
+ }
+
+ Path detailPath = outputDir.resolve("detail.tsv");
+ Path summaryPath = outputDir.resolve("summary.tsv");
+ Path pivotPath = outputDir.resolve("script_pivot.tsv");
+
+ List allRows = new ArrayList<>();
+
+ try (PrintWriter detail = new PrintWriter(
+ Files.newBufferedWriter(detailPath, StandardCharsets.UTF_8))) {
+
+ detail.println("labeled_cs\tscript\twrong_cs\tlength\tn"
+ + "\tmean_clean_z\tmean_mojibake_z\tcohens_d"
+ + "\tmean_margin\tp5_margin\tp50_margin"
+ + "\tfpr\ttpr");
+
+ for (Path file : files) {
+ String labeledName = filenameToCharsetName(file);
+ if (labeledFilter != null && !labeledFilter.contains(labeledName)) {
+ continue;
+ }
+ Charset labeled = tryGetCharset(labeledName);
+ if (labeled == null) {
+ System.err.println(" SKIP: labeled charset unavailable: " + labeledName);
+ continue;
+ }
+
+ List records = readRecords(file, maxRecords);
+ if (records.size() < MIN_SAMPLES_PER_CELL) {
+ System.err.printf(" SKIP %s: only %d records%n",
+ labeledName, records.size());
+ continue;
+ }
+
+ System.err.printf("%n--- %s (%d records) ---%n",
+ labeledName, records.size());
+
+ for (int len : lengths) {
+ List slices = sliceToLength(records, len);
+ if (slices.size() < MIN_SAMPLES_PER_CELL) {
+ continue;
+ }
+
+ // Decode all slices under labeled (clean) once
+ List cleanTexts = decodeAll(slices, labeled);
+ List cleanZs = scoreAll(detector, cleanTexts);
+ if (cleanZs.size() < MIN_SAMPLES_PER_CELL) {
+ continue;
+ }
+
+ // Detect script from a sample of the clean decoded text
+ String script = detectDominantScript(
+ cleanTexts.get(cleanTexts.size() / 2));
+
+ for (Map.Entry entry : resolvedWrong.entrySet()) {
+ String wrongName = entry.getKey();
+ Charset wrongCs = entry.getValue();
+ if (equalCharset(labeled, wrongCs)) {
+ continue; // can't be its own mojibake
+ }
+
+ List mojiTexts = decodeAll(slices, wrongCs);
+ // Pair cleanTexts[i] with mojiTexts[i] by source record
+ Row row = scorePairs(detector, script, labeledName,
+ wrongName, len, cleanTexts, mojiTexts,
+ cleanZs, threshold);
+ if (row == null) {
+ continue;
+ }
+ allRows.add(row);
+ detail.println(row.toTsv());
+ }
+ detail.flush();
+ System.err.printf(" len=%4d n_clean=%d cells=%d%n",
+ len, cleanZs.size(),
+ allRows.stream()
+ .filter(r -> r.labeledCs.equals(labeledName)
+ && r.length == len)
+ .count());
+ }
+ }
+ }
+
+ writeSummary(summaryPath, allRows, lengths);
+ writeScriptPivot(pivotPath, allRows);
+
+ System.err.println("\nWrote " + detailPath);
+ System.err.println("Wrote " + summaryPath);
+ System.err.println("Wrote " + pivotPath);
+ System.err.println("Done.");
+ }
+
+ // -----------------------------------------------------------------------
+ // Per-cell scoring (one labeled × wrong × length cell)
+ // -----------------------------------------------------------------------
+
+ private static Row scorePairs(JunkDetector detector,
+ String script,
+ String labeledName, String wrongName,
+ int length,
+ List cleanTexts,
+ List mojiTexts,
+ List cleanZsPre,
+ float threshold) {
+ // cleanZsPre is the already-scored clean text (avoid re-scoring per wrong cs).
+ // We re-score only the mojibake side here.
+ int n = Math.min(cleanTexts.size(), mojiTexts.size());
+ List cleanZs = new ArrayList<>(n);
+ List mojiZs = new ArrayList<>(n);
+ List margins = new ArrayList<>(n);
+ for (int i = 0; i < n; i++) {
+ float cz = cleanZsPre.get(i);
+ TextQualityScore ms = detector.score(mojiTexts.get(i));
+ if (ms.isUnknown()) {
+ continue;
+ }
+ float mz = ms.getZScore();
+ cleanZs.add(cz);
+ mojiZs.add(mz);
+ margins.add(cz - mz);
+ }
+ if (margins.size() < MIN_SAMPLES_PER_CELL) {
+ return null;
+ }
+ return new Row(labeledName, script, wrongName, length,
+ cleanZs, mojiZs, margins, threshold);
+ }
+
+ // -----------------------------------------------------------------------
+ // I/O: read the gzipped length-prefixed record format
+ // -----------------------------------------------------------------------
+
+ private static List readRecords(Path file, int maxRecords) throws IOException {
+ List records = new ArrayList<>();
+ try (FileInputStream fis = new FileInputStream(file.toFile());
+ GZIPInputStream gis = new GZIPInputStream(fis);
+ DataInputStream dis = new DataInputStream(gis)) {
+ while (records.size() < maxRecords) {
+ int len;
+ try {
+ len = dis.readUnsignedShort();
+ } catch (EOFException eof) {
+ break;
+ }
+ byte[] rec = new byte[len];
+ dis.readFully(rec);
+ records.add(rec);
+ }
+ }
+ return records;
+ }
+
+ private static List sliceToLength(List records, int len) {
+ List slices = new ArrayList<>();
+ for (byte[] r : records) {
+ if (r.length >= len) {
+ slices.add(Arrays.copyOf(r, len));
+ }
+ }
+ return slices;
+ }
+
+ private static List decodeAll(List slices, Charset cs) {
+ List texts = new ArrayList<>(slices.size());
+ for (byte[] s : slices) {
+ texts.add(decode(s, cs));
+ }
+ return texts;
+ }
+
+ private static String decode(byte[] bytes, Charset cs) {
+ CharsetDecoder dec = cs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ try {
+ return dec.decode(ByteBuffer.wrap(bytes)).toString();
+ } catch (CharacterCodingException e) {
+ return new String(bytes, cs); // fallback; shouldn't happen with REPLACE
+ }
+ }
+
+ private static List scoreAll(JunkDetector detector, List texts) {
+ List zs = new ArrayList<>(texts.size());
+ for (String t : texts) {
+ TextQualityScore s = detector.score(t);
+ if (!s.isUnknown()) {
+ zs.add(s.getZScore());
+ } else {
+ zs.add(Float.NaN);
+ }
+ }
+ return zs;
+ }
+
+ // -----------------------------------------------------------------------
+ // Aggregation: summary.tsv (macro across wrong charsets, per script×length)
+ // -----------------------------------------------------------------------
+
+ private static void writeSummary(Path summaryPath, List rows,
+ int[] lengths) throws IOException {
+ try (PrintWriter out = new PrintWriter(
+ Files.newBufferedWriter(summaryPath, StandardCharsets.UTF_8))) {
+ out.println("script\tlength\tn_cells"
+ + "\tmacro_cohens_d\tmacro_mean_margin\tmacro_p5_margin"
+ + "\tmacro_fpr\tmacro_tpr");
+
+ // Group by (script, length)
+ Map>> bucketed = new HashMap<>();
+ for (Row r : rows) {
+ bucketed
+ .computeIfAbsent(r.script, k -> new HashMap<>())
+ .computeIfAbsent(r.length, k -> new ArrayList<>())
+ .add(r);
+ }
+
+ List scripts = new ArrayList<>(bucketed.keySet());
+ Collections.sort(scripts);
+ for (String script : scripts) {
+ for (int len : lengths) {
+ List cell = bucketed.get(script).get(len);
+ if (cell == null || cell.isEmpty()) {
+ continue;
+ }
+ double macroD = cell.stream()
+ .filter(r -> !Double.isNaN(r.cohensD))
+ .mapToDouble(r -> r.cohensD)
+ .average().orElse(Double.NaN);
+ double macroMargin = cell.stream()
+ .mapToDouble(r -> r.meanMargin)
+ .average().orElse(Double.NaN);
+ double macroP5 = cell.stream()
+ .mapToDouble(r -> r.p5Margin)
+ .average().orElse(Double.NaN);
+ double macroFpr = cell.stream()
+ .mapToDouble(r -> r.fpr)
+ .average().orElse(Double.NaN);
+ double macroTpr = cell.stream()
+ .mapToDouble(r -> r.tpr)
+ .average().orElse(Double.NaN);
+ out.printf("%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f%n",
+ script, len, cell.size(),
+ macroD, macroMargin, macroP5, macroFpr, macroTpr);
+ }
+ }
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Aggregation: script_pivot.tsv (single line per script — quick triage)
+ // -----------------------------------------------------------------------
+
+ private static void writeScriptPivot(Path path, List rows) throws IOException {
+ try (PrintWriter out = new PrintWriter(
+ Files.newBufferedWriter(path, StandardCharsets.UTF_8))) {
+ out.println("script\tn_cells"
+ + "\tmean_d\tmean_margin\tmean_p5_margin"
+ + "\tmin_d_cell\tmin_margin_cell");
+
+ Map> byScript = new HashMap<>();
+ for (Row r : rows) {
+ byScript.computeIfAbsent(r.script, k -> new ArrayList<>()).add(r);
+ }
+ List scripts = new ArrayList<>(byScript.keySet());
+ Collections.sort(scripts);
+ for (String script : scripts) {
+ List cells = byScript.get(script);
+ double meanD = cells.stream()
+ .filter(r -> !Double.isNaN(r.cohensD))
+ .mapToDouble(r -> r.cohensD)
+ .average().orElse(Double.NaN);
+ double meanMargin = cells.stream()
+ .mapToDouble(r -> r.meanMargin)
+ .average().orElse(Double.NaN);
+ double meanP5 = cells.stream()
+ .mapToDouble(r -> r.p5Margin)
+ .average().orElse(Double.NaN);
+ Row minDCell = cells.stream()
+ .filter(r -> !Double.isNaN(r.cohensD))
+ .min((a, b) -> Double.compare(a.cohensD, b.cohensD))
+ .orElse(null);
+ Row minMarginCell = cells.stream()
+ .min((a, b) -> Double.compare(a.meanMargin, b.meanMargin))
+ .orElse(null);
+ out.printf("%s\t%d\t%.3f\t%.3f\t%.3f\t%s\t%s%n",
+ script, cells.size(),
+ meanD, meanMargin, meanP5,
+ minDCell != null ? cellLabel(minDCell) : "-",
+ minMarginCell != null ? cellLabel(minMarginCell) : "-");
+ }
+ }
+ }
+
+ private static String cellLabel(Row r) {
+ return String.format("[%s→%s@%d]", r.labeledCs, r.wrongCs, r.length);
+ }
+
+ // -----------------------------------------------------------------------
+ // Charset utilities
+ // -----------------------------------------------------------------------
+
+ private static String filenameToCharsetName(Path file) {
+ String name = file.getFileName().toString();
+ if (name.endsWith(".bin.gz")) {
+ name = name.substring(0, name.length() - ".bin.gz".length());
+ }
+ return name;
+ }
+
+ private static Charset tryGetCharset(String name) {
+ try {
+ return Charset.forName(name);
+ } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
+ return null;
+ }
+ }
+
+ private static boolean equalCharset(Charset a, Charset b) {
+ return a.name().equalsIgnoreCase(b.name())
+ || a.aliases().contains(b.name())
+ || b.aliases().contains(a.name());
+ }
+
+ // -----------------------------------------------------------------------
+ // Script detection (parallels JunkDetector.detectDominantScript, which is
+ // package-private; small enough to inline)
+ // -----------------------------------------------------------------------
+
+ private static final Map SCRIPT_FALLBACK = Map.of(
+ "HIRAGANA", "HAN",
+ "KATAKANA", "HAN"
+ );
+
+ private static String detectDominantScript(String text) {
+ if (text == null || text.isEmpty()) {
+ return "LATIN";
+ }
+ Map counts = new HashMap<>();
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ if (s != Character.UnicodeScript.COMMON
+ && s != Character.UnicodeScript.INHERITED
+ && s != Character.UnicodeScript.UNKNOWN) {
+ counts.merge(s, 1, Integer::sum);
+ }
+ i += Character.charCount(cp);
+ }
+ if (counts.isEmpty()) {
+ return "LATIN";
+ }
+ String name = counts.entrySet().stream()
+ .max(Map.Entry.comparingByValue())
+ .map(e -> e.getKey().name())
+ .orElse("LATIN");
+ return SCRIPT_FALLBACK.getOrDefault(name, name);
+ }
+
+ // -----------------------------------------------------------------------
+ // Row
+ // -----------------------------------------------------------------------
+
+ private static final class Row {
+ final String labeledCs;
+ final String script;
+ final String wrongCs;
+ final int length;
+ final int n;
+ final double meanCleanZ;
+ final double meanMojiZ;
+ final double cohensD;
+ final double meanMargin;
+ final double p5Margin;
+ final double p50Margin;
+ final double fpr;
+ final double tpr;
+
+ Row(String labeledCs, String script, String wrongCs, int length,
+ List cleanZs, List mojiZs, List margins,
+ float threshold) {
+ this.labeledCs = labeledCs;
+ this.script = script;
+ this.wrongCs = wrongCs;
+ this.length = length;
+ this.n = margins.size();
+ this.meanCleanZ = mean(cleanZs);
+ this.meanMojiZ = mean(mojiZs);
+ this.cohensD = computeCohensD(cleanZs, mojiZs);
+ this.meanMargin = mean(margins);
+ this.p5Margin = percentile(margins, 0.05);
+ this.p50Margin = percentile(margins, 0.50);
+ this.fpr = fractionBelow(cleanZs, threshold);
+ this.tpr = fractionBelow(mojiZs, threshold);
+ }
+
+ String toTsv() {
+ return String.format(
+ "%s\t%s\t%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f",
+ labeledCs, script, wrongCs, length, n,
+ meanCleanZ, meanMojiZ, cohensD,
+ meanMargin, p5Margin, p50Margin,
+ fpr, tpr);
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Statistics
+ // -----------------------------------------------------------------------
+
+ private static double computeCohensD(List a, List b) {
+ if (a.size() < 2 || b.size() < 2) {
+ return Double.NaN;
+ }
+ double ma = mean(a);
+ double mb = mean(b);
+ double va = variance(a, ma);
+ double vb = variance(b, mb);
+ double pooled = Math.sqrt((va + vb) / 2.0);
+ if (pooled < 1e-9) {
+ return Double.NaN;
+ }
+ return (ma - mb) / pooled;
+ }
+
+ private static double mean(List xs) {
+ double s = 0;
+ int n = 0;
+ for (float f : xs) {
+ if (!Float.isNaN(f)) {
+ s += f;
+ n++;
+ }
+ }
+ return n == 0 ? Double.NaN : s / n;
+ }
+
+ private static double variance(List xs, double m) {
+ if (xs.size() < 2) {
+ return 0;
+ }
+ double s = 0;
+ int n = 0;
+ for (float f : xs) {
+ if (!Float.isNaN(f)) {
+ double d = f - m;
+ s += d * d;
+ n++;
+ }
+ }
+ return n < 2 ? 0 : s / (n - 1);
+ }
+
+ private static double percentile(List xs, double p) {
+ List sorted = new ArrayList<>(xs);
+ sorted.removeIf(f -> Float.isNaN(f));
+ if (sorted.isEmpty()) {
+ return Double.NaN;
+ }
+ Collections.sort(sorted);
+ int idx = (int) Math.floor(p * (sorted.size() - 1));
+ return sorted.get(idx);
+ }
+
+ private static double fractionBelow(List xs, float threshold) {
+ int below = 0;
+ int n = 0;
+ for (float f : xs) {
+ if (!Float.isNaN(f)) {
+ if (f < threshold) {
+ below++;
+ }
+ n++;
+ }
+ }
+ return n == 0 ? Double.NaN : (double) below / n;
+ }
+
+ // -----------------------------------------------------------------------
+
+ private static void printUsage() {
+ System.err.println("Usage:");
+ System.err.println(" EvalJunkOnCharsetDevtest");
+ System.err.println(" [--devtest-dir ] (default ~/data/charsets/devtest)");
+ System.err.println(" [--output-dir ] (default /tmp/junkdetect-eval)");
+ System.err.println(" [--model ] (default classpath junkdetect.bin)");
+ System.err.println(" [--max-records N] (default 2000)");
+ System.err.println(" [--threshold F] (default -2.0)");
+ System.err.println(" [--lengths 20,50,...]");
+ System.err.println(" [--wrong-charsets a,b,...]");
+ System.err.println(" [--only labeledCs,...] (filter for spot runs)");
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
new file mode 100644
index 00000000000..aa3761ef79f
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Frozen set of training-time choices that together define a junk-detector
+ * model's identity. Any change to these values produces a meaningfully
+ * different model and must be reviewed in git.
+ *
+ * Two principles drove making this a class rather than CLI flags:
+ *
+ *
+ * - Reproducibility. When we look back at a model file six
+ * months later we want a single commit hash that says exactly what
+ * knobs produced it, not a half-remembered shell history.
+ *
- Drift prevention. CLI flags with defaults allow accidental
+ * deviation between developers ("did you remember to pass
+ * {@code --min-target-script-frac 0.05}?"). Constants in a tracked
+ * file remove that failure mode.
+ *
+ *
+ * {@link BuildJunkTrainingData} and {@link TrainJunkModel} read the
+ * values here; both tools refuse to start if any CLI argument
+ * attempts to override a config-controlled parameter, surfacing the
+ * mistake at launch time rather than silently producing a non-canonical
+ * model.
+ *
+ *
The constants below reflect the choices that produced the current
+ * shipping model and are recorded in the corresponding training notes
+ * ({@code 20260514-junk-retrain-v6.md}). Update them by editing this
+ * file and committing the change together with the new model output.
+ *
+ *
The class has no instance state; all values are exposed as
+ * {@code public static final}. This keeps callsites short and avoids
+ * the temptation of passing a runtime-mutable config around.
+ *
+ *
This is not part of the public model-loading API. The {@link
+ * org.apache.tika.ml.junkdetect.JunkDetector} runtime is configuration-
+ * free; once a model file is built, all of its baked-in choices travel
+ * with the file's binary format.
+ */
+public final class JunkDetectorTrainingConfig {
+
+ // =======================================================================
+ // Corpus build (BuildJunkTrainingData)
+ // =======================================================================
+
+ /**
+ * Total UTF-8 byte budget across all script groups. Divided
+ * proportionally by per-script bigram entropy after the sampling phase.
+ */
+ public static final long TOTAL_BUDGET_BYTES = 500_000_000L;
+
+ /**
+ * Maximum UTF-8 bytes a single language may contribute to a
+ * multi-language script bucket. Prevents one large source (e.g. {@code
+ * zho} with 8 GB of MADLAD) from dominating a multi-language script
+ * model. Buckets with only one language ignore this cap and may consume
+ * their full budget. See {@link BuildJunkTrainingData} Phase 4.
+ */
+ public static final long PER_LANGUAGE_CAP_BYTES = 5_000_000L;
+
+ /**
+ * Sentence-level filter: minimum fraction of non-COMMON/INHERITED
+ * codepoints that must belong to the script bucket's target script for a
+ * sentence to be accepted. Set low so legitimate mixed-script content
+ * (Japanese kanji + kana, Korean with hanja annotations, Chinese with
+ * English citations, etc.) is preserved, but enough to reject lines that
+ * are essentially off-target (e.g. an English article about Gothic in
+ * the GOTHIC bucket).
+ */
+ public static final double MIN_TARGET_SCRIPT_FRAC = 0.05;
+
+ /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
+ public static final int MIN_BYTES_PER_SENTENCE = 50;
+
+ /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
+ public static final double MAX_PUNC_FRAC = 0.30;
+
+ /**
+ * Minimum number of sentences that must land in the dev split for a
+ * script to be included in the model. Scripts below this floor have
+ * insufficient data to reliably estimate calibration statistics, which
+ * inflates FPR. With {@code DEV_FRAC = 0.10} this corresponds to a
+ * total-sentence floor of {@code 500 / 0.10 = 5000} per script.
+ */
+ public static final int MIN_DEV_SENTENCES = 500;
+
+ /** Lines read per language to determine the language's dominant script. */
+ public static final int SCRIPT_SAMPLE_LINES = 2_000;
+
+ /**
+ * UTF-8 bytes loaded per script group for bigram entropy estimation,
+ * driving the entropy-proportional budget allocation. 200 KB is
+ * sufficient to characterise the bigram distribution of any single
+ * script.
+ */
+ public static final long ENTROPY_SAMPLE_BYTES = 200_000L;
+
+ /** Random seed for sentence shuffling and other corpus-build randomness. */
+ public static final int SEED = 42;
+
+ /**
+ * Script bucket names whose source data is too thin or too off-target
+ * to produce reliable per-script F1 calibration. Excluded from the
+ * model entirely; the {@link
+ * org.apache.tika.ml.junkdetect.JunkDetector#score(String)} routing
+ * falls back to "unknown script" behavior for these scripts.
+ *
+ *
The current selection is based on a corpus audit that found these
+ * scripts either had thin native source data (e.g. THAANA: 216 train
+ * sentences from Maldivian), or had sources dominated by off-target
+ * content (e.g. GOTHIC: 40% of lines are {@literal <}5% Gothic — the
+ * Wikipedia "gothic" directory is English text about Gothic).
+ *
+ *
Three further scripts (CANADIAN_ABORIGINAL, CHEROKEE, TIFINAGH)
+ * are not listed here because the {@link #MIN_TARGET_SCRIPT_FRAC}
+ * filter implicitly removes them — their MADLAD sources contain
+ * effectively no native-script content at the 5% threshold. Listing
+ * them here is unnecessary and would obscure the data-quality finding.
+ */
+ public static final Set DROP_SCRIPTS =
+ Collections.unmodifiableSet(new java.util.TreeSet<>(Set.of("GOTHIC", "THAANA")));
+
+ /**
+ * Per-script byte-budget overrides applied on top of the entropy-
+ * proportional allocation. Empty in the current configuration.
+ *
+ * Under v6 the {@code HAN=60MB} experiment worsened every
+ * non-HAN script (the global F1 hash table was the bottleneck). Under
+ * v7's per-script tables, the same experiment correctly leaves other
+ * scripts untouched, but the HAN gain itself was negligible (Cohen's d
+ * moved 7.26 → 7.35) — the per-script HAN model is already near its
+ * data-saturation point with ~18 MB of training data. Override left
+ * empty until a more decisive HAN-coverage experiment is designed.
+ */
+ public static final Map SCRIPT_BUDGET_OVERRIDES =
+ Collections.emptyMap();
+
+ // =======================================================================
+ // Model train (TrainJunkModel)
+ // =======================================================================
+
+ /**
+ * Drop per-script F1 bigrams whose per-pair occurrence count (within
+ * that script's training data) is below this threshold. Set to 3 on
+ * evidence that singleton and doubleton pairs are overwhelmingly OCR
+ * artifacts and proper-noun noise that inflate the clean-side score
+ * distribution tail without contributing signal.
+ *
+ * Set to 1 to disable the filter (every observed pair retained).
+ */
+ public static final int MIN_BIGRAM_COUNT = 3;
+
+ /**
+ * Target load factor for the per-script open-addressing F1 hash
+ * table. Table capacity is sized as the smallest power of two
+ * larger than {@code keptPairs / loadFactor}, giving an average of
+ * 1 / (1 - loadFactor) probes per lookup. 0.5 → ~2 probes; modestly
+ * wasteful in space but very cheap to probe.
+ */
+ public static final double OA_LOAD_FACTOR = 0.5;
+
+ /**
+ * Bit width of each codepoint's dense index within a script's F1
+ * table. Each bigram is packed as {@code (idxA << KEY_INDEX_BITS) |
+ * idxB}, so each side must fit in this many bits. 16 bits supports
+ * up to 65535 distinct codepoints per script, which is comfortably
+ * above the largest per-script count we have measured (HAN is the
+ * worst case at ~15K kept codepoints).
+ */
+ public static final int KEY_INDEX_BITS = 16;
+
+ private JunkDetectorTrainingConfig() {
+ // No instances.
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
new file mode 100644
index 00000000000..bcda57c9f7c
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * For each {@code *.train.gz} file in a directory, compute per-line statistics
+ * of "target-script fraction" — i.e. the fraction of codepoints in each line
+ * that belong to the script the file is supposed to represent.
+ *
+ *
Reports a histogram across the buckets
+ * [0, 5, 10, 20, 30, 50, 70, 90, 100]% so we can pick a per-script keep
+ * threshold (e.g. "drop lines with <20% HAN codepoints"). Also reports
+ * what fraction of total bytes / lines would be dropped at each threshold.
+ *
+ *
Each {@code {script}.train.gz} maps to a {@link Character.UnicodeScript};
+ * the file basename is uppercased. Special-case handling routes a few
+ * project-internal script names (e.g. HAN includes HALF_FULL ideographic
+ * forms) when desired.
+ *
+ *
Usage:
+ *
+ * java LineScriptFractions <dataDir> [thresholds]
+ *
+ */
+public final class LineScriptFractions {
+
+ private static final int[] BUCKETS = {0, 5, 10, 20, 30, 50, 70, 90, 100};
+
+ private LineScriptFractions() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.err.println("Usage: LineScriptFractions ");
+ System.exit(1);
+ }
+ Path dataDir = Paths.get(args[0]);
+ Path[] files;
+ try (var s = Files.list(dataDir)) {
+ files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+ .sorted().toArray(Path[]::new);
+ }
+ if (files.length == 0) {
+ System.err.println("No *.train.gz files in " + dataDir);
+ System.exit(1);
+ }
+
+ System.out.printf("%-20s %10s %10s | %s%n",
+ "script", "lines", "<5%",
+ "lines at target-frac threshold (cumulative dropped %)");
+ System.out.println(" "
+ + " <10% <20% <30% <50% <70% <90% <100%");
+ System.out.println(repeat('-', 110));
+
+ for (Path file : files) {
+ String fname = file.getFileName().toString();
+ String name = fname.substring(0, fname.length() - ".train.gz".length())
+ .toUpperCase();
+ Character.UnicodeScript target = mapScript(name);
+ if (target == null) {
+ System.out.printf("%-20s (no UnicodeScript mapping for '%s')%n", name, name);
+ continue;
+ }
+
+ long lines = 0;
+ long[] bucketCounts = new long[BUCKETS.length];
+ try (BufferedReader r = new BufferedReader(
+ new InputStreamReader(
+ new GZIPInputStream(Files.newInputStream(file)),
+ StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = r.readLine()) != null) {
+ lines++;
+ int total = 0;
+ int matching = 0;
+ for (int i = 0; i < line.length(); ) {
+ int cp = line.codePointAt(i);
+ i += Character.charCount(cp);
+ Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ if (s == Character.UnicodeScript.COMMON
+ || s == Character.UnicodeScript.INHERITED
+ || s == Character.UnicodeScript.UNKNOWN) {
+ // Don't count toward denominator: punctuation,
+ // spaces, diacritics are script-neutral.
+ continue;
+ }
+ total++;
+ if (s == target) matching++;
+ }
+ double pct = total == 0 ? 0.0 : 100.0 * matching / total;
+ int b = 0;
+ while (b < BUCKETS.length - 1 && pct >= BUCKETS[b + 1]) b++;
+ bucketCounts[b]++;
+ }
+ }
+
+ // Convert bucket counts to "cumulative fraction dropped at threshold = BUCKETS[i]".
+ StringBuilder sb = new StringBuilder();
+ long cum = 0;
+ // bucketCounts[i] holds lines with pct in [BUCKETS[i], BUCKETS[i+1]).
+ // Drop-if-pctGoal: prove the codepoint-bigram-hash approach opens the
+ * UTF-8→GB18030 mojibake margin meaningfully above v5's ~1 z-unit
+ * baseline BEFORE committing to a multi-day production retrain.
+ *
+ * Training corpus: decode {@code ~/data/charsets/devtest/GB18030.bin.gz}
+ * (Chinese) + first 80% of {@code UTF-8.bin.gz} (multi-language Wikipedia)
+ * under their labeled charsets, iterate codepoints, count bigrams and unigrams,
+ * hash into N buckets, build Bloom filter of seen pairs. Held-out: last 20%
+ * of UTF-8 records.
+ *
+ *
Eval: for each held-out UTF-8 record, slice to length buckets
+ * {20, 50, 100, 200, 500, 1000} source bytes. Decode each slice under
+ * UTF-8 (clean) and GB18030 (mojibake-as-HAN). Score both with the
+ * prototype model. Margin = clean_score - mojibake_score. Report
+ * mean and 5th-percentile margin per length.
+ *
+ *
Sweep: {bigramBuckets, alpha} grid. Pick the configuration that
+ * maximises margin. Compare to v5 baseline (mean margin ~1 z-unit
+ * across all lengths in the same cohort).
+ *
+ *
Outputs:
+ *
+ * - prototype-sweep.tsv: one row per
+ * (bigram_buckets, alpha, length). Columns: n, mean_clean,
+ * mean_moji, mean_margin, std_margin, p5_margin, p50_margin,
+ * margin_in_clean_stds (effective z-units).
+ *
+ *
+ * Usage:
+ *
+ * ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
+ * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.PrototypeCodepointHash \
+ * -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v6-prototype"
+ *
+ */
+public class PrototypeCodepointHash {
+
+ // --- Hyperparameter sweep grid ---
+ private static final int[] BIGRAM_BUCKETS = {4096, 8192, 16384, 32768};
+ private static final double[] ALPHAS = {1.0, 0.4};
+ private static final int UNIGRAM_BUCKETS = 8192;
+ private static final int BLOOM_BITS = 4 * 1024 * 1024; // 512 KB
+ private static final int BLOOM_K = 7;
+
+ // --- Smoothing ---
+ private static final double ADD_ALPHA = 0.01;
+
+ // --- Eval ---
+ private static final int[] LENGTHS = {20, 50, 100, 200, 500, 1000};
+ private static final int MAX_RECORDS_PER_FILE = 5000;
+ private static final double HOLDOUT_FRACTION = 0.20;
+ private static final int MIN_SCORE_CODEPOINTS = 3;
+
+ public static void main(String[] args) throws IOException {
+ Path devtestDir = Paths.get(System.getProperty("user.home"),
+ "data", "charsets", "devtest");
+ Path outputDir = Paths.get("/tmp/v6-prototype");
+ int maxRecords = MAX_RECORDS_PER_FILE;
+ List fixturesDirs = new ArrayList<>();
+ String wrongCharsetName = "GB18030";
+ boolean singleModel = false;
+ List candidates = List.of(
+ "UTF-8", "GB18030", "windows-1252", "windows-1251", "windows-1257",
+ "Shift_JIS", "EUC-JP", "ISO-2022-JP", "UTF-16LE", "UTF-16BE");
+ List forceCandidates = null; // when set, skip base detectors
+ String expected = "UTF-8";
+ int[] probeSizes = null; // when set, sweep these probe sizes per fixture
+
+ for (int i = 0; i < args.length; i++) {
+ switch (args[i]) {
+ case "--devtest-dir":
+ devtestDir = Paths.get(args[++i]);
+ break;
+ case "--output-dir":
+ outputDir = Paths.get(args[++i]);
+ break;
+ case "--max-records":
+ maxRecords = Integer.parseInt(args[++i]);
+ break;
+ case "--fixtures-dir":
+ fixturesDirs.add(Paths.get(args[++i]));
+ break;
+ case "--wrong-charset":
+ wrongCharsetName = args[++i];
+ break;
+ case "--single-model":
+ // Skip prototype training; run N-way fixture eval on bundled JunkDetector only.
+ singleModel = true;
+ break;
+ case "--candidates":
+ candidates = Arrays.asList(args[++i].split(","));
+ break;
+ case "--force-candidates":
+ // Bypass base detectors; pairwise tournament directly on these.
+ forceCandidates = Arrays.asList(args[++i].split(","));
+ break;
+ case "--expected":
+ expected = args[++i];
+ break;
+ case "--probe-sizes":
+ // Comma-separated probe sizes (bytes). Each fixture
+ // gets one row per size, so you can see how length
+ // affects UNKNOWN vs scored.
+ String[] sizes = args[++i].split(",");
+ probeSizes = new int[sizes.length];
+ for (int k = 0; k < sizes.length; k++) {
+ probeSizes[k] = Integer.parseInt(sizes[k].trim());
+ }
+ break;
+ default:
+ System.err.println("Unknown arg: " + args[i]);
+ System.exit(1);
+ }
+ }
+ Files.createDirectories(outputDir);
+
+ // --single-model bypasses the v5/v6-prototype comparison apparatus.
+ // Requires --force-candidates to specify the charsets to compare;
+ // the base-detector-driven path was removed to keep tika-ml-junkdetect
+ // free of heavy encoding-detector deps.
+ if (singleModel) {
+ if (fixturesDirs.isEmpty()) {
+ System.err.println("--single-model requires --fixtures-dir");
+ System.exit(1);
+ }
+ if (forceCandidates == null || forceCandidates.isEmpty()) {
+ System.err.println("--single-model requires --force-candidates "
+ + "(e.g. --force-candidates UTF-8,GB18030)");
+ System.exit(1);
+ }
+ evalFixturesSingleModel(fixturesDirs, forceCandidates, expected,
+ probeSizes, outputDir);
+ return;
+ }
+
+ System.err.println("=== PrototypeCodepointHash ===");
+ System.err.println(" devtest-dir: " + devtestDir);
+ System.err.println(" output-dir: " + outputDir);
+ System.err.println(" max-records: " + maxRecords);
+ System.err.println(" bigram_buckets sweep: " + Arrays.toString(BIGRAM_BUCKETS));
+ System.err.println(" alpha sweep: " + Arrays.toString(ALPHAS));
+ System.err.println(" unigram_buckets: " + UNIGRAM_BUCKETS);
+ System.err.println(" bloom_bits: " + BLOOM_BITS
+ + " (" + (BLOOM_BITS / 8 / 1024) + " KB, k=" + BLOOM_K + ")");
+
+ // -------- Load corpus --------
+
+ Charset utf8 = StandardCharsets.UTF_8;
+ Charset gb18030 = Charset.forName("GB18030");
+
+ System.err.println("\n--- Loading corpus ---");
+ List utf8Records = readRecords(
+ devtestDir.resolve("UTF-8.bin.gz"), maxRecords);
+ List gbRecords = readRecords(
+ devtestDir.resolve("GB18030.bin.gz"), maxRecords);
+ System.err.printf(" UTF-8.bin.gz: %d records%n", utf8Records.size());
+ System.err.printf(" GB18030.bin.gz: %d records%n", gbRecords.size());
+
+ // Train/eval split on UTF-8 records. GB18030 records all go to training.
+ int holdoutCount = (int) (utf8Records.size() * HOLDOUT_FRACTION);
+ int utf8TrainSize = utf8Records.size() - holdoutCount;
+ List utf8TrainBytes = utf8Records.subList(0, utf8TrainSize);
+ List utf8EvalBytes = utf8Records.subList(utf8TrainSize, utf8Records.size());
+ System.err.printf(" UTF-8 train: %d eval: %d%n",
+ utf8TrainBytes.size(), utf8EvalBytes.size());
+
+ // Decode training corpus to codepoint streams
+ System.err.println("\n--- Decoding training corpus ---");
+ List trainStreams = new ArrayList<>();
+ long totalTrainCp = 0;
+ for (byte[] r : utf8TrainBytes) {
+ int[] cps = toCodepoints(decode(r, utf8));
+ if (cps.length >= 2) trainStreams.add(cps);
+ totalTrainCp += cps.length;
+ }
+ for (byte[] r : gbRecords) {
+ int[] cps = toCodepoints(decode(r, gb18030));
+ if (cps.length >= 2) trainStreams.add(cps);
+ totalTrainCp += cps.length;
+ }
+ System.err.printf(" total training codepoints: %,d across %d records%n",
+ totalTrainCp, trainStreams.size());
+
+ // Count unique pairs (for Bloom sizing sanity)
+ Set uniquePairs = new HashSet<>();
+ for (int[] cps : trainStreams) {
+ for (int i = 0; i + 1 < cps.length; i++) {
+ uniquePairs.add(packPair(cps[i], cps[i + 1]));
+ if (uniquePairs.size() >= 2_000_000) break;
+ }
+ if (uniquePairs.size() >= 2_000_000) break;
+ }
+ System.err.printf(" unique codepoint-pairs in training: ~%,d%n",
+ uniquePairs.size());
+
+ // -------- Hyperparameter sweep --------
+
+ Path sweepPath = outputDir.resolve("prototype-sweep.tsv");
+ try (PrintWriter out = new PrintWriter(
+ Files.newBufferedWriter(sweepPath, StandardCharsets.UTF_8))) {
+ out.println("bigram_buckets\talpha\tlength\tn"
+ + "\tmean_clean\tstd_clean\tmean_moji"
+ + "\tmean_margin\tstd_margin\tp5_margin\tp50_margin"
+ + "\tmargin_in_clean_stds\tbloom_seen_frac_clean\tbloom_seen_frac_moji");
+
+ for (int buckets : BIGRAM_BUCKETS) {
+ for (double alpha : ALPHAS) {
+ System.err.printf("%n--- Config: bigram_buckets=%d alpha=%.1f ---%n",
+ buckets, alpha);
+
+ Model m = train(trainStreams, buckets, UNIGRAM_BUCKETS,
+ BLOOM_BITS, BLOOM_K, ADD_ALPHA, alpha);
+
+ // Calibrate on a sample of training streams (for the
+ // "margin_in_clean_stds" effective-z normalization)
+ double[] muSigma = calibrate(m, trainStreams);
+ System.err.printf(" train mu=%.3f sigma=%.3f%n", muSigma[0], muSigma[1]);
+
+ // Eval on held-out UTF-8 records
+ for (int len : LENGTHS) {
+ EvalCell cell = evalAtLength(m, utf8EvalBytes, len, utf8, gb18030);
+ if (cell == null) continue;
+ double effZ = cell.meanMargin / Math.max(muSigma[1], 1e-6);
+ out.printf("%d\t%.2f\t%d\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.3f\t%.3f\t%.3f%n",
+ buckets, alpha, len, cell.n,
+ cell.meanClean, cell.stdClean, cell.meanMoji,
+ cell.meanMargin, cell.stdMargin,
+ cell.p5Margin, cell.p50Margin,
+ effZ, cell.bloomSeenFracClean, cell.bloomSeenFracMoji);
+ System.err.printf(" len=%4d n=%-5d mean_margin=%6.3f p5=%6.3f"
+ + " eff_z=%5.2f bloom_clean=%.2f bloom_moji=%.2f%n",
+ len, cell.n, cell.meanMargin, cell.p5Margin, effZ,
+ cell.bloomSeenFracClean, cell.bloomSeenFracMoji);
+ out.flush();
+ }
+ }
+ }
+ }
+ System.err.println("\nWrote " + sweepPath);
+
+ // -------- Fixture eval (AIT5-class HTML files) --------
+
+ if (!fixturesDirs.isEmpty()) {
+ evalFixtures(trainStreams, fixturesDirs, wrongCharsetName, outputDir);
+ }
+
+ System.err.println("Done.");
+ }
+
+ // -----------------------------------------------------------------------
+ // Real-life fixture eval: runs the production base detectors (BOM +
+ // HtmlEncodingDetector + UniversalEncodingDetector) and asks the
+ // JunkDetector to pick among their candidates via pairwise compare.
+ // Mirrors the production charset-detection arbitration.
+ // -----------------------------------------------------------------------
+
+ private static void evalFixturesSingleModel(List fixturesDirs,
+ List forceCandidates,
+ String expected,
+ int[] probeSizes,
+ Path outputDir) throws IOException {
+ System.err.println("\n--- Forced-candidates fixture eval ---");
+ System.err.println(" candidates: " + forceCandidates);
+ JunkDetector detector = JunkDetector.loadFromClasspath();
+ System.err.println(" model version: " + detector.getModelVersion());
+ System.err.println(" expected: " + expected);
+
+ List forced = new ArrayList<>();
+ for (String n : forceCandidates) {
+ try {
+ forced.add(Charset.forName(n));
+ } catch (Exception e) {
+ System.err.println(" skip unsupported charset: " + n);
+ }
+ }
+
+ Path out = outputDir.resolve("fixtures-real-life.tsv");
+ try (PrintWriter pw = new PrintWriter(
+ Files.newBufferedWriter(out, StandardCharsets.UTF_8))) {
+ pw.println("dir\tfile\tn_bytes\tprobe_size\texpected\tbom_cs\thtml_cs\tuniversal_cs"
+ + "\tcandidates\twinner\tmargin\tstatus\tnotes");
+ int pass = 0, fail = 0, skip = 0, agree = 0;
+ double passMarginSum = 0.0;
+ List failingLines = new ArrayList<>();
+
+ for (Path dir : fixturesDirs) {
+ if (!Files.isDirectory(dir)) {
+ System.err.println(" WARN: not a directory: " + dir);
+ continue;
+ }
+ try (Stream stream = Files.walk(dir)) {
+ List files = new ArrayList<>();
+ stream.filter(Files::isRegularFile).forEach(files::add);
+ Collections.sort(files);
+ int[] sizes = probeSizes != null ? probeSizes : new int[]{16_384};
+ for (Path f : files) {
+ for (int sz : sizes) {
+ FixtureResult r =
+ evalOneForced(f, expected, detector, forced, sz);
+ pw.println(r.toTsvLine());
+ switch (r.status) {
+ case "PASS":
+ pass++;
+ passMarginSum += r.margin;
+ break;
+ case "FAIL":
+ fail++;
+ failingLines.add(r.dir + "/" + r.shortName
+ + "@" + sz + " -> " + r.winner
+ + " (expected " + r.expected + ")");
+ break;
+ case "AGREE":
+ agree++;
+ break;
+ default:
+ skip++;
+ }
+ }
+ }
+ }
+ }
+ int n = pass + fail;
+ System.err.println();
+ System.err.println("=== Summary ===");
+ System.err.printf("Pass: %d / %d (%.1f%%) — JunkDetector picked the expected charset%n",
+ pass, n, n == 0 ? 0.0 : 100.0 * pass / n);
+ System.err.printf("Fail: %d%n", fail);
+ System.err.printf("Agree: %d (all detectors agreed; no arbitration needed)%n", agree);
+ System.err.printf("Skip: %d%n", skip);
+ if (pass > 0) {
+ System.err.printf("Mean margin on pass: %.3f%n", passMarginSum / pass);
+ }
+ if (!failingLines.isEmpty()) {
+ System.err.println("Failing:");
+ Collections.sort(failingLines);
+ for (String line : failingLines) {
+ System.err.println(" " + line);
+ }
+ }
+ }
+ System.err.println("Wrote " + out);
+ }
+
+ private static FixtureResult evalOneForced(Path file, String expected,
+ JunkDetector detector,
+ List forced,
+ int probeBytes) throws IOException {
+ byte[] raw = Files.readAllBytes(file);
+ FixtureResult r = new FixtureResult();
+ r.dir = file.getParent().getFileName().toString();
+ String fname = file.getFileName().toString();
+ r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname;
+ r.bytes = raw.length;
+ r.probeSize = probeBytes;
+ r.expected = expected;
+
+ if (isBinaryMagic(raw)) {
+ r.status = "SKIP_BIN";
+ return r;
+ }
+ // Strip HTML on the WHOLE raw buffer first, then slice to probeBytes
+ // from the stripped content. Otherwise a small probe slice can land
+ // entirely inside // boilerplate and leave
+ // nothing to score after strip.
+ byte[] strippedFull = stripHtmlBytes(raw);
+ byte[] forDecode = strippedFull.length > probeBytes
+ ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull;
+ r.candidatesStr = forced.stream().map(Charset::name)
+ .reduce((a, b) -> a + "," + b).orElse("-");
+
+ // Always log every candidate in notes — even those JunkDetector
+ // rejects as unknown — so the failure mode is visible. An
+ // "unknown" score itself is meaningful information when the other
+ // candidate scored fine.
+ String winner = null;
+ String runner = null;
+ float winnerZ = Float.NEGATIVE_INFINITY;
+ float runnerZ = Float.NEGATIVE_INFINITY;
+ StringBuilder notes = new StringBuilder();
+ int decoded_scored = 0;
+ for (Charset cs : forced) {
+ String decoded = applyEntityVariant(new String(forDecode, cs), "expanded");
+ int cps = toCodepoints(decoded).length;
+ if (cps < 3) {
+ notes.append(cs.name()).append("=TOO_SHORT(").append(cps).append(") ");
+ continue;
+ }
+ TextQualityScore s = detector.score(decoded);
+ if (s.isUnknown()) {
+ // Diagnose: is this script-not-in-model (neutral case) or
+ // all-runs-fragmented-too-short (a real mojibake signal)?
+ String why = diagnoseUnknown(decoded, detector);
+ notes.append(cs.name()).append("=UNK[").append(why).append("] ");
+ continue;
+ }
+ float z = s.getZScore();
+ notes.append(cs.name()).append("=").append(String.format("%.2f", z)).append(" ");
+ decoded_scored++;
+ if (z > winnerZ) {
+ runner = winner;
+ runnerZ = winnerZ;
+ winner = cs.name();
+ winnerZ = z;
+ } else if (z > runnerZ) {
+ runner = cs.name();
+ runnerZ = z;
+ }
+ }
+ if (winner == null) {
+ r.status = "NO_DECODE";
+ r.notes = notes.toString().trim();
+ return r;
+ }
+ r.winner = winner;
+ if (decoded_scored < 2) {
+ // Only one candidate scored; no real arbitration happened.
+ r.margin = Float.NaN;
+ r.status = safeCanonical(winner).equals(safeCanonical(expected))
+ ? "ONLY_EXPECTED_SCORED" : "ONLY_WRONG_SCORED";
+ } else {
+ r.margin = winnerZ - runnerZ;
+ r.status = safeCanonical(winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL";
+ }
+ r.notes = notes.toString().trim();
+ return r;
+ }
+
+ /**
+ * Diagnose why JunkDetector returned UNKNOWN for {@code text}. Walks
+ * the same script-run logic, then classifies the failure mode:
+ *
+ * - {@code EMPTY} — input had no characters.
+ * - {@code NO_MODELED_SCRIPT} — all runs are in scripts the model
+ * doesn't know (legit reason to be neutral).
+ * - {@code ALL_RUNS_TOO_SHORT(N)} — runs exist in modeled scripts
+ * but every one is <2 UTF-8 bytes. Strong mojibake signal —
+ * text is a salad of single codepoints from many scripts.
+ * - {@code MIXED} — some runs were modeled-but-too-short and
+ * some were unmodeled.
+ *
+ */
+ private static String diagnoseUnknown(String text, JunkDetector detector) {
+ if (text == null || text.isEmpty()) {
+ return "EMPTY";
+ }
+ Set modeled = detector.knownScripts();
+ // Walk codepoints, splitting on script boundaries — same as
+ // JunkDetector.buildScriptRuns conceptually. Track per-script:
+ // longest UTF-8-byte run length, plus a separate "unmodeled" tally.
+ java.util.Map longestModeled = new java.util.HashMap<>();
+ int unmodeledRuns = 0;
+ int modeledTooShortRuns = 0;
+ int currentBytes = 0;
+ String currentScript = null;
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ int charCount = Character.charCount(cp);
+ String script = Character.UnicodeScript.of(cp).name();
+ // COMMON / INHERITED / UNKNOWN attach to preceding run, but for
+ // diagnosis we don't need to be that precise — treat them as a
+ // continuation.
+ if ("COMMON".equals(script) || "INHERITED".equals(script)
+ || "UNKNOWN".equals(script)) {
+ if (currentScript != null) {
+ currentBytes += new String(new int[]{cp}, 0, 1)
+ .getBytes(StandardCharsets.UTF_8).length;
+ }
+ } else if (script.equals(currentScript)) {
+ currentBytes += new String(new int[]{cp}, 0, 1)
+ .getBytes(StandardCharsets.UTF_8).length;
+ } else {
+ // close out previous run
+ tallyRun(currentScript, currentBytes, modeled, longestModeled);
+ if (currentScript != null) {
+ if (!modeled.contains(currentScript)) {
+ unmodeledRuns++;
+ } else if (currentBytes < 2) {
+ modeledTooShortRuns++;
+ }
+ }
+ currentScript = script;
+ currentBytes = new String(new int[]{cp}, 0, 1)
+ .getBytes(StandardCharsets.UTF_8).length;
+ }
+ i += charCount;
+ }
+ // close final run
+ if (currentScript != null) {
+ if (!modeled.contains(currentScript)) {
+ unmodeledRuns++;
+ } else if (currentBytes < 2) {
+ modeledTooShortRuns++;
+ } else {
+ longestModeled.merge(currentScript, currentBytes, Math::max);
+ }
+ }
+ boolean anyModeledLong = !longestModeled.isEmpty();
+ if (anyModeledLong) {
+ // Some modeled run is ≥2 bytes — shouldn't have hit UNKNOWN.
+ // (Possible discrepancy with the production logic; reported as MIXED.)
+ return "MIXED(modeled_long=" + longestModeled.size() + ")";
+ }
+ if (modeledTooShortRuns > 0 && unmodeledRuns > 0) {
+ return "MIXED(short=" + modeledTooShortRuns
+ + ",unmodeled=" + unmodeledRuns + ")";
+ }
+ if (modeledTooShortRuns > 0) {
+ return "ALL_RUNS_TOO_SHORT(" + modeledTooShortRuns + ")";
+ }
+ if (unmodeledRuns > 0) {
+ return "NO_MODELED_SCRIPT(" + unmodeledRuns + ")";
+ }
+ return "OTHER";
+ }
+
+ private static void tallyRun(String script, int bytes, Set modeled,
+ java.util.Map longestModeled) {
+ if (script == null) {
+ return;
+ }
+ if (modeled.contains(script) && bytes >= 2) {
+ longestModeled.merge(script, bytes, Math::max);
+ }
+ }
+
+ /**
+ * Run HtmlByteStripper over the entire input; return the stripped
+ * content bytes (or the input verbatim if no tags found).
+ */
+ private static byte[] stripHtmlBytes(byte[] raw) {
+ byte[] dst = new byte[raw.length];
+ HtmlByteStripper.Result r =
+ HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+ if (r.tagCount > 0 && r.length > 0) {
+ return Arrays.copyOf(dst, r.length);
+ }
+ return raw;
+ }
+
+ private static boolean isBinaryMagic(byte[] b) {
+ if (b.length < 4) {
+ return false;
+ }
+ if (b[0] == 0x50 && b[1] == 0x4B
+ && (b[2] == 0x03 || b[2] == 0x05 || b[2] == 0x07)) {
+ return true; // ZIP / JAR / APK / docx
+ }
+ if ((b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) {
+ return true; // gzip
+ }
+ if (b[0] == '%' && b[1] == 'P' && b[2] == 'D' && b[3] == 'F') {
+ return true; // PDF
+ }
+ if ((b[0] & 0xFF) == 0xD0 && (b[1] & 0xFF) == 0xCF) {
+ return true; // OLE2
+ }
+ return false;
+ }
+
+ private static String safeCanonical(String charset) {
+ if (charset == null) {
+ return "";
+ }
+ try {
+ return Charset.forName(charset).name();
+ } catch (Exception e) {
+ return charset.toUpperCase();
+ }
+ }
+
+ private static final class FixtureResult {
+ String dir;
+ String shortName;
+ int bytes;
+ int probeSize;
+ String expected;
+ String bomCs;
+ String htmlCs;
+ String universalCs;
+ String candidatesStr = "-";
+ String winner = "-";
+ float margin = Float.NaN;
+ String status = "";
+ String notes = "";
+
+ String toTsvLine() {
+ return String.format("%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
+ dir, shortName, bytes, probeSize, expected,
+ str(bomCs), str(htmlCs), str(universalCs),
+ candidatesStr, str(winner),
+ Float.isNaN(margin) ? "-" : String.format("%.3f", margin),
+ status, notes.isEmpty() ? "-" : notes);
+ }
+
+ private static String str(String s) {
+ return s == null ? "-" : s;
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Fixture eval: score real-world AIT5-class HTML files under v5 and v6
+ // prototype, with byte-level HTML stripping and entity-variant comparison.
+ // -----------------------------------------------------------------------
+
+ private static void evalFixtures(List trainStreams,
+ List fixturesDirs,
+ String wrongCharsetName,
+ Path outputDir) throws IOException {
+ System.err.println("\n--- Fixture eval (best config: 4096 buckets, alpha=1.0) ---");
+ Model v6 = train(trainStreams, 4096, UNIGRAM_BUCKETS,
+ BLOOM_BITS, BLOOM_K, ADD_ALPHA, 1.0);
+ double[] muSigma = calibrate(v6, trainStreams);
+ float mu = (float) muSigma[0];
+ float sigma = (float) Math.max(muSigma[1], 1e-6);
+ System.err.printf(" v6 train mu=%.3f sigma=%.3f%n", mu, sigma);
+
+ JunkDetector v5 = JunkDetector.loadFromClasspath();
+ Charset cleanCs = StandardCharsets.UTF_8;
+ Charset wrongCs = Charset.forName(wrongCharsetName);
+ System.err.println(" v5 model version: " + v5.getModelVersion());
+ System.err.println(" clean charset: " + cleanCs.name());
+ System.err.println(" mojibake charset: " + wrongCs.name());
+
+ Path fixturesPath = outputDir.resolve("fixtures.tsv");
+ try (PrintWriter out = new PrintWriter(
+ Files.newBufferedWriter(fixturesPath, StandardCharsets.UTF_8))) {
+ out.println("cluster\tfile\tentity_variant\tn_clean_cp\tn_moji_cp"
+ + "\tv5_clean_z\tv5_moji_z\tv5_margin"
+ + "\tv6_F1_clean\tv6_F1_moji\tv6_F1_margin"
+ + "\tv6_combo_clean\tv6_combo_moji\tv6_combo_margin"
+ + "\tdominant_script"
+ + "\tv5_winner\tv6_F1_winner\tv6_combo_winner");
+
+ for (Path dir : fixturesDirs) {
+ if (!Files.isDirectory(dir)) {
+ System.err.println(" WARN: not a directory: " + dir);
+ continue;
+ }
+ try (java.util.stream.Stream files = Files.walk(dir)) {
+ List sorted = new ArrayList<>();
+ files.filter(Files::isRegularFile).forEach(sorted::add);
+ Collections.sort(sorted);
+ for (Path f : sorted) {
+ evalOneFixture(f, v6, mu, sigma, v5, cleanCs, wrongCs, out);
+ }
+ }
+ }
+ }
+ System.err.println("Wrote " + fixturesPath);
+ }
+
+ private static void evalOneFixture(Path file, Model v6, float v6Mu, float v6Sigma,
+ JunkDetector v5,
+ Charset cleanCs, Charset wrongCs,
+ PrintWriter out) throws IOException {
+ byte[] rawBytes = Files.readAllBytes(file);
+ if (rawBytes.length > 16384) {
+ rawBytes = Arrays.copyOf(rawBytes, 16384);
+ }
+ // Byte-level HTML strip (matches JunkFilterEncodingDetector production pipeline)
+ byte[] stripDst = new byte[rawBytes.length];
+ HtmlByteStripper.Result strip =
+ HtmlByteStripper.strip(rawBytes, 0, rawBytes.length, stripDst, 0);
+ byte[] forDecode = rawBytes;
+ if (strip.tagCount > 0 && strip.length > 0) {
+ forDecode = new byte[strip.length];
+ System.arraycopy(stripDst, 0, forDecode, 0, strip.length);
+ }
+
+ String cluster = file.getParent().getFileName().toString();
+ String fname = file.getFileName().toString();
+ // shorten long content-hash names for readability in output
+ String shortName = fname.length() > 12 ? fname.substring(0, 12) : fname;
+
+ String cleanRaw = decode(forDecode, cleanCs);
+ String mojiRaw = decode(forDecode, wrongCs);
+
+ for (String variant : List.of("raw", "expanded", "removed")) {
+ String clean = applyEntityVariant(cleanRaw, variant);
+ String moji = applyEntityVariant(mojiRaw, variant);
+ int[] cleanCps = toCodepoints(clean);
+ int[] mojiCps = toCodepoints(moji);
+ if (cleanCps.length < 3 || mojiCps.length < 3) continue;
+
+ // --- v5 full pipeline (existing) ---
+ TextQualityScore v5cs = v5.score(clean);
+ TextQualityScore v5ms = v5.score(moji);
+ float v5cleanZ = v5cs.isUnknown() ? Float.NaN : v5cs.getZScore();
+ float v5mojiZ = v5ms.isUnknown() ? Float.NaN : v5ms.getZScore();
+ float v5Margin = v5cleanZ - v5mojiZ;
+
+ // --- v6 Feature 1 alone (codepoint-bigram-hash + Bloom + unigram backoff) ---
+ ScoreResult v6c = score(v6, cleanCps);
+ ScoreResult v6m = score(v6, mojiCps);
+ double v6Margin = v6c.meanLogP - v6m.meanLogP;
+
+ // --- v6 combined: substitute v6's F1 z-score into v5's classifier ---
+ JunkDetector.FeatureComponents cleanFc = v5.scoreWithFeatureComponents(clean);
+ JunkDetector.FeatureComponents mojiFc = v5.scoreWithFeatureComponents(moji);
+ float v6F1zClean = (float) (v6c.meanLogP - v6Mu) / v6Sigma;
+ float v6F1zMoji = (float) (v6m.meanLogP - v6Mu) / v6Sigma;
+ float comboClean = recombineLogit(v6F1zClean, cleanFc);
+ float comboMoji = recombineLogit(v6F1zMoji, mojiFc);
+ float comboMargin = comboClean - comboMoji;
+ String dominantScript = cleanFc != null ? cleanFc.dominantScript : "?";
+
+ String v5Winner = Float.isNaN(v5Margin) ? "?" : (v5Margin > 0 ? "CLEAN" : "MOJI");
+ String v6F1Winner = Double.isNaN(v6Margin) ? "?" : (v6Margin > 0 ? "CLEAN" : "MOJI");
+ String v6cWinner = Float.isNaN(comboMargin) ? "?" : (comboMargin > 0 ? "CLEAN" : "MOJI");
+
+ out.printf("%s\t%s\t%s\t%d\t%d"
+ + "\t%.3f\t%.3f\t%.3f"
+ + "\t%.4f\t%.4f\t%.4f"
+ + "\t%.3f\t%.3f\t%.3f"
+ + "\t%s\t%s\t%s\t%s%n",
+ cluster, shortName, variant,
+ cleanCps.length, mojiCps.length,
+ v5cleanZ, v5mojiZ, v5Margin,
+ v6c.meanLogP, v6m.meanLogP, v6Margin,
+ comboClean, comboMoji, comboMargin,
+ dominantScript,
+ v5Winner, v6F1Winner, v6cWinner);
+ out.flush();
+ System.err.printf(" [%s/%s %-8s] v5: Δ%+6.2f %s v6F1: Δ%+6.3f %s v6combo: Δ%+6.2f %s script=%s%n",
+ cluster, shortName, variant,
+ v5Margin, v5Winner,
+ v6Margin, v6F1Winner,
+ comboMargin, v6cWinner,
+ dominantScript);
+ }
+ }
+
+ /**
+ * Recomputes v5's per-script classifier logit with v6's F1 z-score
+ * substituted for v5's z1. Approximation: keeps v5's classifier weights
+ * (w1..w4, bias) which were trained on the OLD F1 distribution. A true
+ * v6 retrain would re-fit w1 on the new F1 distribution; this version
+ * gives a directional estimate of "what if we just swap F1?"
+ */
+ private static float recombineLogit(float v6F1z, JunkDetector.FeatureComponents fc) {
+ if (fc == null || fc.classifierWeights == null) {
+ return Float.NaN;
+ }
+ float[] cw = fc.classifierWeights;
+ int nFeat = cw.length - 1;
+ float logit = cw[nFeat]; // bias
+ if (nFeat >= 1) logit += cw[0] * v6F1z;
+ if (nFeat >= 2) logit += cw[1] * fc.z2;
+ if (nFeat >= 3) logit += cw[2] * fc.z3;
+ if (nFeat >= 4) logit += cw[3] * fc.z4;
+ return logit;
+ }
+
+ // -----------------------------------------------------------------------
+ // HTML entity expansion / removal (regex-based, sufficient for fixtures)
+ // -----------------------------------------------------------------------
+
+ private static final Pattern NUM_DEC = Pattern.compile("(\\d{1,7});");
+ private static final Pattern NUM_HEX = Pattern.compile("[xX]([0-9a-fA-F]{1,6});");
+ private static final Pattern NAMED =
+ Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+ private static String applyEntityVariant(String s, String variant) {
+ switch (variant) {
+ case "raw": return s;
+ case "expanded": return expandEntities(s);
+ case "removed": return removeEntities(s);
+ default: throw new IllegalArgumentException(variant);
+ }
+ }
+
+ private static String expandEntities(String in) {
+ String s = in;
+ s = NUM_DEC.matcher(s).replaceAll(mr -> {
+ try {
+ int cp = Integer.parseInt(mr.group(1));
+ if (cp >= 0 && cp <= 0x10FFFF) {
+ return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+ }
+ } catch (NumberFormatException ignored) {
+ // fall through, leave unchanged
+ }
+ return Matcher.quoteReplacement(mr.group());
+ });
+ s = NUM_HEX.matcher(s).replaceAll(mr -> {
+ try {
+ int cp = Integer.parseInt(mr.group(1), 16);
+ if (cp >= 0 && cp <= 0x10FFFF) {
+ return Matcher.quoteReplacement(new String(Character.toChars(cp)));
+ }
+ } catch (NumberFormatException ignored) {
+ // fall through, leave unchanged
+ }
+ return Matcher.quoteReplacement(mr.group());
+ });
+ s = NAMED.matcher(s).replaceAll(mr -> {
+ switch (mr.group(1)) {
+ case "amp": return "&";
+ case "lt": return "<";
+ case "gt": return ">";
+ case "quot": return "\"";
+ case "apos": return "'";
+ case "nbsp": return " ";
+ case "copy": return "©";
+ case "reg": return "®";
+ default: return Matcher.quoteReplacement(mr.group());
+ }
+ });
+ return s;
+ }
+
+ private static String removeEntities(String s) {
+ s = NUM_DEC.matcher(s).replaceAll("");
+ s = NUM_HEX.matcher(s).replaceAll("");
+ s = NAMED.matcher(s).replaceAll("");
+ return s;
+ }
+
+ // -----------------------------------------------------------------------
+ // Training
+ // -----------------------------------------------------------------------
+
+ private static Model train(List streams,
+ int bigramBuckets, int unigramBuckets,
+ int bloomBits, int bloomK,
+ double addAlpha, double backoffAlpha) {
+ if (Integer.bitCount(bigramBuckets) != 1 || Integer.bitCount(unigramBuckets) != 1) {
+ throw new IllegalArgumentException("Bucket counts must be powers of 2");
+ }
+ long[] bigramCounts = new long[bigramBuckets];
+ long[] unigramCounts = new long[unigramBuckets];
+ long bigramTotal = 0;
+ long unigramTotal = 0;
+ long[] bloomBitArr = new long[(bloomBits + 63) / 64];
+
+ for (int[] cps : streams) {
+ for (int i = 0; i < cps.length; i++) {
+ int cp = cps[i];
+ int uBucket = (int) (fnv1aUnigram(cp) & (unigramBuckets - 1));
+ unigramCounts[uBucket]++;
+ unigramTotal++;
+ if (i + 1 < cps.length) {
+ int cpNext = cps[i + 1];
+ int bBucket = (int) (fnv1aBigram(cp, cpNext) & (bigramBuckets - 1));
+ bigramCounts[bBucket]++;
+ bigramTotal++;
+ bloomAdd(bloomBitArr, bloomBits, bloomK, cp, cpNext);
+ }
+ }
+ }
+
+ // Convert to log-probabilities with add-alpha smoothing
+ float[] bigramLogP = new float[bigramBuckets];
+ double bigramDenom = bigramTotal + addAlpha * bigramBuckets;
+ for (int i = 0; i < bigramBuckets; i++) {
+ double p = (bigramCounts[i] + addAlpha) / bigramDenom;
+ bigramLogP[i] = (float) Math.log(p);
+ }
+ float[] unigramLogP = new float[unigramBuckets];
+ double unigramDenom = unigramTotal + addAlpha * unigramBuckets;
+ for (int i = 0; i < unigramBuckets; i++) {
+ double p = (unigramCounts[i] + addAlpha) / unigramDenom;
+ unigramLogP[i] = (float) Math.log(p);
+ }
+
+ return new Model(bigramBuckets, unigramBuckets, bigramLogP, unigramLogP,
+ bloomBitArr, bloomBits, bloomK, backoffAlpha);
+ }
+
+ private static double[] calibrate(Model m, List streams) {
+ double s = 0;
+ double s2 = 0;
+ int n = 0;
+ // Use a stride to avoid scoring every single train record
+ int stride = Math.max(1, streams.size() / 1000);
+ for (int i = 0; i < streams.size(); i += stride) {
+ int[] cps = streams.get(i);
+ if (cps.length < MIN_SCORE_CODEPOINTS) continue;
+ ScoreResult r = score(m, cps);
+ s += r.meanLogP;
+ s2 += r.meanLogP * r.meanLogP;
+ n++;
+ }
+ if (n == 0) return new double[]{0, 1};
+ double mu = s / n;
+ double var = Math.max(0, s2 / n - mu * mu);
+ double sigma = Math.sqrt(var);
+ return new double[]{mu, sigma};
+ }
+
+ // -----------------------------------------------------------------------
+ // Scoring
+ // -----------------------------------------------------------------------
+
+ private static ScoreResult score(Model m, int[] cps) {
+ if (cps.length < 2) return new ScoreResult(Double.NaN, 0, 0);
+ double sum = 0;
+ int n = 0;
+ int seen = 0;
+ for (int i = 0; i + 1 < cps.length; i++) {
+ int cp1 = cps[i];
+ int cp2 = cps[i + 1];
+ double logP;
+ if (bloomContains(m.bloomBits, m.bloomBitCount, m.bloomK, cp1, cp2)) {
+ int b = (int) (fnv1aBigram(cp1, cp2) & (m.bigramBuckets - 1));
+ logP = m.bigramLogP[b];
+ seen++;
+ } else {
+ int u1 = (int) (fnv1aUnigram(cp1) & (m.unigramBuckets - 1));
+ int u2 = (int) (fnv1aUnigram(cp2) & (m.unigramBuckets - 1));
+ logP = m.backoffAlpha * (m.unigramLogP[u1] + m.unigramLogP[u2]);
+ }
+ sum += logP;
+ n++;
+ }
+ return new ScoreResult(sum / n, n, seen);
+ }
+
+ private static final class ScoreResult {
+ final double meanLogP;
+ final int nPairs;
+ final int seenPairs;
+ ScoreResult(double m, int n, int s) {
+ this.meanLogP = m;
+ this.nPairs = n;
+ this.seenPairs = s;
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Eval at one length bucket
+ // -----------------------------------------------------------------------
+
+ private static EvalCell evalAtLength(Model m, List evalBytes, int length,
+ Charset cleanCs, Charset wrongCs) {
+ List cleans = new ArrayList<>();
+ List mojis = new ArrayList<>();
+ List margins = new ArrayList<>();
+ double seenSumClean = 0, seenSumMoji = 0;
+ int nSeenObs = 0;
+ for (byte[] rec : evalBytes) {
+ if (rec.length < length) continue;
+ byte[] slice = Arrays.copyOf(rec, length);
+ int[] cleanCps = toCodepoints(decode(slice, cleanCs));
+ int[] mojiCps = toCodepoints(decode(slice, wrongCs));
+ if (cleanCps.length < MIN_SCORE_CODEPOINTS
+ || mojiCps.length < MIN_SCORE_CODEPOINTS) continue;
+ ScoreResult sc = score(m, cleanCps);
+ ScoreResult sm = score(m, mojiCps);
+ if (Double.isNaN(sc.meanLogP) || Double.isNaN(sm.meanLogP)) continue;
+ cleans.add(sc.meanLogP);
+ mojis.add(sm.meanLogP);
+ margins.add(sc.meanLogP - sm.meanLogP);
+ if (sc.nPairs > 0) seenSumClean += (double) sc.seenPairs / sc.nPairs;
+ if (sm.nPairs > 0) seenSumMoji += (double) sm.seenPairs / sm.nPairs;
+ nSeenObs++;
+ }
+ if (margins.size() < 30) return null;
+ EvalCell cell = new EvalCell();
+ cell.n = margins.size();
+ cell.meanClean = mean(cleans);
+ cell.stdClean = std(cleans, cell.meanClean);
+ cell.meanMoji = mean(mojis);
+ cell.meanMargin = mean(margins);
+ cell.stdMargin = std(margins, cell.meanMargin);
+ cell.p5Margin = percentile(margins, 0.05);
+ cell.p50Margin = percentile(margins, 0.50);
+ cell.bloomSeenFracClean = nSeenObs > 0 ? seenSumClean / nSeenObs : Double.NaN;
+ cell.bloomSeenFracMoji = nSeenObs > 0 ? seenSumMoji / nSeenObs : Double.NaN;
+ return cell;
+ }
+
+ private static final class EvalCell {
+ int n;
+ double meanClean, stdClean;
+ double meanMoji;
+ double meanMargin, stdMargin;
+ double p5Margin, p50Margin;
+ double bloomSeenFracClean, bloomSeenFracMoji;
+ }
+
+ // -----------------------------------------------------------------------
+ // FNV-1a hashing for codepoint bigram / unigram + Bloom filter
+ // -----------------------------------------------------------------------
+
+ private static final long FNV_OFFSET = 0xcbf29ce484222325L;
+ private static final long FNV_PRIME = 0x100000001b3L;
+
+ private static long fnv1aBigram(int cp1, int cp2) {
+ long h = FNV_OFFSET;
+ h = (h ^ ((cp1 >>> 24) & 0xFF)) * FNV_PRIME;
+ h = (h ^ ((cp1 >>> 16) & 0xFF)) * FNV_PRIME;
+ h = (h ^ ((cp1 >>> 8) & 0xFF)) * FNV_PRIME;
+ h = (h ^ (cp1 & 0xFF)) * FNV_PRIME;
+ h = (h ^ 0xFF) * FNV_PRIME; // separator
+ h = (h ^ ((cp2 >>> 24) & 0xFF)) * FNV_PRIME;
+ h = (h ^ ((cp2 >>> 16) & 0xFF)) * FNV_PRIME;
+ h = (h ^ ((cp2 >>> 8) & 0xFF)) * FNV_PRIME;
+ h = (h ^ (cp2 & 0xFF)) * FNV_PRIME;
+ return h;
+ }
+
+ private static long fnv1aUnigram(int cp) {
+ long h = FNV_OFFSET;
+ h = (h ^ ((cp >>> 24) & 0xFF)) * FNV_PRIME;
+ h = (h ^ ((cp >>> 16) & 0xFF)) * FNV_PRIME;
+ h = (h ^ ((cp >>> 8) & 0xFF)) * FNV_PRIME;
+ h = (h ^ (cp & 0xFF)) * FNV_PRIME;
+ return h;
+ }
+
+ private static long secondaryHash(int cp1, int cp2) {
+ // Independent secondary hash for Bloom double-hashing. Just shuffle
+ // the inputs differently.
+ long h = 0xff51afd7ed558ccdL;
+ h = (h ^ Integer.reverse(cp1)) * 0xc4ceb9fe1a85ec53L;
+ h = (h ^ Integer.reverse(cp2)) * 0xc4ceb9fe1a85ec53L;
+ h ^= h >>> 33;
+ return h;
+ }
+
+ private static void bloomAdd(long[] bits, int bitCount, int k, int cp1, int cp2) {
+ long h1 = fnv1aBigram(cp1, cp2);
+ long h2 = secondaryHash(cp1, cp2);
+ for (int i = 0; i < k; i++) {
+ long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount;
+ bits[(int) (pos >>> 6)] |= 1L << (pos & 63);
+ }
+ }
+
+ private static boolean bloomContains(long[] bits, int bitCount, int k,
+ int cp1, int cp2) {
+ long h1 = fnv1aBigram(cp1, cp2);
+ long h2 = secondaryHash(cp1, cp2);
+ for (int i = 0; i < k; i++) {
+ long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount;
+ if ((bits[(int) (pos >>> 6)] & (1L << (pos & 63))) == 0) return false;
+ }
+ return true;
+ }
+
+ private static long packPair(int cp1, int cp2) {
+ return ((long) cp1 << 32) | (cp2 & 0xFFFFFFFFL);
+ }
+
+ // -----------------------------------------------------------------------
+ // I/O and decode utilities (copied from EvalJunkOnCharsetDevtest)
+ // -----------------------------------------------------------------------
+
+ private static List readRecords(Path file, int maxRecords) throws IOException {
+ List records = new ArrayList<>();
+ try (FileInputStream fis = new FileInputStream(file.toFile());
+ GZIPInputStream gis = new GZIPInputStream(fis);
+ DataInputStream dis = new DataInputStream(gis)) {
+ while (records.size() < maxRecords) {
+ int len;
+ try {
+ len = dis.readUnsignedShort();
+ } catch (EOFException eof) {
+ break;
+ }
+ byte[] rec = new byte[len];
+ dis.readFully(rec);
+ records.add(rec);
+ }
+ }
+ return records;
+ }
+
+ private static String decode(byte[] bytes, Charset cs) {
+ CharsetDecoder dec = cs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ try {
+ return dec.decode(ByteBuffer.wrap(bytes)).toString();
+ } catch (CharacterCodingException e) {
+ return new String(bytes, cs);
+ }
+ }
+
+ private static int[] toCodepoints(String s) {
+ int[] cps = new int[s.length()];
+ int n = 0;
+ for (int i = 0; i < s.length(); ) {
+ int cp = s.codePointAt(i);
+ cps[n++] = cp;
+ i += Character.charCount(cp);
+ }
+ return Arrays.copyOf(cps, n);
+ }
+
+ // -----------------------------------------------------------------------
+ // Stats
+ // -----------------------------------------------------------------------
+
+ private static double mean(List xs) {
+ double s = 0;
+ int n = 0;
+ for (double v : xs) {
+ if (!Double.isNaN(v)) {
+ s += v;
+ n++;
+ }
+ }
+ return n == 0 ? Double.NaN : s / n;
+ }
+
+ private static double std(List xs, double mu) {
+ if (xs.size() < 2) return 0;
+ double s = 0;
+ int n = 0;
+ for (double v : xs) {
+ if (!Double.isNaN(v)) {
+ s += (v - mu) * (v - mu);
+ n++;
+ }
+ }
+ return n < 2 ? 0 : Math.sqrt(s / (n - 1));
+ }
+
+ private static double percentile(List xs, double p) {
+ List sorted = new ArrayList<>(xs);
+ sorted.removeIf(v -> Double.isNaN(v));
+ if (sorted.isEmpty()) return Double.NaN;
+ Collections.sort(sorted);
+ int idx = (int) Math.floor(p * (sorted.size() - 1));
+ return sorted.get(idx);
+ }
+
+ // -----------------------------------------------------------------------
+ // Model
+ // -----------------------------------------------------------------------
+
+ private static final class Model {
+ final int bigramBuckets;
+ final int unigramBuckets;
+ final float[] bigramLogP;
+ final float[] unigramLogP;
+ final long[] bloomBits;
+ final int bloomBitCount;
+ final int bloomK;
+ final double backoffAlpha;
+ Model(int bb, int ub, float[] blp, float[] ulp,
+ long[] bloom, int bbc, int bk, double a) {
+ this.bigramBuckets = bb;
+ this.unigramBuckets = ub;
+ this.bigramLogP = blp;
+ this.unigramLogP = ulp;
+ this.bloomBits = bloom;
+ this.bloomBitCount = bbc;
+ this.bloomK = bk;
+ this.backoffAlpha = a;
+ }
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
new file mode 100644
index 00000000000..b384d5f4c51
--- /dev/null
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Codepoint-level script census of one or more text files. For each input
+ * file, reports the percentage of codepoints in each {@link
+ * Character.UnicodeScript}, optionally per-line script-mix histograms.
+ *
+ * Useful to verify whether {@code BuildJunkTrainingData} is bucketing
+ * languages correctly: e.g. Japanese is usually a mix of HIRAGANA, KATAKANA
+ * and HAN; if {@code jpn} ends up in {@code han.train.gz} we want to know
+ * what fraction of its codepoints are actually Han ideographs vs. kana.
+ *
+ *
Usage:
+ *
+ * java ScriptCensus <file> [file ...] # supports .gz and plain text
+ *
+ */
+public final class ScriptCensus {
+
+ /** Max lines to sample per file (set high for full pass). */
+ private static final int MAX_LINES = 200_000;
+
+ private ScriptCensus() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 1) {
+ System.err.println("Usage: ScriptCensus [file ...]");
+ System.exit(1);
+ }
+ for (String arg : args) {
+ Path f = Paths.get(arg);
+ if (!Files.isRegularFile(f)) {
+ System.err.println("Skipping non-file: " + f);
+ continue;
+ }
+ reportOne(f);
+ System.out.println();
+ }
+ }
+
+ private static void reportOne(Path file) throws IOException {
+ Map scriptCounts = new HashMap<>();
+ // Per-line dominant-script histogram.
+ Map dominantHistogram = new HashMap<>();
+ long total = 0;
+ long lines = 0;
+ long sampledBytes = 0;
+
+ try (BufferedReader r = open(file)) {
+ String line;
+ while ((line = r.readLine()) != null && lines < MAX_LINES) {
+ lines++;
+ sampledBytes += line.length();
+ // For MADLAD/Wikipedia files the format is "lineNum TAB text";
+ // strip the prefix if present.
+ int tab = line.indexOf('\t');
+ String text = tab >= 0 ? line.substring(tab + 1) : line;
+
+ Map perLine = new HashMap<>();
+ for (int i = 0; i < text.length(); ) {
+ int cp = text.codePointAt(i);
+ i += Character.charCount(cp);
+ Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+ if (s == Character.UnicodeScript.COMMON
+ || s == Character.UnicodeScript.INHERITED
+ || s == Character.UnicodeScript.UNKNOWN) {
+ continue;
+ }
+ String name = s.name();
+ scriptCounts.computeIfAbsent(name, k -> new long[1])[0]++;
+ perLine.merge(name, 1L, Long::sum);
+ total++;
+ }
+ // Identify the dominant script for this line.
+ String dom = null;
+ long best = -1;
+ for (Map.Entry e : perLine.entrySet()) {
+ if (e.getValue() > best) {
+ best = e.getValue();
+ dom = e.getKey();
+ }
+ }
+ if (dom != null) {
+ dominantHistogram.computeIfAbsent(dom, k -> new long[1])[0]++;
+ }
+ }
+ }
+
+ System.out.printf("File: %s%n", file);
+ System.out.printf(" lines sampled: %,d total codepoints (excl. COMMON/INHERITED): %,d%n%n",
+ lines, total);
+
+ if (total == 0) {
+ System.out.println(" (empty / no scripted codepoints)");
+ return;
+ }
+
+ System.out.println(" Codepoint distribution by script:");
+ List> sorted = new ArrayList<>(scriptCounts.entrySet());
+ sorted.sort(Comparator.comparingLong((Map.Entry e) -> -e.getValue()[0]));
+ long cumulative = 0;
+ for (Map.Entry e : sorted) {
+ long c = e.getValue()[0];
+ cumulative += c;
+ double pct = 100.0 * c / total;
+ double cumPct = 100.0 * cumulative / total;
+ if (pct < 0.01 && c < 100) continue;
+ System.out.printf(" %-22s %,14d %6.2f%% (cum %6.2f%%)%n",
+ e.getKey(), c, pct, cumPct);
+ }
+
+ System.out.println();
+ System.out.println(" Per-line dominant-script histogram:");
+ List> dom = new ArrayList<>(dominantHistogram.entrySet());
+ dom.sort(Comparator.comparingLong((Map.Entry e) -> -e.getValue()[0]));
+ long domTotal = 0;
+ for (long[] v : dominantHistogram.values()) domTotal += v[0];
+ for (Map.Entry e : dom) {
+ long c = e.getValue()[0];
+ double pct = 100.0 * c / domTotal;
+ if (pct < 0.05) continue;
+ System.out.printf(" %-22s %,12d %6.2f%% of lines%n",
+ e.getKey(), c, pct);
+ }
+ }
+
+ private static BufferedReader open(Path path) throws IOException {
+ if (path.getFileName().toString().endsWith(".gz")) {
+ return new BufferedReader(new InputStreamReader(
+ new GZIPInputStream(Files.newInputStream(path)),
+ StandardCharsets.UTF_8));
+ }
+ return Files.newBufferedReader(path, StandardCharsets.UTF_8);
+ }
+}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index fe99f3214e3..cf52a9eedfc 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -41,6 +41,9 @@
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
+import org.apache.tika.ml.junkdetect.JunkDetector;
+import org.apache.tika.ml.junkdetect.V7Tables;
+
/**
* Trains the junk detector model from per-script corpus files produced by
* {@link BuildJunkTrainingData}.
@@ -124,7 +127,17 @@
public class TrainJunkModel {
static final String MAGIC = "JUNKDET1";
- static final byte VERSION = 5;
+ /** Sole supported file-format version. Matches JunkDetector.VERSION. */
+ static final byte VERSION = 7;
+
+ // -----------------------------------------------------------------------
+ // v7 model constants (per-script open-addressing codepoint-bigram tables)
+ // -----------------------------------------------------------------------
+
+ /** Unigram backoff multiplier. α=1.0 = plain independence; prototype validated. */
+ static final float V7_BACKOFF_ALPHA = 1.0f;
+ /** Additive smoothing constant for log-prob computation. */
+ static final double V7_ADD_ALPHA = 0.01;
/** Number of clean (and corrupted) windows used to train the per-script classifier. */
static final int NUM_CLASSIFIER_SAMPLES = 500;
@@ -179,6 +192,25 @@ public static void main(String[] args) throws IOException {
"datasets", "madlad", "junkdetect");
Path output = dataDir.resolve("junkdetect.bin");
+ // Durable training parameters live in JunkDetectorTrainingConfig; this
+ // tool deliberately refuses CLI overrides so a built model file's
+ // identity always matches a committed config.
+ int minBigramCount = JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT;
+ double loadFactor = JunkDetectorTrainingConfig.OA_LOAD_FACTOR;
+ int keyIndexBits = JunkDetectorTrainingConfig.KEY_INDEX_BITS;
+ if (minBigramCount < 1) {
+ System.err.println("ERROR: MIN_BIGRAM_COUNT must be >= 1");
+ System.exit(1);
+ }
+ if (loadFactor <= 0 || loadFactor >= 1) {
+ System.err.println("ERROR: OA_LOAD_FACTOR must be in (0, 1), got " + loadFactor);
+ System.exit(1);
+ }
+ if (keyIndexBits < 1 || keyIndexBits > 16) {
+ System.err.println("ERROR: KEY_INDEX_BITS must be in [1, 16], got " + keyIndexBits);
+ System.exit(1);
+ }
+
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "--data-dir":
@@ -187,6 +219,12 @@ public static void main(String[] args) throws IOException {
case "--output":
output = Paths.get(args[++i]);
break;
+ case "--bloom-bits":
+ case "--min-bigram-count":
+ System.err.println("ERROR: " + args[i] + " is no longer a CLI option."
+ + " Edit JunkDetectorTrainingConfig and commit the change instead.");
+ System.exit(1);
+ break;
default:
System.err.println("Unknown argument: " + args[i]);
printUsage();
@@ -194,31 +232,34 @@ public static void main(String[] args) throws IOException {
}
}
- System.out.println("=== TrainJunkModel (v5) ===");
- System.out.println(" data-dir: " + dataDir);
- System.out.println(" output: " + output);
+ System.out.println("=== TrainJunkModel ===");
+ System.out.println(" data-dir: " + dataDir);
+ System.out.println(" output: " + output);
+ System.out.println(" --- v7 format constants (TrainJunkModel) ---");
+ System.out.printf( " backoff_alpha: %.2f%n", V7_BACKOFF_ALPHA);
+ System.out.println(" --- config (JunkDetectorTrainingConfig) ---");
+ System.out.printf( " min_bigram_count: %d%n", minBigramCount);
+ System.out.printf( " oa_load_factor: %.2f%n", loadFactor);
+ System.out.printf( " key_index_bits: %d%n", keyIndexBits);
if (!Files.isDirectory(dataDir)) {
System.err.println("ERROR: data-dir not found: " + dataDir);
System.exit(1);
}
- System.out.print("Building Unicode named-block index... ");
+ int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
+ System.out.printf("Block bucketing: %d named blocks + 1 unassigned "
+ + "(scheme version %d, JVM-independent)%n",
+ blockN - 1, org.apache.tika.ml.junkdetect.UnicodeBlockRanges.SCHEME_VERSION);
long t0 = System.currentTimeMillis();
- Map blockIndex = buildBlockIndex();
- int blockN = blockIndex.size() + 1;
- System.out.printf("%d named blocks → table size %d×%d (%dms)%n",
- blockIndex.size(), blockN, blockN, System.currentTimeMillis() - t0);
-
- TreeMap bigramTables = new TreeMap<>();
- TreeMap bigramCalibrations = new TreeMap<>();
- TreeMap blockTables = new TreeMap<>();
- TreeMap blockCalibrations = new TreeMap<>();
+
+ TreeMap f1Calibrations = new TreeMap<>();
+ TreeMap blockTables = new TreeMap<>();
+ TreeMap blockCalibrations = new TreeMap<>();
TreeMap controlCalibrations = new TreeMap<>();
- TreeMap classifierWeights = new TreeMap<>();
- TreeMap devFilePaths = new TreeMap<>();
- List allTrainFiles = new ArrayList<>();
- List allDevFiles = new ArrayList<>();
+ TreeMap classifierWeights = new TreeMap<>();
+ TreeMap trainFilePaths = new TreeMap<>();
+ List allTrainFiles = new ArrayList<>();
List trainFiles;
try (var stream = Files.list(dataDir)) {
@@ -234,69 +275,62 @@ public static void main(String[] args) throws IOException {
}
// -----------------------------------------------------------------------
- // Phase 1 — per-script bigram tables, block tables, calibrations
+ // Phase 1 — per-script F1 tables (V7), F1 calibration, F2 block tables,
+ // F3 control-byte calibration
// -----------------------------------------------------------------------
- System.out.println("\n--- Phase 1: per-script tables and calibrations ---");
+ TreeMap f1TablesByScript = new TreeMap<>();
+ System.out.println("\n--- Phase 1: per-script F1 tables + calibrations ---");
for (Path trainFile : trainFiles) {
String filename = trainFile.getFileName().toString();
String script = filename.substring(0, filename.length() - ".train.gz".length())
.toUpperCase();
- Path devFile = trainFile.getParent().resolve(
- filename.replace(".train.gz", ".dev.gz"));
System.out.printf("%n [%s]%n", script);
allTrainFiles.add(trainFile);
t0 = System.currentTimeMillis();
- System.out.print(" Training byte-bigram table... ");
- float[] bigramTable = trainBigramTable(trainFile);
- System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0);
+ System.out.print(" Training V7 F1 tables (cp index + OA)..");
+ V7Tables v7 = trainV7TablesForScript(trainFile, minBigramCount,
+ loadFactor, keyIndexBits);
+ System.out.printf(" done (%dms)%n", System.currentTimeMillis() - t0);
+ System.out.println(v7.statsString());
+ f1TablesByScript.put(script, v7);
t0 = System.currentTimeMillis();
- System.out.print(" Training named-block table... ");
- float[] blockTable = trainBlockTable(trainFile, blockIndex, blockN);
+ System.out.print(" Training named-block table... ");
+ float[] blockTable = trainBlockTable(trainFile);
System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0);
- float[] bigramCal = new float[]{0f, 1f};
- float[] blockCal = new float[]{0f, 1f};
- float[] controlCal = new float[]{0f, 1f};
-
- if (Files.exists(devFile)) {
- t0 = System.currentTimeMillis();
- System.out.print(" Calibrating byte bigrams on dev... ");
- bigramCal = computeBigramCalibration(devFile, bigramTable);
- System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
- bigramCal[0], bigramCal[1], System.currentTimeMillis() - t0);
-
- t0 = System.currentTimeMillis();
- System.out.print(" Calibrating named blocks on dev... ");
- blockCal = computeBlockCalibration(devFile, blockTable, blockIndex, blockN);
- System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
- blockCal[0], blockCal[1], System.currentTimeMillis() - t0);
-
- t0 = System.currentTimeMillis();
- System.out.print(" Calibrating control bytes on dev...");
- controlCal = computeControlByteCalibration(devFile);
- System.out.printf("done — mu=%.6f sigma=%.6f (%dms)%n",
- controlCal[0], controlCal[1], System.currentTimeMillis() - t0);
-
- devFilePaths.put(script, devFile);
- allDevFiles.add(devFile);
- } else {
- System.out.println(" WARNING: no dev file found, using uncalibrated defaults");
- }
+ t0 = System.currentTimeMillis();
+ System.out.print(" Calibrating F1 (cp-hash) on train.. ");
+ float[] f1Cal = calibrateF1PerScript(trainFile, v7);
+ System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
+ f1Cal[0], f1Cal[1], System.currentTimeMillis() - t0);
+
+ t0 = System.currentTimeMillis();
+ System.out.print(" Calibrating named blocks on train...");
+ float[] blockCal = computeBlockCalibration(trainFile, blockTable);
+ System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
+ blockCal[0], blockCal[1], System.currentTimeMillis() - t0);
- bigramTables.put(script, bigramTable);
- bigramCalibrations.put(script, bigramCal);
+ t0 = System.currentTimeMillis();
+ System.out.print(" Calibrating control bytes on train..");
+ float[] controlCal = computeControlByteCalibration(trainFile);
+ System.out.printf("done — mu=%.6f sigma=%.6f (%dms)%n",
+ controlCal[0], controlCal[1], System.currentTimeMillis() - t0);
+
+ trainFilePaths.put(script, trainFile);
+
+ f1Calibrations.put(script, f1Cal);
blockTables.put(script, blockTable);
blockCalibrations.put(script, blockCal);
controlCalibrations.put(script, controlCal);
- // Placeholder — set in phase 3
+ // Placeholder — set in Phase 3
classifierWeights.put(script, new float[]{1f / 4, 1f / 4, 1f / 4, 1f / 4, 0f});
}
// -----------------------------------------------------------------------
- // Phase 2 — global script-transition table
+ // Phase 2 — global script-transition table + supporting pools
// -----------------------------------------------------------------------
System.out.println("\n--- Phase 2: global script-transition table ---");
List scriptBuckets = buildScriptBuckets();
@@ -314,7 +348,7 @@ public static void main(String[] args) throws IOException {
t0 = System.currentTimeMillis();
System.out.print(" Calibrating script transitions... ");
- float[] scriptTransCal = calibrateScriptTransitions(allDevFiles, scriptTransTable,
+ float[] scriptTransCal = calibrateScriptTransitions(allTrainFiles, scriptTransTable,
scriptBucketMap, numScriptBuckets);
System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
scriptTransCal[0], scriptTransCal[1], System.currentTimeMillis() - t0);
@@ -334,21 +368,21 @@ public static void main(String[] args) throws IOException {
System.out.printf("%d tables built%n", remapTables.size());
// -----------------------------------------------------------------------
- // Phase 3 — per-script linear classifiers (now with z4)
+ // Phase 3 — per-script linear classifiers using v6 features
// -----------------------------------------------------------------------
System.out.println("\n--- Phase 3: per-script linear classifiers (z1,z2,z3,z4) ---");
- for (String script : bigramTables.keySet()) {
- Path devFile = devFilePaths.get(script);
- if (devFile == null) {
- System.out.printf(" [%s] WARNING: no dev file, keeping equal-weight defaults%n", script);
+ for (String script : f1Calibrations.keySet()) {
+ Path trainFile = trainFilePaths.get(script);
+ if (trainFile == null) {
+ System.out.printf(" [%s] WARNING: no train file, keeping equal-weight defaults%n", script);
continue;
}
t0 = System.currentTimeMillis();
System.out.printf(" [%s] training classifier... ", script);
- float[] weights = trainClassifier(devFile,
- bigramTables.get(script), bigramCalibrations.get(script),
+ float[] weights = trainClassifierV7(trainFile,
+ f1TablesByScript.get(script), f1Calibrations.get(script),
blockTables.get(script), blockCalibrations.get(script),
- controlCalibrations.get(script), blockIndex, blockN,
+ controlCalibrations.get(script),
scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets,
scriptCodepoints, remapTables);
classifierWeights.put(script, weights);
@@ -358,82 +392,31 @@ public static void main(String[] args) throws IOException {
}
System.out.printf("%nWriting model (%d scripts, blockN=%d, scriptBuckets=%d) → %s%n",
- bigramTables.size(), blockN, numScriptBuckets, output);
- saveModel(bigramTables, bigramCalibrations,
+ f1Calibrations.size(), blockN, numScriptBuckets, output);
+ saveModelV7(f1TablesByScript, f1Calibrations,
blockTables, blockCalibrations,
controlCalibrations, classifierWeights,
- blockIndex, blockN, scriptBuckets, scriptTransTable, scriptTransCal, output);
- System.out.printf("Model size: %,d bytes (%.1f MB)%n",
- Files.size(output), Files.size(output) / 1_000_000.0);
+ scriptBuckets, scriptTransTable, scriptTransCal,
+ output);
+ System.out.printf("Model size: %,d bytes (%.1f KB)%n",
+ Files.size(output), Files.size(output) / 1024.0);
System.out.println("Done.");
}
- // -----------------------------------------------------------------------
- // Block index
- // -----------------------------------------------------------------------
-
- /**
- * Builds a stable ordered mapping from {@link Character.UnicodeBlock} to integer index
- * by scanning all valid Unicode codepoints in order (U+0000 to U+10FFFF) and
- * recording each block's first occurrence.
- *
- * The resulting map has {@code size()} entries (one per named block).
- * Callers should reserve index {@code size()} as the "unassigned" bucket
- * (for codepoints where {@code UnicodeBlock.of(cp)} returns null).
- *
- * @return immutable ordered map: UnicodeBlock → integer index [0, size)
- */
- static Map buildBlockIndex() {
- LinkedHashMap index = new LinkedHashMap<>();
- for (int cp = 0; cp <= 0x10FFFF; cp++) {
- Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
- if (b != null) index.putIfAbsent(b, index.size());
- }
- return Collections.unmodifiableMap(index);
- }
-
// -----------------------------------------------------------------------
// Training
// -----------------------------------------------------------------------
/**
- * Trains a 256×256 byte-bigram log-probability table from a gzipped sentence file.
- *
- * @return float[65536] where index {@code a*256+b} = log P(b|a)
- */
- static float[] trainBigramTable(Path trainGz) throws IOException {
- long[] counts = new long[65536];
- long totalBigrams = 0;
- long sentences = 0;
-
- try (BufferedReader r = openGzipped(trainGz)) {
- String line;
- while ((line = r.readLine()) != null) {
- byte[] bytes = line.getBytes(StandardCharsets.UTF_8);
- for (int i = 0; i + 1 < bytes.length; i++) {
- counts[((bytes[i] & 0xFF) << 8) | (bytes[i + 1] & 0xFF)]++;
- totalBigrams++;
- }
- sentences++;
- }
- }
-
- System.out.printf(" %,d sentences, %,d byte bigrams%n", sentences, totalBigrams);
- return laplaceSmoothLogProb(counts, 256);
- }
-
- /**
- * Trains a {@code blockN×blockN} named-Unicode-block transition log-probability table.
+ * Trains a {@code N × N} block-transition log-probability table where
+ * {@code N = UnicodeBlockRanges.bucketCount()}. Block bucketing uses
+ * the JVM-independent {@link UnicodeBlockRanges} table.
*
- * @param blockIndex ordered mapping from UnicodeBlock to index [0, blockIndex.size())
- * @param blockN blockIndex.size() + 1 (includes the null bucket)
- * @return float[blockN*blockN] where index {@code a*blockN+b} = log P(block_b | block_a)
+ * @return float[N*N] where index {@code a*N+b} = log P(block_b | block_a)
*/
- static float[] trainBlockTable(Path trainGz,
- Map blockIndex,
- int blockN) throws IOException {
+ static float[] trainBlockTable(Path trainGz) throws IOException {
+ int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
long[] counts = new long[blockN * blockN];
- int nullId = blockN - 1;
long totalBigrams = 0;
long sentences = 0;
@@ -443,8 +426,7 @@ static float[] trainBlockTable(Path trainGz,
int prev = -1;
for (int i = 0; i < line.length(); ) {
int cp = line.codePointAt(i);
- Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
- int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
+ int blockId = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketOf(cp);
if (prev >= 0) {
counts[prev * blockN + blockId]++;
totalBigrams++;
@@ -533,37 +515,17 @@ static List sampleSubstrings(Path devGz, int nSamples,
return result;
}
- /** @return float[2] = {mu, sigma} of byte-bigram mean log-prob on dev windows */
- static float[] computeBigramCalibration(Path devGz, float[] bigramTable) throws IOException {
- List windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 42);
- List scores = new ArrayList<>(windows.size());
- for (String window : windows) {
- byte[] bytes = window.getBytes(StandardCharsets.UTF_8);
- if (bytes.length < 2) continue;
- double sum = 0;
- for (int i = 0; i + 1 < bytes.length; i++) {
- sum += bigramTable[((bytes[i] & 0xFF) << 8) | (bytes[i + 1] & 0xFF)];
- }
- scores.add(sum / (bytes.length - 1));
- }
- System.out.printf(" %,d dev windows%n", scores.size());
- return muSigma(scores);
- }
-
/** @return float[2] = {mu, sigma} of block-transition mean log-prob on dev windows */
- static float[] computeBlockCalibration(Path devGz, float[] blockTable,
- Map blockIndex,
- int blockN) throws IOException {
+ static float[] computeBlockCalibration(Path devGz, float[] blockTable) throws IOException {
+ int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount();
List windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 43);
List scores = new ArrayList<>(windows.size());
- int nullId = blockN - 1;
for (String window : windows) {
int[] ids = new int[window.length()];
int len = 0;
for (int i = 0; i < window.length(); ) {
int cp = window.codePointAt(i);
- Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
- ids[len++] = b != null ? blockIndex.getOrDefault(b, nullId) : nullId;
+ ids[len++] = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketOf(cp);
i += Character.charCount(cp);
}
if (len < 2) continue;
@@ -623,166 +585,15 @@ static float[] computeControlByteCalibration(Path devGz) throws IOException {
* @param remapTables list of pre-built wrong-codec remap tables from {@link #buildRemapTable}
* @return float[5] = {w1, w2, w3, w4, bias} — classifier weights; positive logit = clean
*/
- static float[] trainClassifier(Path devGz,
- float[] bigramTable, float[] bigramCal,
- float[] blockTable, float[] blockCal,
- float[] controlCal,
- Map blockIndex,
- int blockN,
- float[] scriptTransTable, float[] scriptTransCal,
- Map scriptBucketMap, int numScriptBuckets,
- Map> scriptCodepoints,
- List