diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java index 8c054b0ef75..c1f78cebb68 100644 --- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java +++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityComparison.java @@ -50,8 +50,8 @@ public TextQualityComparison(String winner, float delta, } /** - * Returns {@code "A"} if candidate A is cleaner, {@code "B"} otherwise. - * Check {@link #delta()} to gauge confidence. + * Returns the label of the cleaner candidate ({@link #labelA()} or + * {@link #labelB()}). Check {@link #delta()} to gauge confidence. */ public String winner() { return winner; @@ -88,8 +88,7 @@ public String labelB() { @Override public String toString() { return String.format(java.util.Locale.ROOT, - "TextQualityComparison[winner=%s(%s) delta=%.3f A=%s B=%s]", - winner, winner.equals("A") ? labelA : labelB, - delta, scoreA, scoreB); + "TextQualityComparison[winner=%s delta=%.3f A=%s(%s) B=%s(%s)]", + winner, delta, labelA, scoreA, labelB, scoreB); } } diff --git a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java index d832b5a169d..b91315e7272 100644 --- a/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java +++ b/tika-core/src/main/java/org/apache/tika/quality/TextQualityDetector.java @@ -37,7 +37,7 @@ * // Arbitrate between two charset decodings * TextQualityComparison cmp = detector.compare("cp1252", decodedAsCp1252, * "cp1251", decodedAsCp1251); - * String winner = cmp.winner(); // "A" or "B" + * String winner = cmp.winner(); // returns the chosen label, e.g. "cp1251" * } */ public interface TextQualityDetector { diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index 1719043f408..5635f6f168d 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -41,26 +41,35 @@ * Language-agnostic text quality scorer. Discriminates clean UTF-8 text from * mojibake, reversed text, wrong-codec decodings, and other corruption forms. * - *

Scoring combines up to three features, depending on the model version: + *

Scoring combines four features: *

    - *
  1. Byte-bigram log-probability — 256×256 table of log P(b|a) over - * consecutive byte pairs in the UTF-8 encoding.
  2. - *
  3. Unicode named-block transition log-probability (version 2+) — - * N×N table of log P(block_b | block_a) where block IDs are the named - * {@link Character.UnicodeBlock} values (BASIC_LATIN, ARABIC, - * CJK_UNIFIED_IDEOGRAPHS, etc.).
  4. - *
  5. Control-byte fraction (version 2+) — fraction of bytes in control + *
  6. Codepoint-bigram log-probability (F1) — global hashed table + * indexed by FNV-1a(cp_a, cp_b, seed) into {@code bigramBuckets} cells. + * A Bloom filter records seen pairs; unseen pairs fall back to a + * hashed-unigram independence-assumption score + * {@code α * (log P(cp_a) + log P(cp_b))}.
  7. + *
  8. Unicode named-block transition log-probability (F2) — + * per-script N×N table over {@link Character.UnicodeBlock} values.
  9. + *
  10. Control-byte fraction (F3) — fraction of bytes in control * ranges [0x01–0x08, 0x0B, 0x0C, 0x0E–0x1F, 0x7F].
  11. + *
  12. Global script-transition log-probability (F4) — single + * transition table over raw {@link Character.UnicodeScript} values, + * capturing document-level cross-script anomalies.
  13. *
* - *

All features are calibrated (mu/sigma) on held-out dev text so their z-scores - * are on a common scale. + *

All features are calibrated per-script (mu/sigma) on held-out dev text + * so their z-scores are on a common scale. z-scores are combined by a + * per-script linear classifier: + * {@code logit = w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias}, where weights are + * fit on clean vs. corrupted dev windows. Natural junk threshold is 0 + * (positive logit = clean); use negative thresholds for conservative + * detection.

* - *

Features are combined by a per-script logistic regression classifier: - * {@code w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias}, where weights are fit on - * clean vs. corrupted dev windows. The natural junk threshold is 0 (positive - * logit = clean); use a negative threshold for conservative detection - * (e.g., {@code score < -1}).

+ *

Model file format: a single binary spec (see {@link #load(InputStream)} + * javadoc). No backwards-compat fallback to older formats — the loader + * rejects mismatched version bytes with a clear error. This is + * intentional: keeping parallel scoring paths is a known source of silent + * miscalibration bugs. * *

Instances are immutable and thread-safe after construction. * @@ -72,7 +81,7 @@ * * // Arbitrate between two charset decodings * TextQualityComparison result = detector.compare("cp1252", ascp1252, "cp1251", ascp1251); - * String winner = result.winner(); // "A" or "B" + * String winner = result.winner(); // returns "cp1252" or "cp1251" * } */ public final class JunkDetector implements TextQualityDetector { @@ -82,68 +91,54 @@ public final class JunkDetector implements TextQualityDetector { "org/apache/tika/ml/junkdetect/junkdetect.bin"; static final String MAGIC = "JUNKDET1"; + /** Sole supported file-format version. Mismatch is a hard error. */ + static final int VERSION = 7; - private final int modelVersion; + // Feature 1 — per-script open-addressed codepoint-bigram tables. + // No global Bloom: empty-slot is the membership oracle. + private final Map f1TablesByScript; - // Feature 1: byte bigrams (all versions) - private final Map tables; // script → float[65536] log-prob + /** Per-script F1 calibration on the codepoint-hash mean log-prob. */ private final Map calibrations; // script → float[2] {mu, sigma} - // Feature 2: named-block transitions (version 2+); null for v1 models - private final Map blockTables; // script → float[blockN*blockN] - private final Map blockCalibrations; // script → float[2] {mu, sigma} - private final int blockN; // block table dimension (0 for v1) + // Feature 2 — per-script block transition. Block bucketing uses the + // JVM-independent {@link UnicodeBlockRanges} static table; table size + // per script is {@code bucketCount()²} floats. + private final Map blockTables; + private final Map blockCalibrations; - // Feature 3: control-byte fraction (version 2+); null for v1 models - private final Map controlCalibrations; // script → float[2] {mu, sigma} + // Feature 3 — per-script control-byte fraction calibration + private final Map controlCalibrations; - // Feature combination: per-script linear classifier (version 3+); null for v1/v2 models - // float[numFeatures+1] = {w1, ..., wN, bias}; positive logit = clean - private final Map classifierWeights; - - // Feature 4: global script-transition (version 4+); null for v1/v2/v3 models - // One global table: float[numScriptBuckets * numScriptBuckets] log P(script_b | script_a) - // Uses raw UnicodeScript names (not SCRIPT_MODEL_FALLBACK) to distinguish HIRAGANA/KATAKANA/HAN. + // Feature 4 — single global script-transition table private final float[] scriptTransitionTable; - private final float[] scriptTransitionCalibration; // float[2] = {mu, sigma} - private final Map scriptBucketIndex; // raw UnicodeScript name → bucket ID - private final int numScriptBuckets; // 0 for v1/v2/v3 + private final float[] scriptTransitionCalibration; + private final Map scriptBucketIndex; + private final int numScriptBuckets; - // Shared block index for v2+ models: UnicodeBlock → index [0, blockN-1) - // Index blockN-1 is the "unassigned" bucket (null UnicodeBlock). - private final Map blockIndex; + // Per-script linear classifier: float[numFeatures+1] = {w1, ..., wN, bias}. + private final Map classifierWeights; - private JunkDetector(int modelVersion, - Map tables, - Map calibrations, + private JunkDetector(Map calibrations, Map blockTables, Map blockCalibrations, - int blockN, Map controlCalibrations, Map classifierWeights, - Map blockIndex, float[] scriptTransitionTable, float[] scriptTransitionCalibration, Map scriptBucketIndex, - int numScriptBuckets) { - this.modelVersion = modelVersion; - this.tables = Collections.unmodifiableMap(tables); + int numScriptBuckets, + Map f1TablesByScript) { this.calibrations = Collections.unmodifiableMap(calibrations); - this.blockTables = blockTables != null - ? Collections.unmodifiableMap(blockTables) : null; - this.blockCalibrations = blockCalibrations != null - ? Collections.unmodifiableMap(blockCalibrations) : null; - this.blockN = blockN; - this.controlCalibrations = controlCalibrations != null - ? Collections.unmodifiableMap(controlCalibrations) : null; - this.classifierWeights = classifierWeights != null - ? Collections.unmodifiableMap(classifierWeights) : null; - this.blockIndex = blockIndex; + this.blockTables = Collections.unmodifiableMap(blockTables); + this.blockCalibrations = Collections.unmodifiableMap(blockCalibrations); + this.controlCalibrations = Collections.unmodifiableMap(controlCalibrations); + this.classifierWeights = Collections.unmodifiableMap(classifierWeights); this.scriptTransitionTable = scriptTransitionTable; this.scriptTransitionCalibration = scriptTransitionCalibration; - this.scriptBucketIndex = scriptBucketIndex != null - ? Collections.unmodifiableMap(scriptBucketIndex) : null; + this.scriptBucketIndex = Collections.unmodifiableMap(scriptBucketIndex); this.numScriptBuckets = numScriptBuckets; + this.f1TablesByScript = Collections.unmodifiableMap(f1TablesByScript); } // ----------------------------------------------------------------------- @@ -196,7 +191,53 @@ public static JunkDetector loadFromPath(Path path) throws IOException { /** * Loads a model from an {@link InputStream}. Gzip-detection is automatic. - * Supports model versions 1 through 5. + * Strictly requires the current file-format version ({@value #VERSION}) — + * older formats are rejected with a clear error rather than supported + * via a fallback path. + * + *

File-format layout (gzipped): + *

+     *   [8 bytes]    magic "JUNKDET1" (ASCII)
+     *   [1 byte]     version (= 7)
+     *   [4 bytes]    num_scripts (int BE)
+     *   [1 byte]     block_scheme_version  (must equal
+     *                {@link UnicodeBlockRanges#SCHEME_VERSION})
+     *   [1 byte]     num_script_buckets
+     *   for each bucket:
+     *     [2 bytes]      name length (ushort BE)
+     *     [name bytes]   bucket name (UTF-8)
+     *   [num_script_buckets² × 4 bytes]  script-transition log-prob table (F4)
+     *   [4 bytes]    mu4 (float32 BE)
+     *   [4 bytes]    sigma4 (float32 BE)
+     *   for each script (sorted by name):
+     *     [2 bytes]      name length
+     *     [name bytes]   script name (UTF-8)
+     *     [4 bytes]      mu1 (F1 calibration, codepoint-bigram mean log-prob)
+     *     [4 bytes]      sigma1
+     *     // V7 F1 tables for this script — see {@link V7Tables#writeTo}
+     *     [4 bytes]      backoff_alpha (float32 BE)
+     *     [4 bytes]      codepoint_count
+     *     [codepoint_count × 4 bytes]  codepoint index (sorted, ascending)
+     *     [4 bytes]      bigram_slots (power of 2)
+     *     [4 bytes]      bigram_quant_min (float32 BE)
+     *     [4 bytes]      bigram_quant_max (float32 BE)
+     *     [bigram_slots × 4 bytes]  bigram open-addressing keys
+     *                                ((idxA<<16)|idxB, or {@link V7Tables#EMPTY_KEY})
+     *     [bigram_slots bytes]      bigram values (8-bit quantized log-probs)
+     *     [4 bytes]      unigram_quant_min (float32 BE)
+     *     [4 bytes]      unigram_quant_max (float32 BE)
+     *     [4 bytes]      unigram_fallback_log_prob (float32 BE; used for
+     *                                                codepoints not in index)
+     *     [codepoint_count bytes]   unigram values (8-bit quantized log-probs)
+     *     // F2/F3/classifier (unchanged from v6 layout)
+     *     [4 bytes]      mu2 (F2 calibration)
+     *     [4 bytes]      sigma2
+     *     [block_N² × 4 bytes]  block-transition log-prob table (F2)
+     *     [4 bytes]      mu3 (F3 calibration)
+     *     [4 bytes]      sigma3
+     *     [1 byte]       num_features
+     *     [(num_features+1) × 4 bytes]  classifier weights w1..wN and bias
+     * 
*/ public static JunkDetector load(InputStream rawIs) throws IOException { byte[] peek = rawIs.readNBytes(2); @@ -215,21 +256,22 @@ public static JunkDetector load(InputStream rawIs) throws IOException { throw new IOException("Not a JunkDetector model file (bad magic)"); } int version = dis.readUnsignedByte(); - if (version != 5) { - throw new IOException("Unsupported model version: " + version - + ". Only version 5 is supported. Retrain the model with TrainJunkModel."); + if (version != VERSION) { + throw new IOException("Unsupported model format version: " + version + + ". This build expects version " + VERSION + + ". Retrain the model with the current TrainJunkModel."); } int numScripts = dis.readInt(); - // Block names (v5): stored in model for JVM-independence - int blockN = dis.readUnsignedShort(); - String[] blockNames = new String[blockN - 1]; - for (int i = 0; i < blockN - 1; i++) { - int nameLen = dis.readUnsignedShort(); - blockNames[i] = new String(dis.readNBytes(nameLen), StandardCharsets.UTF_8); + int blockSchemeVersion = dis.readUnsignedByte(); + if (blockSchemeVersion != UnicodeBlockRanges.SCHEME_VERSION) { + throw new IOException("Unsupported block-scheme version: " + + blockSchemeVersion + ". This build expects " + + UnicodeBlockRanges.SCHEME_VERSION + + ". Retrain with the current TrainJunkModel."); } - Map blockIndex = buildBlockIndexFromNames(blockNames); + int blockN = UnicodeBlockRanges.bucketCount(); // Global script-transition section int numScriptBuckets = dis.readUnsignedByte(); @@ -242,42 +284,39 @@ public static JunkDetector load(InputStream rawIs) throws IOException { float[] scriptTransitionTable = readFloatTable(dis, numScriptBuckets * numScriptBuckets); float[] scriptTransitionCalibration = new float[]{dis.readFloat(), dis.readFloat()}; - Map tables = new HashMap<>(numScripts * 2); - Map calibrations = new HashMap<>(numScripts * 2); - Map blockTables = new HashMap<>(numScripts * 2); - Map blockCalibrations = new HashMap<>(numScripts * 2); - Map controlCalibrations = new HashMap<>(numScripts * 2); - Map classifierWeights = new HashMap<>(numScripts * 2); + Map f1TablesByScript = new HashMap<>(numScripts * 2); + Map calibrations = new HashMap<>(numScripts * 2); + Map blockTables = new HashMap<>(numScripts * 2); + Map blockCalibrations = new HashMap<>(numScripts * 2); + Map controlCalibrations = new HashMap<>(numScripts * 2); + Map classifierWeights = new HashMap<>(numScripts * 2); for (int s = 0; s < numScripts; s++) { int nameLen = dis.readUnsignedShort(); String script = new String(dis.readNBytes(nameLen), StandardCharsets.UTF_8); - // Feature 1: byte bigrams calibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()}); - tables.put(script, readFloatTable(dis, 65536)); - // Feature 2: named-block transitions + // Per-script V7 F1 tables. + f1TablesByScript.put(script, V7Tables.readFrom(dis)); + blockCalibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()}); blockTables.put(script, readFloatTable(dis, blockN * blockN)); - - // Feature 3: control-byte fraction controlCalibrations.put(script, new float[]{dis.readFloat(), dis.readFloat()}); - // Classifier weights: num_features (1 byte) + num_features floats + 1 bias int numFeatures = dis.readUnsignedByte(); - float[] weights = new float[numFeatures + 1]; // last = bias + float[] weights = new float[numFeatures + 1]; for (int j = 0; j <= numFeatures; j++) { weights[j] = dis.readFloat(); } classifierWeights.put(script, weights); } - return new JunkDetector(version, tables, calibrations, - blockTables, blockCalibrations, blockN, - controlCalibrations, classifierWeights, blockIndex, + return new JunkDetector(calibrations, + blockTables, blockCalibrations, + controlCalibrations, classifierWeights, scriptTransitionTable, scriptTransitionCalibration, - scriptBucketIndex, numScriptBuckets); + scriptBucketIndex, numScriptBuckets, f1TablesByScript); } } @@ -289,44 +328,6 @@ private static float[] readFloatTable(DataInputStream dis, int size) throws IOEx return table; } - /** - * Builds the stable ordered mapping from {@link Character.UnicodeBlock} to index. - * This must produce the same ordering as {@link TrainJunkModel#buildBlockIndex()}. - * Used for v2/v3/v4 models only; v5+ models store block names in the file. - */ - static Map buildBlockIndex() { - LinkedHashMap index = new LinkedHashMap<>(); - for (int cp = 0; cp <= 0x10FFFF; cp++) { - Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); - if (b != null) index.putIfAbsent(b, index.size()); - } - return Collections.unmodifiableMap(index); - } - - /** - * Builds a block index from an ordered array of block names stored in a v5+ model. - * Resolves each name via {@link Character.UnicodeBlock#forName(String)}. - * Throws {@link IOException} if any name is not recognised by the current JVM — - * this means the model was trained on a newer JVM; retrain on the minimum - * supported JVM (Java 17) to produce a compatible model. - * - * @param blockNames ordered array of block names (index = position in block table) - * @return unmodifiable map from UnicodeBlock to table index - */ - static Map buildBlockIndexFromNames(String[] blockNames) - throws IOException { - Map index = new HashMap<>(blockNames.length * 2); - for (int i = 0; i < blockNames.length; i++) { - try { - Character.UnicodeBlock b = Character.UnicodeBlock.forName(blockNames[i]); - index.put(b, i); - } catch (IllegalArgumentException e) { - throw new IOException("Unicode block not known to this JVM: " + blockNames[i] - + ". Model was trained on a newer JVM; retrain on Java 17.", e); - } - } - return Collections.unmodifiableMap(index); - } // ----------------------------------------------------------------------- // TextQualityDetector implementation @@ -373,7 +374,7 @@ public TextQualityComparison compare(String labelA, String candidateA, float zA = scoreA.isUnknown() ? 0f : scoreA.getZScore(); float zB = scoreB.isUnknown() ? 0f : scoreB.getZScore(); - String winner = zA >= zB ? "A" : "B"; + String winner = zA >= zB ? labelA : labelB; float delta = Math.abs(zA - zB); return new TextQualityComparison(winner, delta, scoreA, scoreB, labelA, labelB); @@ -381,12 +382,12 @@ public TextQualityComparison compare(String labelA, String candidateA, /** Returns the set of script names this model knows about. */ public Set knownScripts() { - return tables.keySet(); + return calibrations.keySet(); } - /** Returns the version of the loaded model (1, 2, or 3). */ + /** Returns the file-format version of the loaded model. */ public int getModelVersion() { - return modelVersion; + return VERSION; } // ----------------------------------------------------------------------- @@ -409,12 +410,16 @@ private TextQualityScore scoreText(String text) { float[] dominantCal1 = null; for (ScriptRun run : runs) { - if (!tables.containsKey(run.script)) { + if (!calibrations.containsKey(run.script)) { continue; // skip scripts not in model; treat as neutral, not junk } byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8); - if (runUtf8.length < 2) { - continue; // too short to score + // Skip if too short to form a bigram by either metric. A single + // CJK char is 3 UTF-8 bytes (passes the byte filter) but 1 UTF-16 + // unit, and computeF1MeanLogP filters by text.length() < 2 and + // returns NaN — which would poison the weighted sum here. + if (runUtf8.length < 2 || run.text.length() < 2) { + continue; } float logit = scoreChunk(runUtf8, run.text, run.script, z4); int n = runUtf8.length; @@ -444,88 +449,226 @@ private TextQualityScore scoreText(String text) { return new TextQualityScore(zScore, pClean, ciLow, ciHigh, dominantScript); } + /** + * Diagnostic — exposes per-feature z-scores and classifier weights. Same + * chunking and aggregation as {@link #score(String)}, but returns the + * intermediate signals individually for analysis or for hybrid models + * that want to substitute one feature with an externally-computed value. + * + *

Aggregation: per-chunk z1/z2/z3 and per-chunk logit are byte-count- + * weighted across script-homogeneous chunks. z4 is a global signal + * (already document-level). {@code dominantScript} and + * {@code classifierWeights} refer to the script run with the most bytes. + */ + public FeatureComponents scoreWithFeatureComponents(String text) { + if (text == null || text.isEmpty()) { + return new FeatureComponents(Float.NaN, Float.NaN, Float.NaN, + Float.NaN, Float.NaN, "UNKNOWN", null, 0); + } + List runs = buildScriptRuns(text); + float z4 = computeScriptTransitionZ(text); + + float totalBytes = 0; + float weightedZ1 = 0; + float weightedZ2 = 0; + float weightedZ3 = 0; + float weightedLogit = 0; + String dominantScript = null; + int maxBytes = 0; + + for (ScriptRun run : runs) { + if (!calibrations.containsKey(run.script)) { + continue; + } + byte[] runUtf8 = run.text.getBytes(StandardCharsets.UTF_8); + if (runUtf8.length < 2 || run.text.length() < 2) { + continue; // see scoreText: paired filter avoids NaN poisoning + } + float[] zs = computeChunkZs(runUtf8, run.text, run.script); + float chunkLogit = combineLogit(zs[0], zs[1], zs[2], z4, run.script); + int n = runUtf8.length; + weightedZ1 += zs[0] * n; + weightedZ2 += zs[1] * n; + weightedZ3 += zs[2] * n; + weightedLogit += chunkLogit * n; + totalBytes += n; + if (n > maxBytes) { + maxBytes = n; + dominantScript = run.script; + } + } + + if (totalBytes == 0 || dominantScript == null) { + return new FeatureComponents(Float.NaN, Float.NaN, Float.NaN, z4, + Float.NaN, runs.isEmpty() ? "UNKNOWN" : runs.get(0).script, + null, 0); + } + + float[] cw = classifierWeights.get(dominantScript); + return new FeatureComponents( + weightedZ1 / totalBytes, + weightedZ2 / totalBytes, + weightedZ3 / totalBytes, + z4, + weightedLogit / totalBytes, + dominantScript, + cw, + (int) totalBytes); + } + + /** + * Per-feature z-score breakdown returned by + * {@link #scoreWithFeatureComponents(String)}. All z-scores are + * byte-count-weighted aggregates across script-homogeneous chunks + * except {@code z4}, which is a single document-level value. + * + *

{@code classifierWeights} is the per-script linear classifier + * weight vector {@code {w1, w2, w3, w4, bias}} for the dominant + * script — useful for hybrid models that recompute the logit after + * substituting one z-score with an externally-computed value. + */ + public static final class FeatureComponents { + public final float z1; + public final float z2; + public final float z3; + public final float z4; + public final float logit; + public final String dominantScript; + public final float[] classifierWeights; + public final int totalBytes; + + FeatureComponents(float z1, float z2, float z3, float z4, + float logit, String dominantScript, + float[] classifierWeights, int totalBytes) { + this.z1 = z1; + this.z2 = z2; + this.z3 = z3; + this.z4 = z4; + this.logit = logit; + this.dominantScript = dominantScript; + this.classifierWeights = classifierWeights; + this.totalBytes = totalBytes; + } + } + /** * Scores a single script-homogeneous chunk and returns its logit. * Positive = clean, negative = junk. Returns 0 (neutral) if the chunk * has no model or is too short. */ private float scoreChunk(byte[] utf8, String text, String script, float z4) { - float[] bigramTable = tables.get(script); - if (bigramTable == null || utf8.length < 2) { + if (utf8.length < 2 || !calibrations.containsKey(script)) { return 0f; } + float[] zs = computeChunkZs(utf8, text, script); + return combineLogit(zs[0], zs[1], zs[2], z4, script); + } - // Feature 1: byte-bigram mean log-prob - double bigramSum = 0; - int bigramCount = 0; - for (int i = 0; i + 1 < utf8.length; i++) { - bigramSum += bigramTable[((utf8[i] & 0xFF) << 8) | (utf8[i + 1] & 0xFF)]; - bigramCount++; - } - float meanBigramLogProb = (float) (bigramSum / bigramCount); + /** + * Computes per-feature z-scores {z1, z2, z3} for a single script- + * homogeneous chunk. Shared between {@link #scoreChunk} and + * {@link #scoreWithFeatureComponents}, and used at training time + * via the public {@code computeZ2/3/4...} static helpers so + * training and inference share the same math. + */ + private float[] computeChunkZs(byte[] utf8, String text, String script) { + // Feature 1: per-script codepoint-bigram, calibrated per-script + V7Tables tables = f1TablesByScript.get(script); + float meanF1LogProb = computeCodepointF1MeanLogP(text, tables); float[] cal1 = calibrations.get(script); - float z1 = (meanBigramLogProb - cal1[0]) / cal1[1]; - - // Feature 2: named-block transition mean log-prob - float z2 = 0f; - float[] blockTable = blockTables.get(script); - if (blockTable != null) { - int nullId = blockN - 1; - int prev = -1; - double blockSum = 0; - int blockCount = 0; - for (int i = 0; i < text.length(); ) { - int cp = text.codePointAt(i); - Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); - int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId; - if (prev >= 0) { - blockSum += blockTable[prev * blockN + blockId]; - blockCount++; - } - prev = blockId; - i += Character.charCount(cp); - } - if (blockCount > 0) { - float meanBlockLogProb = (float) (blockSum / blockCount); - float[] cal2 = blockCalibrations.get(script); - z2 = cal2 != null ? (meanBlockLogProb - cal2[0]) / cal2[1] : 0f; + float z1 = (meanF1LogProb - cal1[0]) / cal1[1]; + + float z2 = computeZ2BlockTransition(text, + blockTables.get(script), blockCalibrations.get(script)); + float z3 = computeZ3ControlByte(utf8, controlCalibrations.get(script)); + return new float[]{z1, z2, z3}; + } + + private static float computeCodepointF1MeanLogP(String text, V7Tables tables) { + if (tables == null) return Float.NaN; + double v = computeF1MeanLogP(text, tables); + return Double.isNaN(v) ? Float.NaN : (float) v; + } + + /** + * Feature 2 — calibrated z-score for block-transition mean log-prob on + * one text window. Returns 0 if the window has fewer than two + * codepoints or if {@code blockTable} / {@code blockCal} are null. + * + *

Block bucketing is via the JVM-independent + * {@link UnicodeBlockRanges}. Public so the trainer's classifier + * feature extractor calls into the exact same math used at inference + * time — single source of truth, no train/infer drift. + * + * @param blockTable {@code (blockN)² × float} log-prob table where + * {@code blockN = UnicodeBlockRanges.bucketCount()} + */ + public static float computeZ2BlockTransition(String text, + float[] blockTable, float[] blockCal) { + if (blockTable == null || blockCal == null || text.length() < 2) { + return 0f; + } + int blockN = UnicodeBlockRanges.bucketCount(); + int prev = -1; + double sum = 0; + int count = 0; + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + int blockId = UnicodeBlockRanges.bucketOf(cp); + if (prev >= 0) { + sum += blockTable[prev * blockN + blockId]; + count++; } + prev = blockId; + i += Character.charCount(cp); + } + if (count == 0) { + return 0f; } + return ((float) (sum / count) - blockCal[0]) / blockCal[1]; + } - // Feature 3: control-byte fraction (stored as −fraction, so higher = cleaner) + /** + * Feature 3 — calibrated z-score for control-byte fraction on the UTF-8 + * byte sequence of one text window. Stored score is {@code -fraction} + * so higher = cleaner (matching the direction convention of the other + * z-features). + * + *

Public for train/infer math-sharing. + */ + public static float computeZ3ControlByte(byte[] utf8, float[] controlCal) { + if (utf8.length == 0 || controlCal == null) { + return 0f; + } long controlCount = 0; for (byte b : utf8) { - if (isControlByte(b & 0xFF)) controlCount++; - } - float controlScore = -(float) controlCount / utf8.length; - float[] cal3 = controlCalibrations.get(script); - float z3 = cal3 != null ? (controlScore - cal3[0]) / cal3[1] : 0f; - - // Per-script linear classifier: w1*z1 + w2*z2 + w3*z3 + w4*z4 + bias - float[] cw = classifierWeights.get(script); - if (cw != null) { - int nFeat = cw.length - 1; // bias is last - float logit = cw[nFeat]; // bias - if (nFeat >= 1) logit += cw[0] * z1; - if (nFeat >= 2) logit += cw[1] * z2; - if (nFeat >= 3) logit += cw[2] * z3; - if (nFeat >= 4) logit += cw[3] * z4; - return logit; + if (isControlByte(b & 0xFF)) { + controlCount++; + } } - return (z1 + z2 + z3 + z4) / 4.0f; // fallback: equal weight + float score = -(float) controlCount / utf8.length; + return (score - controlCal[0]) / controlCal[1]; } /** - * Computes the global script-transition z-score for the whole input string. - * Uses raw {@link Character.UnicodeScript} values — NOT {@link #SCRIPT_MODEL_FALLBACK} — - * so that HIRAGANA, KATAKANA, and HAN remain distinct, preserving the - * characteristic script-mixing pattern of Japanese text. + * Feature 4 — calibrated z-score for global script-transition mean + * log-prob on one text window. Uses raw {@link Character.UnicodeScript} + * values (no model fallback) so HIRAGANA / KATAKANA / HAN remain + * distinct. Returns 0 if the window has fewer than two non-neutral + * codepoints or if the script-transition data isn't supplied. * - *

Returns 0 if the string has fewer than two non-neutral codepoints. + *

Public for train/infer math-sharing. Note: inference computes + * z4 once per document via {@link #computeScriptTransitionZ} (which + * uses the instance's loaded tables); this helper takes them as + * arguments so training can compute z4 before the model is finalised. */ - private float computeScriptTransitionZ(String text) { - if (scriptTransitionTable == null || scriptBucketIndex == null - || scriptTransitionCalibration == null || numScriptBuckets == 0) { + public static float computeZ4ScriptTransition(String text, + float[] scriptTransTable, + float[] scriptTransCal, + Map scriptBucketIndex, + int numScriptBuckets) { + if (scriptTransTable == null || scriptTransCal == null + || scriptBucketIndex == null || numScriptBuckets == 0) { return 0f; } int otherBucket = numScriptBuckets - 1; @@ -543,7 +686,7 @@ private float computeScriptTransitionZ(String text) { } int bucket = scriptBucketIndex.getOrDefault(s.name(), otherBucket); if (prev >= 0) { - sum += scriptTransitionTable[prev * numScriptBuckets + bucket]; + sum += scriptTransTable[prev * numScriptBuckets + bucket]; count++; } prev = bucket; @@ -551,8 +694,184 @@ private float computeScriptTransitionZ(String text) { if (count == 0) { return 0f; } - float mean = (float) (sum / count); - return (mean - scriptTransitionCalibration[0]) / scriptTransitionCalibration[1]; + return ((float) (sum / count) - scriptTransCal[0]) / scriptTransCal[1]; + } + + /** + * Combines per-feature z-scores via the per-script linear classifier. + * Fallback (when no classifier weights stored): equal-weight average. + */ + private float combineLogit(float z1, float z2, float z3, float z4, String script) { + float[] cw = classifierWeights.get(script); + if (cw != null) { + int nFeat = cw.length - 1; // bias is last + float logit = cw[nFeat]; // bias + if (nFeat >= 1) logit += cw[0] * z1; + if (nFeat >= 2) logit += cw[1] * z2; + if (nFeat >= 3) logit += cw[2] * z3; + if (nFeat >= 4) logit += cw[3] * z4; + return logit; + } + return (z1 + z2 + z3 + z4) / 4.0f; // fallback: equal weight + } + + // ----------------------------------------------------------------------- + // Feature 1: per-script open-addressing codepoint-bigram lookup + // ----------------------------------------------------------------------- + + /** + * Mean log-prob over the codepoint pairs in {@code text} using the given + * script's V7 F1 tables. + * + *

For each adjacent codepoint pair {@code (a, b)}: + *

    + *
  1. Binary-search both codepoints in the script's codepoint index. + * If either is absent, the pair was never seen in training; emit + * {@code α * (logP(a) + logP(b))} using each codepoint's unigram + * value (or {@link V7Tables#unigramFallbackLogProb} if the + * codepoint isn't even in the unigram index).
  2. + *
  3. Otherwise, look up the packed {@code (idxA<<16)|idxB} key in + * the open-addressing bigram table. Empty slot → unseen pair → + * unigram backoff (same formula). Match → dequantize the stored + * value.
  4. + *
+ * + *

This is the single authoritative implementation of the V7 F1 + * scoring math, shared by inference and training. Keeping one + * implementation eliminates the risk of train/infer drift in the F1 + * feature. + * + * @return mean log-prob, or {@link Double#NaN} if {@code text} has fewer + * than two codepoints or {@code tables} is null + */ + public static double computeF1MeanLogP(String text, V7Tables tables) { + if (text == null || text.length() < 2 || tables == null) { + return Double.NaN; + } + double sum = 0; + int n = 0; + int prevCp = -1; + int prevIdx = -1; + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + int curIdx = codepointToIndex(tables, cp); + if (prevCp >= 0) { + sum += scorePairF1V7(prevCp, prevIdx, cp, curIdx, tables); + n++; + } + prevCp = cp; + prevIdx = curIdx; + } + return n == 0 ? Double.NaN : sum / n; + } + + /** + * Binary-search a codepoint in the script's index. + * + * @return the dense index (≥ 0) if found, or -1 if the codepoint + * doesn't appear in any kept bigram for this script + */ + public static int codepointToIndex(V7Tables tables, int cp) { + return java.util.Arrays.binarySearch(tables.codepointIndex, cp); + } + + /** + * Mixing function used to scatter packed (idxA, idxB) keys across + * the open-addressing table. A simple integer finalizer (splitmix32 + * style) gives good distribution for sequential index values. + * + *

Public so the trainer's open-addressing insertion routine uses + * the same probe order as inference — drift here would silently + * corrupt every lookup. + */ + public static int mixIndexKey(int packedKey) { + int x = packedKey; + x = (x ^ (x >>> 16)) * 0x7feb352d; + x = (x ^ (x >>> 15)) * 0x846ca68b; + x = x ^ (x >>> 16); + return x; + } + + /** + * Packed bigram key for indices {@code (a, b)} where each index fits in + * {@link JunkDetectorTrainingConfig#KEY_INDEX_BITS} bits. Asserts that + * indices are non-negative; that's the caller's contract. + */ + public static int packBigramKey(int idxA, int idxB) { + return (idxA << 16) | (idxB & 0xFFFF); + } + + /** + * Looks up a (cpA, cpB) bigram in the script's V7 tables and returns + * its dequantized log-prob. Falls back to unigram backoff on miss. + * + *

{@code idxA}/{@code idxB} are the pre-computed codepoint indices + * (from {@link #codepointToIndex}); {@code -1} means the codepoint is + * not in this script's index. The caller is expected to compute them + * once when scanning the text (avoiding a redundant binary search per + * codepoint). + */ + private static double scorePairF1V7(int cpA, int idxA, int cpB, int idxB, + V7Tables tables) { + if (idxA >= 0 && idxB >= 0) { + int slot = lookupBigramSlot(tables, idxA, idxB); + if (slot >= 0) { + return dequantize(tables.bigramValues[slot], + tables.bigramQuantMin, tables.bigramQuantMax); + } + } + // Unigram backoff for unseen pair or for codepoints absent from the + // per-script index. α=1.0 = plain independence; prototype-validated. + double ua = unigramLogProb(tables, idxA); + double ub = unigramLogProb(tables, idxB); + return tables.backoffAlpha * (ua + ub); + } + + /** + * Open-addressing lookup: returns the slot index that contains the key + * for {@code (idxA, idxB)}, or {@code -1} if not present (probe hit an + * empty slot first). + * + *

Linear probing with the same mix-hash used at training time — + * required for the table to be readable, not just writable. + */ + static int lookupBigramSlot(V7Tables tables, int idxA, int idxB) { + int packedKey = packBigramKey(idxA, idxB); + int[] keys = tables.bigramKeys; + int mask = keys.length - 1; + int h = mixIndexKey(packedKey) & mask; + while (true) { + int k = keys[h]; + if (k == V7Tables.EMPTY_KEY) return -1; + if (k == packedKey) return h; + h = (h + 1) & mask; + } + } + + private static double unigramLogProb(V7Tables tables, int idx) { + if (idx < 0) { + return tables.unigramFallbackLogProb; + } + return dequantize(tables.unigramTable[idx], + tables.unigramQuantMin, tables.unigramQuantMax); + } + + private static float dequantize(byte b, float min, float max) { + int u = b & 0xFF; + return min + (u / 255.0f) * (max - min); + } + + /** + * Computes the global script-transition z-score for the whole input + * string against this model's loaded tables. Thin wrapper around the + * public static {@link #computeZ4ScriptTransition} helper — same math, + * just preloaded with this instance's parameters. + */ + private float computeScriptTransitionZ(String text) { + return computeZ4ScriptTransition(text, + scriptTransitionTable, scriptTransitionCalibration, + scriptBucketIndex, numScriptBuckets); } /** diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java index f1de37d989a..72e51e8094f 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -25,6 +25,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,7 +41,6 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.quality.TextQualityComparison; import org.apache.tika.quality.TextQualityDetector; -import org.apache.tika.quality.TextQualityScore; /** * A {@link MetaEncodingDetector} that arbitrates charset candidates by @@ -76,34 +77,6 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { * default read limit used by the charset base detectors. */ private static final int DEFAULT_READ_LIMIT = 16384; - // --------------------------------------------------------------------- - // TACTICAL: declarative-override gate constants. - // - // These exist to compensate for known per-script calibration unevenness - // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/ - // BENGALI floors too strict). They produce wrong tournaments when an - // honest in-document declaration (`` / XML decl) decodes - // to sparse non-Latin content that scores junky-but-correct, while a - // statistical pick decodes to dense mojibake-Han that scores decent- - // but-wrong. See `analyses/2026-04-26-tika-eval-charset-and-other.md` - // and the indic-collapse + Korean+Hanja fixtures. - // - // REMOVE when the quality scorer is recalibrated per-script — the - // tournament should then be reliable on its own. - // --------------------------------------------------------------------- - - /** Maximum delta in z-score units we tolerate before honoring the - * in-document declaration over the tournament winner. Tuned so that - * small same-script-different-codepage deltas (windows-1252 vs - * windows-1257 ≈ 1-2 units) don't trigger override when scripts - * match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */ - private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f; - - /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared - * decoder's output. Above this, the declared charset clearly cannot - * decode the bytes and we should not honor the declaration. */ - private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01; - /** Cached quality detector. {@code null} if none is on the classpath. */ private final TextQualityDetector qualityDetector; @@ -187,10 +160,21 @@ public List detect(TikaInputStream tis, Metadata metadata, // Decode probe under each candidate, preserving insertion order so // tournament seeding is deterministic. + // + // Each decoded string is then run through HTML entity expansion. + // For entity-encoded HTML (numeric refs like ്), this is + // load-bearing: entity refs are ASCII bytes that decode identically + // under every candidate charset, so they don't differentiate. + // After expansion they become real codepoints — and crucially, in + // the *wrong* decoding (e.g. mojibake-as-HAN), they introduce + // cross-script transitions (HAN ↔ MALAYALAM mid-document) that the + // quality detector's script-transition feature correctly penalises. + // See `20260512-junkdetector-codepoint-hash-plan.md` (AIT5 case). Map candidates = new LinkedHashMap<>(); for (Charset cs : uniqueCharsets) { String decoded = safeDecode(forDecode, cs); if (decoded != null && !decoded.isEmpty()) { + decoded = expandHtmlEntities(decoded); candidates.put(cs, decoded); if (LOG.isTraceEnabled()) { int sampleLen = Math.min(400, decoded.length()); @@ -246,148 +230,17 @@ public List detect(TikaInputStream tis, Metadata metadata, champion.getKey().name(), challenger.getKey().name(), cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f", cmp.delta()), cmp.scoreA(), cmp.scoreB()); - if ("B".equals(cmp.winner())) { + if (challenger.getKey().name().equals(cmp.winner())) { champion = challenger; } } LOG.trace("junk-filter -> {} (tournament champion)", champion.getKey().name()); - // TACTICAL: declarative override. See class-level comment block. - // REMOVE when quality scorer is recalibrated per-script. - Charset declarativeOverride = applyInDocumentDeclarativeOverride( - context, candidates, champion.getKey()); - if (declarativeOverride != null) { - float conf = context.getTopConfidenceFor(declarativeOverride); - context.setArbitrationInfo("junk-filter-declarative-override"); - LOG.trace("junk-filter -> {} (declarative override of tournament winner {})", - declarativeOverride.name(), champion.getKey().name()); - return List.of(new EncodingResult(declarativeOverride, conf)); - } - float confidence = context.getTopConfidenceFor(champion.getKey()); context.setArbitrationInfo("junk-filter-selected"); return List.of(new EncodingResult(champion.getKey(), confidence)); } - /** - * Tactical fix: honor an in-document {@code } or XML - * declaration when the quality scorer's per-script calibration unevenness - * would otherwise mis-rank candidates of different scripts. - * - *

Returns the in-document declared charset to use, or {@code null} to - * leave the tournament winner intact.

- * - *

Gates (all must hold to override):

- *
    - *
  1. (a) Decode is mostly clean: declared decoder produces - * fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per char.
  2. - *
  3. (b) Both decoded: declared and tournament winner are - * both in the candidate map (already guaranteed by upstream code).
  4. - *
  5. (c) Quality gap small: tournament winner's z-score - * is not vastly higher than the declared's; specifically - * {@code winner.z - declared.z <= DECLARATIVE_OVERRIDE_MAX_DELTA}.
  6. - *
  7. (d) Different scripts: declared and winner classify - * as different scripts. Same-script Latin-cousin lies (e.g. windows-1252 - * declared on a windows-1257 file) fall through to the tournament, - * which correctly handles them via byte-distribution scoring.
  8. - *
- * - *

"In-document" means {@code HtmlEncodingDetector} or any future XML-decl - * source — explicitly NOT {@code MetadataCharsetDetector} (outer Content-Type - * header), which is more often wrong.

- */ - private Charset applyInDocumentDeclarativeOverride( - EncodingDetectorContext context, - Map candidates, - Charset champion) { - Charset declared = findInDocumentDeclarative(context); - if (declared == null) { - return null; - } - if (declared.equals(champion)) { - return null; // already winning - } - // Per HTML5 spec, cannot validly declare UTF-16 / UTF-32: - // the meta tag itself is bytes that have to be parsed before its - // declaration is known, and UTF-16/32 require a BOM. If the - // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs - // first in the chain), we treat the declaration as invalid and let - // the tournament winner stand. This catches govdocs1-style "utf-16 - // declared on a Latin file" lies that would otherwise look like a - // legitimate script-mismatch override. - String declaredName = declared.name(); - if (declaredName.startsWith("UTF-16") || declaredName.startsWith("UTF-32")) { - LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in (HTML5 invalid)"); - return null; - } - String championText = candidates.get(champion); - String declaredText = candidates.get(declared); - if (declaredText == null || championText == null) { - return null; // failed to decode - } - // (a) decode mostly clean - double fffdRate = replacementCharRate(declaredText); - if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) { - LOG.trace("junk-filter declarative-override skipped: U+FFFD rate {} > {}", - fffdRate, DECLARATIVE_MAX_FFFD_RATE); - return null; - } - TextQualityScore declaredScore = qualityDetector.score(declaredText); - TextQualityScore championScore = qualityDetector.score(championText); - // (c) winner not vastly higher - float delta = championScore.getZScore() - declaredScore.getZScore(); - if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) { - LOG.trace("junk-filter declarative-override skipped: delta {} > {}", - delta, DECLARATIVE_OVERRIDE_MAX_DELTA); - return null; - } - // (d) different scripts - String declaredScript = declaredScore.getDominantScript(); - String championScript = championScore.getDominantScript(); - if (declaredScript == null || declaredScript.equals(championScript)) { - LOG.trace("junk-filter declarative-override skipped: same script {}", - declaredScript); - return null; - } - LOG.trace("junk-filter declarative-override fires: declared={} (script={}, z={}) vs winner={} (script={}, z={}) delta={}", - declared.name(), declaredScript, declaredScore.getZScore(), - champion.name(), championScript, championScore.getZScore(), delta); - return declared; - } - - /** - * Find the first in-document DECLARATIVE candidate (from - * {@code HtmlEncodingDetector} / XML declaration), or {@code null}. - * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is - * intentionally excluded — those headers lie too often. - */ - private static Charset findInDocumentDeclarative(EncodingDetectorContext context) { - for (EncodingDetectorContext.Result r : context.getResults()) { - String name = r.getDetectorName(); - if (("HtmlEncodingDetector".equals(name) - || "StandardHtmlEncodingDetector".equals(name)) - && r.getResultType() == EncodingResult.ResultType.DECLARATIVE) { - return r.getCharset(); - } - } - return null; - } - - /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded String — - * a proxy for "this charset cannot decode these bytes". */ - private static double replacementCharRate(String s) { - if (s.isEmpty()) { - return 0.0; - } - long count = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == '�') { - count++; - } - } - return (double) count / s.length(); - } - /** * Return the first DECLARATIVE charset whose decoded output equals at * least one other candidate's, or {@code null}. @@ -459,6 +312,69 @@ private static String safeDecode(byte[] bytes, Charset charset) { } } + // ----------------------------------------------------------------------- + // HTML entity expansion + // + // Applied to every decoded candidate before quality scoring. Resolves + // numeric character refs (&#NNNN; / &#xHHHH;) to their target codepoints + // and a small set of common named entities. Malformed entities pass + // through as literal text. Sufficient for the AIT5-class failure + // mode where blogspot/news pages use numeric Malayalam/Bengali entities + // intermixed with raw UTF-8 codepoints. + // ----------------------------------------------------------------------- + + private static final Pattern ENTITY_DEC = Pattern.compile("&#(\\d{1,7});"); + private static final Pattern ENTITY_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});"); + private static final Pattern ENTITY_NAMED = + Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);"); + + /** + * Expands HTML numeric and a small set of named entity references in + * {@code s}. Malformed or out-of-range entities pass through unchanged. + * The named-entity set is intentionally small — only the universally- + * declared HTML5 entities that don't depend on a DOCTYPE. Anything more + * exotic stays as a literal entity reference (which scores as ASCII noise, + * the same as it would have before). + */ + static String expandHtmlEntities(String s) { + s = ENTITY_DEC.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1)); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // overflow — fall through, leave entity literal + } + return Matcher.quoteReplacement(mr.group()); + }); + s = ENTITY_HEX.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1), 16); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // overflow — fall through, leave entity literal + } + return Matcher.quoteReplacement(mr.group()); + }); + s = ENTITY_NAMED.matcher(s).replaceAll(mr -> { + switch (mr.group(1)) { + case "amp": return "&"; + case "lt": return "<"; + case "gt": return ">"; + case "quot": return "\""; + case "apos": return "'"; + case "nbsp": return " "; + case "copy": return "©"; + case "reg": return "®"; + default: return Matcher.quoteReplacement(mr.group()); + } + }); + return s; + } + /** * Strip a leading byte-order mark, if any. UTF-32 signatures are * checked before UTF-16 because the UTF-32 LE BOM ({@code FF FE 00 00}) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java new file mode 100644 index 00000000000..ab7e1b00b7e --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/UnicodeBlockRanges.java @@ -0,0 +1,445 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +/** + * Static codepoint-range → bucket-index lookup table used by Feature 2 + * (block-transition log-probability). Replaces + * {@link Character.UnicodeBlock#of(int)} so that the model's block + * semantics are fully decoupled from the JVM's Unicode-data release — + * training on one JDK and serving on another produces identical scores + * by construction. + * + *

The 338 named blocks are a snapshot from JDK 25's + * {@link Character.UnicodeBlock} (Unicode 16.x). Codepoints in gaps + * between named blocks resolve to the {@link #UNASSIGNED} bucket + * ({@value #UNASSIGNED}). The total bucket count is + * {@link #bucketCount()} = 339. + * + *

If the block list is ever updated, bump {@link #SCHEME_VERSION} — + * the model file's {@code block_scheme_version} byte must match. This + * forces a clean retrain rather than silent re-mapping. + * + *

Lookup cost: O(log N) binary search. Thread-safe, immutable. + */ +public final class UnicodeBlockRanges { + + /** + * Bumped whenever the static range table below changes. A model + * trained against scheme version X cannot be served by code at + * version Y ≠ X — the loader rejects the mismatch. + */ + public static final int SCHEME_VERSION = 1; + + /** Bucket index returned for codepoints in no named block. */ + public static final int UNASSIGNED = 338; + + /** + * Sorted by {@code start_cp}. Each row: {@code {start, end_inclusive, bucket_id}}. + * Bucket ids are 0..337 — the {@link #UNASSIGNED} bucket has id 338 + * and is implicit (returned when binary search finds no matching range). + * + *

Generated from JDK 25 {@code Character.UnicodeBlock.of(cp)} for + * every codepoint in [0, 0x10FFFF]. + */ + private static final int[][] RANGES = { + {0x0000, 0x007F, 0}, // BASIC_LATIN + {0x0080, 0x00FF, 1}, // LATIN_1_SUPPLEMENT + {0x0100, 0x017F, 2}, // LATIN_EXTENDED_A + {0x0180, 0x024F, 3}, // LATIN_EXTENDED_B + {0x0250, 0x02AF, 4}, // IPA_EXTENSIONS + {0x02B0, 0x02FF, 5}, // SPACING_MODIFIER_LETTERS + {0x0300, 0x036F, 6}, // COMBINING_DIACRITICAL_MARKS + {0x0370, 0x03FF, 7}, // GREEK + {0x0400, 0x04FF, 8}, // CYRILLIC + {0x0500, 0x052F, 9}, // CYRILLIC_SUPPLEMENTARY + {0x0530, 0x058F, 10}, // ARMENIAN + {0x0590, 0x05FF, 11}, // HEBREW + {0x0600, 0x06FF, 12}, // ARABIC + {0x0700, 0x074F, 13}, // SYRIAC + {0x0750, 0x077F, 14}, // ARABIC_SUPPLEMENT + {0x0780, 0x07BF, 15}, // THAANA + {0x07C0, 0x07FF, 16}, // NKO + {0x0800, 0x083F, 17}, // SAMARITAN + {0x0840, 0x085F, 18}, // MANDAIC + {0x0860, 0x086F, 19}, // SYRIAC_SUPPLEMENT + {0x0870, 0x089F, 20}, // ARABIC_EXTENDED_B + {0x08A0, 0x08FF, 21}, // ARABIC_EXTENDED_A + {0x0900, 0x097F, 22}, // DEVANAGARI + {0x0980, 0x09FF, 23}, // BENGALI + {0x0A00, 0x0A7F, 24}, // GURMUKHI + {0x0A80, 0x0AFF, 25}, // GUJARATI + {0x0B00, 0x0B7F, 26}, // ORIYA + {0x0B80, 0x0BFF, 27}, // TAMIL + {0x0C00, 0x0C7F, 28}, // TELUGU + {0x0C80, 0x0CFF, 29}, // KANNADA + {0x0D00, 0x0D7F, 30}, // MALAYALAM + {0x0D80, 0x0DFF, 31}, // SINHALA + {0x0E00, 0x0E7F, 32}, // THAI + {0x0E80, 0x0EFF, 33}, // LAO + {0x0F00, 0x0FFF, 34}, // TIBETAN + {0x1000, 0x109F, 35}, // MYANMAR + {0x10A0, 0x10FF, 36}, // GEORGIAN + {0x1100, 0x11FF, 37}, // HANGUL_JAMO + {0x1200, 0x137F, 38}, // ETHIOPIC + {0x1380, 0x139F, 39}, // ETHIOPIC_SUPPLEMENT + {0x13A0, 0x13FF, 40}, // CHEROKEE + {0x1400, 0x167F, 41}, // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS + {0x1680, 0x169F, 42}, // OGHAM + {0x16A0, 0x16FF, 43}, // RUNIC + {0x1700, 0x171F, 44}, // TAGALOG + {0x1720, 0x173F, 45}, // HANUNOO + {0x1740, 0x175F, 46}, // BUHID + {0x1760, 0x177F, 47}, // TAGBANWA + {0x1780, 0x17FF, 48}, // KHMER + {0x1800, 0x18AF, 49}, // MONGOLIAN + {0x18B0, 0x18FF, 50}, // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED + {0x1900, 0x194F, 51}, // LIMBU + {0x1950, 0x197F, 52}, // TAI_LE + {0x1980, 0x19DF, 53}, // NEW_TAI_LUE + {0x19E0, 0x19FF, 54}, // KHMER_SYMBOLS + {0x1A00, 0x1A1F, 55}, // BUGINESE + {0x1A20, 0x1AAF, 56}, // TAI_THAM + {0x1AB0, 0x1AFF, 57}, // COMBINING_DIACRITICAL_MARKS_EXTENDED + {0x1B00, 0x1B7F, 58}, // BALINESE + {0x1B80, 0x1BBF, 59}, // SUNDANESE + {0x1BC0, 0x1BFF, 60}, // BATAK + {0x1C00, 0x1C4F, 61}, // LEPCHA + {0x1C50, 0x1C7F, 62}, // OL_CHIKI + {0x1C80, 0x1C8F, 63}, // CYRILLIC_EXTENDED_C + {0x1C90, 0x1CBF, 64}, // GEORGIAN_EXTENDED + {0x1CC0, 0x1CCF, 65}, // SUNDANESE_SUPPLEMENT + {0x1CD0, 0x1CFF, 66}, // VEDIC_EXTENSIONS + {0x1D00, 0x1D7F, 67}, // PHONETIC_EXTENSIONS + {0x1D80, 0x1DBF, 68}, // PHONETIC_EXTENSIONS_SUPPLEMENT + {0x1DC0, 0x1DFF, 69}, // COMBINING_DIACRITICAL_MARKS_SUPPLEMENT + {0x1E00, 0x1EFF, 70}, // LATIN_EXTENDED_ADDITIONAL + {0x1F00, 0x1FFF, 71}, // GREEK_EXTENDED + {0x2000, 0x206F, 72}, // GENERAL_PUNCTUATION + {0x2070, 0x209F, 73}, // SUPERSCRIPTS_AND_SUBSCRIPTS + {0x20A0, 0x20CF, 74}, // CURRENCY_SYMBOLS + {0x20D0, 0x20FF, 75}, // COMBINING_MARKS_FOR_SYMBOLS + {0x2100, 0x214F, 76}, // LETTERLIKE_SYMBOLS + {0x2150, 0x218F, 77}, // NUMBER_FORMS + {0x2190, 0x21FF, 78}, // ARROWS + {0x2200, 0x22FF, 79}, // MATHEMATICAL_OPERATORS + {0x2300, 0x23FF, 80}, // MISCELLANEOUS_TECHNICAL + {0x2400, 0x243F, 81}, // CONTROL_PICTURES + {0x2440, 0x245F, 82}, // OPTICAL_CHARACTER_RECOGNITION + {0x2460, 0x24FF, 83}, // ENCLOSED_ALPHANUMERICS + {0x2500, 0x257F, 84}, // BOX_DRAWING + {0x2580, 0x259F, 85}, // BLOCK_ELEMENTS + {0x25A0, 0x25FF, 86}, // GEOMETRIC_SHAPES + {0x2600, 0x26FF, 87}, // MISCELLANEOUS_SYMBOLS + {0x2700, 0x27BF, 88}, // DINGBATS + {0x27C0, 0x27EF, 89}, // MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A + {0x27F0, 0x27FF, 90}, // SUPPLEMENTAL_ARROWS_A + {0x2800, 0x28FF, 91}, // BRAILLE_PATTERNS + {0x2900, 0x297F, 92}, // SUPPLEMENTAL_ARROWS_B + {0x2980, 0x29FF, 93}, // MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B + {0x2A00, 0x2AFF, 94}, // SUPPLEMENTAL_MATHEMATICAL_OPERATORS + {0x2B00, 0x2BFF, 95}, // MISCELLANEOUS_SYMBOLS_AND_ARROWS + {0x2C00, 0x2C5F, 96}, // GLAGOLITIC + {0x2C60, 0x2C7F, 97}, // LATIN_EXTENDED_C + {0x2C80, 0x2CFF, 98}, // COPTIC + {0x2D00, 0x2D2F, 99}, // GEORGIAN_SUPPLEMENT + {0x2D30, 0x2D7F, 100}, // TIFINAGH + {0x2D80, 0x2DDF, 101}, // ETHIOPIC_EXTENDED + {0x2DE0, 0x2DFF, 102}, // CYRILLIC_EXTENDED_A + {0x2E00, 0x2E7F, 103}, // SUPPLEMENTAL_PUNCTUATION + {0x2E80, 0x2EFF, 104}, // CJK_RADICALS_SUPPLEMENT + {0x2F00, 0x2FDF, 105}, // KANGXI_RADICALS + {0x2FF0, 0x2FFF, 106}, // IDEOGRAPHIC_DESCRIPTION_CHARACTERS + {0x3000, 0x303F, 107}, // CJK_SYMBOLS_AND_PUNCTUATION + {0x3040, 0x309F, 108}, // HIRAGANA + {0x30A0, 0x30FF, 109}, // KATAKANA + {0x3100, 0x312F, 110}, // BOPOMOFO + {0x3130, 0x318F, 111}, // HANGUL_COMPATIBILITY_JAMO + {0x3190, 0x319F, 112}, // KANBUN + {0x31A0, 0x31BF, 113}, // BOPOMOFO_EXTENDED + {0x31C0, 0x31EF, 114}, // CJK_STROKES + {0x31F0, 0x31FF, 115}, // KATAKANA_PHONETIC_EXTENSIONS + {0x3200, 0x32FF, 116}, // ENCLOSED_CJK_LETTERS_AND_MONTHS + {0x3300, 0x33FF, 117}, // CJK_COMPATIBILITY + {0x3400, 0x4DBF, 118}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A + {0x4DC0, 0x4DFF, 119}, // YIJING_HEXAGRAM_SYMBOLS + {0x4E00, 0x9FFF, 120}, // CJK_UNIFIED_IDEOGRAPHS + {0xA000, 0xA48F, 121}, // YI_SYLLABLES + {0xA490, 0xA4CF, 122}, // YI_RADICALS + {0xA4D0, 0xA4FF, 123}, // LISU + {0xA500, 0xA63F, 124}, // VAI + {0xA640, 0xA69F, 125}, // CYRILLIC_EXTENDED_B + {0xA6A0, 0xA6FF, 126}, // BAMUM + {0xA700, 0xA71F, 127}, // MODIFIER_TONE_LETTERS + {0xA720, 0xA7FF, 128}, // LATIN_EXTENDED_D + {0xA800, 0xA82F, 129}, // SYLOTI_NAGRI + {0xA830, 0xA83F, 130}, // COMMON_INDIC_NUMBER_FORMS + {0xA840, 0xA87F, 131}, // PHAGS_PA + {0xA880, 0xA8DF, 132}, // SAURASHTRA + {0xA8E0, 0xA8FF, 133}, // DEVANAGARI_EXTENDED + {0xA900, 0xA92F, 134}, // KAYAH_LI + {0xA930, 0xA95F, 135}, // REJANG + {0xA960, 0xA97F, 136}, // HANGUL_JAMO_EXTENDED_A + {0xA980, 0xA9DF, 137}, // JAVANESE + {0xA9E0, 0xA9FF, 138}, // MYANMAR_EXTENDED_B + {0xAA00, 0xAA5F, 139}, // CHAM + {0xAA60, 0xAA7F, 140}, // MYANMAR_EXTENDED_A + {0xAA80, 0xAADF, 141}, // TAI_VIET + {0xAAE0, 0xAAFF, 142}, // MEETEI_MAYEK_EXTENSIONS + {0xAB00, 0xAB2F, 143}, // ETHIOPIC_EXTENDED_A + {0xAB30, 0xAB6F, 144}, // LATIN_EXTENDED_E + {0xAB70, 0xABBF, 145}, // CHEROKEE_SUPPLEMENT + {0xABC0, 0xABFF, 146}, // MEETEI_MAYEK + {0xAC00, 0xD7AF, 147}, // HANGUL_SYLLABLES + {0xD7B0, 0xD7FF, 148}, // HANGUL_JAMO_EXTENDED_B + {0xD800, 0xDB7F, 149}, // HIGH_SURROGATES + {0xDB80, 0xDBFF, 150}, // HIGH_PRIVATE_USE_SURROGATES + {0xDC00, 0xDFFF, 151}, // LOW_SURROGATES + {0xE000, 0xF8FF, 152}, // PRIVATE_USE_AREA + {0xF900, 0xFAFF, 153}, // CJK_COMPATIBILITY_IDEOGRAPHS + {0xFB00, 0xFB4F, 154}, // ALPHABETIC_PRESENTATION_FORMS + {0xFB50, 0xFDFF, 155}, // ARABIC_PRESENTATION_FORMS_A + {0xFE00, 0xFE0F, 156}, // VARIATION_SELECTORS + {0xFE10, 0xFE1F, 157}, // VERTICAL_FORMS + {0xFE20, 0xFE2F, 158}, // COMBINING_HALF_MARKS + {0xFE30, 0xFE4F, 159}, // CJK_COMPATIBILITY_FORMS + {0xFE50, 0xFE6F, 160}, // SMALL_FORM_VARIANTS + {0xFE70, 0xFEFF, 161}, // ARABIC_PRESENTATION_FORMS_B + {0xFF00, 0xFFEF, 162}, // HALFWIDTH_AND_FULLWIDTH_FORMS + {0xFFF0, 0xFFFF, 163}, // SPECIALS + {0x10000, 0x1007F, 164}, // LINEAR_B_SYLLABARY + {0x10080, 0x100FF, 165}, // LINEAR_B_IDEOGRAMS + {0x10100, 0x1013F, 166}, // AEGEAN_NUMBERS + {0x10140, 0x1018F, 167}, // ANCIENT_GREEK_NUMBERS + {0x10190, 0x101CF, 168}, // ANCIENT_SYMBOLS + {0x101D0, 0x101FF, 169}, // PHAISTOS_DISC + {0x10280, 0x1029F, 170}, // LYCIAN + {0x102A0, 0x102DF, 171}, // CARIAN + {0x102E0, 0x102FF, 172}, // COPTIC_EPACT_NUMBERS + {0x10300, 0x1032F, 173}, // OLD_ITALIC + {0x10330, 0x1034F, 174}, // GOTHIC + {0x10350, 0x1037F, 175}, // OLD_PERMIC + {0x10380, 0x1039F, 176}, // UGARITIC + {0x103A0, 0x103DF, 177}, // OLD_PERSIAN + {0x10400, 0x1044F, 178}, // DESERET + {0x10450, 0x1047F, 179}, // SHAVIAN + {0x10480, 0x104AF, 180}, // OSMANYA + {0x104B0, 0x104FF, 181}, // OSAGE + {0x10500, 0x1052F, 182}, // ELBASAN + {0x10530, 0x1056F, 183}, // CAUCASIAN_ALBANIAN + {0x10570, 0x105BF, 184}, // VITHKUQI + {0x105C0, 0x105FF, 185}, // TODHRI + {0x10600, 0x1077F, 186}, // LINEAR_A + {0x10780, 0x107BF, 187}, // LATIN_EXTENDED_F + {0x10800, 0x1083F, 188}, // CYPRIOT_SYLLABARY + {0x10840, 0x1085F, 189}, // IMPERIAL_ARAMAIC + {0x10860, 0x1087F, 190}, // PALMYRENE + {0x10880, 0x108AF, 191}, // NABATAEAN + {0x108E0, 0x108FF, 192}, // HATRAN + {0x10900, 0x1091F, 193}, // PHOENICIAN + {0x10920, 0x1093F, 194}, // LYDIAN + {0x10980, 0x1099F, 195}, // MEROITIC_HIEROGLYPHS + {0x109A0, 0x109FF, 196}, // MEROITIC_CURSIVE + {0x10A00, 0x10A5F, 197}, // KHAROSHTHI + {0x10A60, 0x10A7F, 198}, // OLD_SOUTH_ARABIAN + {0x10A80, 0x10A9F, 199}, // OLD_NORTH_ARABIAN + {0x10AC0, 0x10AFF, 200}, // MANICHAEAN + {0x10B00, 0x10B3F, 201}, // AVESTAN + {0x10B40, 0x10B5F, 202}, // INSCRIPTIONAL_PARTHIAN + {0x10B60, 0x10B7F, 203}, // INSCRIPTIONAL_PAHLAVI + {0x10B80, 0x10BAF, 204}, // PSALTER_PAHLAVI + {0x10C00, 0x10C4F, 205}, // OLD_TURKIC + {0x10C80, 0x10CFF, 206}, // OLD_HUNGARIAN + {0x10D00, 0x10D3F, 207}, // HANIFI_ROHINGYA + {0x10D40, 0x10D8F, 208}, // GARAY + {0x10E60, 0x10E7F, 209}, // RUMI_NUMERAL_SYMBOLS + {0x10E80, 0x10EBF, 210}, // YEZIDI + {0x10EC0, 0x10EFF, 211}, // ARABIC_EXTENDED_C + {0x10F00, 0x10F2F, 212}, // OLD_SOGDIAN + {0x10F30, 0x10F6F, 213}, // SOGDIAN + {0x10F70, 0x10FAF, 214}, // OLD_UYGHUR + {0x10FB0, 0x10FDF, 215}, // CHORASMIAN + {0x10FE0, 0x10FFF, 216}, // ELYMAIC + {0x11000, 0x1107F, 217}, // BRAHMI + {0x11080, 0x110CF, 218}, // KAITHI + {0x110D0, 0x110FF, 219}, // SORA_SOMPENG + {0x11100, 0x1114F, 220}, // CHAKMA + {0x11150, 0x1117F, 221}, // MAHAJANI + {0x11180, 0x111DF, 222}, // SHARADA + {0x111E0, 0x111FF, 223}, // SINHALA_ARCHAIC_NUMBERS + {0x11200, 0x1124F, 224}, // KHOJKI + {0x11280, 0x112AF, 225}, // MULTANI + {0x112B0, 0x112FF, 226}, // KHUDAWADI + {0x11300, 0x1137F, 227}, // GRANTHA + {0x11380, 0x113FF, 228}, // TULU_TIGALARI + {0x11400, 0x1147F, 229}, // NEWA + {0x11480, 0x114DF, 230}, // TIRHUTA + {0x11580, 0x115FF, 231}, // SIDDHAM + {0x11600, 0x1165F, 232}, // MODI + {0x11660, 0x1167F, 233}, // MONGOLIAN_SUPPLEMENT + {0x11680, 0x116CF, 234}, // TAKRI + {0x116D0, 0x116FF, 235}, // MYANMAR_EXTENDED_C + {0x11700, 0x1174F, 236}, // AHOM + {0x11800, 0x1184F, 237}, // DOGRA + {0x118A0, 0x118FF, 238}, // WARANG_CITI + {0x11900, 0x1195F, 239}, // DIVES_AKURU + {0x119A0, 0x119FF, 240}, // NANDINAGARI + {0x11A00, 0x11A4F, 241}, // ZANABAZAR_SQUARE + {0x11A50, 0x11AAF, 242}, // SOYOMBO + {0x11AB0, 0x11ABF, 243}, // UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A + {0x11AC0, 0x11AFF, 244}, // PAU_CIN_HAU + {0x11B00, 0x11B5F, 245}, // DEVANAGARI_EXTENDED_A + {0x11BC0, 0x11BFF, 246}, // SUNUWAR + {0x11C00, 0x11C6F, 247}, // BHAIKSUKI + {0x11C70, 0x11CBF, 248}, // MARCHEN + {0x11D00, 0x11D5F, 249}, // MASARAM_GONDI + {0x11D60, 0x11DAF, 250}, // GUNJALA_GONDI + {0x11EE0, 0x11EFF, 251}, // MAKASAR + {0x11F00, 0x11F5F, 252}, // KAWI + {0x11FB0, 0x11FBF, 253}, // LISU_SUPPLEMENT + {0x11FC0, 0x11FFF, 254}, // TAMIL_SUPPLEMENT + {0x12000, 0x123FF, 255}, // CUNEIFORM + {0x12400, 0x1247F, 256}, // CUNEIFORM_NUMBERS_AND_PUNCTUATION + {0x12480, 0x1254F, 257}, // EARLY_DYNASTIC_CUNEIFORM + {0x12F90, 0x12FFF, 258}, // CYPRO_MINOAN + {0x13000, 0x1342F, 259}, // EGYPTIAN_HIEROGLYPHS + {0x13430, 0x1345F, 260}, // EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS + {0x13460, 0x143FF, 261}, // EGYPTIAN_HIEROGLYPHS_EXTENDED_A + {0x14400, 0x1467F, 262}, // ANATOLIAN_HIEROGLYPHS + {0x16100, 0x1613F, 263}, // GURUNG_KHEMA + {0x16800, 0x16A3F, 264}, // BAMUM_SUPPLEMENT + {0x16A40, 0x16A6F, 265}, // MRO + {0x16A70, 0x16ACF, 266}, // TANGSA + {0x16AD0, 0x16AFF, 267}, // BASSA_VAH + {0x16B00, 0x16B8F, 268}, // PAHAWH_HMONG + {0x16D40, 0x16D7F, 269}, // KIRAT_RAI + {0x16E40, 0x16E9F, 270}, // MEDEFAIDRIN + {0x16F00, 0x16F9F, 271}, // MIAO + {0x16FE0, 0x16FFF, 272}, // IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION + {0x17000, 0x187FF, 273}, // TANGUT + {0x18800, 0x18AFF, 274}, // TANGUT_COMPONENTS + {0x18B00, 0x18CFF, 275}, // KHITAN_SMALL_SCRIPT + {0x18D00, 0x18D7F, 276}, // TANGUT_SUPPLEMENT + {0x1AFF0, 0x1AFFF, 277}, // KANA_EXTENDED_B + {0x1B000, 0x1B0FF, 278}, // KANA_SUPPLEMENT + {0x1B100, 0x1B12F, 279}, // KANA_EXTENDED_A + {0x1B130, 0x1B16F, 280}, // SMALL_KANA_EXTENSION + {0x1B170, 0x1B2FF, 281}, // NUSHU + {0x1BC00, 0x1BC9F, 282}, // DUPLOYAN + {0x1BCA0, 0x1BCAF, 283}, // SHORTHAND_FORMAT_CONTROLS + {0x1CC00, 0x1CEBF, 284}, // SYMBOLS_FOR_LEGACY_COMPUTING_SUPPLEMENT + {0x1CF00, 0x1CFCF, 285}, // ZNAMENNY_MUSICAL_NOTATION + {0x1D000, 0x1D0FF, 286}, // BYZANTINE_MUSICAL_SYMBOLS + {0x1D100, 0x1D1FF, 287}, // MUSICAL_SYMBOLS + {0x1D200, 0x1D24F, 288}, // ANCIENT_GREEK_MUSICAL_NOTATION + {0x1D2C0, 0x1D2DF, 289}, // KAKTOVIK_NUMERALS + {0x1D2E0, 0x1D2FF, 290}, // MAYAN_NUMERALS + {0x1D300, 0x1D35F, 291}, // TAI_XUAN_JING_SYMBOLS + {0x1D360, 0x1D37F, 292}, // COUNTING_ROD_NUMERALS + {0x1D400, 0x1D7FF, 293}, // MATHEMATICAL_ALPHANUMERIC_SYMBOLS + {0x1D800, 0x1DAAF, 294}, // SUTTON_SIGNWRITING + {0x1DF00, 0x1DFFF, 295}, // LATIN_EXTENDED_G + {0x1E000, 0x1E02F, 296}, // GLAGOLITIC_SUPPLEMENT + {0x1E030, 0x1E08F, 297}, // CYRILLIC_EXTENDED_D + {0x1E100, 0x1E14F, 298}, // NYIAKENG_PUACHUE_HMONG + {0x1E290, 0x1E2BF, 299}, // TOTO + {0x1E2C0, 0x1E2FF, 300}, // WANCHO + {0x1E4D0, 0x1E4FF, 301}, // NAG_MUNDARI + {0x1E5D0, 0x1E5FF, 302}, // OL_ONAL + {0x1E7E0, 0x1E7FF, 303}, // ETHIOPIC_EXTENDED_B + {0x1E800, 0x1E8DF, 304}, // MENDE_KIKAKUI + {0x1E900, 0x1E95F, 305}, // ADLAM + {0x1EC70, 0x1ECBF, 306}, // INDIC_SIYAQ_NUMBERS + {0x1ED00, 0x1ED4F, 307}, // OTTOMAN_SIYAQ_NUMBERS + {0x1EE00, 0x1EEFF, 308}, // ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS + {0x1F000, 0x1F02F, 309}, // MAHJONG_TILES + {0x1F030, 0x1F09F, 310}, // DOMINO_TILES + {0x1F0A0, 0x1F0FF, 311}, // PLAYING_CARDS + {0x1F100, 0x1F1FF, 312}, // ENCLOSED_ALPHANUMERIC_SUPPLEMENT + {0x1F200, 0x1F2FF, 313}, // ENCLOSED_IDEOGRAPHIC_SUPPLEMENT + {0x1F300, 0x1F5FF, 314}, // MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS + {0x1F600, 0x1F64F, 315}, // EMOTICONS + {0x1F650, 0x1F67F, 316}, // ORNAMENTAL_DINGBATS + {0x1F680, 0x1F6FF, 317}, // TRANSPORT_AND_MAP_SYMBOLS + {0x1F700, 0x1F77F, 318}, // ALCHEMICAL_SYMBOLS + {0x1F780, 0x1F7FF, 319}, // GEOMETRIC_SHAPES_EXTENDED + {0x1F800, 0x1F8FF, 320}, // SUPPLEMENTAL_ARROWS_C + {0x1F900, 0x1F9FF, 321}, // SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS + {0x1FA00, 0x1FA6F, 322}, // CHESS_SYMBOLS + {0x1FA70, 0x1FAFF, 323}, // SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A + {0x1FB00, 0x1FBFF, 324}, // SYMBOLS_FOR_LEGACY_COMPUTING + {0x20000, 0x2A6DF, 325}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B + {0x2A700, 0x2B73F, 326}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C + {0x2B740, 0x2B81F, 327}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D + {0x2B820, 0x2CEAF, 328}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E + {0x2CEB0, 0x2EBEF, 329}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F + {0x2EBF0, 0x2EE5F, 330}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I + {0x2F800, 0x2FA1F, 331}, // CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT + {0x30000, 0x3134F, 332}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G + {0x31350, 0x323AF, 333}, // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H + {0xE0000, 0xE007F, 334}, // TAGS + {0xE0100, 0xE01EF, 335}, // VARIATION_SELECTORS_SUPPLEMENT + {0xF0000, 0xFFFFF, 336}, // SUPPLEMENTARY_PRIVATE_USE_AREA_A + {0x100000, 0x10FFFF, 337}, // SUPPLEMENTARY_PRIVATE_USE_AREA_B + }; + + /** Cached start_cp array for binary search. */ + private static final int[] STARTS; + static { + STARTS = new int[RANGES.length]; + for (int i = 0; i < RANGES.length; i++) { + STARTS[i] = RANGES[i][0]; + } + } + + private UnicodeBlockRanges() { + // utility class + } + + /** Total number of buckets (named blocks + 1 unassigned). */ + public static int bucketCount() { + return RANGES.length + 1; + } + + /** + * Returns the bucket id for the given codepoint, or {@link #UNASSIGNED} + * if the codepoint falls outside every named block range. + * + *

Binary search over the sorted-by-{@code start_cp} range list: + * O(log N) where N = {@value #UNASSIGNED} (the number of named blocks). + */ + public static int bucketOf(int cp) { + // Binary search: find largest STARTS[i] <= cp + int lo = 0; + int hi = STARTS.length - 1; + int found = -1; + while (lo <= hi) { + int mid = (lo + hi) >>> 1; + if (STARTS[mid] <= cp) { + found = mid; + lo = mid + 1; + } else { + hi = mid - 1; + } + } + if (found < 0) { + return UNASSIGNED; + } + // RANGES[found] is the candidate. Confirm cp is within end_inclusive. + return cp <= RANGES[found][1] ? RANGES[found][2] : UNASSIGNED; + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java new file mode 100644 index 00000000000..93a82640caa --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/V7Tables.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * Carrier for one script's v7 F1 tables. + * + *

The v6 design used a single global codepoint-bigram hash + Bloom + * filter shared across all scripts. We measured that this ceiling + * limits accuracy: enlarging one script's training data (e.g. HAN) hurts + * the other scripts' z-scores because they share the global hash. v7 + * gives each script its own pair of tables. + * + *

Per-script layout: + * + *

    + *
  • {@code codepointIndex} — sorted, ascending {@code int[]} of every + * codepoint that appears as either side of a kept bigram for this + * script. Codepoint → dense index is a binary search; index → + * codepoint is direct array access. Typical sizes: ~7K-15K for HAN, + * ~200-500 for most other scripts. + *
  • {@code bigramKeys} / {@code bigramValues} — parallel arrays + * implementing an open-addressed hash table with linear probing. + * Each key is a 32-bit value {@code (idxA << 16) | idxB}; key {@code + * -1} means "empty slot." Indices are bounded at 16 bits (65535), + * which is comfortably above the largest per-script codepoint count + * we observe. + *
  • {@code unigramTable} — {@code byte[numCodepoints]}, quantized + * unigram log-probabilities indexed by the same codepoint→index map. + *
  • {@code bigramQuantMin/Max}, {@code unigramQuantMin/Max} — + * per-quantization ranges; dequantize by + * {@code min + (b/255) * (max - min)}. + *
  • {@code unigramFallbackLogProb} — log-prob assigned when a + * codepoint is not in {@code codepointIndex} at all. Set to the + * script's most-pessimistic unigram value (its quantization min) so + * absent codepoints don't accidentally score above legitimately-rare + * ones. + *
  • {@code backoffAlpha} — multiplier on the unigram-backoff + * independence sum, copied from v6. + *
+ * + *

Membership semantics: no Bloom filter. The empty-slot sentinel is + * the membership oracle — a pair is "seen" iff binary-search finds both + * codepoints in the index AND a probe sequence hits a matching key before + * an empty slot. Lookups are therefore exact; there is no false-positive + * backoff path as there is in v6. + * + *

Fields are package-private so the + * {@link org.apache.tika.ml.junkdetect.tools.TrainJunkModel} trainer can + * construct instances directly without going through accessors. + */ +public final class V7Tables { + + /** Reserved value in {@link #bigramKeys} marking an unoccupied slot. */ + public static final int EMPTY_KEY = -1; + + final int[] codepointIndex; + final int[] bigramKeys; + final byte[] bigramValues; + final byte[] unigramTable; + final float bigramQuantMin; + final float bigramQuantMax; + final float unigramQuantMin; + final float unigramQuantMax; + final float unigramFallbackLogProb; + final float backoffAlpha; + + public V7Tables(int[] codepointIndex, + int[] bigramKeys, byte[] bigramValues, + byte[] unigramTable, + float bigramQuantMin, float bigramQuantMax, + float unigramQuantMin, float unigramQuantMax, + float unigramFallbackLogProb, + float backoffAlpha) { + if (bigramKeys.length != bigramValues.length) { + throw new IllegalArgumentException( + "bigramKeys and bigramValues must have equal length: " + + bigramKeys.length + " vs " + bigramValues.length); + } + if (unigramTable.length != codepointIndex.length) { + throw new IllegalArgumentException( + "unigramTable.length must equal codepointIndex.length: " + + unigramTable.length + " vs " + codepointIndex.length); + } + this.codepointIndex = codepointIndex; + this.bigramKeys = bigramKeys; + this.bigramValues = bigramValues; + this.unigramTable = unigramTable; + this.bigramQuantMin = bigramQuantMin; + this.bigramQuantMax = bigramQuantMax; + this.unigramQuantMin = unigramQuantMin; + this.unigramQuantMax = unigramQuantMax; + this.unigramFallbackLogProb = unigramFallbackLogProb; + this.backoffAlpha = backoffAlpha; + } + + /** + * Serialises this script's F1 tables. Read back via + * {@link #readFrom(DataInputStream)}. + */ + public void writeTo(DataOutputStream dos) throws IOException { + dos.writeFloat(backoffAlpha); + + // Codepoint index. + dos.writeInt(codepointIndex.length); + ByteBuffer cpBuf = ByteBuffer.allocate(codepointIndex.length * 4) + .order(ByteOrder.BIG_ENDIAN); + cpBuf.asIntBuffer().put(codepointIndex); + dos.write(cpBuf.array()); + + // Bigram open-addressing table (keys + values). + dos.writeInt(bigramKeys.length); + dos.writeFloat(bigramQuantMin); + dos.writeFloat(bigramQuantMax); + ByteBuffer keyBuf = ByteBuffer.allocate(bigramKeys.length * 4) + .order(ByteOrder.BIG_ENDIAN); + keyBuf.asIntBuffer().put(bigramKeys); + dos.write(keyBuf.array()); + dos.write(bigramValues); + + // Unigram table. + dos.writeFloat(unigramQuantMin); + dos.writeFloat(unigramQuantMax); + dos.writeFloat(unigramFallbackLogProb); + dos.write(unigramTable); + } + + /** Inverse of {@link #writeTo(DataOutputStream)}. */ + public static V7Tables readFrom(DataInputStream dis) throws IOException { + float backoffAlpha = dis.readFloat(); + + int cpCount = dis.readInt(); + byte[] cpBytes = dis.readNBytes(cpCount * 4); + int[] codepoints = new int[cpCount]; + ByteBuffer.wrap(cpBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(codepoints); + + int slots = dis.readInt(); + float bMin = dis.readFloat(); + float bMax = dis.readFloat(); + byte[] keyBytes = dis.readNBytes(slots * 4); + int[] keys = new int[slots]; + ByteBuffer.wrap(keyBytes).order(ByteOrder.BIG_ENDIAN).asIntBuffer().get(keys); + byte[] values = dis.readNBytes(slots); + + float uMin = dis.readFloat(); + float uMax = dis.readFloat(); + float uFallback = dis.readFloat(); + byte[] unigramTable = dis.readNBytes(cpCount); + + return new V7Tables(codepoints, keys, values, unigramTable, + bMin, bMax, uMin, uMax, uFallback, backoffAlpha); + } + + /** + * Returns a one-line summary for trainer progress output. + */ + public String statsString() { + return String.format( + " cp_index=%d, bigram_slots=%d (load≈%.2f), " + + "bigram_range=[%.3f, %.3f], unigram_range=[%.3f, %.3f]", + codepointIndex.length, bigramKeys.length, + occupiedSlots() / (double) Math.max(1, bigramKeys.length), + bigramQuantMin, bigramQuantMax, + unigramQuantMin, unigramQuantMax); + } + + private int occupiedSlots() { + int n = 0; + for (int k : bigramKeys) { + if (k != EMPTY_KEY) n++; + } + return n; + } + + /** Number of codepoints in this script's index. Diagnostic. */ + public int codepointCount() { + return codepointIndex.length; + } + + /** Number of bigram-table slots (capacity). Diagnostic. */ + public int bigramSlots() { + return bigramKeys.length; + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java new file mode 100644 index 00000000000..08b2aa4eb57 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * Diagnostic tool: bucket every bigram in {@code han.train.gz} (or any + * specified file) by the {@link Character.UnicodeBlock} of each codepoint, + * and report the distribution. + * + *

Goal: determine whether HAN's 224K distinct pairs split cleanly along + * block boundaries — e.g. CJK Unified Ideographs vs. Hiragana vs. Katakana — + * which would justify routing HAN windows to language-specific sub-models in + * the v7 design. + * + *

Usage: + *

+ *   java ... AnalyzeHanByBlock /path/to/junkdetect/han.train.gz
+ * 
+ */ +public final class AnalyzeHanByBlock { + + private AnalyzeHanByBlock() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: AnalyzeHanByBlock "); + System.exit(1); + } + Path file = Paths.get(args[0]); + + // (blockA, blockB) -> [totalBigrams, distinctSet via HashMap] + // We use Maps of Maps to keep code simple; HAN is the only file + // big enough to matter and fits in heap. + Map> byBlockPair = new HashMap<>(); + Map blockPairTotals = new HashMap<>(); + long totalN = 0; + + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(file)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + String prevBlock = null; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + String block = blockShortName(cp); + if (prevCp >= 0) { + String key = prevBlock + "|" + block; + Map set = byBlockPair.computeIfAbsent( + key, k -> new HashMap<>(256)); + long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL); + long[] c = set.get(packed); + if (c == null) { + set.put(packed, new long[]{1L}); + } else { + c[0]++; + } + blockPairTotals.computeIfAbsent(key, k -> new long[1])[0]++; + totalN++; + } + prevCp = cp; + prevBlock = block; + } + } + } + + System.out.printf("File: %s%n", file); + System.out.printf("Total bigram occurrences: %,d%n%n", totalN); + + // Sort block-pair keys by total occurrences (descending). + List> sorted = new ArrayList<>(blockPairTotals.entrySet()); + sorted.sort(Comparator.comparingLong( + (Map.Entry e) -> -e.getValue()[0])); + + System.out.printf("%-50s %14s %14s %12s %8s%n", + "block_pair", "occurrences", "distinct", "singletons", "%total"); + System.out.println(repeat('-', 105)); + + long distinctTotal = 0; + long singletonsTotal = 0; + for (Map.Entry e : sorted) { + String pair = e.getKey(); + long n = e.getValue()[0]; + Map set = byBlockPair.get(pair); + int distinct = set.size(); + int singletons = 0; + for (long[] c : set.values()) { + if (c[0] == 1) singletons++; + } + distinctTotal += distinct; + singletonsTotal += singletons; + double pct = 100.0 * n / totalN; + if (pct < 0.1 && n < 1000) { + continue; // skip tail noise rows + } + System.out.printf("%-50s %,14d %,14d %,12d %7.2f%%%n", + pair, n, distinct, singletons, pct); + } + System.out.println(repeat('-', 105)); + System.out.printf("Total distinct pairs (incl. tail): %,d%n", distinctTotal); + System.out.printf("Total singletons (incl. tail): %,d%n", singletonsTotal); + + // Roll up by individual block (left side only) to see per-block distinct counts. + System.out.println(); + System.out.println("=== Per-leading-block roll-up ==="); + Map distinctByLeadingBlock = new HashMap<>(); + Map occByLeadingBlock = new HashMap<>(); + for (Map.Entry> e : byBlockPair.entrySet()) { + String leading = e.getKey().substring(0, e.getKey().indexOf('|')); + distinctByLeadingBlock.merge(leading, (long) e.getValue().size(), Long::sum); + long sum = 0; + for (long[] c : e.getValue().values()) sum += c[0]; + occByLeadingBlock.merge(leading, sum, Long::sum); + } + List> rollup = new ArrayList<>(occByLeadingBlock.entrySet()); + rollup.sort(Comparator.comparingLong( + (Map.Entry e) -> -e.getValue())); + System.out.printf("%-35s %14s %14s%n", + "leading_block", "occurrences", "distinct(rough)"); + System.out.println(repeat('-', 70)); + for (Map.Entry e : rollup) { + System.out.printf("%-35s %,14d %,14d%n", + e.getKey(), e.getValue(), + distinctByLeadingBlock.get(e.getKey())); + } + } + + /** + * Short-name for the Unicode block containing {@code cp}. Compresses the + * many CJK-related blocks into a handful of human-readable labels. + * + *

Splits ASCII into ASCII_DIGIT / ASCII_LETTER / ASCII_PUNCT so we can + * distinguish numerals (which are content-bearing across all scripts) from + * English-letter contamination and punctuation. + */ + private static String blockShortName(int cp) { + Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); + if (b == null) return "UNK"; + + String name = b.toString(); + if (name.equals("BASIC_LATIN")) { + if (cp >= '0' && cp <= '9') return "ASCII_DIGIT"; + if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) return "ASCII_LETTER"; + return "ASCII_PUNCT"; + } + // Compress noisy block names for the report. + if (name.startsWith("CJK_UNIFIED_IDEOGRAPHS_EXTENSION")) { + return "CJK_EXT_" + name.substring(name.lastIndexOf('_') + 1); + } + if (name.equals("CJK_UNIFIED_IDEOGRAPHS")) return "CJK_UNIFIED"; + if (name.equals("CJK_SYMBOLS_AND_PUNCTUATION")) return "CJK_PUNCT"; + if (name.equals("CJK_COMPATIBILITY_IDEOGRAPHS")) return "CJK_COMPAT"; + if (name.equals("CJK_COMPATIBILITY_FORMS")) return "CJK_COMPAT_FORMS"; + if (name.equals("HALFWIDTH_AND_FULLWIDTH_FORMS")) return "HALF_FULL"; + if (name.equals("HIRAGANA")) return "HIRAGANA"; + if (name.equals("KATAKANA")) return "KATAKANA"; + if (name.equals("KATAKANA_PHONETIC_EXTENSIONS")) return "KATAKANA_EXT"; + if (name.equals("HANGUL_SYLLABLES")) return "HANGUL"; + if (name.equals("HANGUL_JAMO")) return "HANGUL_JAMO"; + if (name.equals("HANGUL_COMPATIBILITY_JAMO")) return "HANGUL_JAMO_C"; + if (name.equals("LATIN_1_SUPPLEMENT")) return "LATIN1"; + return name; + } + + private static String repeat(char c, int n) { + char[] buf = new char[n]; + java.util.Arrays.fill(buf, c); + return new String(buf); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java new file mode 100644 index 00000000000..f64986b8dd8 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +/** + * For each {@code *.train.gz} file, classify every adjacent codepoint pair + * by its relation to the target script S (= file's script). Categories: + * + *

    + *
  • IN_S_INTERIOR — both codepoints are in S or in COMMON/INHERITED + *
  • S_BOUNDARY — exactly one codepoint is in S-or-COMMON, the other + * is a non-S script + *
  • FOREIGN_INTERIOR — both codepoints are in some non-S script + * (possibly different scripts). Under the proposed generalized + * boundary rule, these are the bigrams to drop from S's training. + *
  • ASCII_LETTER_RUN — special subcategory of foreign interior where + * both cps are ASCII A–Z/a–z; this is the English-run case. + *
+ * + *

Reports occurrence counts, distinct-pair counts, and singleton counts + * for each category, plus the implied model-size impact of dropping + * FOREIGN_INTERIOR (or just ASCII_LETTER_RUN) under {@code min_count>=1} + * and {@code min_count>=3}. + */ +public final class BoundaryBigramAudit { + + private BoundaryBigramAudit() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: BoundaryBigramAudit "); + System.exit(1); + } + Path dataDir = Paths.get(args[0]); + Path[] files; + try (Stream s = Files.list(dataDir)) { + files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz")) + .sorted().toArray(Path[]::new); + } + + System.out.printf("%-22s %14s %14s %14s %14s %12s | %14s %14s%n", + "script", "in_S_occ", "boundary_occ", "foreign_occ", + "ascii_run_occ", "total_occ", + "drop_foreign_dist", "drop_asciirun_dist"); + System.out.println(repeat('-', 165)); + + for (Path file : files) { + String fname = file.getFileName().toString(); + String name = fname.substring(0, fname.length() - ".train.gz".length()) + .toUpperCase(); + Character.UnicodeScript target; + try { + target = Character.UnicodeScript.valueOf(name); + } catch (IllegalArgumentException e) { + continue; + } + + long inS = 0, boundary = 0, foreign = 0, asciiRun = 0; + HashMap distinctAll = new HashMap<>(1 << 16); + HashMap distinctKeptUnderForeignDrop = new HashMap<>(1 << 16); + HashMap distinctKeptUnderAsciiDrop = new HashMap<>(1 << 16); + + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(file)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + if (prevCp >= 0) { + boolean aInS = inScriptOrCommon(prevCp, target); + boolean bInS = inScriptOrCommon(cp, target); + boolean aLetter = isLatinLetter(prevCp); + boolean bLetter = isLatinLetter(cp); + + long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL); + increment(distinctAll, packed); + + if (aInS && bInS) { + inS++; + increment(distinctKeptUnderForeignDrop, packed); + increment(distinctKeptUnderAsciiDrop, packed); + } else if (aInS != bInS) { + boundary++; + increment(distinctKeptUnderForeignDrop, packed); + increment(distinctKeptUnderAsciiDrop, packed); + } else { + // both foreign (neither in S nor COMMON) + foreign++; + if (aLetter && bLetter) { + asciiRun++; + } else { + // foreign interior but not pure ASCII letters: + // we'd keep this under the "ASCII-letter only" rule. + increment(distinctKeptUnderAsciiDrop, packed); + } + } + } + prevCp = cp; + } + } + } + + long total = inS + boundary + foreign; + int distAll = distinctAll.size(); + int distForeignDrop = distinctKeptUnderForeignDrop.size(); + int distAsciiDrop = distinctKeptUnderAsciiDrop.size(); + + System.out.printf("%-22s %,14d %,14d %,14d %,14d %,12d | %,14d %,14d%n", + name.toLowerCase(), inS, boundary, foreign, asciiRun, total, + distAll - distForeignDrop, distAll - distAsciiDrop); + } + } + + private static boolean inScriptOrCommon(int cp, Character.UnicodeScript target) { + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + return s == target + || s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED; + } + + private static boolean isLatinLetter(int cp) { + return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z') + || (cp >= 0xFF21 && cp <= 0xFF3A) // fullwidth A-Z + || (cp >= 0xFF41 && cp <= 0xFF5A); // fullwidth a-z + } + + private static void increment(HashMap map, long key) { + long[] c = map.get(key); + if (c == null) { + map.put(key, new long[]{1L}); + } else { + c[0]++; + } + } + + private static String repeat(char c, int n) { + char[] b = new char[n]; + java.util.Arrays.fill(b, c); + return new String(b); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java index 27a5436d5e4..a80fafbd6b4 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java @@ -82,45 +82,18 @@ public class BuildJunkTrainingData { // ----------------------------------------------------------------------- - // Defaults + // Split ratios — fixed, part of the model identity (changing them would + // invalidate downstream eval comparisons). // ----------------------------------------------------------------------- - /** Lines read per language to determine dominant script. */ - private static final int DEFAULT_SCRIPT_SAMPLE_LINES = 2_000; - - /** - * UTF-8 bytes loaded per script group for entropy estimation. - * Budget is spread evenly across languages in the group. - * 200KB is enough to observe the bigram distribution reliably. - */ - private static final long ENTROPY_SAMPLE_BYTES = 200_000L; - - /** - * Total UTF-8 byte budget across all script groups. Divided proportionally - * by bigram entropy after the sampling phase. 50MB gives ~1–3MB per script - * on average across 34 groups; scale up for production runs. - */ - private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L; - - /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */ - private static final int DEFAULT_MIN_BYTES = 50; - - /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */ - private static final double DEFAULT_MAX_PUNC_FRAC = 0.30; - /** Fraction of sentences written to each split (train / dev / test = 80/10/10). */ private static final double TRAIN_FRAC = 0.80; private static final double DEV_FRAC = 0.10; // remaining (1 - TRAIN_FRAC - DEV_FRAC) goes to the test split - /** - * Minimum number of sentences that must land in the dev split for a script to be - * included in the model. Scripts below this floor have too few samples to reliably - * estimate calibration statistics (mu/sigma), which produces noisy z-scores and - * inflated false positive rates. With DEV_FRAC=0.10 the effective minimum total - * sentence count is minDevSentences / DEV_FRAC (default: 5,000 total sentences). - */ - private static final int DEFAULT_MIN_DEV_SENTENCES = 500; + // All other durable parameters live in JunkDetectorTrainingConfig. This + // tool deliberately does not accept CLI overrides for those values; see + // the rejection logic in main() below. // ----------------------------------------------------------------------- // Entry point @@ -129,13 +102,22 @@ public class BuildJunkTrainingData { public static void main(String[] args) throws IOException { Path dataDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "data"); Path outputDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "junkdetect"); - int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES; - long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES; - int minBytes = DEFAULT_MIN_BYTES; - double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC; - int seed = 42; boolean dryRun = false; - int minDevSentences = DEFAULT_MIN_DEV_SENTENCES; + + // Bind config-controlled values into local variables. These are + // read-only from this point on; any attempt to override them via CLI + // is rejected below. + long totalBudgetBytes = JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES; + long perLanguageCapBytes = JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES; + int minBytes = JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE; + double maxPuncFrac = JunkDetectorTrainingConfig.MAX_PUNC_FRAC; + double minTargetScriptFrac = JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC; + int minDevSentences = JunkDetectorTrainingConfig.MIN_DEV_SENTENCES; + int scriptSampleLines = JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES; + int seed = JunkDetectorTrainingConfig.SEED; + java.util.Set dropScripts = JunkDetectorTrainingConfig.DROP_SCRIPTS; + Map scriptBudgetOverrides = + JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES; for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -145,26 +127,25 @@ public static void main(String[] args) throws IOException { case "--output-dir": outputDir = Paths.get(args[++i]); break; - case "--script-sample-lines": - scriptSampleLines = Integer.parseInt(args[++i]); + case "--dry-run": + dryRun = true; break; + // Durable parameters are config-controlled. Refuse any CLI + // override so that a model file's identity always matches the + // committed config. + case "--script-sample-lines": case "--total-budget-bytes": - totalBudgetBytes = Long.parseLong(args[++i]); - break; + case "--per-language-cap-bytes": case "--min-bytes": - minBytes = Integer.parseInt(args[++i]); - break; case "--max-punc-frac": - maxPuncFrac = Double.parseDouble(args[++i]); - break; + case "--min-target-script-frac": case "--seed": - seed = Integer.parseInt(args[++i]); - break; case "--min-dev-sentences": - minDevSentences = Integer.parseInt(args[++i]); - break; - case "--dry-run": - dryRun = true; + case "--drop-scripts": + case "--script-budget-override": + System.err.println("ERROR: " + args[i] + " is no longer a CLI option." + + " Edit JunkDetectorTrainingConfig and commit the change instead."); + System.exit(1); break; default: System.err.println("Unknown argument: " + args[i]); @@ -174,15 +155,26 @@ public static void main(String[] args) throws IOException { } System.out.println("=== BuildJunkTrainingData ==="); - System.out.println(" data-dir: " + dataDir); - System.out.println(" output-dir: " + outputDir); - System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n", + System.out.println(" data-dir: " + dataDir); + System.out.println(" output-dir: " + outputDir); + System.out.println(" --- config (JunkDetectorTrainingConfig) ---"); + System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n", totalBudgetBytes, totalBudgetBytes / 1_000_000.0); - System.out.printf( " min-bytes: %d%n", minBytes); - System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac); - System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n", + System.out.printf( " per-language-cap: %,d (%.1f MB)%n", + perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0); + System.out.printf( " min-bytes: %d%n", minBytes); + System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac); + System.out.printf( " min-target-script-frac: %.2f%n", minTargetScriptFrac); + System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n", minDevSentences, (int)(minDevSentences / DEV_FRAC)); - System.out.println(" dry-run: " + dryRun); + System.out.printf( " seed: %d%n", seed); + if (!dropScripts.isEmpty()) { + System.out.println(" drop-scripts: " + dropScripts); + } + if (!scriptBudgetOverrides.isEmpty()) { + System.out.println(" script-budget-override: " + scriptBudgetOverrides); + } + System.out.println(" dry-run: " + dryRun); if (!Files.isDirectory(dataDir)) { System.err.println("ERROR: data-dir not found: " + dataDir); @@ -208,6 +200,15 @@ public static void main(String[] args) throws IOException { System.out.printf(" %-12s → %s%n", lang, script); } } + + if (!dropScripts.isEmpty()) { + for (String s : dropScripts) { + if (scriptGroups.remove(s) != null) { + System.out.printf(" DROP script: %s%n", s); + } + } + } + System.out.printf("%n → %d languages, %d script groups%n", langToScript.size(), scriptGroups.size()); @@ -222,7 +223,8 @@ public static void main(String[] args) throws IOException { String script = entry.getKey(); List langDirs = entry.getValue(); - long perLangSampleBytes = Math.max(ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L); + long perLangSampleBytes = Math.max( + JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L); List sample = new ArrayList<>(); for (Path langDir : langDirs) { loadSentences(langDir, perLangSampleBytes, minBytes, maxPuncFrac, sample); @@ -246,9 +248,25 @@ public static void main(String[] args) throws IOException { Map scriptBudget = new TreeMap<>(); for (Map.Entry e : scriptEntropy.entrySet()) { long budget = (long) (totalBudgetBytes * e.getValue() / totalEntropy); + Long override = scriptBudgetOverrides.get(e.getKey()); + if (override != null) { + System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)" + + " [OVERRIDE: was %,d (%.1f MB)]%n", + e.getKey(), e.getValue(), override, override / 1_000_000.0, + budget, budget / 1_000_000.0); + budget = override; + } else { + System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)%n", + e.getKey(), e.getValue(), budget, budget / 1_000_000.0); + } scriptBudget.put(e.getKey(), budget); - System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)%n", - e.getKey(), e.getValue(), budget, budget / 1_000_000.0); + } + // Warn about overrides for scripts that aren't in the bucket set. + for (String k : scriptBudgetOverrides.keySet()) { + if (!scriptBudget.containsKey(k)) { + System.err.printf("WARNING: --script-budget-override for %s ignored" + + " (script not in bucket set)%n", k); + } } if (dryRun) { @@ -273,8 +291,16 @@ public static void main(String[] args) throws IOException { String script = budgetEntry.getKey(); long budget = budgetEntry.getValue(); List langDirs = scriptGroups.get(script); + Character.UnicodeScript targetScript = parseUnicodeScript(script); long perLangBytes = Math.max(budget / langDirs.size(), 1L); + // Apply per-language cap on top of the even split, but only for + // multi-language buckets. For single-language scripts (e.g. KHMER, + // HANGUL), the cap would needlessly limit a bucket that has only + // one source; let it consume its full budget instead. + long capPerLang = langDirs.size() > 1 + ? Math.min(perLangBytes, perLanguageCapBytes) + : perLangBytes; List sentences = new ArrayList<>(); long totalBytesLoaded = 0; @@ -282,8 +308,10 @@ public static void main(String[] args) throws IOException { long remaining = budget - totalBytesLoaded; if (remaining <= 0) break; long langBytes = loadSentences(langDir, - Math.min(perLangBytes, remaining), - minBytes, maxPuncFrac, sentences); + Math.min(capPerLang, remaining), + minBytes, maxPuncFrac, + targetScript, minTargetScriptFrac, + sentences); totalBytesLoaded += langBytes; if (langBytes > 0) { System.out.printf(" %-12s %-20s +%,d bytes%n", @@ -327,7 +355,11 @@ public static void main(String[] args) throws IOException { long newBudget = budget + extra; List langDirs = scriptGroups.get(script); + Character.UnicodeScript targetScript = parseUnicodeScript(script); long perLangBytes = Math.max(newBudget / langDirs.size(), 1L); + long capPerLang = langDirs.size() > 1 + ? Math.min(perLangBytes, perLanguageCapBytes) + : perLangBytes; List sentences = new ArrayList<>(); long totalBytesLoaded = 0; @@ -335,8 +367,10 @@ public static void main(String[] args) throws IOException { long remaining = newBudget - totalBytesLoaded; if (remaining <= 0) break; long langBytes = loadSentences(langDir, - Math.min(perLangBytes, remaining), - minBytes, maxPuncFrac, sentences); + Math.min(capPerLang, remaining), + minBytes, maxPuncFrac, + targetScript, minTargetScriptFrac, + sentences); totalBytesLoaded += langBytes; } if (!sentences.isEmpty()) { @@ -415,6 +449,21 @@ public static void main(String[] args) throws IOException { System.out.println("Done."); } + /** + * Parses a script-bucket name (e.g. {@code "HAN"}) into a + * {@link Character.UnicodeScript}, or returns {@code null} if the name + * does not correspond to a real script (e.g. {@code "COMMON"} or any + * future synthetic bucket). Used by the corpus builder to look up the + * target script for the {@code min-target-script-frac} filter. + */ + static Character.UnicodeScript parseUnicodeScript(String name) { + try { + return Character.UnicodeScript.valueOf(name); + } catch (IllegalArgumentException e) { + return null; + } + } + // ----------------------------------------------------------------------- // Script detection // ----------------------------------------------------------------------- @@ -531,6 +580,22 @@ static double computeBigramEntropy(List sentences) { */ static long loadSentences(Path langDir, long maxBytes, int minBytes, double maxPuncFrac, List result) { + // Backwards-compatible overload: no target-script filter. + return loadSentences(langDir, maxBytes, minBytes, maxPuncFrac, + null, 0.0, result); + } + + /** + * Same as the 5-arg overload, but additionally drops sentences whose + * fraction of {@code targetScript} codepoints (relative to all non- + * COMMON/INHERITED codepoints) is below {@code minTargetScriptFrac}. + * Passing {@code targetScript == null} disables the target-script filter. + */ + static long loadSentences(Path langDir, long maxBytes, int minBytes, + double maxPuncFrac, + Character.UnicodeScript targetScript, + double minTargetScriptFrac, + List result) { long bytesLoaded = 0; for (String filename : new String[]{"sentences_wikipedia.txt", "sentences_madlad.txt"}) { if (bytesLoaded >= maxBytes) { @@ -553,7 +618,8 @@ static long loadSentences(Path langDir, long maxBytes, int minBytes, if (text.isEmpty()) { continue; } - String filtered = filterSentence(text, minBytes, maxPuncFrac); + String filtered = filterSentence(text, minBytes, maxPuncFrac, + targetScript, minTargetScriptFrac); if (filtered != null) { int sentBytes = filtered.getBytes(StandardCharsets.UTF_8).length; result.add(filtered); @@ -577,6 +643,18 @@ static long loadSentences(Path langDir, long maxBytes, int minBytes, * @return the normalised sentence, or {@code null} if it should be discarded */ static String filterSentence(String text, int minBytes, double maxPuncFrac) { + return filterSentence(text, minBytes, maxPuncFrac, null, 0.0); + } + + /** + * Same as the 3-arg overload, but additionally rejects sentences whose + * fraction of {@code targetScript} codepoints (over non-COMMON/INHERITED + * codepoints) is below {@code minTargetScriptFrac}. If {@code + * targetScript == null} the target-script filter is skipped. + */ + static String filterSentence(String text, int minBytes, double maxPuncFrac, + Character.UnicodeScript targetScript, + double minTargetScriptFrac) { if (text.indexOf('\uFFFD') >= 0) { return null; } @@ -586,17 +664,34 @@ static String filterSentence(String text, int minBytes, double maxPuncFrac) { } int cpCount = 0; int puncCount = 0; + int scriptCpTotal = 0; + int scriptCpMatching = 0; for (int i = 0; i < text.length(); ) { int cp = text.codePointAt(i); cpCount++; if (cp >= 0x21 && cp <= 0x7E && !Character.isLetter(cp)) { puncCount++; } + if (targetScript != null) { + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s != Character.UnicodeScript.COMMON + && s != Character.UnicodeScript.INHERITED + && s != Character.UnicodeScript.UNKNOWN) { + scriptCpTotal++; + if (s == targetScript) { + scriptCpMatching++; + } + } + } i += Character.charCount(cp); } if (cpCount > 0 && (double) puncCount / cpCount > maxPuncFrac) { return null; } + if (targetScript != null && scriptCpTotal > 0 + && (double) scriptCpMatching / scriptCpTotal < minTargetScriptFrac) { + return null; + } return text; } @@ -624,23 +719,15 @@ private static void writeGzipped(Path path, List lines) throws IOExcepti private static void printUsage() { System.err.println("Usage: BuildJunkTrainingData [options]"); - System.err.println(" --data-dir MADLAD data root" + System.err.println(" --data-dir MADLAD data root" + " (default: ~/datasets/madlad/data)"); - System.err.println(" --output-dir Output directory" + System.err.println(" --output-dir Output directory" + " (default: ~/datasets/madlad/junkdetect)"); - System.err.println(" --script-sample-lines N Lines per language for script" - + " detection (default: 2000)"); - System.err.println(" --total-budget-bytes N Total UTF-8 bytes across all" - + " scripts (default: 50000000)"); - System.err.println(" --min-bytes N Min UTF-8 bytes per sentence" - + " (default: 50)"); - System.err.println(" --max-punc-frac F Max ASCII punct fraction" - + " (default: 0.30)"); - System.err.println(" --min-dev-sentences N Min sentences in dev split for a" - + " script to be included (default: 500). Scripts below this floor" - + " have unreliable calibration and inflated FPR."); - System.err.println(" --seed N Random seed (default: 42)"); - System.err.println(" --dry-run Detect scripts + show budget," - + " skip file writing"); + System.err.println(" --dry-run Detect scripts + show budget," + + " skip file writing."); + System.err.println(); + System.err.println("All other training/build parameters (budgets, filters, dropped" + + " scripts, seed, etc.) are fixed in JunkDetectorTrainingConfig and tracked" + + " in git. Edit that file and commit to change them."); } } diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java new file mode 100644 index 00000000000..b287012ddc0 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +/** + * Diagnostic tool for sizing a per-script F1 bigram store (v7 design). + * + *

Walks every {@code *.train.gz} in {@code dataDir}, treating each file as + * one script's corpus. Counts (cpA, cpB) codepoint-pair frequencies and + * reports, per script: + * + *

    + *
  • total bigram occurrences (N) + *
  • distinct pair count (U) + *
  • singletons — pairs seen exactly once (these are usually the + * worst candidates to keep; they often reflect OCR noise / rare + * proper nouns and inflate U without helping discrimination) + *
  • "effective" pair count = pairs seen at least {@code MIN_COUNT} times + *
  • coverage curve: how many of the top-N most-frequent pairs are needed + * to cover {x = 50, 75, 90, 95, 99, 99.9}% of all bigram occurrences + *
  • estimated v7 model size for several candidate cutoffs, assuming + * 2.25 bytes/pair (MPHF + 8-bit fingerprint + 8-bit value) + * and 1.3 bytes/pair (MPHF + 8-bit value, no fingerprint) + *
+ * + *

Usage: + *

+ *   mvn -pl tika-ml/tika-ml-junkdetect exec:java \
+ *       -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.CountPerScriptBigrams \
+ *       -Dexec.args="/path/to/junkdetect"
+ * 
+ * + *

No model output; this is read-only telemetry to inform the v7 sizing + * decision (see {@code 20260514-junk-retrain-v6.md}). + */ +public final class CountPerScriptBigrams { + + private static final int[] COVERAGE_PCT = {50, 75, 90, 95, 99}; + private static final double[] COVERAGE_FRAC_HI = {0.999}; + + /** Cutoffs reported in the size-estimate table. */ + private static final int[] MIN_COUNT_CUTOFFS = {1, 2, 3, 5, 10}; + + /** Bytes per retained pair for each candidate storage scheme. */ + private static final double[] BYTES_PER_PAIR_SCHEMES = {1.3, 2.25, 6.25}; + private static final String[] SCHEME_NAMES = { + "MPHF+val(1.3B)", "MPHF+fp+val(2.25B)", "open-addr+key(6.25B)"}; + + private CountPerScriptBigrams() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println( + "Usage: CountPerScriptBigrams [topK-per-script]"); + System.exit(1); + } + Path dataDir = Paths.get(args[0]); + int topK = args.length >= 2 ? Integer.parseInt(args[1]) : 0; + + List trainFiles = new ArrayList<>(); + try (Stream s = Files.list(dataDir)) { + s.filter(p -> p.getFileName().toString().endsWith(".train.gz")) + .sorted() + .forEach(trainFiles::add); + } + if (trainFiles.isEmpty()) { + System.err.println("ERROR: no *.train.gz files in " + dataDir); + System.exit(1); + } + + System.out.printf("Found %d *.train.gz files in %s%n%n", + trainFiles.size(), dataDir); + System.out.printf( + "%-22s %12s %12s %12s %12s | %s%n", + "script", "total_N", "distinct_U", "singletons", + "U(>=10)", "coverage: pairs needed for [50,75,90,95,99,99.9]%"); + System.out.println(repeat('-', 140)); + + long grandTotalN = 0; + long grandTotalU = 0; + long grandTotalUge2 = 0; + long grandTotalUge10 = 0; + + // Per-script size accumulators for the global-size summary at the end. + Map perScriptStats = new HashMap<>(); + + for (Path trainFile : trainFiles) { + String fname = trainFile.getFileName().toString(); + String script = fname.substring(0, fname.length() - ".train.gz".length()) + .toUpperCase(); + + HashMap pairCounts = new HashMap<>(1 << 16); + long totalN = 0; + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(trainFile)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + if (prevCp >= 0) { + long key = packPair(prevCp, cp); + long[] c = pairCounts.get(key); + if (c == null) { + pairCounts.put(key, new long[]{1L}); + } else { + c[0]++; + } + totalN++; + } + prevCp = cp; + } + } + } + + int distinctU = pairCounts.size(); + + long[] counts = new long[distinctU]; + int idx = 0; + for (long[] c : pairCounts.values()) { + counts[idx++] = c[0]; + } + // Sort descending for coverage curve. + java.util.Arrays.sort(counts); + // Reverse in place. + for (int i = 0, j = counts.length - 1; i < j; i++, j--) { + long t = counts[i]; + counts[i] = counts[j]; + counts[j] = t; + } + + int singletons = 0; + int uGe2 = 0; + int uGe10 = 0; + for (long c : counts) { + if (c == 1) singletons++; + if (c >= 2) uGe2++; + if (c >= 10) uGe10++; + } + + // Coverage thresholds: minimum k such that sum(counts[0..k-1]) / N >= t. + int[] coveragePairs = new int[COVERAGE_PCT.length + COVERAGE_FRAC_HI.length]; + double[] thresholds = new double[coveragePairs.length]; + for (int i = 0; i < COVERAGE_PCT.length; i++) { + thresholds[i] = COVERAGE_PCT[i] / 100.0; + } + for (int i = 0; i < COVERAGE_FRAC_HI.length; i++) { + thresholds[COVERAGE_PCT.length + i] = COVERAGE_FRAC_HI[i]; + } + long running = 0; + int tIdx = 0; + for (int k = 0; k < counts.length && tIdx < thresholds.length; k++) { + running += counts[k]; + while (tIdx < thresholds.length + && (double) running / totalN >= thresholds[tIdx]) { + coveragePairs[tIdx++] = k + 1; + } + } + // Fill any unreached thresholds with U (means: never reached, took all). + for (; tIdx < thresholds.length; tIdx++) { + coveragePairs[tIdx] = distinctU; + } + + StringBuilder cov = new StringBuilder(); + for (int i = 0; i < coveragePairs.length; i++) { + if (i > 0) cov.append(", "); + cov.append(String.format("%,d", coveragePairs[i])); + } + + System.out.printf("%-22s %,12d %,12d %,12d %,12d | %s%n", + script.toLowerCase(), + totalN, distinctU, singletons, uGe10, + cov.toString()); + + // Per-script size table. + if (topK > 0 || true) { + long[] sizeStats = new long[ + 2 + MIN_COUNT_CUTOFFS.length + BYTES_PER_PAIR_SCHEMES.length]; + sizeStats[0] = totalN; + sizeStats[1] = distinctU; + for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) { + int minC = MIN_COUNT_CUTOFFS[i]; + int kept = 0; + for (long c : counts) { + if (c >= minC) kept++; + else break; + } + sizeStats[2 + i] = kept; + } + perScriptStats.put(script.toLowerCase(), sizeStats); + } + + // Per-script top-K dump if requested. + if (topK > 0) { + System.out.printf(" top %d pairs in %s:%n", topK, script.toLowerCase()); + List> sorted = new ArrayList<>(pairCounts.entrySet()); + sorted.sort((a, b) -> Long.compare(b.getValue()[0], a.getValue()[0])); + for (int i = 0; i < Math.min(topK, sorted.size()); i++) { + Map.Entry e = sorted.get(i); + long k = e.getKey(); + int cpA = (int) (k >>> 24); + int cpB = (int) (k & 0xFFFFFFL); + System.out.printf(" U+%04X U+%04X (%c %c) %,d%n", + cpA, cpB, + safePrint(cpA), safePrint(cpB), + e.getValue()[0]); + } + } + + grandTotalN += totalN; + grandTotalU += distinctU; + grandTotalUge2 += uGe2; + grandTotalUge10 += uGe10; + } + + System.out.println(repeat('-', 140)); + System.out.printf("%-22s %,12d %,12d %12s %,12d%n%n", + "TOTAL", grandTotalN, grandTotalU, + "-", grandTotalUge10); + + // ------------------------------------------------------------------ + // Cutoff vs. model-size summary + // ------------------------------------------------------------------ + System.out.println("=== Model-size estimates by min-count cutoff and storage scheme ==="); + System.out.println("(sum of retained pairs across all scripts × bytes-per-pair)"); + System.out.println(); + System.out.printf("%-12s", "cutoff"); + for (String name : SCHEME_NAMES) { + System.out.printf(" %20s", name); + } + System.out.printf(" %20s%n", "retained_pairs"); + System.out.println(repeat('-', 12 + (SCHEME_NAMES.length + 1) * 21)); + + for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) { + long retained = 0; + for (long[] stats : perScriptStats.values()) { + retained += stats[2 + i]; + } + System.out.printf("min_count>=%-2d", MIN_COUNT_CUTOFFS[i]); + for (double bpp : BYTES_PER_PAIR_SCHEMES) { + double bytes = retained * bpp; + System.out.printf(" %18s ", humanBytes(bytes)); + } + System.out.printf(" %,20d%n", retained); + } + + System.out.println(); + System.out.println("Per-script pair counts retained at each cutoff:"); + System.out.printf("%-22s", "script"); + for (int c : MIN_COUNT_CUTOFFS) { + System.out.printf(" %12s", ">=" + c); + } + System.out.println(); + List> sortedScripts = + new ArrayList<>(perScriptStats.entrySet()); + sortedScripts.sort(Comparator.comparingLong( + (Map.Entry e) -> -e.getValue()[1])); + for (Map.Entry e : sortedScripts) { + System.out.printf("%-22s", e.getKey()); + for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) { + System.out.printf(" %,12d", e.getValue()[2 + i]); + } + System.out.println(); + } + } + + /** Pack two codepoints (each up to 21 bits) into a single long. */ + private static long packPair(int cpA, int cpB) { + return ((long) cpA << 24) | (cpB & 0xFFFFFFL); + } + + private static char safePrint(int cp) { + if (cp < 0x20 || cp == 0x7F || !Character.isDefined(cp)) { + return '.'; + } + if (Character.charCount(cp) != 1) { + return '?'; + } + return (char) cp; + } + + private static String repeat(char c, int n) { + char[] buf = new char[n]; + java.util.Arrays.fill(buf, c); + return new String(buf); + } + + private static String humanBytes(double bytes) { + if (bytes < 1024) return String.format("%.0f B", bytes); + if (bytes < 1024 * 1024) return String.format("%.1f KB", bytes / 1024.0); + if (bytes < 1024L * 1024 * 1024) return String.format("%.2f MB", bytes / (1024.0 * 1024)); + return String.format("%.2f GB", bytes / (1024.0 * 1024 * 1024)); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java new file mode 100644 index 00000000000..36f3a897a01 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/DebugScriptRuns.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.ml.chardetect.HtmlByteStripper; +import org.apache.tika.ml.junkdetect.JunkDetector; +import org.apache.tika.quality.TextQualityScore; + +/** + * Diagnostic: replicate JunkDetector.buildScriptRuns exactly on a fixture + * and print every run. Helps explain why score() returns UNKNOWN. + * + *

Usage: + *

+ *   ./mvnw exec:java -pl tika-ml/tika-ml-junkdetect \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.DebugScriptRuns \
+ *     -Dexec.args="--file ~/data/regression/.../AIT5... --charset GB18030 --bytes 1024"
+ * 
+ */ +public class DebugScriptRuns { + + // Mirror of JunkDetector.SCRIPT_MODEL_FALLBACK — keep in sync if production changes. + private static final Map SCRIPT_MODEL_FALLBACK = Map.of( + "HIRAGANA", "HAN", + "KATAKANA", "HAN"); + + public static void main(String[] args) throws IOException { + Path file = null; + String charset = "GB18030"; + int probeBytes = 1024; + boolean strip = true; + boolean expand = true; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--file": + file = Paths.get(expandHome(args[++i])); + break; + case "--charset": + charset = args[++i]; + break; + case "--bytes": + probeBytes = Integer.parseInt(args[++i]); + break; + case "--no-strip": + strip = false; + break; + case "--no-expand": + expand = false; + break; + default: + System.err.println("unknown: " + args[i]); + System.exit(1); + } + } + if (file == null) { + System.err.println("Required: --file "); + System.exit(1); + } + byte[] raw = Files.readAllBytes(file); + byte[] forDecode = raw; + if (strip) { + byte[] dst = new byte[raw.length]; + HtmlByteStripper.Result r = HtmlByteStripper.strip(raw, 0, raw.length, dst, 0); + if (r.tagCount > 0 && r.length > 0) { + forDecode = Arrays.copyOf(dst, r.length); + } + System.err.println("After strip: " + forDecode.length + " bytes (was " + raw.length + ")"); + } + if (forDecode.length > probeBytes) { + forDecode = Arrays.copyOf(forDecode, probeBytes); + } + System.err.println("Probe: " + forDecode.length + " bytes decoded as " + charset); + + String decoded = new String(forDecode, Charset.forName(charset)); + if (expand) { + decoded = expandEntities(decoded); + } + System.err.println("Decoded codepoints: " + decoded.codePointCount(0, decoded.length())); + + List runs = buildScriptRuns(decoded); + System.err.println("Built " + runs.size() + " script runs."); + + // Mirror JunkDetector.scoreText filter and report what would be scored. + JunkDetector detector = JunkDetector.loadFromClasspath(); + java.util.Set modeled = detector.knownScripts(); + + TreeMap totals = new TreeMap<>(); // script -> {chars, bytes, runs, modeled?} + int totalScored = 0; + int totalSkippedShort = 0; + int totalSkippedUnmodeled = 0; + long totalBytesScored = 0; + + for (Run r : runs) { + byte[] runUtf8 = r.text.getBytes(StandardCharsets.UTF_8); + boolean isModeled = modeled.contains(r.script); + boolean longEnough = runUtf8.length >= 2; + totals.merge(r.script, new int[]{r.text.codePointCount(0, r.text.length()), + runUtf8.length, 1, isModeled ? 1 : 0}, + (a, b) -> new int[]{a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3]}); + if (!isModeled) { + totalSkippedUnmodeled++; + } else if (!longEnough) { + totalSkippedShort++; + } else { + totalScored++; + totalBytesScored += runUtf8.length; + } + } + + System.out.println("Script roll-up (script: cps, utf8_bytes, runs, modeled):"); + for (Map.Entry e : totals.entrySet()) { + int[] v = e.getValue(); + System.out.printf(" %-15s cps=%-5d bytes=%-6d runs=%-4d modeled=%s%n", + e.getKey(), v[0], v[1], v[2], v[3] == 1 ? "Y" : "N"); + } + System.out.println(); + System.out.println("Scoring filter outcome:"); + System.out.println(" runs scored: " + totalScored); + System.out.println(" runs skipped (short): " + totalSkippedShort); + System.out.println(" runs skipped (unmod): " + totalSkippedUnmodeled); + System.out.println(" total bytes scored: " + totalBytesScored); + + // The bug: computeF1MeanLogP returns NaN when String.length() < 2. + // String.length() counts UTF-16 code units, but the outer filter uses + // UTF-8 bytes. A single CJK char = 1 UTF-16 unit but 3 UTF-8 bytes, + // so it passes the outer filter and produces NaN inside. + int nanCausing = 0; + for (Run r : runs) { + byte[] u = r.text.getBytes(StandardCharsets.UTF_8); + if (u.length >= 2 && r.text.length() < 2 && modeled.contains(r.script)) { + nanCausing++; + } + } + System.out.println(); + System.out.println("NaN-causing runs (utf8≥2 but utf16<2, modeled): " + nanCausing); + + TextQualityScore score = detector.score(decoded); + System.out.println(" detector.score() z: " + + (score.isUnknown() ? "UNKNOWN(" + score.getDominantScript() + ")" + : String.format("%.3f (script=%s)", score.getZScore(), score.getDominantScript()))); + + // Print the longest 10 runs so we can see what's actually in there. + System.out.println(); + System.out.println("Longest 10 runs:"); + runs.sort((a, b) -> Integer.compare(b.text.length(), a.text.length())); + for (int i = 0; i < Math.min(10, runs.size()); i++) { + Run r = runs.get(i); + byte[] u = r.text.getBytes(StandardCharsets.UTF_8); + String preview = r.text.length() > 30 + ? r.text.substring(0, 30) + "…" : r.text; + preview = preview.replace("\n", "\\n").replace("\r", "\\r"); + System.out.printf(" %-15s cps=%-4d bytes=%-4d preview=%s%n", + r.script, r.text.codePointCount(0, r.text.length()), u.length, preview); + } + } + + // Exact mirror of JunkDetector.buildScriptRuns (private, copied here for diagnosis). + private static List buildScriptRuns(String text) { + List runs = new ArrayList<>(); + String currentScript = null; + StringBuilder currentText = new StringBuilder(); + StringBuilder leadingCommon = new StringBuilder(); + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED + || s == Character.UnicodeScript.UNKNOWN) { + if (currentScript != null) { + currentText.appendCodePoint(cp); + } else { + leadingCommon.appendCodePoint(cp); + } + continue; + } + String scriptName = SCRIPT_MODEL_FALLBACK.getOrDefault(s.name(), s.name()); + if (!scriptName.equals(currentScript)) { + if (currentScript != null && currentText.length() > 0) { + runs.add(new Run(currentScript, currentText.toString())); + } + currentScript = scriptName; + currentText = new StringBuilder(); + if (leadingCommon.length() > 0) { + currentText.append(leadingCommon); + leadingCommon.setLength(0); + } + } + currentText.appendCodePoint(cp); + } + if (currentScript != null && currentText.length() > 0) { + runs.add(new Run(currentScript, currentText.toString())); + } + return runs; + } + + private static final class Run { + final String script; + final String text; + Run(String s, String t) { + this.script = s; + this.text = t; + } + } + + private static final Pattern NUM_DEC = Pattern.compile("&#(\\d{1,7});"); + private static final Pattern NUM_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});"); + private static final Pattern NAMED = + Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);"); + + private static String expandEntities(String in) { + String s = NUM_DEC.matcher(in).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1)); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // leave unchanged + } + return Matcher.quoteReplacement(mr.group()); + }); + s = NUM_HEX.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1), 16); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // leave unchanged + } + return Matcher.quoteReplacement(mr.group()); + }); + s = NAMED.matcher(s).replaceAll(mr -> { + switch (mr.group(1)) { + case "amp": return "&"; + case "lt": return "<"; + case "gt": return ">"; + case "quot": return "\""; + case "apos": return "'"; + case "nbsp": return " "; + case "copy": return "©"; + case "reg": return "®"; + default: return Matcher.quoteReplacement(mr.group()); + } + }); + return s; + } + + private static String expandHome(String s) { + return s.startsWith("~/") ? System.getProperty("user.home") + s.substring(1) : s; + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java index 6b6057fc34f..e0b4bc0ae10 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkDetector.java @@ -470,7 +470,7 @@ private static void writeCompareEval(JunkDetector detector, sourceCodec, asSource, wrongCodec, asWrong); deltas.add(result.delta()); - if ("A".equals(result.winner())) nCorrect++; + if (sourceCodec.equals(result.winner())) nCorrect++; } if (deltas.isEmpty()) continue; diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java new file mode 100644 index 00000000000..30d175a4b12 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/EvalJunkOnCharsetDevtest.java @@ -0,0 +1,688 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.DataInputStream; +import java.io.EOFException; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +import org.apache.tika.ml.junkdetect.JunkDetector; +import org.apache.tika.quality.TextQualityScore; + +/** + * Eval harness: for each labeled charset in {@code ~/data/charsets/devtest/}, + * decode under its true charset (clean) and under a curated set of wrong + * charsets (mojibake), score with {@link JunkDetector}, report margin + * statistics per (labeled_charset × wrong_charset × source-byte-length). + * + *

Devtest file format: gzip → repeated {@code [u16 big-endian length, + * length bytes]} records, where the bytes are real text encoded in the + * labeled charset. Same format the charset trainer consumes. + * + *

Output (TSVs): + *

    + *
  • detail.tsv: one row per (labeled_cs, script, wrong_cs, length). + * Columns: n, mean_clean_z, mean_mojibake_z, cohens_d, mean_margin, + * p5_margin, p50_margin, fpr, tpr.
  • + *
  • summary.tsv: macro-averaged across wrong charsets, per + * (script, length). The headline "is this script in trouble?" view.
  • + *
  • script_pivot.tsv: per-script rollup across all lengths + + * wrong charsets. Single-number-per-script view for spot inversion.
  • + *
+ * + *

"Margin" is the per-record paired difference {@code clean_z - + * mojibake_z}. Mean margin and 5th-percentile margin are the + * margin-maximization metrics the v6 retrain is optimizing for. Cohen's d + * is the independent-distribution analog (kept for compatibility with the + * existing {@link EvalJunkDetector} schema). + * + *

Usage: + *

+ *   ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.EvalJunkOnCharsetDevtest \
+ *     -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v5-baseline"
+ * 
+ */ +public class EvalJunkOnCharsetDevtest { + + /** + * Curated set of wrong charsets to cross-decode every labeled charset + * against. Chosen to span the common real-world mojibake families: + * Western Latin (cp1252, ISO-8859-1, MacRoman), CJK over-claim (GB18030, + * Big5-HKSCS, Shift_JIS), Cyrillic (KOI8-R, cp1251), Arabic (cp1256), + * EBCDIC over-claim (IBM424), DOS Latin (IBM850), and UTF-8 (catches + * non-UTF8 bytes as replacement-character garbage). + */ + private static final List DEFAULT_WRONG_CHARSETS = List.of( + "windows-1252", "ISO-8859-1", "x-MacRoman", + "GB18030", "Big5-HKSCS", "Shift_JIS", + "KOI8-R", "windows-1251", + "windows-1256", "IBM424", + "IBM850", "UTF-8" + ); + + /** Source-byte length buckets to slice records into. */ + private static final int[] DEFAULT_LENGTHS = {20, 50, 100, 200, 500, 1000}; + + /** Cap on records loaded per labeled-charset file. */ + private static final int DEFAULT_MAX_RECORDS = 2000; + + /** Threshold for FPR/TPR reporting; matches EvalJunkDetector default. */ + private static final float DEFAULT_THRESHOLD = -2.0f; + + /** Minimum number of paired (clean, mojibake) samples per cell to emit a row. */ + private static final int MIN_SAMPLES_PER_CELL = 30; + + public static void main(String[] args) throws IOException { + Path devtestDir = Paths.get(System.getProperty("user.home"), + "data", "charsets", "devtest"); + Path outputDir = Paths.get("/tmp/junkdetect-eval"); + Path modelPath = null; + int maxRecords = DEFAULT_MAX_RECORDS; + int[] lengths = DEFAULT_LENGTHS; + float threshold = DEFAULT_THRESHOLD; + List wrongCharsets = DEFAULT_WRONG_CHARSETS; + List labeledFilter = null; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--devtest-dir": + devtestDir = Paths.get(args[++i]); + break; + case "--output-dir": + outputDir = Paths.get(args[++i]); + break; + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--max-records": + maxRecords = Integer.parseInt(args[++i]); + break; + case "--threshold": + threshold = Float.parseFloat(args[++i]); + break; + case "--lengths": + lengths = Arrays.stream(args[++i].split(",")) + .mapToInt(Integer::parseInt).toArray(); + break; + case "--wrong-charsets": + wrongCharsets = Arrays.asList(args[++i].split(",")); + break; + case "--only": + labeledFilter = Arrays.asList(args[++i].split(",")); + break; + default: + System.err.println("Unknown arg: " + args[i]); + printUsage(); + System.exit(1); + } + } + + if (!Files.isDirectory(devtestDir)) { + System.err.println("ERROR: devtest-dir not found: " + devtestDir); + System.exit(1); + } + Files.createDirectories(outputDir); + + JunkDetector detector = modelPath != null + ? JunkDetector.loadFromPath(modelPath) + : JunkDetector.loadFromClasspath(); + + System.err.println("=== EvalJunkOnCharsetDevtest ==="); + System.err.println(" devtest-dir: " + devtestDir); + System.err.println(" output-dir: " + outputDir); + System.err.println(" model: " + (modelPath != null ? modelPath : "classpath default")); + System.err.println(" model version: " + detector.getModelVersion()); + System.err.println(" max-records: " + maxRecords); + System.err.println(" lengths: " + Arrays.toString(lengths)); + System.err.println(" threshold: " + threshold); + System.err.println(" wrong-cs: " + wrongCharsets); + + // Resolve wrong charsets (skip any the JVM doesn't have) + Map resolvedWrong = new LinkedHashMap<>(); + for (String name : wrongCharsets) { + Charset cs = tryGetCharset(name); + if (cs == null) { + System.err.println(" WARN: wrong-charset unavailable: " + name); + continue; + } + resolvedWrong.put(name, cs); + } + + List files; + try (Stream stream = Files.list(devtestDir)) { + files = stream + .filter(p -> p.getFileName().toString().endsWith(".bin.gz")) + .sorted() + .toList(); + } + if (files.isEmpty()) { + System.err.println("ERROR: no *.bin.gz files in " + devtestDir); + System.exit(1); + } + + Path detailPath = outputDir.resolve("detail.tsv"); + Path summaryPath = outputDir.resolve("summary.tsv"); + Path pivotPath = outputDir.resolve("script_pivot.tsv"); + + List allRows = new ArrayList<>(); + + try (PrintWriter detail = new PrintWriter( + Files.newBufferedWriter(detailPath, StandardCharsets.UTF_8))) { + + detail.println("labeled_cs\tscript\twrong_cs\tlength\tn" + + "\tmean_clean_z\tmean_mojibake_z\tcohens_d" + + "\tmean_margin\tp5_margin\tp50_margin" + + "\tfpr\ttpr"); + + for (Path file : files) { + String labeledName = filenameToCharsetName(file); + if (labeledFilter != null && !labeledFilter.contains(labeledName)) { + continue; + } + Charset labeled = tryGetCharset(labeledName); + if (labeled == null) { + System.err.println(" SKIP: labeled charset unavailable: " + labeledName); + continue; + } + + List records = readRecords(file, maxRecords); + if (records.size() < MIN_SAMPLES_PER_CELL) { + System.err.printf(" SKIP %s: only %d records%n", + labeledName, records.size()); + continue; + } + + System.err.printf("%n--- %s (%d records) ---%n", + labeledName, records.size()); + + for (int len : lengths) { + List slices = sliceToLength(records, len); + if (slices.size() < MIN_SAMPLES_PER_CELL) { + continue; + } + + // Decode all slices under labeled (clean) once + List cleanTexts = decodeAll(slices, labeled); + List cleanZs = scoreAll(detector, cleanTexts); + if (cleanZs.size() < MIN_SAMPLES_PER_CELL) { + continue; + } + + // Detect script from a sample of the clean decoded text + String script = detectDominantScript( + cleanTexts.get(cleanTexts.size() / 2)); + + for (Map.Entry entry : resolvedWrong.entrySet()) { + String wrongName = entry.getKey(); + Charset wrongCs = entry.getValue(); + if (equalCharset(labeled, wrongCs)) { + continue; // can't be its own mojibake + } + + List mojiTexts = decodeAll(slices, wrongCs); + // Pair cleanTexts[i] with mojiTexts[i] by source record + Row row = scorePairs(detector, script, labeledName, + wrongName, len, cleanTexts, mojiTexts, + cleanZs, threshold); + if (row == null) { + continue; + } + allRows.add(row); + detail.println(row.toTsv()); + } + detail.flush(); + System.err.printf(" len=%4d n_clean=%d cells=%d%n", + len, cleanZs.size(), + allRows.stream() + .filter(r -> r.labeledCs.equals(labeledName) + && r.length == len) + .count()); + } + } + } + + writeSummary(summaryPath, allRows, lengths); + writeScriptPivot(pivotPath, allRows); + + System.err.println("\nWrote " + detailPath); + System.err.println("Wrote " + summaryPath); + System.err.println("Wrote " + pivotPath); + System.err.println("Done."); + } + + // ----------------------------------------------------------------------- + // Per-cell scoring (one labeled × wrong × length cell) + // ----------------------------------------------------------------------- + + private static Row scorePairs(JunkDetector detector, + String script, + String labeledName, String wrongName, + int length, + List cleanTexts, + List mojiTexts, + List cleanZsPre, + float threshold) { + // cleanZsPre is the already-scored clean text (avoid re-scoring per wrong cs). + // We re-score only the mojibake side here. + int n = Math.min(cleanTexts.size(), mojiTexts.size()); + List cleanZs = new ArrayList<>(n); + List mojiZs = new ArrayList<>(n); + List margins = new ArrayList<>(n); + for (int i = 0; i < n; i++) { + float cz = cleanZsPre.get(i); + TextQualityScore ms = detector.score(mojiTexts.get(i)); + if (ms.isUnknown()) { + continue; + } + float mz = ms.getZScore(); + cleanZs.add(cz); + mojiZs.add(mz); + margins.add(cz - mz); + } + if (margins.size() < MIN_SAMPLES_PER_CELL) { + return null; + } + return new Row(labeledName, script, wrongName, length, + cleanZs, mojiZs, margins, threshold); + } + + // ----------------------------------------------------------------------- + // I/O: read the gzipped length-prefixed record format + // ----------------------------------------------------------------------- + + private static List readRecords(Path file, int maxRecords) throws IOException { + List records = new ArrayList<>(); + try (FileInputStream fis = new FileInputStream(file.toFile()); + GZIPInputStream gis = new GZIPInputStream(fis); + DataInputStream dis = new DataInputStream(gis)) { + while (records.size() < maxRecords) { + int len; + try { + len = dis.readUnsignedShort(); + } catch (EOFException eof) { + break; + } + byte[] rec = new byte[len]; + dis.readFully(rec); + records.add(rec); + } + } + return records; + } + + private static List sliceToLength(List records, int len) { + List slices = new ArrayList<>(); + for (byte[] r : records) { + if (r.length >= len) { + slices.add(Arrays.copyOf(r, len)); + } + } + return slices; + } + + private static List decodeAll(List slices, Charset cs) { + List texts = new ArrayList<>(slices.size()); + for (byte[] s : slices) { + texts.add(decode(s, cs)); + } + return texts; + } + + private static String decode(byte[] bytes, Charset cs) { + CharsetDecoder dec = cs.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + try { + return dec.decode(ByteBuffer.wrap(bytes)).toString(); + } catch (CharacterCodingException e) { + return new String(bytes, cs); // fallback; shouldn't happen with REPLACE + } + } + + private static List scoreAll(JunkDetector detector, List texts) { + List zs = new ArrayList<>(texts.size()); + for (String t : texts) { + TextQualityScore s = detector.score(t); + if (!s.isUnknown()) { + zs.add(s.getZScore()); + } else { + zs.add(Float.NaN); + } + } + return zs; + } + + // ----------------------------------------------------------------------- + // Aggregation: summary.tsv (macro across wrong charsets, per script×length) + // ----------------------------------------------------------------------- + + private static void writeSummary(Path summaryPath, List rows, + int[] lengths) throws IOException { + try (PrintWriter out = new PrintWriter( + Files.newBufferedWriter(summaryPath, StandardCharsets.UTF_8))) { + out.println("script\tlength\tn_cells" + + "\tmacro_cohens_d\tmacro_mean_margin\tmacro_p5_margin" + + "\tmacro_fpr\tmacro_tpr"); + + // Group by (script, length) + Map>> bucketed = new HashMap<>(); + for (Row r : rows) { + bucketed + .computeIfAbsent(r.script, k -> new HashMap<>()) + .computeIfAbsent(r.length, k -> new ArrayList<>()) + .add(r); + } + + List scripts = new ArrayList<>(bucketed.keySet()); + Collections.sort(scripts); + for (String script : scripts) { + for (int len : lengths) { + List cell = bucketed.get(script).get(len); + if (cell == null || cell.isEmpty()) { + continue; + } + double macroD = cell.stream() + .filter(r -> !Double.isNaN(r.cohensD)) + .mapToDouble(r -> r.cohensD) + .average().orElse(Double.NaN); + double macroMargin = cell.stream() + .mapToDouble(r -> r.meanMargin) + .average().orElse(Double.NaN); + double macroP5 = cell.stream() + .mapToDouble(r -> r.p5Margin) + .average().orElse(Double.NaN); + double macroFpr = cell.stream() + .mapToDouble(r -> r.fpr) + .average().orElse(Double.NaN); + double macroTpr = cell.stream() + .mapToDouble(r -> r.tpr) + .average().orElse(Double.NaN); + out.printf("%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f%n", + script, len, cell.size(), + macroD, macroMargin, macroP5, macroFpr, macroTpr); + } + } + } + } + + // ----------------------------------------------------------------------- + // Aggregation: script_pivot.tsv (single line per script — quick triage) + // ----------------------------------------------------------------------- + + private static void writeScriptPivot(Path path, List rows) throws IOException { + try (PrintWriter out = new PrintWriter( + Files.newBufferedWriter(path, StandardCharsets.UTF_8))) { + out.println("script\tn_cells" + + "\tmean_d\tmean_margin\tmean_p5_margin" + + "\tmin_d_cell\tmin_margin_cell"); + + Map> byScript = new HashMap<>(); + for (Row r : rows) { + byScript.computeIfAbsent(r.script, k -> new ArrayList<>()).add(r); + } + List scripts = new ArrayList<>(byScript.keySet()); + Collections.sort(scripts); + for (String script : scripts) { + List cells = byScript.get(script); + double meanD = cells.stream() + .filter(r -> !Double.isNaN(r.cohensD)) + .mapToDouble(r -> r.cohensD) + .average().orElse(Double.NaN); + double meanMargin = cells.stream() + .mapToDouble(r -> r.meanMargin) + .average().orElse(Double.NaN); + double meanP5 = cells.stream() + .mapToDouble(r -> r.p5Margin) + .average().orElse(Double.NaN); + Row minDCell = cells.stream() + .filter(r -> !Double.isNaN(r.cohensD)) + .min((a, b) -> Double.compare(a.cohensD, b.cohensD)) + .orElse(null); + Row minMarginCell = cells.stream() + .min((a, b) -> Double.compare(a.meanMargin, b.meanMargin)) + .orElse(null); + out.printf("%s\t%d\t%.3f\t%.3f\t%.3f\t%s\t%s%n", + script, cells.size(), + meanD, meanMargin, meanP5, + minDCell != null ? cellLabel(minDCell) : "-", + minMarginCell != null ? cellLabel(minMarginCell) : "-"); + } + } + } + + private static String cellLabel(Row r) { + return String.format("[%s→%s@%d]", r.labeledCs, r.wrongCs, r.length); + } + + // ----------------------------------------------------------------------- + // Charset utilities + // ----------------------------------------------------------------------- + + private static String filenameToCharsetName(Path file) { + String name = file.getFileName().toString(); + if (name.endsWith(".bin.gz")) { + name = name.substring(0, name.length() - ".bin.gz".length()); + } + return name; + } + + private static Charset tryGetCharset(String name) { + try { + return Charset.forName(name); + } catch (UnsupportedCharsetException | IllegalCharsetNameException e) { + return null; + } + } + + private static boolean equalCharset(Charset a, Charset b) { + return a.name().equalsIgnoreCase(b.name()) + || a.aliases().contains(b.name()) + || b.aliases().contains(a.name()); + } + + // ----------------------------------------------------------------------- + // Script detection (parallels JunkDetector.detectDominantScript, which is + // package-private; small enough to inline) + // ----------------------------------------------------------------------- + + private static final Map SCRIPT_FALLBACK = Map.of( + "HIRAGANA", "HAN", + "KATAKANA", "HAN" + ); + + private static String detectDominantScript(String text) { + if (text == null || text.isEmpty()) { + return "LATIN"; + } + Map counts = new HashMap<>(); + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s != Character.UnicodeScript.COMMON + && s != Character.UnicodeScript.INHERITED + && s != Character.UnicodeScript.UNKNOWN) { + counts.merge(s, 1, Integer::sum); + } + i += Character.charCount(cp); + } + if (counts.isEmpty()) { + return "LATIN"; + } + String name = counts.entrySet().stream() + .max(Map.Entry.comparingByValue()) + .map(e -> e.getKey().name()) + .orElse("LATIN"); + return SCRIPT_FALLBACK.getOrDefault(name, name); + } + + // ----------------------------------------------------------------------- + // Row + // ----------------------------------------------------------------------- + + private static final class Row { + final String labeledCs; + final String script; + final String wrongCs; + final int length; + final int n; + final double meanCleanZ; + final double meanMojiZ; + final double cohensD; + final double meanMargin; + final double p5Margin; + final double p50Margin; + final double fpr; + final double tpr; + + Row(String labeledCs, String script, String wrongCs, int length, + List cleanZs, List mojiZs, List margins, + float threshold) { + this.labeledCs = labeledCs; + this.script = script; + this.wrongCs = wrongCs; + this.length = length; + this.n = margins.size(); + this.meanCleanZ = mean(cleanZs); + this.meanMojiZ = mean(mojiZs); + this.cohensD = computeCohensD(cleanZs, mojiZs); + this.meanMargin = mean(margins); + this.p5Margin = percentile(margins, 0.05); + this.p50Margin = percentile(margins, 0.50); + this.fpr = fractionBelow(cleanZs, threshold); + this.tpr = fractionBelow(mojiZs, threshold); + } + + String toTsv() { + return String.format( + "%s\t%s\t%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f", + labeledCs, script, wrongCs, length, n, + meanCleanZ, meanMojiZ, cohensD, + meanMargin, p5Margin, p50Margin, + fpr, tpr); + } + } + + // ----------------------------------------------------------------------- + // Statistics + // ----------------------------------------------------------------------- + + private static double computeCohensD(List a, List b) { + if (a.size() < 2 || b.size() < 2) { + return Double.NaN; + } + double ma = mean(a); + double mb = mean(b); + double va = variance(a, ma); + double vb = variance(b, mb); + double pooled = Math.sqrt((va + vb) / 2.0); + if (pooled < 1e-9) { + return Double.NaN; + } + return (ma - mb) / pooled; + } + + private static double mean(List xs) { + double s = 0; + int n = 0; + for (float f : xs) { + if (!Float.isNaN(f)) { + s += f; + n++; + } + } + return n == 0 ? Double.NaN : s / n; + } + + private static double variance(List xs, double m) { + if (xs.size() < 2) { + return 0; + } + double s = 0; + int n = 0; + for (float f : xs) { + if (!Float.isNaN(f)) { + double d = f - m; + s += d * d; + n++; + } + } + return n < 2 ? 0 : s / (n - 1); + } + + private static double percentile(List xs, double p) { + List sorted = new ArrayList<>(xs); + sorted.removeIf(f -> Float.isNaN(f)); + if (sorted.isEmpty()) { + return Double.NaN; + } + Collections.sort(sorted); + int idx = (int) Math.floor(p * (sorted.size() - 1)); + return sorted.get(idx); + } + + private static double fractionBelow(List xs, float threshold) { + int below = 0; + int n = 0; + for (float f : xs) { + if (!Float.isNaN(f)) { + if (f < threshold) { + below++; + } + n++; + } + } + return n == 0 ? Double.NaN : (double) below / n; + } + + // ----------------------------------------------------------------------- + + private static void printUsage() { + System.err.println("Usage:"); + System.err.println(" EvalJunkOnCharsetDevtest"); + System.err.println(" [--devtest-dir ] (default ~/data/charsets/devtest)"); + System.err.println(" [--output-dir ] (default /tmp/junkdetect-eval)"); + System.err.println(" [--model ] (default classpath junkdetect.bin)"); + System.err.println(" [--max-records N] (default 2000)"); + System.err.println(" [--threshold F] (default -2.0)"); + System.err.println(" [--lengths 20,50,...]"); + System.err.println(" [--wrong-charsets a,b,...]"); + System.err.println(" [--only labeledCs,...] (filter for spot runs)"); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java new file mode 100644 index 00000000000..aa3761ef79f --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +/** + * Frozen set of training-time choices that together define a junk-detector + * model's identity. Any change to these values produces a meaningfully + * different model and must be reviewed in git. + * + *

Two principles drove making this a class rather than CLI flags: + * + *

    + *
  1. Reproducibility. When we look back at a model file six + * months later we want a single commit hash that says exactly what + * knobs produced it, not a half-remembered shell history. + *
  2. Drift prevention. CLI flags with defaults allow accidental + * deviation between developers ("did you remember to pass + * {@code --min-target-script-frac 0.05}?"). Constants in a tracked + * file remove that failure mode. + *
+ * + *

{@link BuildJunkTrainingData} and {@link TrainJunkModel} read the + * values here; both tools refuse to start if any CLI argument + * attempts to override a config-controlled parameter, surfacing the + * mistake at launch time rather than silently producing a non-canonical + * model. + * + *

The constants below reflect the choices that produced the current + * shipping model and are recorded in the corresponding training notes + * ({@code 20260514-junk-retrain-v6.md}). Update them by editing this + * file and committing the change together with the new model output. + * + *

The class has no instance state; all values are exposed as + * {@code public static final}. This keeps callsites short and avoids + * the temptation of passing a runtime-mutable config around. + * + *

This is not part of the public model-loading API. The {@link + * org.apache.tika.ml.junkdetect.JunkDetector} runtime is configuration- + * free; once a model file is built, all of its baked-in choices travel + * with the file's binary format. + */ +public final class JunkDetectorTrainingConfig { + + // ======================================================================= + // Corpus build (BuildJunkTrainingData) + // ======================================================================= + + /** + * Total UTF-8 byte budget across all script groups. Divided + * proportionally by per-script bigram entropy after the sampling phase. + */ + public static final long TOTAL_BUDGET_BYTES = 500_000_000L; + + /** + * Maximum UTF-8 bytes a single language may contribute to a + * multi-language script bucket. Prevents one large source (e.g. {@code + * zho} with 8 GB of MADLAD) from dominating a multi-language script + * model. Buckets with only one language ignore this cap and may consume + * their full budget. See {@link BuildJunkTrainingData} Phase 4. + */ + public static final long PER_LANGUAGE_CAP_BYTES = 5_000_000L; + + /** + * Sentence-level filter: minimum fraction of non-COMMON/INHERITED + * codepoints that must belong to the script bucket's target script for a + * sentence to be accepted. Set low so legitimate mixed-script content + * (Japanese kanji + kana, Korean with hanja annotations, Chinese with + * English citations, etc.) is preserved, but enough to reject lines that + * are essentially off-target (e.g. an English article about Gothic in + * the GOTHIC bucket). + */ + public static final double MIN_TARGET_SCRIPT_FRAC = 0.05; + + /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */ + public static final int MIN_BYTES_PER_SENTENCE = 50; + + /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */ + public static final double MAX_PUNC_FRAC = 0.30; + + /** + * Minimum number of sentences that must land in the dev split for a + * script to be included in the model. Scripts below this floor have + * insufficient data to reliably estimate calibration statistics, which + * inflates FPR. With {@code DEV_FRAC = 0.10} this corresponds to a + * total-sentence floor of {@code 500 / 0.10 = 5000} per script. + */ + public static final int MIN_DEV_SENTENCES = 500; + + /** Lines read per language to determine the language's dominant script. */ + public static final int SCRIPT_SAMPLE_LINES = 2_000; + + /** + * UTF-8 bytes loaded per script group for bigram entropy estimation, + * driving the entropy-proportional budget allocation. 200 KB is + * sufficient to characterise the bigram distribution of any single + * script. + */ + public static final long ENTROPY_SAMPLE_BYTES = 200_000L; + + /** Random seed for sentence shuffling and other corpus-build randomness. */ + public static final int SEED = 42; + + /** + * Script bucket names whose source data is too thin or too off-target + * to produce reliable per-script F1 calibration. Excluded from the + * model entirely; the {@link + * org.apache.tika.ml.junkdetect.JunkDetector#score(String)} routing + * falls back to "unknown script" behavior for these scripts. + * + *

The current selection is based on a corpus audit that found these + * scripts either had thin native source data (e.g. THAANA: 216 train + * sentences from Maldivian), or had sources dominated by off-target + * content (e.g. GOTHIC: 40% of lines are {@literal <}5% Gothic — the + * Wikipedia "gothic" directory is English text about Gothic). + * + *

Three further scripts (CANADIAN_ABORIGINAL, CHEROKEE, TIFINAGH) + * are not listed here because the {@link #MIN_TARGET_SCRIPT_FRAC} + * filter implicitly removes them — their MADLAD sources contain + * effectively no native-script content at the 5% threshold. Listing + * them here is unnecessary and would obscure the data-quality finding. + */ + public static final Set DROP_SCRIPTS = + Collections.unmodifiableSet(new java.util.TreeSet<>(Set.of("GOTHIC", "THAANA"))); + + /** + * Per-script byte-budget overrides applied on top of the entropy- + * proportional allocation. Empty in the current configuration. + * + *

Under v6 the {@code HAN=60MB} experiment worsened every + * non-HAN script (the global F1 hash table was the bottleneck). Under + * v7's per-script tables, the same experiment correctly leaves other + * scripts untouched, but the HAN gain itself was negligible (Cohen's d + * moved 7.26 → 7.35) — the per-script HAN model is already near its + * data-saturation point with ~18 MB of training data. Override left + * empty until a more decisive HAN-coverage experiment is designed. + */ + public static final Map SCRIPT_BUDGET_OVERRIDES = + Collections.emptyMap(); + + // ======================================================================= + // Model train (TrainJunkModel) + // ======================================================================= + + /** + * Drop per-script F1 bigrams whose per-pair occurrence count (within + * that script's training data) is below this threshold. Set to 3 on + * evidence that singleton and doubleton pairs are overwhelmingly OCR + * artifacts and proper-noun noise that inflate the clean-side score + * distribution tail without contributing signal. + * + *

Set to 1 to disable the filter (every observed pair retained). + */ + public static final int MIN_BIGRAM_COUNT = 3; + + /** + * Target load factor for the per-script open-addressing F1 hash + * table. Table capacity is sized as the smallest power of two + * larger than {@code keptPairs / loadFactor}, giving an average of + * 1 / (1 - loadFactor) probes per lookup. 0.5 → ~2 probes; modestly + * wasteful in space but very cheap to probe. + */ + public static final double OA_LOAD_FACTOR = 0.5; + + /** + * Bit width of each codepoint's dense index within a script's F1 + * table. Each bigram is packed as {@code (idxA << KEY_INDEX_BITS) | + * idxB}, so each side must fit in this many bits. 16 bits supports + * up to 65535 distinct codepoints per script, which is comfortably + * above the largest per-script count we have measured (HAN is the + * worst case at ~15K kept codepoints). + */ + public static final int KEY_INDEX_BITS = 16; + + private JunkDetectorTrainingConfig() { + // No instances. + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java new file mode 100644 index 00000000000..bcda57c9f7c --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.zip.GZIPInputStream; + +/** + * For each {@code *.train.gz} file in a directory, compute per-line statistics + * of "target-script fraction" — i.e. the fraction of codepoints in each line + * that belong to the script the file is supposed to represent. + * + *

Reports a histogram across the buckets + * [0, 5, 10, 20, 30, 50, 70, 90, 100]% so we can pick a per-script keep + * threshold (e.g. "drop lines with <20% HAN codepoints"). Also reports + * what fraction of total bytes / lines would be dropped at each threshold. + * + *

Each {@code {script}.train.gz} maps to a {@link Character.UnicodeScript}; + * the file basename is uppercased. Special-case handling routes a few + * project-internal script names (e.g. HAN includes HALF_FULL ideographic + * forms) when desired. + * + *

Usage: + *

+ *   java LineScriptFractions <dataDir> [thresholds]
+ * 
+ */ +public final class LineScriptFractions { + + private static final int[] BUCKETS = {0, 5, 10, 20, 30, 50, 70, 90, 100}; + + private LineScriptFractions() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: LineScriptFractions "); + System.exit(1); + } + Path dataDir = Paths.get(args[0]); + Path[] files; + try (var s = Files.list(dataDir)) { + files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz")) + .sorted().toArray(Path[]::new); + } + if (files.length == 0) { + System.err.println("No *.train.gz files in " + dataDir); + System.exit(1); + } + + System.out.printf("%-20s %10s %10s | %s%n", + "script", "lines", "<5%", + "lines at target-frac threshold (cumulative dropped %)"); + System.out.println(" " + + " <10% <20% <30% <50% <70% <90% <100%"); + System.out.println(repeat('-', 110)); + + for (Path file : files) { + String fname = file.getFileName().toString(); + String name = fname.substring(0, fname.length() - ".train.gz".length()) + .toUpperCase(); + Character.UnicodeScript target = mapScript(name); + if (target == null) { + System.out.printf("%-20s (no UnicodeScript mapping for '%s')%n", name, name); + continue; + } + + long lines = 0; + long[] bucketCounts = new long[BUCKETS.length]; + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(file)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + lines++; + int total = 0; + int matching = 0; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED + || s == Character.UnicodeScript.UNKNOWN) { + // Don't count toward denominator: punctuation, + // spaces, diacritics are script-neutral. + continue; + } + total++; + if (s == target) matching++; + } + double pct = total == 0 ? 0.0 : 100.0 * matching / total; + int b = 0; + while (b < BUCKETS.length - 1 && pct >= BUCKETS[b + 1]) b++; + bucketCounts[b]++; + } + } + + // Convert bucket counts to "cumulative fraction dropped at threshold = BUCKETS[i]". + StringBuilder sb = new StringBuilder(); + long cum = 0; + // bucketCounts[i] holds lines with pct in [BUCKETS[i], BUCKETS[i+1]). + // Drop-if-pctGoal: prove the codepoint-bigram-hash approach opens the + * UTF-8→GB18030 mojibake margin meaningfully above v5's ~1 z-unit + * baseline BEFORE committing to a multi-day production retrain. + * + *

Training corpus: decode {@code ~/data/charsets/devtest/GB18030.bin.gz} + * (Chinese) + first 80% of {@code UTF-8.bin.gz} (multi-language Wikipedia) + * under their labeled charsets, iterate codepoints, count bigrams and unigrams, + * hash into N buckets, build Bloom filter of seen pairs. Held-out: last 20% + * of UTF-8 records. + * + *

Eval: for each held-out UTF-8 record, slice to length buckets + * {20, 50, 100, 200, 500, 1000} source bytes. Decode each slice under + * UTF-8 (clean) and GB18030 (mojibake-as-HAN). Score both with the + * prototype model. Margin = clean_score - mojibake_score. Report + * mean and 5th-percentile margin per length. + * + *

Sweep: {bigramBuckets, alpha} grid. Pick the configuration that + * maximises margin. Compare to v5 baseline (mean margin ~1 z-unit + * across all lengths in the same cohort). + * + *

Outputs: + *

    + *
  • prototype-sweep.tsv: one row per + * (bigram_buckets, alpha, length). Columns: n, mean_clean, + * mean_moji, mean_margin, std_margin, p5_margin, p50_margin, + * margin_in_clean_stds (effective z-units).
  • + *
+ * + *

Usage: + *

+ *   ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
+ *     -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.PrototypeCodepointHash \
+ *     -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v6-prototype"
+ * 
+ */ +public class PrototypeCodepointHash { + + // --- Hyperparameter sweep grid --- + private static final int[] BIGRAM_BUCKETS = {4096, 8192, 16384, 32768}; + private static final double[] ALPHAS = {1.0, 0.4}; + private static final int UNIGRAM_BUCKETS = 8192; + private static final int BLOOM_BITS = 4 * 1024 * 1024; // 512 KB + private static final int BLOOM_K = 7; + + // --- Smoothing --- + private static final double ADD_ALPHA = 0.01; + + // --- Eval --- + private static final int[] LENGTHS = {20, 50, 100, 200, 500, 1000}; + private static final int MAX_RECORDS_PER_FILE = 5000; + private static final double HOLDOUT_FRACTION = 0.20; + private static final int MIN_SCORE_CODEPOINTS = 3; + + public static void main(String[] args) throws IOException { + Path devtestDir = Paths.get(System.getProperty("user.home"), + "data", "charsets", "devtest"); + Path outputDir = Paths.get("/tmp/v6-prototype"); + int maxRecords = MAX_RECORDS_PER_FILE; + List fixturesDirs = new ArrayList<>(); + String wrongCharsetName = "GB18030"; + boolean singleModel = false; + List candidates = List.of( + "UTF-8", "GB18030", "windows-1252", "windows-1251", "windows-1257", + "Shift_JIS", "EUC-JP", "ISO-2022-JP", "UTF-16LE", "UTF-16BE"); + List forceCandidates = null; // when set, skip base detectors + String expected = "UTF-8"; + int[] probeSizes = null; // when set, sweep these probe sizes per fixture + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--devtest-dir": + devtestDir = Paths.get(args[++i]); + break; + case "--output-dir": + outputDir = Paths.get(args[++i]); + break; + case "--max-records": + maxRecords = Integer.parseInt(args[++i]); + break; + case "--fixtures-dir": + fixturesDirs.add(Paths.get(args[++i])); + break; + case "--wrong-charset": + wrongCharsetName = args[++i]; + break; + case "--single-model": + // Skip prototype training; run N-way fixture eval on bundled JunkDetector only. + singleModel = true; + break; + case "--candidates": + candidates = Arrays.asList(args[++i].split(",")); + break; + case "--force-candidates": + // Bypass base detectors; pairwise tournament directly on these. + forceCandidates = Arrays.asList(args[++i].split(",")); + break; + case "--expected": + expected = args[++i]; + break; + case "--probe-sizes": + // Comma-separated probe sizes (bytes). Each fixture + // gets one row per size, so you can see how length + // affects UNKNOWN vs scored. + String[] sizes = args[++i].split(","); + probeSizes = new int[sizes.length]; + for (int k = 0; k < sizes.length; k++) { + probeSizes[k] = Integer.parseInt(sizes[k].trim()); + } + break; + default: + System.err.println("Unknown arg: " + args[i]); + System.exit(1); + } + } + Files.createDirectories(outputDir); + + // --single-model bypasses the v5/v6-prototype comparison apparatus. + // Requires --force-candidates to specify the charsets to compare; + // the base-detector-driven path was removed to keep tika-ml-junkdetect + // free of heavy encoding-detector deps. + if (singleModel) { + if (fixturesDirs.isEmpty()) { + System.err.println("--single-model requires --fixtures-dir"); + System.exit(1); + } + if (forceCandidates == null || forceCandidates.isEmpty()) { + System.err.println("--single-model requires --force-candidates " + + "(e.g. --force-candidates UTF-8,GB18030)"); + System.exit(1); + } + evalFixturesSingleModel(fixturesDirs, forceCandidates, expected, + probeSizes, outputDir); + return; + } + + System.err.println("=== PrototypeCodepointHash ==="); + System.err.println(" devtest-dir: " + devtestDir); + System.err.println(" output-dir: " + outputDir); + System.err.println(" max-records: " + maxRecords); + System.err.println(" bigram_buckets sweep: " + Arrays.toString(BIGRAM_BUCKETS)); + System.err.println(" alpha sweep: " + Arrays.toString(ALPHAS)); + System.err.println(" unigram_buckets: " + UNIGRAM_BUCKETS); + System.err.println(" bloom_bits: " + BLOOM_BITS + + " (" + (BLOOM_BITS / 8 / 1024) + " KB, k=" + BLOOM_K + ")"); + + // -------- Load corpus -------- + + Charset utf8 = StandardCharsets.UTF_8; + Charset gb18030 = Charset.forName("GB18030"); + + System.err.println("\n--- Loading corpus ---"); + List utf8Records = readRecords( + devtestDir.resolve("UTF-8.bin.gz"), maxRecords); + List gbRecords = readRecords( + devtestDir.resolve("GB18030.bin.gz"), maxRecords); + System.err.printf(" UTF-8.bin.gz: %d records%n", utf8Records.size()); + System.err.printf(" GB18030.bin.gz: %d records%n", gbRecords.size()); + + // Train/eval split on UTF-8 records. GB18030 records all go to training. + int holdoutCount = (int) (utf8Records.size() * HOLDOUT_FRACTION); + int utf8TrainSize = utf8Records.size() - holdoutCount; + List utf8TrainBytes = utf8Records.subList(0, utf8TrainSize); + List utf8EvalBytes = utf8Records.subList(utf8TrainSize, utf8Records.size()); + System.err.printf(" UTF-8 train: %d eval: %d%n", + utf8TrainBytes.size(), utf8EvalBytes.size()); + + // Decode training corpus to codepoint streams + System.err.println("\n--- Decoding training corpus ---"); + List trainStreams = new ArrayList<>(); + long totalTrainCp = 0; + for (byte[] r : utf8TrainBytes) { + int[] cps = toCodepoints(decode(r, utf8)); + if (cps.length >= 2) trainStreams.add(cps); + totalTrainCp += cps.length; + } + for (byte[] r : gbRecords) { + int[] cps = toCodepoints(decode(r, gb18030)); + if (cps.length >= 2) trainStreams.add(cps); + totalTrainCp += cps.length; + } + System.err.printf(" total training codepoints: %,d across %d records%n", + totalTrainCp, trainStreams.size()); + + // Count unique pairs (for Bloom sizing sanity) + Set uniquePairs = new HashSet<>(); + for (int[] cps : trainStreams) { + for (int i = 0; i + 1 < cps.length; i++) { + uniquePairs.add(packPair(cps[i], cps[i + 1])); + if (uniquePairs.size() >= 2_000_000) break; + } + if (uniquePairs.size() >= 2_000_000) break; + } + System.err.printf(" unique codepoint-pairs in training: ~%,d%n", + uniquePairs.size()); + + // -------- Hyperparameter sweep -------- + + Path sweepPath = outputDir.resolve("prototype-sweep.tsv"); + try (PrintWriter out = new PrintWriter( + Files.newBufferedWriter(sweepPath, StandardCharsets.UTF_8))) { + out.println("bigram_buckets\talpha\tlength\tn" + + "\tmean_clean\tstd_clean\tmean_moji" + + "\tmean_margin\tstd_margin\tp5_margin\tp50_margin" + + "\tmargin_in_clean_stds\tbloom_seen_frac_clean\tbloom_seen_frac_moji"); + + for (int buckets : BIGRAM_BUCKETS) { + for (double alpha : ALPHAS) { + System.err.printf("%n--- Config: bigram_buckets=%d alpha=%.1f ---%n", + buckets, alpha); + + Model m = train(trainStreams, buckets, UNIGRAM_BUCKETS, + BLOOM_BITS, BLOOM_K, ADD_ALPHA, alpha); + + // Calibrate on a sample of training streams (for the + // "margin_in_clean_stds" effective-z normalization) + double[] muSigma = calibrate(m, trainStreams); + System.err.printf(" train mu=%.3f sigma=%.3f%n", muSigma[0], muSigma[1]); + + // Eval on held-out UTF-8 records + for (int len : LENGTHS) { + EvalCell cell = evalAtLength(m, utf8EvalBytes, len, utf8, gb18030); + if (cell == null) continue; + double effZ = cell.meanMargin / Math.max(muSigma[1], 1e-6); + out.printf("%d\t%.2f\t%d\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.3f\t%.3f\t%.3f%n", + buckets, alpha, len, cell.n, + cell.meanClean, cell.stdClean, cell.meanMoji, + cell.meanMargin, cell.stdMargin, + cell.p5Margin, cell.p50Margin, + effZ, cell.bloomSeenFracClean, cell.bloomSeenFracMoji); + System.err.printf(" len=%4d n=%-5d mean_margin=%6.3f p5=%6.3f" + + " eff_z=%5.2f bloom_clean=%.2f bloom_moji=%.2f%n", + len, cell.n, cell.meanMargin, cell.p5Margin, effZ, + cell.bloomSeenFracClean, cell.bloomSeenFracMoji); + out.flush(); + } + } + } + } + System.err.println("\nWrote " + sweepPath); + + // -------- Fixture eval (AIT5-class HTML files) -------- + + if (!fixturesDirs.isEmpty()) { + evalFixtures(trainStreams, fixturesDirs, wrongCharsetName, outputDir); + } + + System.err.println("Done."); + } + + // ----------------------------------------------------------------------- + // Real-life fixture eval: runs the production base detectors (BOM + + // HtmlEncodingDetector + UniversalEncodingDetector) and asks the + // JunkDetector to pick among their candidates via pairwise compare. + // Mirrors the production charset-detection arbitration. + // ----------------------------------------------------------------------- + + private static void evalFixturesSingleModel(List fixturesDirs, + List forceCandidates, + String expected, + int[] probeSizes, + Path outputDir) throws IOException { + System.err.println("\n--- Forced-candidates fixture eval ---"); + System.err.println(" candidates: " + forceCandidates); + JunkDetector detector = JunkDetector.loadFromClasspath(); + System.err.println(" model version: " + detector.getModelVersion()); + System.err.println(" expected: " + expected); + + List forced = new ArrayList<>(); + for (String n : forceCandidates) { + try { + forced.add(Charset.forName(n)); + } catch (Exception e) { + System.err.println(" skip unsupported charset: " + n); + } + } + + Path out = outputDir.resolve("fixtures-real-life.tsv"); + try (PrintWriter pw = new PrintWriter( + Files.newBufferedWriter(out, StandardCharsets.UTF_8))) { + pw.println("dir\tfile\tn_bytes\tprobe_size\texpected\tbom_cs\thtml_cs\tuniversal_cs" + + "\tcandidates\twinner\tmargin\tstatus\tnotes"); + int pass = 0, fail = 0, skip = 0, agree = 0; + double passMarginSum = 0.0; + List failingLines = new ArrayList<>(); + + for (Path dir : fixturesDirs) { + if (!Files.isDirectory(dir)) { + System.err.println(" WARN: not a directory: " + dir); + continue; + } + try (Stream stream = Files.walk(dir)) { + List files = new ArrayList<>(); + stream.filter(Files::isRegularFile).forEach(files::add); + Collections.sort(files); + int[] sizes = probeSizes != null ? probeSizes : new int[]{16_384}; + for (Path f : files) { + for (int sz : sizes) { + FixtureResult r = + evalOneForced(f, expected, detector, forced, sz); + pw.println(r.toTsvLine()); + switch (r.status) { + case "PASS": + pass++; + passMarginSum += r.margin; + break; + case "FAIL": + fail++; + failingLines.add(r.dir + "/" + r.shortName + + "@" + sz + " -> " + r.winner + + " (expected " + r.expected + ")"); + break; + case "AGREE": + agree++; + break; + default: + skip++; + } + } + } + } + } + int n = pass + fail; + System.err.println(); + System.err.println("=== Summary ==="); + System.err.printf("Pass: %d / %d (%.1f%%) — JunkDetector picked the expected charset%n", + pass, n, n == 0 ? 0.0 : 100.0 * pass / n); + System.err.printf("Fail: %d%n", fail); + System.err.printf("Agree: %d (all detectors agreed; no arbitration needed)%n", agree); + System.err.printf("Skip: %d%n", skip); + if (pass > 0) { + System.err.printf("Mean margin on pass: %.3f%n", passMarginSum / pass); + } + if (!failingLines.isEmpty()) { + System.err.println("Failing:"); + Collections.sort(failingLines); + for (String line : failingLines) { + System.err.println(" " + line); + } + } + } + System.err.println("Wrote " + out); + } + + private static FixtureResult evalOneForced(Path file, String expected, + JunkDetector detector, + List forced, + int probeBytes) throws IOException { + byte[] raw = Files.readAllBytes(file); + FixtureResult r = new FixtureResult(); + r.dir = file.getParent().getFileName().toString(); + String fname = file.getFileName().toString(); + r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname; + r.bytes = raw.length; + r.probeSize = probeBytes; + r.expected = expected; + + if (isBinaryMagic(raw)) { + r.status = "SKIP_BIN"; + return r; + } + // Strip HTML on the WHOLE raw buffer first, then slice to probeBytes + // from the stripped content. Otherwise a small probe slice can land + // entirely inside // boilerplate and leave + // nothing to score after strip. + byte[] strippedFull = stripHtmlBytes(raw); + byte[] forDecode = strippedFull.length > probeBytes + ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull; + r.candidatesStr = forced.stream().map(Charset::name) + .reduce((a, b) -> a + "," + b).orElse("-"); + + // Always log every candidate in notes — even those JunkDetector + // rejects as unknown — so the failure mode is visible. An + // "unknown" score itself is meaningful information when the other + // candidate scored fine. + String winner = null; + String runner = null; + float winnerZ = Float.NEGATIVE_INFINITY; + float runnerZ = Float.NEGATIVE_INFINITY; + StringBuilder notes = new StringBuilder(); + int decoded_scored = 0; + for (Charset cs : forced) { + String decoded = applyEntityVariant(new String(forDecode, cs), "expanded"); + int cps = toCodepoints(decoded).length; + if (cps < 3) { + notes.append(cs.name()).append("=TOO_SHORT(").append(cps).append(") "); + continue; + } + TextQualityScore s = detector.score(decoded); + if (s.isUnknown()) { + // Diagnose: is this script-not-in-model (neutral case) or + // all-runs-fragmented-too-short (a real mojibake signal)? + String why = diagnoseUnknown(decoded, detector); + notes.append(cs.name()).append("=UNK[").append(why).append("] "); + continue; + } + float z = s.getZScore(); + notes.append(cs.name()).append("=").append(String.format("%.2f", z)).append(" "); + decoded_scored++; + if (z > winnerZ) { + runner = winner; + runnerZ = winnerZ; + winner = cs.name(); + winnerZ = z; + } else if (z > runnerZ) { + runner = cs.name(); + runnerZ = z; + } + } + if (winner == null) { + r.status = "NO_DECODE"; + r.notes = notes.toString().trim(); + return r; + } + r.winner = winner; + if (decoded_scored < 2) { + // Only one candidate scored; no real arbitration happened. + r.margin = Float.NaN; + r.status = safeCanonical(winner).equals(safeCanonical(expected)) + ? "ONLY_EXPECTED_SCORED" : "ONLY_WRONG_SCORED"; + } else { + r.margin = winnerZ - runnerZ; + r.status = safeCanonical(winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL"; + } + r.notes = notes.toString().trim(); + return r; + } + + /** + * Diagnose why JunkDetector returned UNKNOWN for {@code text}. Walks + * the same script-run logic, then classifies the failure mode: + *
    + *
  • {@code EMPTY} — input had no characters.
  • + *
  • {@code NO_MODELED_SCRIPT} — all runs are in scripts the model + * doesn't know (legit reason to be neutral).
  • + *
  • {@code ALL_RUNS_TOO_SHORT(N)} — runs exist in modeled scripts + * but every one is <2 UTF-8 bytes. Strong mojibake signal — + * text is a salad of single codepoints from many scripts.
  • + *
  • {@code MIXED} — some runs were modeled-but-too-short and + * some were unmodeled.
  • + *
+ */ + private static String diagnoseUnknown(String text, JunkDetector detector) { + if (text == null || text.isEmpty()) { + return "EMPTY"; + } + Set modeled = detector.knownScripts(); + // Walk codepoints, splitting on script boundaries — same as + // JunkDetector.buildScriptRuns conceptually. Track per-script: + // longest UTF-8-byte run length, plus a separate "unmodeled" tally. + java.util.Map longestModeled = new java.util.HashMap<>(); + int unmodeledRuns = 0; + int modeledTooShortRuns = 0; + int currentBytes = 0; + String currentScript = null; + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + int charCount = Character.charCount(cp); + String script = Character.UnicodeScript.of(cp).name(); + // COMMON / INHERITED / UNKNOWN attach to preceding run, but for + // diagnosis we don't need to be that precise — treat them as a + // continuation. + if ("COMMON".equals(script) || "INHERITED".equals(script) + || "UNKNOWN".equals(script)) { + if (currentScript != null) { + currentBytes += new String(new int[]{cp}, 0, 1) + .getBytes(StandardCharsets.UTF_8).length; + } + } else if (script.equals(currentScript)) { + currentBytes += new String(new int[]{cp}, 0, 1) + .getBytes(StandardCharsets.UTF_8).length; + } else { + // close out previous run + tallyRun(currentScript, currentBytes, modeled, longestModeled); + if (currentScript != null) { + if (!modeled.contains(currentScript)) { + unmodeledRuns++; + } else if (currentBytes < 2) { + modeledTooShortRuns++; + } + } + currentScript = script; + currentBytes = new String(new int[]{cp}, 0, 1) + .getBytes(StandardCharsets.UTF_8).length; + } + i += charCount; + } + // close final run + if (currentScript != null) { + if (!modeled.contains(currentScript)) { + unmodeledRuns++; + } else if (currentBytes < 2) { + modeledTooShortRuns++; + } else { + longestModeled.merge(currentScript, currentBytes, Math::max); + } + } + boolean anyModeledLong = !longestModeled.isEmpty(); + if (anyModeledLong) { + // Some modeled run is ≥2 bytes — shouldn't have hit UNKNOWN. + // (Possible discrepancy with the production logic; reported as MIXED.) + return "MIXED(modeled_long=" + longestModeled.size() + ")"; + } + if (modeledTooShortRuns > 0 && unmodeledRuns > 0) { + return "MIXED(short=" + modeledTooShortRuns + + ",unmodeled=" + unmodeledRuns + ")"; + } + if (modeledTooShortRuns > 0) { + return "ALL_RUNS_TOO_SHORT(" + modeledTooShortRuns + ")"; + } + if (unmodeledRuns > 0) { + return "NO_MODELED_SCRIPT(" + unmodeledRuns + ")"; + } + return "OTHER"; + } + + private static void tallyRun(String script, int bytes, Set modeled, + java.util.Map longestModeled) { + if (script == null) { + return; + } + if (modeled.contains(script) && bytes >= 2) { + longestModeled.merge(script, bytes, Math::max); + } + } + + /** + * Run HtmlByteStripper over the entire input; return the stripped + * content bytes (or the input verbatim if no tags found). + */ + private static byte[] stripHtmlBytes(byte[] raw) { + byte[] dst = new byte[raw.length]; + HtmlByteStripper.Result r = + HtmlByteStripper.strip(raw, 0, raw.length, dst, 0); + if (r.tagCount > 0 && r.length > 0) { + return Arrays.copyOf(dst, r.length); + } + return raw; + } + + private static boolean isBinaryMagic(byte[] b) { + if (b.length < 4) { + return false; + } + if (b[0] == 0x50 && b[1] == 0x4B + && (b[2] == 0x03 || b[2] == 0x05 || b[2] == 0x07)) { + return true; // ZIP / JAR / APK / docx + } + if ((b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) { + return true; // gzip + } + if (b[0] == '%' && b[1] == 'P' && b[2] == 'D' && b[3] == 'F') { + return true; // PDF + } + if ((b[0] & 0xFF) == 0xD0 && (b[1] & 0xFF) == 0xCF) { + return true; // OLE2 + } + return false; + } + + private static String safeCanonical(String charset) { + if (charset == null) { + return ""; + } + try { + return Charset.forName(charset).name(); + } catch (Exception e) { + return charset.toUpperCase(); + } + } + + private static final class FixtureResult { + String dir; + String shortName; + int bytes; + int probeSize; + String expected; + String bomCs; + String htmlCs; + String universalCs; + String candidatesStr = "-"; + String winner = "-"; + float margin = Float.NaN; + String status = ""; + String notes = ""; + + String toTsvLine() { + return String.format("%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + dir, shortName, bytes, probeSize, expected, + str(bomCs), str(htmlCs), str(universalCs), + candidatesStr, str(winner), + Float.isNaN(margin) ? "-" : String.format("%.3f", margin), + status, notes.isEmpty() ? "-" : notes); + } + + private static String str(String s) { + return s == null ? "-" : s; + } + } + + // ----------------------------------------------------------------------- + // Fixture eval: score real-world AIT5-class HTML files under v5 and v6 + // prototype, with byte-level HTML stripping and entity-variant comparison. + // ----------------------------------------------------------------------- + + private static void evalFixtures(List trainStreams, + List fixturesDirs, + String wrongCharsetName, + Path outputDir) throws IOException { + System.err.println("\n--- Fixture eval (best config: 4096 buckets, alpha=1.0) ---"); + Model v6 = train(trainStreams, 4096, UNIGRAM_BUCKETS, + BLOOM_BITS, BLOOM_K, ADD_ALPHA, 1.0); + double[] muSigma = calibrate(v6, trainStreams); + float mu = (float) muSigma[0]; + float sigma = (float) Math.max(muSigma[1], 1e-6); + System.err.printf(" v6 train mu=%.3f sigma=%.3f%n", mu, sigma); + + JunkDetector v5 = JunkDetector.loadFromClasspath(); + Charset cleanCs = StandardCharsets.UTF_8; + Charset wrongCs = Charset.forName(wrongCharsetName); + System.err.println(" v5 model version: " + v5.getModelVersion()); + System.err.println(" clean charset: " + cleanCs.name()); + System.err.println(" mojibake charset: " + wrongCs.name()); + + Path fixturesPath = outputDir.resolve("fixtures.tsv"); + try (PrintWriter out = new PrintWriter( + Files.newBufferedWriter(fixturesPath, StandardCharsets.UTF_8))) { + out.println("cluster\tfile\tentity_variant\tn_clean_cp\tn_moji_cp" + + "\tv5_clean_z\tv5_moji_z\tv5_margin" + + "\tv6_F1_clean\tv6_F1_moji\tv6_F1_margin" + + "\tv6_combo_clean\tv6_combo_moji\tv6_combo_margin" + + "\tdominant_script" + + "\tv5_winner\tv6_F1_winner\tv6_combo_winner"); + + for (Path dir : fixturesDirs) { + if (!Files.isDirectory(dir)) { + System.err.println(" WARN: not a directory: " + dir); + continue; + } + try (java.util.stream.Stream files = Files.walk(dir)) { + List sorted = new ArrayList<>(); + files.filter(Files::isRegularFile).forEach(sorted::add); + Collections.sort(sorted); + for (Path f : sorted) { + evalOneFixture(f, v6, mu, sigma, v5, cleanCs, wrongCs, out); + } + } + } + } + System.err.println("Wrote " + fixturesPath); + } + + private static void evalOneFixture(Path file, Model v6, float v6Mu, float v6Sigma, + JunkDetector v5, + Charset cleanCs, Charset wrongCs, + PrintWriter out) throws IOException { + byte[] rawBytes = Files.readAllBytes(file); + if (rawBytes.length > 16384) { + rawBytes = Arrays.copyOf(rawBytes, 16384); + } + // Byte-level HTML strip (matches JunkFilterEncodingDetector production pipeline) + byte[] stripDst = new byte[rawBytes.length]; + HtmlByteStripper.Result strip = + HtmlByteStripper.strip(rawBytes, 0, rawBytes.length, stripDst, 0); + byte[] forDecode = rawBytes; + if (strip.tagCount > 0 && strip.length > 0) { + forDecode = new byte[strip.length]; + System.arraycopy(stripDst, 0, forDecode, 0, strip.length); + } + + String cluster = file.getParent().getFileName().toString(); + String fname = file.getFileName().toString(); + // shorten long content-hash names for readability in output + String shortName = fname.length() > 12 ? fname.substring(0, 12) : fname; + + String cleanRaw = decode(forDecode, cleanCs); + String mojiRaw = decode(forDecode, wrongCs); + + for (String variant : List.of("raw", "expanded", "removed")) { + String clean = applyEntityVariant(cleanRaw, variant); + String moji = applyEntityVariant(mojiRaw, variant); + int[] cleanCps = toCodepoints(clean); + int[] mojiCps = toCodepoints(moji); + if (cleanCps.length < 3 || mojiCps.length < 3) continue; + + // --- v5 full pipeline (existing) --- + TextQualityScore v5cs = v5.score(clean); + TextQualityScore v5ms = v5.score(moji); + float v5cleanZ = v5cs.isUnknown() ? Float.NaN : v5cs.getZScore(); + float v5mojiZ = v5ms.isUnknown() ? Float.NaN : v5ms.getZScore(); + float v5Margin = v5cleanZ - v5mojiZ; + + // --- v6 Feature 1 alone (codepoint-bigram-hash + Bloom + unigram backoff) --- + ScoreResult v6c = score(v6, cleanCps); + ScoreResult v6m = score(v6, mojiCps); + double v6Margin = v6c.meanLogP - v6m.meanLogP; + + // --- v6 combined: substitute v6's F1 z-score into v5's classifier --- + JunkDetector.FeatureComponents cleanFc = v5.scoreWithFeatureComponents(clean); + JunkDetector.FeatureComponents mojiFc = v5.scoreWithFeatureComponents(moji); + float v6F1zClean = (float) (v6c.meanLogP - v6Mu) / v6Sigma; + float v6F1zMoji = (float) (v6m.meanLogP - v6Mu) / v6Sigma; + float comboClean = recombineLogit(v6F1zClean, cleanFc); + float comboMoji = recombineLogit(v6F1zMoji, mojiFc); + float comboMargin = comboClean - comboMoji; + String dominantScript = cleanFc != null ? cleanFc.dominantScript : "?"; + + String v5Winner = Float.isNaN(v5Margin) ? "?" : (v5Margin > 0 ? "CLEAN" : "MOJI"); + String v6F1Winner = Double.isNaN(v6Margin) ? "?" : (v6Margin > 0 ? "CLEAN" : "MOJI"); + String v6cWinner = Float.isNaN(comboMargin) ? "?" : (comboMargin > 0 ? "CLEAN" : "MOJI"); + + out.printf("%s\t%s\t%s\t%d\t%d" + + "\t%.3f\t%.3f\t%.3f" + + "\t%.4f\t%.4f\t%.4f" + + "\t%.3f\t%.3f\t%.3f" + + "\t%s\t%s\t%s\t%s%n", + cluster, shortName, variant, + cleanCps.length, mojiCps.length, + v5cleanZ, v5mojiZ, v5Margin, + v6c.meanLogP, v6m.meanLogP, v6Margin, + comboClean, comboMoji, comboMargin, + dominantScript, + v5Winner, v6F1Winner, v6cWinner); + out.flush(); + System.err.printf(" [%s/%s %-8s] v5: Δ%+6.2f %s v6F1: Δ%+6.3f %s v6combo: Δ%+6.2f %s script=%s%n", + cluster, shortName, variant, + v5Margin, v5Winner, + v6Margin, v6F1Winner, + comboMargin, v6cWinner, + dominantScript); + } + } + + /** + * Recomputes v5's per-script classifier logit with v6's F1 z-score + * substituted for v5's z1. Approximation: keeps v5's classifier weights + * (w1..w4, bias) which were trained on the OLD F1 distribution. A true + * v6 retrain would re-fit w1 on the new F1 distribution; this version + * gives a directional estimate of "what if we just swap F1?" + */ + private static float recombineLogit(float v6F1z, JunkDetector.FeatureComponents fc) { + if (fc == null || fc.classifierWeights == null) { + return Float.NaN; + } + float[] cw = fc.classifierWeights; + int nFeat = cw.length - 1; + float logit = cw[nFeat]; // bias + if (nFeat >= 1) logit += cw[0] * v6F1z; + if (nFeat >= 2) logit += cw[1] * fc.z2; + if (nFeat >= 3) logit += cw[2] * fc.z3; + if (nFeat >= 4) logit += cw[3] * fc.z4; + return logit; + } + + // ----------------------------------------------------------------------- + // HTML entity expansion / removal (regex-based, sufficient for fixtures) + // ----------------------------------------------------------------------- + + private static final Pattern NUM_DEC = Pattern.compile("&#(\\d{1,7});"); + private static final Pattern NUM_HEX = Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});"); + private static final Pattern NAMED = + Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);"); + + private static String applyEntityVariant(String s, String variant) { + switch (variant) { + case "raw": return s; + case "expanded": return expandEntities(s); + case "removed": return removeEntities(s); + default: throw new IllegalArgumentException(variant); + } + } + + private static String expandEntities(String in) { + String s = in; + s = NUM_DEC.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1)); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // fall through, leave unchanged + } + return Matcher.quoteReplacement(mr.group()); + }); + s = NUM_HEX.matcher(s).replaceAll(mr -> { + try { + int cp = Integer.parseInt(mr.group(1), 16); + if (cp >= 0 && cp <= 0x10FFFF) { + return Matcher.quoteReplacement(new String(Character.toChars(cp))); + } + } catch (NumberFormatException ignored) { + // fall through, leave unchanged + } + return Matcher.quoteReplacement(mr.group()); + }); + s = NAMED.matcher(s).replaceAll(mr -> { + switch (mr.group(1)) { + case "amp": return "&"; + case "lt": return "<"; + case "gt": return ">"; + case "quot": return "\""; + case "apos": return "'"; + case "nbsp": return " "; + case "copy": return "©"; + case "reg": return "®"; + default: return Matcher.quoteReplacement(mr.group()); + } + }); + return s; + } + + private static String removeEntities(String s) { + s = NUM_DEC.matcher(s).replaceAll(""); + s = NUM_HEX.matcher(s).replaceAll(""); + s = NAMED.matcher(s).replaceAll(""); + return s; + } + + // ----------------------------------------------------------------------- + // Training + // ----------------------------------------------------------------------- + + private static Model train(List streams, + int bigramBuckets, int unigramBuckets, + int bloomBits, int bloomK, + double addAlpha, double backoffAlpha) { + if (Integer.bitCount(bigramBuckets) != 1 || Integer.bitCount(unigramBuckets) != 1) { + throw new IllegalArgumentException("Bucket counts must be powers of 2"); + } + long[] bigramCounts = new long[bigramBuckets]; + long[] unigramCounts = new long[unigramBuckets]; + long bigramTotal = 0; + long unigramTotal = 0; + long[] bloomBitArr = new long[(bloomBits + 63) / 64]; + + for (int[] cps : streams) { + for (int i = 0; i < cps.length; i++) { + int cp = cps[i]; + int uBucket = (int) (fnv1aUnigram(cp) & (unigramBuckets - 1)); + unigramCounts[uBucket]++; + unigramTotal++; + if (i + 1 < cps.length) { + int cpNext = cps[i + 1]; + int bBucket = (int) (fnv1aBigram(cp, cpNext) & (bigramBuckets - 1)); + bigramCounts[bBucket]++; + bigramTotal++; + bloomAdd(bloomBitArr, bloomBits, bloomK, cp, cpNext); + } + } + } + + // Convert to log-probabilities with add-alpha smoothing + float[] bigramLogP = new float[bigramBuckets]; + double bigramDenom = bigramTotal + addAlpha * bigramBuckets; + for (int i = 0; i < bigramBuckets; i++) { + double p = (bigramCounts[i] + addAlpha) / bigramDenom; + bigramLogP[i] = (float) Math.log(p); + } + float[] unigramLogP = new float[unigramBuckets]; + double unigramDenom = unigramTotal + addAlpha * unigramBuckets; + for (int i = 0; i < unigramBuckets; i++) { + double p = (unigramCounts[i] + addAlpha) / unigramDenom; + unigramLogP[i] = (float) Math.log(p); + } + + return new Model(bigramBuckets, unigramBuckets, bigramLogP, unigramLogP, + bloomBitArr, bloomBits, bloomK, backoffAlpha); + } + + private static double[] calibrate(Model m, List streams) { + double s = 0; + double s2 = 0; + int n = 0; + // Use a stride to avoid scoring every single train record + int stride = Math.max(1, streams.size() / 1000); + for (int i = 0; i < streams.size(); i += stride) { + int[] cps = streams.get(i); + if (cps.length < MIN_SCORE_CODEPOINTS) continue; + ScoreResult r = score(m, cps); + s += r.meanLogP; + s2 += r.meanLogP * r.meanLogP; + n++; + } + if (n == 0) return new double[]{0, 1}; + double mu = s / n; + double var = Math.max(0, s2 / n - mu * mu); + double sigma = Math.sqrt(var); + return new double[]{mu, sigma}; + } + + // ----------------------------------------------------------------------- + // Scoring + // ----------------------------------------------------------------------- + + private static ScoreResult score(Model m, int[] cps) { + if (cps.length < 2) return new ScoreResult(Double.NaN, 0, 0); + double sum = 0; + int n = 0; + int seen = 0; + for (int i = 0; i + 1 < cps.length; i++) { + int cp1 = cps[i]; + int cp2 = cps[i + 1]; + double logP; + if (bloomContains(m.bloomBits, m.bloomBitCount, m.bloomK, cp1, cp2)) { + int b = (int) (fnv1aBigram(cp1, cp2) & (m.bigramBuckets - 1)); + logP = m.bigramLogP[b]; + seen++; + } else { + int u1 = (int) (fnv1aUnigram(cp1) & (m.unigramBuckets - 1)); + int u2 = (int) (fnv1aUnigram(cp2) & (m.unigramBuckets - 1)); + logP = m.backoffAlpha * (m.unigramLogP[u1] + m.unigramLogP[u2]); + } + sum += logP; + n++; + } + return new ScoreResult(sum / n, n, seen); + } + + private static final class ScoreResult { + final double meanLogP; + final int nPairs; + final int seenPairs; + ScoreResult(double m, int n, int s) { + this.meanLogP = m; + this.nPairs = n; + this.seenPairs = s; + } + } + + // ----------------------------------------------------------------------- + // Eval at one length bucket + // ----------------------------------------------------------------------- + + private static EvalCell evalAtLength(Model m, List evalBytes, int length, + Charset cleanCs, Charset wrongCs) { + List cleans = new ArrayList<>(); + List mojis = new ArrayList<>(); + List margins = new ArrayList<>(); + double seenSumClean = 0, seenSumMoji = 0; + int nSeenObs = 0; + for (byte[] rec : evalBytes) { + if (rec.length < length) continue; + byte[] slice = Arrays.copyOf(rec, length); + int[] cleanCps = toCodepoints(decode(slice, cleanCs)); + int[] mojiCps = toCodepoints(decode(slice, wrongCs)); + if (cleanCps.length < MIN_SCORE_CODEPOINTS + || mojiCps.length < MIN_SCORE_CODEPOINTS) continue; + ScoreResult sc = score(m, cleanCps); + ScoreResult sm = score(m, mojiCps); + if (Double.isNaN(sc.meanLogP) || Double.isNaN(sm.meanLogP)) continue; + cleans.add(sc.meanLogP); + mojis.add(sm.meanLogP); + margins.add(sc.meanLogP - sm.meanLogP); + if (sc.nPairs > 0) seenSumClean += (double) sc.seenPairs / sc.nPairs; + if (sm.nPairs > 0) seenSumMoji += (double) sm.seenPairs / sm.nPairs; + nSeenObs++; + } + if (margins.size() < 30) return null; + EvalCell cell = new EvalCell(); + cell.n = margins.size(); + cell.meanClean = mean(cleans); + cell.stdClean = std(cleans, cell.meanClean); + cell.meanMoji = mean(mojis); + cell.meanMargin = mean(margins); + cell.stdMargin = std(margins, cell.meanMargin); + cell.p5Margin = percentile(margins, 0.05); + cell.p50Margin = percentile(margins, 0.50); + cell.bloomSeenFracClean = nSeenObs > 0 ? seenSumClean / nSeenObs : Double.NaN; + cell.bloomSeenFracMoji = nSeenObs > 0 ? seenSumMoji / nSeenObs : Double.NaN; + return cell; + } + + private static final class EvalCell { + int n; + double meanClean, stdClean; + double meanMoji; + double meanMargin, stdMargin; + double p5Margin, p50Margin; + double bloomSeenFracClean, bloomSeenFracMoji; + } + + // ----------------------------------------------------------------------- + // FNV-1a hashing for codepoint bigram / unigram + Bloom filter + // ----------------------------------------------------------------------- + + private static final long FNV_OFFSET = 0xcbf29ce484222325L; + private static final long FNV_PRIME = 0x100000001b3L; + + private static long fnv1aBigram(int cp1, int cp2) { + long h = FNV_OFFSET; + h = (h ^ ((cp1 >>> 24) & 0xFF)) * FNV_PRIME; + h = (h ^ ((cp1 >>> 16) & 0xFF)) * FNV_PRIME; + h = (h ^ ((cp1 >>> 8) & 0xFF)) * FNV_PRIME; + h = (h ^ (cp1 & 0xFF)) * FNV_PRIME; + h = (h ^ 0xFF) * FNV_PRIME; // separator + h = (h ^ ((cp2 >>> 24) & 0xFF)) * FNV_PRIME; + h = (h ^ ((cp2 >>> 16) & 0xFF)) * FNV_PRIME; + h = (h ^ ((cp2 >>> 8) & 0xFF)) * FNV_PRIME; + h = (h ^ (cp2 & 0xFF)) * FNV_PRIME; + return h; + } + + private static long fnv1aUnigram(int cp) { + long h = FNV_OFFSET; + h = (h ^ ((cp >>> 24) & 0xFF)) * FNV_PRIME; + h = (h ^ ((cp >>> 16) & 0xFF)) * FNV_PRIME; + h = (h ^ ((cp >>> 8) & 0xFF)) * FNV_PRIME; + h = (h ^ (cp & 0xFF)) * FNV_PRIME; + return h; + } + + private static long secondaryHash(int cp1, int cp2) { + // Independent secondary hash for Bloom double-hashing. Just shuffle + // the inputs differently. + long h = 0xff51afd7ed558ccdL; + h = (h ^ Integer.reverse(cp1)) * 0xc4ceb9fe1a85ec53L; + h = (h ^ Integer.reverse(cp2)) * 0xc4ceb9fe1a85ec53L; + h ^= h >>> 33; + return h; + } + + private static void bloomAdd(long[] bits, int bitCount, int k, int cp1, int cp2) { + long h1 = fnv1aBigram(cp1, cp2); + long h2 = secondaryHash(cp1, cp2); + for (int i = 0; i < k; i++) { + long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount; + bits[(int) (pos >>> 6)] |= 1L << (pos & 63); + } + } + + private static boolean bloomContains(long[] bits, int bitCount, int k, + int cp1, int cp2) { + long h1 = fnv1aBigram(cp1, cp2); + long h2 = secondaryHash(cp1, cp2); + for (int i = 0; i < k; i++) { + long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount; + if ((bits[(int) (pos >>> 6)] & (1L << (pos & 63))) == 0) return false; + } + return true; + } + + private static long packPair(int cp1, int cp2) { + return ((long) cp1 << 32) | (cp2 & 0xFFFFFFFFL); + } + + // ----------------------------------------------------------------------- + // I/O and decode utilities (copied from EvalJunkOnCharsetDevtest) + // ----------------------------------------------------------------------- + + private static List readRecords(Path file, int maxRecords) throws IOException { + List records = new ArrayList<>(); + try (FileInputStream fis = new FileInputStream(file.toFile()); + GZIPInputStream gis = new GZIPInputStream(fis); + DataInputStream dis = new DataInputStream(gis)) { + while (records.size() < maxRecords) { + int len; + try { + len = dis.readUnsignedShort(); + } catch (EOFException eof) { + break; + } + byte[] rec = new byte[len]; + dis.readFully(rec); + records.add(rec); + } + } + return records; + } + + private static String decode(byte[] bytes, Charset cs) { + CharsetDecoder dec = cs.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + try { + return dec.decode(ByteBuffer.wrap(bytes)).toString(); + } catch (CharacterCodingException e) { + return new String(bytes, cs); + } + } + + private static int[] toCodepoints(String s) { + int[] cps = new int[s.length()]; + int n = 0; + for (int i = 0; i < s.length(); ) { + int cp = s.codePointAt(i); + cps[n++] = cp; + i += Character.charCount(cp); + } + return Arrays.copyOf(cps, n); + } + + // ----------------------------------------------------------------------- + // Stats + // ----------------------------------------------------------------------- + + private static double mean(List xs) { + double s = 0; + int n = 0; + for (double v : xs) { + if (!Double.isNaN(v)) { + s += v; + n++; + } + } + return n == 0 ? Double.NaN : s / n; + } + + private static double std(List xs, double mu) { + if (xs.size() < 2) return 0; + double s = 0; + int n = 0; + for (double v : xs) { + if (!Double.isNaN(v)) { + s += (v - mu) * (v - mu); + n++; + } + } + return n < 2 ? 0 : Math.sqrt(s / (n - 1)); + } + + private static double percentile(List xs, double p) { + List sorted = new ArrayList<>(xs); + sorted.removeIf(v -> Double.isNaN(v)); + if (sorted.isEmpty()) return Double.NaN; + Collections.sort(sorted); + int idx = (int) Math.floor(p * (sorted.size() - 1)); + return sorted.get(idx); + } + + // ----------------------------------------------------------------------- + // Model + // ----------------------------------------------------------------------- + + private static final class Model { + final int bigramBuckets; + final int unigramBuckets; + final float[] bigramLogP; + final float[] unigramLogP; + final long[] bloomBits; + final int bloomBitCount; + final int bloomK; + final double backoffAlpha; + Model(int bb, int ub, float[] blp, float[] ulp, + long[] bloom, int bbc, int bk, double a) { + this.bigramBuckets = bb; + this.unigramBuckets = ub; + this.bigramLogP = blp; + this.unigramLogP = ulp; + this.bloomBits = bloom; + this.bloomBitCount = bbc; + this.bloomK = bk; + this.backoffAlpha = a; + } + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java new file mode 100644 index 00000000000..b384d5f4c51 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * Codepoint-level script census of one or more text files. For each input + * file, reports the percentage of codepoints in each {@link + * Character.UnicodeScript}, optionally per-line script-mix histograms. + * + *

Useful to verify whether {@code BuildJunkTrainingData} is bucketing + * languages correctly: e.g. Japanese is usually a mix of HIRAGANA, KATAKANA + * and HAN; if {@code jpn} ends up in {@code han.train.gz} we want to know + * what fraction of its codepoints are actually Han ideographs vs. kana. + * + *

Usage: + *

+ *   java ScriptCensus <file> [file ...]   # supports .gz and plain text
+ * 
+ */ +public final class ScriptCensus { + + /** Max lines to sample per file (set high for full pass). */ + private static final int MAX_LINES = 200_000; + + private ScriptCensus() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: ScriptCensus [file ...]"); + System.exit(1); + } + for (String arg : args) { + Path f = Paths.get(arg); + if (!Files.isRegularFile(f)) { + System.err.println("Skipping non-file: " + f); + continue; + } + reportOne(f); + System.out.println(); + } + } + + private static void reportOne(Path file) throws IOException { + Map scriptCounts = new HashMap<>(); + // Per-line dominant-script histogram. + Map dominantHistogram = new HashMap<>(); + long total = 0; + long lines = 0; + long sampledBytes = 0; + + try (BufferedReader r = open(file)) { + String line; + while ((line = r.readLine()) != null && lines < MAX_LINES) { + lines++; + sampledBytes += line.length(); + // For MADLAD/Wikipedia files the format is "lineNum TAB text"; + // strip the prefix if present. + int tab = line.indexOf('\t'); + String text = tab >= 0 ? line.substring(tab + 1) : line; + + Map perLine = new HashMap<>(); + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED + || s == Character.UnicodeScript.UNKNOWN) { + continue; + } + String name = s.name(); + scriptCounts.computeIfAbsent(name, k -> new long[1])[0]++; + perLine.merge(name, 1L, Long::sum); + total++; + } + // Identify the dominant script for this line. + String dom = null; + long best = -1; + for (Map.Entry e : perLine.entrySet()) { + if (e.getValue() > best) { + best = e.getValue(); + dom = e.getKey(); + } + } + if (dom != null) { + dominantHistogram.computeIfAbsent(dom, k -> new long[1])[0]++; + } + } + } + + System.out.printf("File: %s%n", file); + System.out.printf(" lines sampled: %,d total codepoints (excl. COMMON/INHERITED): %,d%n%n", + lines, total); + + if (total == 0) { + System.out.println(" (empty / no scripted codepoints)"); + return; + } + + System.out.println(" Codepoint distribution by script:"); + List> sorted = new ArrayList<>(scriptCounts.entrySet()); + sorted.sort(Comparator.comparingLong((Map.Entry e) -> -e.getValue()[0])); + long cumulative = 0; + for (Map.Entry e : sorted) { + long c = e.getValue()[0]; + cumulative += c; + double pct = 100.0 * c / total; + double cumPct = 100.0 * cumulative / total; + if (pct < 0.01 && c < 100) continue; + System.out.printf(" %-22s %,14d %6.2f%% (cum %6.2f%%)%n", + e.getKey(), c, pct, cumPct); + } + + System.out.println(); + System.out.println(" Per-line dominant-script histogram:"); + List> dom = new ArrayList<>(dominantHistogram.entrySet()); + dom.sort(Comparator.comparingLong((Map.Entry e) -> -e.getValue()[0])); + long domTotal = 0; + for (long[] v : dominantHistogram.values()) domTotal += v[0]; + for (Map.Entry e : dom) { + long c = e.getValue()[0]; + double pct = 100.0 * c / domTotal; + if (pct < 0.05) continue; + System.out.printf(" %-22s %,12d %6.2f%% of lines%n", + e.getKey(), c, pct); + } + } + + private static BufferedReader open(Path path) throws IOException { + if (path.getFileName().toString().endsWith(".gz")) { + return new BufferedReader(new InputStreamReader( + new GZIPInputStream(Files.newInputStream(path)), + StandardCharsets.UTF_8)); + } + return Files.newBufferedReader(path, StandardCharsets.UTF_8); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java index fe99f3214e3..cf52a9eedfc 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java @@ -41,6 +41,9 @@ import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; +import org.apache.tika.ml.junkdetect.JunkDetector; +import org.apache.tika.ml.junkdetect.V7Tables; + /** * Trains the junk detector model from per-script corpus files produced by * {@link BuildJunkTrainingData}. @@ -124,7 +127,17 @@ public class TrainJunkModel { static final String MAGIC = "JUNKDET1"; - static final byte VERSION = 5; + /** Sole supported file-format version. Matches JunkDetector.VERSION. */ + static final byte VERSION = 7; + + // ----------------------------------------------------------------------- + // v7 model constants (per-script open-addressing codepoint-bigram tables) + // ----------------------------------------------------------------------- + + /** Unigram backoff multiplier. α=1.0 = plain independence; prototype validated. */ + static final float V7_BACKOFF_ALPHA = 1.0f; + /** Additive smoothing constant for log-prob computation. */ + static final double V7_ADD_ALPHA = 0.01; /** Number of clean (and corrupted) windows used to train the per-script classifier. */ static final int NUM_CLASSIFIER_SAMPLES = 500; @@ -179,6 +192,25 @@ public static void main(String[] args) throws IOException { "datasets", "madlad", "junkdetect"); Path output = dataDir.resolve("junkdetect.bin"); + // Durable training parameters live in JunkDetectorTrainingConfig; this + // tool deliberately refuses CLI overrides so a built model file's + // identity always matches a committed config. + int minBigramCount = JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT; + double loadFactor = JunkDetectorTrainingConfig.OA_LOAD_FACTOR; + int keyIndexBits = JunkDetectorTrainingConfig.KEY_INDEX_BITS; + if (minBigramCount < 1) { + System.err.println("ERROR: MIN_BIGRAM_COUNT must be >= 1"); + System.exit(1); + } + if (loadFactor <= 0 || loadFactor >= 1) { + System.err.println("ERROR: OA_LOAD_FACTOR must be in (0, 1), got " + loadFactor); + System.exit(1); + } + if (keyIndexBits < 1 || keyIndexBits > 16) { + System.err.println("ERROR: KEY_INDEX_BITS must be in [1, 16], got " + keyIndexBits); + System.exit(1); + } + for (int i = 0; i < args.length; i++) { switch (args[i]) { case "--data-dir": @@ -187,6 +219,12 @@ public static void main(String[] args) throws IOException { case "--output": output = Paths.get(args[++i]); break; + case "--bloom-bits": + case "--min-bigram-count": + System.err.println("ERROR: " + args[i] + " is no longer a CLI option." + + " Edit JunkDetectorTrainingConfig and commit the change instead."); + System.exit(1); + break; default: System.err.println("Unknown argument: " + args[i]); printUsage(); @@ -194,31 +232,34 @@ public static void main(String[] args) throws IOException { } } - System.out.println("=== TrainJunkModel (v5) ==="); - System.out.println(" data-dir: " + dataDir); - System.out.println(" output: " + output); + System.out.println("=== TrainJunkModel ==="); + System.out.println(" data-dir: " + dataDir); + System.out.println(" output: " + output); + System.out.println(" --- v7 format constants (TrainJunkModel) ---"); + System.out.printf( " backoff_alpha: %.2f%n", V7_BACKOFF_ALPHA); + System.out.println(" --- config (JunkDetectorTrainingConfig) ---"); + System.out.printf( " min_bigram_count: %d%n", minBigramCount); + System.out.printf( " oa_load_factor: %.2f%n", loadFactor); + System.out.printf( " key_index_bits: %d%n", keyIndexBits); if (!Files.isDirectory(dataDir)) { System.err.println("ERROR: data-dir not found: " + dataDir); System.exit(1); } - System.out.print("Building Unicode named-block index... "); + int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount(); + System.out.printf("Block bucketing: %d named blocks + 1 unassigned " + + "(scheme version %d, JVM-independent)%n", + blockN - 1, org.apache.tika.ml.junkdetect.UnicodeBlockRanges.SCHEME_VERSION); long t0 = System.currentTimeMillis(); - Map blockIndex = buildBlockIndex(); - int blockN = blockIndex.size() + 1; - System.out.printf("%d named blocks → table size %d×%d (%dms)%n", - blockIndex.size(), blockN, blockN, System.currentTimeMillis() - t0); - - TreeMap bigramTables = new TreeMap<>(); - TreeMap bigramCalibrations = new TreeMap<>(); - TreeMap blockTables = new TreeMap<>(); - TreeMap blockCalibrations = new TreeMap<>(); + + TreeMap f1Calibrations = new TreeMap<>(); + TreeMap blockTables = new TreeMap<>(); + TreeMap blockCalibrations = new TreeMap<>(); TreeMap controlCalibrations = new TreeMap<>(); - TreeMap classifierWeights = new TreeMap<>(); - TreeMap devFilePaths = new TreeMap<>(); - List allTrainFiles = new ArrayList<>(); - List allDevFiles = new ArrayList<>(); + TreeMap classifierWeights = new TreeMap<>(); + TreeMap trainFilePaths = new TreeMap<>(); + List allTrainFiles = new ArrayList<>(); List trainFiles; try (var stream = Files.list(dataDir)) { @@ -234,69 +275,62 @@ public static void main(String[] args) throws IOException { } // ----------------------------------------------------------------------- - // Phase 1 — per-script bigram tables, block tables, calibrations + // Phase 1 — per-script F1 tables (V7), F1 calibration, F2 block tables, + // F3 control-byte calibration // ----------------------------------------------------------------------- - System.out.println("\n--- Phase 1: per-script tables and calibrations ---"); + TreeMap f1TablesByScript = new TreeMap<>(); + System.out.println("\n--- Phase 1: per-script F1 tables + calibrations ---"); for (Path trainFile : trainFiles) { String filename = trainFile.getFileName().toString(); String script = filename.substring(0, filename.length() - ".train.gz".length()) .toUpperCase(); - Path devFile = trainFile.getParent().resolve( - filename.replace(".train.gz", ".dev.gz")); System.out.printf("%n [%s]%n", script); allTrainFiles.add(trainFile); t0 = System.currentTimeMillis(); - System.out.print(" Training byte-bigram table... "); - float[] bigramTable = trainBigramTable(trainFile); - System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0); + System.out.print(" Training V7 F1 tables (cp index + OA).."); + V7Tables v7 = trainV7TablesForScript(trainFile, minBigramCount, + loadFactor, keyIndexBits); + System.out.printf(" done (%dms)%n", System.currentTimeMillis() - t0); + System.out.println(v7.statsString()); + f1TablesByScript.put(script, v7); t0 = System.currentTimeMillis(); - System.out.print(" Training named-block table... "); - float[] blockTable = trainBlockTable(trainFile, blockIndex, blockN); + System.out.print(" Training named-block table... "); + float[] blockTable = trainBlockTable(trainFile); System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0); - float[] bigramCal = new float[]{0f, 1f}; - float[] blockCal = new float[]{0f, 1f}; - float[] controlCal = new float[]{0f, 1f}; - - if (Files.exists(devFile)) { - t0 = System.currentTimeMillis(); - System.out.print(" Calibrating byte bigrams on dev... "); - bigramCal = computeBigramCalibration(devFile, bigramTable); - System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n", - bigramCal[0], bigramCal[1], System.currentTimeMillis() - t0); - - t0 = System.currentTimeMillis(); - System.out.print(" Calibrating named blocks on dev... "); - blockCal = computeBlockCalibration(devFile, blockTable, blockIndex, blockN); - System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n", - blockCal[0], blockCal[1], System.currentTimeMillis() - t0); - - t0 = System.currentTimeMillis(); - System.out.print(" Calibrating control bytes on dev..."); - controlCal = computeControlByteCalibration(devFile); - System.out.printf("done — mu=%.6f sigma=%.6f (%dms)%n", - controlCal[0], controlCal[1], System.currentTimeMillis() - t0); - - devFilePaths.put(script, devFile); - allDevFiles.add(devFile); - } else { - System.out.println(" WARNING: no dev file found, using uncalibrated defaults"); - } + t0 = System.currentTimeMillis(); + System.out.print(" Calibrating F1 (cp-hash) on train.. "); + float[] f1Cal = calibrateF1PerScript(trainFile, v7); + System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n", + f1Cal[0], f1Cal[1], System.currentTimeMillis() - t0); + + t0 = System.currentTimeMillis(); + System.out.print(" Calibrating named blocks on train..."); + float[] blockCal = computeBlockCalibration(trainFile, blockTable); + System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n", + blockCal[0], blockCal[1], System.currentTimeMillis() - t0); - bigramTables.put(script, bigramTable); - bigramCalibrations.put(script, bigramCal); + t0 = System.currentTimeMillis(); + System.out.print(" Calibrating control bytes on train.."); + float[] controlCal = computeControlByteCalibration(trainFile); + System.out.printf("done — mu=%.6f sigma=%.6f (%dms)%n", + controlCal[0], controlCal[1], System.currentTimeMillis() - t0); + + trainFilePaths.put(script, trainFile); + + f1Calibrations.put(script, f1Cal); blockTables.put(script, blockTable); blockCalibrations.put(script, blockCal); controlCalibrations.put(script, controlCal); - // Placeholder — set in phase 3 + // Placeholder — set in Phase 3 classifierWeights.put(script, new float[]{1f / 4, 1f / 4, 1f / 4, 1f / 4, 0f}); } // ----------------------------------------------------------------------- - // Phase 2 — global script-transition table + // Phase 2 — global script-transition table + supporting pools // ----------------------------------------------------------------------- System.out.println("\n--- Phase 2: global script-transition table ---"); List scriptBuckets = buildScriptBuckets(); @@ -314,7 +348,7 @@ public static void main(String[] args) throws IOException { t0 = System.currentTimeMillis(); System.out.print(" Calibrating script transitions... "); - float[] scriptTransCal = calibrateScriptTransitions(allDevFiles, scriptTransTable, + float[] scriptTransCal = calibrateScriptTransitions(allTrainFiles, scriptTransTable, scriptBucketMap, numScriptBuckets); System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n", scriptTransCal[0], scriptTransCal[1], System.currentTimeMillis() - t0); @@ -334,21 +368,21 @@ public static void main(String[] args) throws IOException { System.out.printf("%d tables built%n", remapTables.size()); // ----------------------------------------------------------------------- - // Phase 3 — per-script linear classifiers (now with z4) + // Phase 3 — per-script linear classifiers using v6 features // ----------------------------------------------------------------------- System.out.println("\n--- Phase 3: per-script linear classifiers (z1,z2,z3,z4) ---"); - for (String script : bigramTables.keySet()) { - Path devFile = devFilePaths.get(script); - if (devFile == null) { - System.out.printf(" [%s] WARNING: no dev file, keeping equal-weight defaults%n", script); + for (String script : f1Calibrations.keySet()) { + Path trainFile = trainFilePaths.get(script); + if (trainFile == null) { + System.out.printf(" [%s] WARNING: no train file, keeping equal-weight defaults%n", script); continue; } t0 = System.currentTimeMillis(); System.out.printf(" [%s] training classifier... ", script); - float[] weights = trainClassifier(devFile, - bigramTables.get(script), bigramCalibrations.get(script), + float[] weights = trainClassifierV7(trainFile, + f1TablesByScript.get(script), f1Calibrations.get(script), blockTables.get(script), blockCalibrations.get(script), - controlCalibrations.get(script), blockIndex, blockN, + controlCalibrations.get(script), scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets, scriptCodepoints, remapTables); classifierWeights.put(script, weights); @@ -358,82 +392,31 @@ public static void main(String[] args) throws IOException { } System.out.printf("%nWriting model (%d scripts, blockN=%d, scriptBuckets=%d) → %s%n", - bigramTables.size(), blockN, numScriptBuckets, output); - saveModel(bigramTables, bigramCalibrations, + f1Calibrations.size(), blockN, numScriptBuckets, output); + saveModelV7(f1TablesByScript, f1Calibrations, blockTables, blockCalibrations, controlCalibrations, classifierWeights, - blockIndex, blockN, scriptBuckets, scriptTransTable, scriptTransCal, output); - System.out.printf("Model size: %,d bytes (%.1f MB)%n", - Files.size(output), Files.size(output) / 1_000_000.0); + scriptBuckets, scriptTransTable, scriptTransCal, + output); + System.out.printf("Model size: %,d bytes (%.1f KB)%n", + Files.size(output), Files.size(output) / 1024.0); System.out.println("Done."); } - // ----------------------------------------------------------------------- - // Block index - // ----------------------------------------------------------------------- - - /** - * Builds a stable ordered mapping from {@link Character.UnicodeBlock} to integer index - * by scanning all valid Unicode codepoints in order (U+0000 to U+10FFFF) and - * recording each block's first occurrence. - * - *

The resulting map has {@code size()} entries (one per named block). - * Callers should reserve index {@code size()} as the "unassigned" bucket - * (for codepoints where {@code UnicodeBlock.of(cp)} returns null). - * - * @return immutable ordered map: UnicodeBlock → integer index [0, size) - */ - static Map buildBlockIndex() { - LinkedHashMap index = new LinkedHashMap<>(); - for (int cp = 0; cp <= 0x10FFFF; cp++) { - Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); - if (b != null) index.putIfAbsent(b, index.size()); - } - return Collections.unmodifiableMap(index); - } - // ----------------------------------------------------------------------- // Training // ----------------------------------------------------------------------- /** - * Trains a 256×256 byte-bigram log-probability table from a gzipped sentence file. - * - * @return float[65536] where index {@code a*256+b} = log P(b|a) - */ - static float[] trainBigramTable(Path trainGz) throws IOException { - long[] counts = new long[65536]; - long totalBigrams = 0; - long sentences = 0; - - try (BufferedReader r = openGzipped(trainGz)) { - String line; - while ((line = r.readLine()) != null) { - byte[] bytes = line.getBytes(StandardCharsets.UTF_8); - for (int i = 0; i + 1 < bytes.length; i++) { - counts[((bytes[i] & 0xFF) << 8) | (bytes[i + 1] & 0xFF)]++; - totalBigrams++; - } - sentences++; - } - } - - System.out.printf(" %,d sentences, %,d byte bigrams%n", sentences, totalBigrams); - return laplaceSmoothLogProb(counts, 256); - } - - /** - * Trains a {@code blockN×blockN} named-Unicode-block transition log-probability table. + * Trains a {@code N × N} block-transition log-probability table where + * {@code N = UnicodeBlockRanges.bucketCount()}. Block bucketing uses + * the JVM-independent {@link UnicodeBlockRanges} table. * - * @param blockIndex ordered mapping from UnicodeBlock to index [0, blockIndex.size()) - * @param blockN blockIndex.size() + 1 (includes the null bucket) - * @return float[blockN*blockN] where index {@code a*blockN+b} = log P(block_b | block_a) + * @return float[N*N] where index {@code a*N+b} = log P(block_b | block_a) */ - static float[] trainBlockTable(Path trainGz, - Map blockIndex, - int blockN) throws IOException { + static float[] trainBlockTable(Path trainGz) throws IOException { + int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount(); long[] counts = new long[blockN * blockN]; - int nullId = blockN - 1; long totalBigrams = 0; long sentences = 0; @@ -443,8 +426,7 @@ static float[] trainBlockTable(Path trainGz, int prev = -1; for (int i = 0; i < line.length(); ) { int cp = line.codePointAt(i); - Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); - int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId; + int blockId = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketOf(cp); if (prev >= 0) { counts[prev * blockN + blockId]++; totalBigrams++; @@ -533,37 +515,17 @@ static List sampleSubstrings(Path devGz, int nSamples, return result; } - /** @return float[2] = {mu, sigma} of byte-bigram mean log-prob on dev windows */ - static float[] computeBigramCalibration(Path devGz, float[] bigramTable) throws IOException { - List windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 42); - List scores = new ArrayList<>(windows.size()); - for (String window : windows) { - byte[] bytes = window.getBytes(StandardCharsets.UTF_8); - if (bytes.length < 2) continue; - double sum = 0; - for (int i = 0; i + 1 < bytes.length; i++) { - sum += bigramTable[((bytes[i] & 0xFF) << 8) | (bytes[i + 1] & 0xFF)]; - } - scores.add(sum / (bytes.length - 1)); - } - System.out.printf(" %,d dev windows%n", scores.size()); - return muSigma(scores); - } - /** @return float[2] = {mu, sigma} of block-transition mean log-prob on dev windows */ - static float[] computeBlockCalibration(Path devGz, float[] blockTable, - Map blockIndex, - int blockN) throws IOException { + static float[] computeBlockCalibration(Path devGz, float[] blockTable) throws IOException { + int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount(); List windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 43); List scores = new ArrayList<>(windows.size()); - int nullId = blockN - 1; for (String window : windows) { int[] ids = new int[window.length()]; int len = 0; for (int i = 0; i < window.length(); ) { int cp = window.codePointAt(i); - Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); - ids[len++] = b != null ? blockIndex.getOrDefault(b, nullId) : nullId; + ids[len++] = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketOf(cp); i += Character.charCount(cp); } if (len < 2) continue; @@ -623,166 +585,15 @@ static float[] computeControlByteCalibration(Path devGz) throws IOException { * @param remapTables list of pre-built wrong-codec remap tables from {@link #buildRemapTable} * @return float[5] = {w1, w2, w3, w4, bias} — classifier weights; positive logit = clean */ - static float[] trainClassifier(Path devGz, - float[] bigramTable, float[] bigramCal, - float[] blockTable, float[] blockCal, - float[] controlCal, - Map blockIndex, - int blockN, - float[] scriptTransTable, float[] scriptTransCal, - Map scriptBucketMap, int numScriptBuckets, - Map> scriptCodepoints, - List> remapTables) - throws IOException { - int nEach = NUM_CLASSIFIER_SAMPLES; - // Clean windows - List cleanWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 100); - - // Corrupted windows: sample base windows (seed 101), then distort - // Four-way rotation: inject / shuffle / cross-script / wrong-codec remap - List baseWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 101); - Random rng = new Random(102); - List corruptedWindows = new ArrayList<>(nEach); - for (int i = 0; i < baseWindows.size(); i++) { - String w = baseWindows.get(i); - switch (i % 4) { - case 0: - corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng)); - break; - case 1: - corruptedWindows.add(shuffleChars(w, rng)); - break; - case 2: - corruptedWindows.add(injectCrossScriptChars(w, CLASSIFIER_INJECT_RATE, rng, - scriptCodepoints)); - break; - default: - if (!remapTables.isEmpty()) { - Map table = - remapTables.get(rng.nextInt(remapTables.size())); - corruptedWindows.add(wrongCodecRemap(w, table, CLASSIFIER_INJECT_RATE, rng)); - } else { - corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng)); - } - break; - } - } - - // Build (z1, z2, z3, z4) feature matrix - List features = new ArrayList<>(cleanWindows.size() + corruptedWindows.size()); - List labels = new ArrayList<>(cleanWindows.size() + corruptedWindows.size()); - - for (String w : cleanWindows) { - features.add(extractFeatures(w, bigramTable, bigramCal, - blockTable, blockCal, blockN, controlCal, blockIndex, - scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets)); - labels.add(1); // clean - } - for (String w : corruptedWindows) { - features.add(extractFeatures(w, bigramTable, bigramCal, - blockTable, blockCal, blockN, controlCal, blockIndex, - scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets)); - labels.add(0); // corrupted - } - - float[] weights = fitLogisticRegression(features, labels, 4); - - // Calibrate bias using only short (len=15) windows so that FPR ≤ 2.5% - // even at the worst-case (shortest) window length. - List shortWindows = sampleSubstrings(devGz, nEach, new int[]{15}, 200); - List shortLogits = new ArrayList<>(shortWindows.size()); - int nFeat = weights.length - 1; - for (String w : shortWindows) { - float[] x = extractFeatures(w, bigramTable, bigramCal, - blockTable, blockCal, blockN, controlCal, blockIndex, - scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets); - float logit = weights[nFeat]; - for (int j = 0; j < nFeat; j++) logit += weights[j] * x[j]; - shortLogits.add(logit); - } - if (!shortLogits.isEmpty()) { - Collections.sort(shortLogits); - int pIdx = (int) (0.025 * shortLogits.size()); - float p025 = shortLogits.get(Math.max(0, pIdx)); - weights[nFeat] -= p025; - } - - return weights; - } - - /** - * Extracts calibrated z-scores (z1, z2, z3, z4) for a single text window. - * - * @return float[4] = {z1_bigram, z2_block, z3_control, z4_scriptTrans} - */ - static float[] extractFeatures(String window, - float[] bigramTable, float[] bigramCal, - float[] blockTable, float[] blockCal, - int blockN, float[] controlCal, - Map blockIndex, - float[] scriptTransTable, float[] scriptTransCal, - Map scriptBucketMap, int numScriptBuckets) { - byte[] utf8 = window.getBytes(StandardCharsets.UTF_8); - - // z1: byte-bigram mean log-prob - float z1 = 0f; - if (utf8.length >= 2) { - double sum = 0; - int count = 0; - for (int i = 0; i + 1 < utf8.length; i++) { - sum += bigramTable[((utf8[i] & 0xFF) << 8) | (utf8[i + 1] & 0xFF)]; - count++; - } - z1 = ((float) (sum / count) - bigramCal[0]) / bigramCal[1]; - } - - // z2: block-transition mean log-prob - float z2 = 0f; - if (blockTable != null && window.length() >= 2) { - int nullId = blockN - 1; - int prev = -1; - double sum = 0; - int count = 0; - for (int i = 0; i < window.length(); ) { - int cp = window.codePointAt(i); - Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); - int blockId = b != null ? blockIndex.getOrDefault(b, nullId) : nullId; - if (prev >= 0) { - sum += blockTable[prev * blockN + blockId]; - count++; - } - prev = blockId; - i += Character.charCount(cp); - } - if (count > 0) { - z2 = ((float) (sum / count) - blockCal[0]) / blockCal[1]; - } - } - - // z3: control-byte fraction (stored as −fraction, so higher = cleaner) - float z3 = 0f; - if (utf8.length > 0 && controlCal != null) { - long controlCount = 0; - for (byte b : utf8) { - if (isControlByte(b & 0xFF)) controlCount++; - } - float score = -(float) controlCount / utf8.length; - z3 = (score - controlCal[0]) / controlCal[1]; - } - - // z4: script-transition mean log-prob (raw UnicodeScript, no model fallback) - float z4 = 0f; - if (scriptTransTable != null && scriptTransCal != null) { - double raw = rawScriptTransitionLogProb(window, scriptTransTable, - scriptBucketMap, numScriptBuckets, numScriptBuckets - 1); - if (!Double.isNaN(raw)) { - z4 = ((float) raw - scriptTransCal[0]) / scriptTransCal[1]; - } - } - - return new float[]{z1, z2, z3, z4}; - } + // Per-feature z-score helpers (z2, z3, z4) for the classifier-training + // path live on JunkDetector as public static methods so they are the + // SOLE implementation — inference and training share the exact same + // math by construction. See {@link JunkDetector#computeZ2BlockTransition}, + // {@link JunkDetector#computeZ3ControlByte}, + // {@link JunkDetector#computeZ4ScriptTransition}. z1 (codepoint-hash) + // is computed against the in-progress hash tables during training and + // against the loaded model at inference. /** * Replaces a random fraction of characters with Unicode control characters. @@ -900,50 +711,394 @@ static float[] fitLogisticRegression(List features, List label // Model serialisation // ----------------------------------------------------------------------- + private static byte[] toBytes(float[] table) { + ByteBuffer buf = ByteBuffer.allocate(table.length * 4).order(ByteOrder.BIG_ENDIAN); + for (float v : table) buf.putFloat(v); + return buf.array(); + } + + // ----------------------------------------------------------------------- + // v7 Phase 1: per-script open-addressing F1 table training + // ----------------------------------------------------------------------- + + /** + * Builds the {@link V7Tables} F1 carrier for one script's training data. + * + *

Two-pass: + *

    + *
  1. Pass 1. Count every (cpA, cpB) pair occurrence and every + * cp unigram occurrence in the script's {@code *.train.gz} file. + * Pairs with count {@code < minBigramCount} are dropped at this + * step — they're typically OCR artifacts and proper-noun noise.
  2. + *
  3. Pass 2. Collect every codepoint that appears in any + * kept pair (as either side), sort, assign each a dense small + * index. Build a power-of-two open-addressing hash table sized + * for {@code keptPairs / loadFactor}; pack each retained + * {@code (idxA, idxB)} into a 32-bit key and insert via linear + * probing. Quantize both bigram log-probs and unigram log-probs + * to 8-bit.
  4. + *
+ * + *

Returned {@link V7Tables} are ready to hand to + * {@link #saveModelV7}. + * + * @param trainFile the per-script {@code *.train.gz} + * @param minBigramCount drop pairs whose count is below this + * @param loadFactor target OA table load factor (e.g. 0.5) + * @param keyIndexBits bit-width per index in the packed key + * (each side of the pair must fit) + */ + public static V7Tables trainV7TablesForScript(Path trainFile, + int minBigramCount, + double loadFactor, + int keyIndexBits) throws IOException { + // --- Pass 1: tally pair and unigram counts. --- + HashMap pairCounts = new HashMap<>(1 << 14); + HashMap unigramCounts = new HashMap<>(1 << 12); + long bigramTotal = 0; + long unigramTotal = 0; + + try (BufferedReader r = openGzipped(trainFile)) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + long[] uc = unigramCounts.get(cp); + if (uc == null) { + unigramCounts.put(cp, new long[]{1L}); + } else { + uc[0]++; + } + unigramTotal++; + if (prevCp >= 0) { + long packed = ((long) prevCp << 32) | (cp & 0xFFFFFFFFL); + long[] bc = pairCounts.get(packed); + if (bc == null) { + pairCounts.put(packed, new long[]{1L}); + } else { + bc[0]++; + } + bigramTotal++; + } + prevCp = cp; + } + } + } + + // --- Filter pairs by count, collect kept-codepoint set. --- + int totalDistinct = pairCounts.size(); + int keptPairs = 0; + long keptBigramTotal = 0; + java.util.TreeSet keptCodepoints = new java.util.TreeSet<>(); + for (Map.Entry e : pairCounts.entrySet()) { + if (e.getValue()[0] < minBigramCount) continue; + keptPairs++; + keptBigramTotal += e.getValue()[0]; + long packed = e.getKey(); + int cpA = (int) (packed >>> 32); + int cpB = (int) (packed & 0xFFFFFFFFL); + keptCodepoints.add(cpA); + keptCodepoints.add(cpB); + } + int dropped = totalDistinct - keptPairs; + + // --- Build sorted codepoint index. --- + int[] cpIndex = new int[keptCodepoints.size()]; + int idx = 0; + for (int cp : keptCodepoints) { + cpIndex[idx++] = cp; + } + // Enforce the indexable-bits contract. + int maxIndex = (1 << keyIndexBits) - 1; + if (cpIndex.length > maxIndex + 1) { + throw new IllegalStateException("Per-script codepoint count " + + cpIndex.length + " exceeds 2^KEY_INDEX_BITS (= " + + (maxIndex + 1) + "). Increase KEY_INDEX_BITS or apply" + + " a tighter pair-count filter for " + + trainFile.getFileName()); + } + + // --- Compute per-pair log-prob (add-α smoothed over kept pairs). --- + // Denominator: kept-bigram total + α × keptPairs (only pairs we store). + double bigramDenom = keptBigramTotal + V7_ADD_ALPHA * keptPairs; + // Unigram log-probs. We keep one entry per indexed codepoint; the + // denominator uses ALL unigram observations (kept pairs only would + // bias the backoff toward common pairs). + double unigramDenom = unigramTotal + V7_ADD_ALPHA * unigramCounts.size(); + float[] unigramLogP = new float[cpIndex.length]; + for (int i = 0; i < cpIndex.length; i++) { + long[] uc = unigramCounts.get(cpIndex[i]); + long count = uc != null ? uc[0] : 0L; + double p = (count + V7_ADD_ALPHA) / unigramDenom; + unigramLogP[i] = (float) Math.log(p); + } + // Per-script "absent codepoint" fallback: the lowest unigram log-prob + // we'd assign to a codepoint observed exactly once. A codepoint + // *not* in our index has count 0, so: + double fallbackP = V7_ADD_ALPHA / unigramDenom; + float unigramFallbackLogP = (float) Math.log(fallbackP); + + // Quantize unigram log-probs. + QuantizedFloats qUnigram = quantizeFloats(unigramLogP); + + // --- Build the open-addressing bigram table. --- + int slots = nextPowerOfTwo((int) Math.max(2, Math.ceil(keptPairs / loadFactor))); + int[] keys = new int[slots]; + java.util.Arrays.fill(keys, V7Tables.EMPTY_KEY); + // Compute log-probs first, quantize once, then write into the table + // alongside its key. + float[] keptLogP = new float[keptPairs]; + int[] keptKeys = new int[keptPairs]; + int writeIdx = 0; + // codepoint -> index lookup helper (small map keyed by Integer) + HashMap cpToIdx = new HashMap<>(cpIndex.length * 2); + for (int i = 0; i < cpIndex.length; i++) { + cpToIdx.put(cpIndex[i], i); + } + for (Map.Entry e : pairCounts.entrySet()) { + long count = e.getValue()[0]; + if (count < minBigramCount) continue; + long packed = e.getKey(); + int cpA = (int) (packed >>> 32); + int cpB = (int) (packed & 0xFFFFFFFFL); + int idxA = cpToIdx.get(cpA); + int idxB = cpToIdx.get(cpB); + int packedKey = JunkDetector.packBigramKey(idxA, idxB); + double p = (count + V7_ADD_ALPHA) / bigramDenom; + keptKeys[writeIdx] = packedKey; + keptLogP[writeIdx] = (float) Math.log(p); + writeIdx++; + } + // Quantize all kept log-probs together so they share min/max. + QuantizedFloats qBigram = quantizeFloats(keptLogP); + byte[] values = new byte[slots]; + for (int i = 0; i < keptPairs; i++) { + insertOA(keys, values, keptKeys[i], qBigram.bytes[i]); + } + + System.out.printf( + " pair_counts: distinct=%,d, kept=%,d (>=%d), dropped=%,d " + + "cp_index=%,d slots=%,d (load=%.2f)%n", + totalDistinct, keptPairs, minBigramCount, dropped, + cpIndex.length, slots, keptPairs / (double) slots); + + return new V7Tables(cpIndex, keys, values, qUnigram.bytes, + qBigram.min, qBigram.max, + qUnigram.min, qUnigram.max, + unigramFallbackLogP, V7_BACKOFF_ALPHA); + } + + /** + * Inserts a {@code (packedKey, value)} pair into the open-addressing + * table. The caller is responsible for sizing the table large enough + * to avoid an infinite probe (any load < 1.0 is safe). + */ + private static void insertOA(int[] keys, byte[] values, int packedKey, byte value) { + int mask = keys.length - 1; + int h = JunkDetector.mixIndexKey(packedKey) & mask; + while (keys[h] != V7Tables.EMPTY_KEY) { + if (keys[h] == packedKey) { + // Same key twice — shouldn't happen with our dedup, but be + // defensive and overwrite rather than corrupt. + values[h] = value; + return; + } + h = (h + 1) & mask; + } + keys[h] = packedKey; + values[h] = value; + } + + private static int nextPowerOfTwo(int n) { + if (n < 1) return 1; + int p = Integer.highestOneBit(n - 1) << 1; + return Math.max(1, p); + } + + /** + * Computes per-script F1 calibration ({mu, sigma}) by scoring each + * window in the dev file against the trained per-script codepoint + * tables. Delegates to + * {@link org.apache.tika.ml.junkdetect.JunkDetector#computeF1MeanLogP} + * — the single authoritative F1 implementation shared between training + * and inference. + */ + public static float[] calibrateF1PerScript(Path devGz, V7Tables tables) throws IOException { + List windows = sampleSubstrings(devGz, CALIB_SAMPLES, CALIB_LENGTHS, 42); + List scores = new ArrayList<>(windows.size()); + for (String window : windows) { + double score = JunkDetector.computeF1MeanLogP(window, tables); + if (!Double.isNaN(score)) { + scores.add(score); + } + } + System.out.printf(" %,d dev windows%n", scores.size()); + return muSigma(scores); + } + + // ----------------------------------------------------------------------- + // v7 Phase 3: classifier feature extractor + orchestrator + // ----------------------------------------------------------------------- + + /** + * Extracts a 4-dim calibrated z-score vector for one training window + * using the v7 per-script tables. z2/z3/z4 delegate to the public + * helpers on {@link JunkDetector} — same math used at inference, no + * trainer/inference drift possible. + * + * @return float[4] = {z1_cpHash, z2_block, z3_control, z4_scriptTrans} + */ + static float[] extractFeaturesV7(String window, + V7Tables tables, float[] f1Cal, + float[] blockTable, float[] blockCal, + float[] controlCal, + float[] scriptTransTable, float[] scriptTransCal, + Map scriptBucketMap, + int numScriptBuckets) { + byte[] utf8 = window.getBytes(StandardCharsets.UTF_8); + + // z1: per-script codepoint-bigram mean log-prob + float z1 = 0f; + double rawF1 = JunkDetector.computeF1MeanLogP(window, tables); + if (!Double.isNaN(rawF1) && f1Cal != null && f1Cal[1] > 0) { + z1 = ((float) rawF1 - f1Cal[0]) / f1Cal[1]; + } + + float z2 = org.apache.tika.ml.junkdetect.JunkDetector + .computeZ2BlockTransition(window, blockTable, blockCal); + float z3 = org.apache.tika.ml.junkdetect.JunkDetector + .computeZ3ControlByte(utf8, controlCal); + float z4 = org.apache.tika.ml.junkdetect.JunkDetector + .computeZ4ScriptTransition(window, scriptTransTable, scriptTransCal, + scriptBucketMap, numScriptBuckets); + + return new float[]{z1, z2, z3, z4}; + } + + /** + * Trains a per-script binary logistic regression classifier on + * (z1_cpHash, z2, z3, z4). Same scaffolding as the v6 trainer + * (sample windows, corrupt half, fit LR, bias-calibrate on short + * windows) but uses v7 per-script F1 tables. + */ + static float[] trainClassifierV7(Path devGz, + V7Tables tables, float[] f1Cal, + float[] blockTable, float[] blockCal, + float[] controlCal, + float[] scriptTransTable, float[] scriptTransCal, + Map scriptBucketMap, int numScriptBuckets, + Map> scriptCodepoints, + List> remapTables) + throws IOException { + int nEach = NUM_CLASSIFIER_SAMPLES; + + List cleanWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 100); + + List baseWindows = sampleSubstrings(devGz, nEach, CALIB_LENGTHS, 101); + Random rng = new Random(102); + List corruptedWindows = new ArrayList<>(nEach); + for (int i = 0; i < baseWindows.size(); i++) { + String w = baseWindows.get(i); + switch (i % 4) { + case 0: + corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng)); + break; + case 1: + corruptedWindows.add(shuffleChars(w, rng)); + break; + case 2: + corruptedWindows.add(injectCrossScriptChars(w, CLASSIFIER_INJECT_RATE, rng, + scriptCodepoints)); + break; + default: + if (!remapTables.isEmpty()) { + Map table = + remapTables.get(rng.nextInt(remapTables.size())); + corruptedWindows.add(wrongCodecRemap(w, table, CLASSIFIER_INJECT_RATE, rng)); + } else { + corruptedWindows.add(injectControlChars(w, CLASSIFIER_INJECT_RATE, rng)); + } + break; + } + } + + List features = new ArrayList<>(cleanWindows.size() + corruptedWindows.size()); + List labels = new ArrayList<>(cleanWindows.size() + corruptedWindows.size()); + + for (String w : cleanWindows) { + features.add(extractFeaturesV7(w, tables, f1Cal, + blockTable, blockCal, controlCal, + scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets)); + labels.add(1); + } + for (String w : corruptedWindows) { + features.add(extractFeaturesV7(w, tables, f1Cal, + blockTable, blockCal, controlCal, + scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets)); + labels.add(0); + } + + float[] weights = fitLogisticRegression(features, labels, 4); + + // Bias calibration on short windows so FPR ≤ 2.5% at worst-case length. + List shortWindows = sampleSubstrings(devGz, nEach, new int[]{15}, 200); + List shortLogits = new ArrayList<>(shortWindows.size()); + int nFeat = weights.length - 1; + for (String w : shortWindows) { + float[] x = extractFeaturesV7(w, tables, f1Cal, + blockTable, blockCal, controlCal, + scriptTransTable, scriptTransCal, scriptBucketMap, numScriptBuckets); + float logit = weights[nFeat]; + for (int j = 0; j < nFeat; j++) logit += weights[j] * x[j]; + shortLogits.add(logit); + } + if (!shortLogits.isEmpty()) { + Collections.sort(shortLogits); + int pIdx = (int) (0.025 * shortLogits.size()); + float p025 = shortLogits.get(Math.max(0, pIdx)); + weights[nFeat] -= p025; + } + + return weights; + } + /** - * Writes the trained model (version 4) to a gzipped binary file. + * Writes a v7 model file (JUNKDET1 version=7 gzipped binary). * - *

Format documented in the class Javadoc. All multi-byte integers are - * big-endian; floats are IEEE 754 big-endian. + *

Layout vs. v6: no global F1+Bloom section. Each per-script + * section embeds that script's {@link V7Tables} (codepoint index, + * open-addressing bigram keys+values, unigram table) directly after + * its F1 calibration, before F2. See {@link JunkDetector#load} for + * the full layout spec. * - * @param classifierWeights per-script float[5] = {w1, w2, w3, w4, bias} - * @param blockN the block table dimension (blockIndex.size() + 1) - * @param scriptBuckets ordered list of script bucket names (last = "OTHER") - * @param scriptTransTable global script-transition log-prob table - * @param scriptTransCal float[2] = {mu, sigma} for script-transition feature + *

F2 (block transition), F3 (control byte), F4 (script transition) + * sections are unchanged from v6. */ - static void saveModel(TreeMap bigramTables, - TreeMap bigramCalibrations, - TreeMap blockTables, - TreeMap blockCalibrations, - TreeMap controlCalibrations, - TreeMap classifierWeights, - Map blockIndex, - int blockN, - List scriptBuckets, - float[] scriptTransTable, - float[] scriptTransCal, - Path output) throws IOException { + public static void saveModelV7(TreeMap f1Tables, + TreeMap f1Calibrations, + TreeMap blockTables, + TreeMap blockCalibrations, + TreeMap controlCalibrations, + TreeMap classifierWeights, + List scriptBuckets, + float[] scriptTransTable, + float[] scriptTransCal, + Path output) throws IOException { try (DataOutputStream dos = new DataOutputStream( new GZIPOutputStream(Files.newOutputStream(output)))) { dos.write(MAGIC.getBytes(StandardCharsets.UTF_8)); dos.writeByte(VERSION); - dos.writeInt(bigramTables.size()); - dos.writeShort(blockN); + dos.writeInt(f1Calibrations.size()); - // Block names section (v5+): write ordered block names for JVM-independence - String[] blockNames = new String[blockN - 1]; - for (Map.Entry e : blockIndex.entrySet()) { - blockNames[e.getValue()] = e.getKey().toString(); - } - for (String name : blockNames) { - byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); - dos.writeShort(nameBytes.length); - dos.write(nameBytes); - } + // Block-scheme version byte — bound to the JVM-independent + // UnicodeBlockRanges static table. Mismatch at load time is a + // hard error (no silent re-mapping). + dos.writeByte(org.apache.tika.ml.junkdetect.UnicodeBlockRanges.SCHEME_VERSION); - // Global script-transition section (v4+) + // Global script-transition section int numBuckets = scriptBuckets.size(); dos.writeByte(numBuckets); for (String bucketName : scriptBuckets) { @@ -952,45 +1107,99 @@ static void saveModel(TreeMap bigramTables, dos.write(nameBytes); } dos.write(toBytes(scriptTransTable)); - dos.writeFloat(scriptTransCal[0]); // mu - dos.writeFloat(scriptTransCal[1]); // sigma + dos.writeFloat(scriptTransCal[0]); + dos.writeFloat(scriptTransCal[1]); - for (var entry : bigramTables.entrySet()) { + // Per-script sections. V7 embeds the F1 tables inline. + int blockN = org.apache.tika.ml.junkdetect.UnicodeBlockRanges.bucketCount(); + for (var entry : f1Calibrations.entrySet()) { String script = entry.getKey(); - float[] bigramTable = entry.getValue(); - float[] bigramCal = bigramCalibrations.getOrDefault(script, new float[]{0f, 1f}); - float[] blockTable = blockTables.getOrDefault(script, new float[blockN * blockN]); - float[] blockCal = blockCalibrations.getOrDefault(script, new float[]{0f, 1f}); - float[] controlCal = controlCalibrations.getOrDefault(script, new float[]{0f, 1f}); - float[] weights = classifierWeights.getOrDefault(script, + float[] f1Cal = entry.getValue(); + V7Tables tables = f1Tables.get(script); + if (tables == null) { + throw new IllegalStateException("No V7Tables for script " + script); + } + float[] blockTable = blockTables.getOrDefault(script, new float[blockN * blockN]); + float[] blockCal = blockCalibrations.getOrDefault(script, new float[]{0f, 1f}); + float[] controlCal = controlCalibrations.getOrDefault(script, new float[]{0f, 1f}); + float[] weights = classifierWeights.getOrDefault(script, new float[]{1f / 4, 1f / 4, 1f / 4, 1f / 4, 0f}); byte[] nameBytes = script.getBytes(StandardCharsets.UTF_8); dos.writeShort(nameBytes.length); dos.write(nameBytes); - dos.writeFloat(bigramCal[0]); - dos.writeFloat(bigramCal[1]); - dos.write(toBytes(bigramTable)); + // F1 calibration + dos.writeFloat(f1Cal[0]); + dos.writeFloat(f1Cal[1]); + + // F1 per-script tables + tables.writeTo(dos); + // F2 — block transitions dos.writeFloat(blockCal[0]); dos.writeFloat(blockCal[1]); dos.write(toBytes(blockTable)); + // F3 — control-byte calibration dos.writeFloat(controlCal[0]); dos.writeFloat(controlCal[1]); + // Classifier weights int numFeatures = weights.length - 1; dos.writeByte(numFeatures); - for (float v : weights) dos.writeFloat(v); + for (float v : weights) { + dos.writeFloat(v); + } } } } - private static byte[] toBytes(float[] table) { - ByteBuffer buf = ByteBuffer.allocate(table.length * 4).order(ByteOrder.BIG_ENDIAN); - for (float v : table) buf.putFloat(v); - return buf.array(); + /** + * Quantizes a float array to 8-bit unsigned by linearly mapping + * {@code [min, max] → [0, 255]}. Returns the byte array; {@code min} + * and {@code max} are computed from the input. + * + *

Stored in v6 model files as 8-bit log-prob tables; reader + * dequantizes via {@code min + (b/255) * (max - min)}. + * + * @return three-element record: byte[] quantized, float min, float max + */ + public static QuantizedFloats quantizeFloats(float[] in) { + float min = Float.POSITIVE_INFINITY; + float max = Float.NEGATIVE_INFINITY; + for (float v : in) { + if (Float.isFinite(v)) { + if (v < min) min = v; + if (v > max) max = v; + } + } + if (!Float.isFinite(min) || !Float.isFinite(max) || max == min) { + // Degenerate input — emit zeros, store dummy range. + return new QuantizedFloats(new byte[in.length], 0f, 1f); + } + byte[] out = new byte[in.length]; + float range = max - min; + for (int i = 0; i < in.length; i++) { + float v = Float.isFinite(in[i]) ? in[i] : min; + int q = Math.round(((v - min) / range) * 255.0f); + if (q < 0) q = 0; + else if (q > 255) q = 255; + out[i] = (byte) q; + } + return new QuantizedFloats(out, min, max); + } + + /** Return type of {@link #quantizeFloats(float[])}. */ + public static final class QuantizedFloats { + public final byte[] bytes; + public final float min; + public final float max; + public QuantizedFloats(byte[] bytes, float min, float max) { + this.bytes = bytes; + this.min = min; + this.max = max; + } } // ----------------------------------------------------------------------- @@ -1307,5 +1516,9 @@ private static void printUsage() { System.err.println(" (default: ~/datasets/madlad/junkdetect)"); System.err.println(" --output Output model file"); System.err.println(" (default: {data-dir}/junkdetect.bin)"); + System.err.println(); + System.err.println("All other training parameters (Bloom filter size, min bigram count, etc.)"); + System.err.println("are fixed in JunkDetectorTrainingConfig and tracked in git. Edit that"); + System.err.println("file and commit to change them."); } } diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index feb9da112e7..644d46bad05 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java index 88a5a8c16fa..e670f9e1639 100644 --- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorSmokeTest.java @@ -17,6 +17,7 @@ package org.apache.tika.ml.junkdetect; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.StandardCharsets; @@ -31,6 +32,7 @@ /** * Smoke tests verifying the bundled model meets minimum quality thresholds. * Failures indicate the model needs more data or feature extraction is wrong. + * */ public class JunkDetectorSmokeTest { @@ -110,7 +112,7 @@ void cp1252VsCp1257OnBalticText() throws Exception { System.out.println("Baltic comparison: " + result); - assertEquals("B", result.winner(), + assertEquals("cp1257", result.winner(), "cp1257 should be identified as the correct encoding for Lithuanian text"); // Delta is weak (pooled LATIN model dilutes Baltic-specific bigrams). // Production threshold is delta > 1.0; PoC floor is 0.1. @@ -142,7 +144,7 @@ void cp1252VsCp1251OnRussianText() throws Exception { System.out.println("Russian Cyrillic comparison: " + result); - assertEquals("B", result.winner(), + assertEquals("cp1251", result.winner(), "cp1251 should be identified as the correct encoding for Russian text"); assertTrue(result.delta() > 1.0, "Cyrillic codec separation should be strong: delta=" + result.delta()); @@ -195,10 +197,50 @@ void shiftJisZipEntryNameVsUtf8() throws Exception { System.out.println("Shift-JIS zip entry: " + result); - assertEquals("A", result.winner(), + assertEquals("Shift-JIS", result.winner(), "Shift-JIS decode should beat garbled UTF-8 for short Japanese filename"); } + /** + * Regression: a single CJK codepoint sandwiched between modeled-script + * runs used to NaN-poison the entire score, because the byte-length + * filter ({@code runUtf8.length >= 2}) and the UTF-16 char-length + * filter inside {@code computeF1MeanLogP} ({@code text.length() >= 2}) + * disagreed. A single CJK char is 3 UTF-8 bytes (1 UTF-16 unit), so + * it passed the outer filter, computed NaN inside, and poisoned the + * weighted aggregate — surfacing as UNKNOWN to callers. This was the + * root cause of the AIT5-class regressions (UTF-8 Malayalam decoded as + * GB18030 returns lots of single-Han-char runs). + */ + @Test + void singleCjkCharDoesNotNaNPoisonScore() { + // Latin sentence with a stray CJK char dropped in — exactly the + // shape of a GB18030-mojibake-of-UTF-8 decode at the run-boundary + // level. The CJK char forms a single-codepoint HAN run. + String text = "The quick brown 中 fox jumps over the lazy dog. " + + "Pack 中 my box with five dozen liquor jugs."; + TextQualityScore score = detector.score(text); + assertFalse(score.isUnknown(), + "score should not be UNKNOWN — single-CJK run should be skipped, " + + "not poison the aggregate. Got: " + score); + } + + /** + * Sibling regression: the same NaN-poisoning case caused by a single + * supplementary-plane (4-byte UTF-8, 2-UTF-16-unit) codepoint. Less + * load-bearing than the BMP-CJK case — supplementary chars decode to + * {@code text.length() == 2} so they pass the inner filter — but + * worth pinning the behaviour. + */ + @Test + void supplementaryPlaneCharSurvivesScoring() { + // U+1F600 (😀) is a 2-UTF-16-unit supplementary char with script COMMON, + // so it attaches to a preceding modeled run rather than forming its own. + String text = "Hello world 😀 this is some plain English text."; + TextQualityScore score = detector.score(text); + assertFalse(score.isUnknown(), "supplementary char should not break scoring: " + score); + } + // ----------------------------------------------------------------------- /** diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java new file mode 100644 index 00000000000..b846064c52f --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkDetectorV7Test.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.TreeMap; +import java.util.zip.GZIPOutputStream; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.ml.junkdetect.tools.JunkDetectorTrainingConfig; +import org.apache.tika.ml.junkdetect.tools.TrainJunkModel; +import org.apache.tika.quality.TextQualityScore; + +/** + * Validates the v7 model file format end-to-end: a synthetic small model is + * constructed in-memory with known table values, saved via + * {@link TrainJunkModel#saveModelV7}, loaded via {@link JunkDetector#load}, + * scored against known input, and the output verified against hand-computed + * expected values. + * + *

This is the architectural-decision validation: it confirms that the v7 + * file format spec, the trainer's save path, the loader, and the scoring + * path (per-script open-addressing codepoint-bigram + unigram backoff) all + * agree on the semantics. Does not require the production training corpus. + */ +public class JunkDetectorV7Test { + + @Test + void v7RoundTripSeenPairAndUnigramBackoff(@TempDir Path tmp) throws IOException { + // ----------------------------------------------------------------- + // Build a tiny synthetic v7 model for LATIN. + // + // codepointIndex = ['A', 'B'] (indices 0, 1) + // Pair (A, B) stored with log-prob -1.0 + // (B, A) is *not* in the bigram table — falls back to unigram. + // Unigram log-prob = -2.0 for both 'A' and 'B'. + // backoffAlpha = 1.0 → backoff sum = -4.0 + // + // Expected mean log-prob over "ABAB": + // (A,B) seen: -1.0 + // (B,A) backoff: 1.0 * (-2 + -2) = -4.0 + // (A,B) seen: -1.0 + // mean = -2.0 + // f1Cal mu=-5, sigma=1 → z1 = (-2 - -5) / 1 = +3.0 + // Classifier w1=1, rest 0, bias=0 → logit = +3.0 + // ----------------------------------------------------------------- + V7Tables tables = buildLatinTablesAB(); + + Path modelFile = tmp.resolve("v7-test.bin"); + saveMinimalV7Model(tables, modelFile); + + // Verify the file roundtrips through the loader. + JunkDetector detector = JunkDetector.loadFromPath(modelFile); + assertEquals(7, detector.getModelVersion(), "Loaded model should be v7"); + + TextQualityScore score = detector.score("ABAB"); + assertEquals("LATIN", score.getDominantScript(), "Dominant script should be LATIN"); + // Quantization of [-4, -1] to 8 bits introduces ~0.012 nat / level. + // Net z-error over 3 pairs bounded ~0.05; allow 0.3 to be safe. + assertEquals(3.0f, score.getZScore(), 0.3f, + "Expected z ≈ +3.0 for 'ABAB' (seen-pair + backoff mix)"); + } + + @Test + void v7RoundTripAllSeenPairsScoreHigher(@TempDir Path tmp) throws IOException { + // Same shape as the first test but with BOTH (A,B) and (B,A) in the + // bigram table. mean log-prob = -1.0, z1 = +4.0, logit = +4.0. + int[] cpIndex = new int[]{'A', 'B'}; + int[] keys = new int[4]; + Arrays.fill(keys, V7Tables.EMPTY_KEY); + byte[] values = new byte[4]; + float bMin = -10.0f; + float bMax = -1.0f; + byte b = quantizeOne(-1.0f, bMin, bMax); + insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b); + insertOA(keys, values, JunkDetector.packBigramKey(1, 0), b); + + float uMin = -5.0f; + float uMax = -2.0f; + byte[] unigramBytes = new byte[]{ + quantizeOne(-2.0f, uMin, uMax), + quantizeOne(-2.0f, uMin, uMax), + }; + + V7Tables tables = new V7Tables(cpIndex, keys, values, unigramBytes, + bMin, bMax, uMin, uMax, + -10.0f, 1.0f); + + Path modelFile = tmp.resolve("v7-test-allseen.bin"); + saveMinimalV7Model(tables, modelFile); + JunkDetector detector = JunkDetector.loadFromPath(modelFile); + + TextQualityScore score = detector.score("ABAB"); + // mean = -1.0, z1 = (-1 - -5) / 1 = +4.0 + assertEquals(4.0f, score.getZScore(), 0.3f, + "All-seen 'ABAB' should score z ≈ +4"); + } + + /** + * End-to-end trainer integration: drives {@link + * TrainJunkModel#trainV7TablesForScript} on a tiny synthetic corpus, + * calibrates F1, saves a model, loads it, and scores text. Catches + * drift between trainer F1 math and inference F1 math — the FNV + * mix-hash, packed-key layout, and codepoint-pair iteration order all + * have to agree exactly, or scoring produces nonsense. + * + *

F2/F3/F4 are zeroed out (placeholder data) — the test isolates + * F1's trainer↔inference round-trip. + */ + @Test + void trainerRoundTripIntegration(@TempDir Path tmp) throws IOException { + // --- 1. Build a tiny LATIN corpus on disk --- + Path trainFile = tmp.resolve("LATIN.train.gz"); + writeGzippedLines(trainFile, + "the quick brown fox jumps over the lazy dog", + "pack my box with five dozen liquor jugs", + "how vexingly quick daft zebras jump", + "the five boxing wizards jump quickly", + "sphinx of black quartz judge my vow"); + Path devFile = tmp.resolve("LATIN.dev.gz"); + writeGzippedLines(devFile, + "the rain in spain falls mainly on the plain", + "a stitch in time saves nine", + "all that glitters is not gold"); + + // --- 2. Phase 1: train V7 F1 tables for this script --- + // Tiny corpus → min_count=1 so all pairs survive. + V7Tables tables = TrainJunkModel.trainV7TablesForScript(trainFile, + 1, JunkDetectorTrainingConfig.OA_LOAD_FACTOR, + JunkDetectorTrainingConfig.KEY_INDEX_BITS); + + // Sanity: 'h' should be in the codepoint index (appears in "the"). + assertTrue(Arrays.binarySearch(tables.codepointIndex, (int) 'h') >= 0, + "'h' should be in codepoint index — it appears in training"); + assertTrue(Arrays.binarySearch(tables.codepointIndex, (int) 'x') >= 0, + "'x' should be in codepoint index — appears in 'box', 'fox'"); + + // The pair (t, h) is in training; the OA lookup should find it. + int idxT = Arrays.binarySearch(tables.codepointIndex, (int) 't'); + int idxH = Arrays.binarySearch(tables.codepointIndex, (int) 'h'); + assertTrue(idxT >= 0 && idxH >= 0); + int slot = JunkDetector.lookupBigramSlot(tables, idxT, idxH); + assertTrue(slot >= 0, "OA lookup should find seen pair (t, h)"); + + // --- 3. F1 raw scoring sanity --- + double meanLogP = JunkDetector.computeF1MeanLogP("the quick brown fox", tables); + assertTrue(Double.isFinite(meanLogP), + "Mean log-prob on training text should be finite, got " + meanLogP); + assertTrue(meanLogP > -15 && meanLogP < 0, + "Score on training text should be sensible, got " + meanLogP); + + // --- 4. Phase 1.5: F1 calibration on dev --- + float[] f1CalLatin = TrainJunkModel.calibrateF1PerScript(devFile, tables); + assertTrue(Float.isFinite(f1CalLatin[0]), "mu1 should be finite"); + assertTrue(Float.isFinite(f1CalLatin[1]) && f1CalLatin[1] > 0, + "sigma1 should be positive finite"); + + // --- 5. Assemble + save a minimal v7 model --- + int blockN = UnicodeBlockRanges.bucketCount(); + TreeMap f1Tables = new TreeMap<>(); + f1Tables.put("LATIN", tables); + TreeMap blockTables = new TreeMap<>(); + blockTables.put("LATIN", new float[blockN * blockN]); + TreeMap blockCal = new TreeMap<>(); + blockCal.put("LATIN", new float[]{0f, 1f}); + TreeMap controlCal = new TreeMap<>(); + controlCal.put("LATIN", new float[]{0f, 1f}); + TreeMap f1CalMap = new TreeMap<>(); + f1CalMap.put("LATIN", f1CalLatin); + TreeMap classifierWeights = new TreeMap<>(); + classifierWeights.put("LATIN", new float[]{1f, 0f, 0f, 0f, 0f}); + + List scriptBuckets = List.of("LATIN", "OTHER"); + float[] scriptTransTable = new float[scriptBuckets.size() * scriptBuckets.size()]; + float[] scriptTransCal = new float[]{0f, 1f}; + + Path modelPath = tmp.resolve("junkdetect.bin"); + TrainJunkModel.saveModelV7( + f1Tables, f1CalMap, blockTables, blockCal, controlCal, + classifierWeights, scriptBuckets, scriptTransTable, + scriptTransCal, modelPath); + + // --- 6. Load via JunkDetector and score --- + JunkDetector detector = JunkDetector.loadFromPath(modelPath); + assertEquals(7, detector.getModelVersion(), + "Loaded model should be v7"); + assertTrue(detector.knownScripts().contains("LATIN"), + "Loaded model should know LATIN"); + + TextQualityScore score = detector.score("the quick brown fox jumps"); + assertEquals("LATIN", score.getDominantScript()); + assertTrue(Float.isFinite(score.getZScore()), + "Score on in-distribution text should be finite, got " + score); + + // --- 7. Train/infer consistency check --- + // The inference path should compute the same raw F1 score as + // JunkDetector.computeF1MeanLogP on the same text — if these + // two ever disagree, the model's calibration is silently wrong. + String probe = "pack my box with five dozen liquor jugs"; + double trainerRawMean = JunkDetector.computeF1MeanLogP(probe, tables); + float expectedZ1 = (float) ((trainerRawMean - f1CalLatin[0]) / f1CalLatin[1]); + TextQualityScore probeScore = detector.score(probe); + // logit = w1 * z1 + 0 + 0 + 0 + 0 = z1 in this test configuration. + assertEquals(expectedZ1, probeScore.getZScore(), 0.001f, + "Inference z1 must match trainer-computed z1 " + + "(train/infer F1 math drift)"); + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + /** + * Builds a V7Tables with codepoint index ['A', 'B'], where (A,B) has a + * stored log-prob of -1.0 but (B,A) is absent (forces unigram backoff). + * Unigram log-prob = -2.0 for both A and B. + * + *

Bigram quant range is set explicitly to {@code [-10, -1]} so that + * the single stored value at -1.0 maps to byte 255 (avoids the + * degenerate {@code min == max} branch in + * {@link TrainJunkModel#quantizeFloats}). Same idea for the unigram + * range {@code [-5, -2]} so the (-2.0, -2.0) values map to byte 255. + */ + private static V7Tables buildLatinTablesAB() { + int[] cpIndex = new int[]{'A', 'B'}; + + // 4 slots ≈ 25% load for 1 pair. Open-addressing with linear probe. + int[] keys = new int[4]; + Arrays.fill(keys, V7Tables.EMPTY_KEY); + byte[] values = new byte[4]; + + // Manual quantization with a chosen range so we don't hit the + // degenerate single-element case. range=[-10, -1] → -1.0 → byte 255. + float bMin = -10.0f; + float bMax = -1.0f; + byte b = quantizeOne(-1.0f, bMin, bMax); + insertOA(keys, values, JunkDetector.packBigramKey(0, 1), b); + + float uMin = -5.0f; + float uMax = -2.0f; + byte[] unigramBytes = new byte[]{ + quantizeOne(-2.0f, uMin, uMax), + quantizeOne(-2.0f, uMin, uMax), + }; + + return new V7Tables(cpIndex, keys, values, unigramBytes, + bMin, bMax, + uMin, uMax, + -10.0f, 1.0f); + } + + /** Quantize a single float to 8-bit unsigned using the explicit range. */ + private static byte quantizeOne(float v, float min, float max) { + float range = max - min; + int q = Math.round(((v - min) / range) * 255.0f); + if (q < 0) q = 0; + else if (q > 255) q = 255; + return (byte) q; + } + + /** + * Replica of {@code TrainJunkModel.insertOA} (package-private) for the + * test's hand-constructed tables. Uses the same mix-hash as the + * production code path. + */ + private static void insertOA(int[] keys, byte[] values, int packedKey, byte value) { + int mask = keys.length - 1; + int h = JunkDetector.mixIndexKey(packedKey) & mask; + while (keys[h] != V7Tables.EMPTY_KEY) { + if (keys[h] == packedKey) { + values[h] = value; + return; + } + h = (h + 1) & mask; + } + keys[h] = packedKey; + values[h] = value; + } + + /** + * Saves a minimal v7 model containing only LATIN, with F2/F3/F4 zeroed + * out and pure-F1 classifier weights (w1=1, rest 0, bias 0). Scoring + * a window thus reduces to z1 directly. F1 calibration: mu=-5, sigma=1. + */ + private static void saveMinimalV7Model(V7Tables tables, Path modelFile) throws IOException { + TreeMap f1Tables = new TreeMap<>(); + f1Tables.put("LATIN", tables); + + TreeMap f1Cal = new TreeMap<>(); + f1Cal.put("LATIN", new float[]{-5.0f, 1.0f}); + + int blockN = UnicodeBlockRanges.bucketCount(); + + TreeMap blockTables = new TreeMap<>(); + blockTables.put("LATIN", new float[blockN * blockN]); + TreeMap blockCal = new TreeMap<>(); + blockCal.put("LATIN", new float[]{0f, 1f}); + + TreeMap controlCal = new TreeMap<>(); + controlCal.put("LATIN", new float[]{0f, 1f}); + + List scriptBuckets = List.of("LATIN", "OTHER"); + float[] scriptTransTable = new float[scriptBuckets.size() * scriptBuckets.size()]; + float[] scriptTransCal = new float[]{0f, 1f}; + + TreeMap classifierWeights = new TreeMap<>(); + classifierWeights.put("LATIN", new float[]{1.0f, 0.0f, 0.0f, 0.0f, 0.0f}); + + TrainJunkModel.saveModelV7( + f1Tables, f1Cal, blockTables, blockCal, controlCal, + classifierWeights, scriptBuckets, scriptTransTable, + scriptTransCal, modelFile); + } + + private static void writeGzippedLines(Path path, String... lines) throws IOException { + try (BufferedWriter w = new BufferedWriter(new OutputStreamWriter( + new GZIPOutputStream(Files.newOutputStream(path)), + StandardCharsets.UTF_8))) { + for (String line : lines) { + w.write(line); + w.write('\n'); + } + } + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java index 0b97a9a0bdb..eac556f139b 100644 --- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java @@ -161,4 +161,54 @@ public void noopWhenAllDecodingsIdentical() throws Exception { // constructor — ServiceLoader cannot instantiate it. Wiring JunkDetector // up as a proper SPI provider is tracked as follow-up work for TIKA-4720; // at that point this test can be added to exercise the real SPI path. + + @Test + void expandHtmlEntities_numericDecimalResolvesToCodepoint() { + // U+0D4D = Malayalam Sign Virama + assertEquals("്", + JunkFilterEncodingDetector.expandHtmlEntities("്")); + // Surrounding ASCII preserved + assertEquals("a്b", + JunkFilterEncodingDetector.expandHtmlEntities("a്b")); + } + + @Test + void expandHtmlEntities_numericHexResolvesToCodepoint() { + // U+4E2D = 中 (Han ideograph "middle") + assertEquals("中", + JunkFilterEncodingDetector.expandHtmlEntities("中")); + assertEquals("中", + JunkFilterEncodingDetector.expandHtmlEntities("中")); + } + + @Test + void expandHtmlEntities_namedReferences() { + assertEquals("&", JunkFilterEncodingDetector.expandHtmlEntities("&")); + assertEquals("<", JunkFilterEncodingDetector.expandHtmlEntities("<")); + assertEquals(">", JunkFilterEncodingDetector.expandHtmlEntities(">")); + assertEquals("\"", JunkFilterEncodingDetector.expandHtmlEntities(""")); + assertEquals("a & b < c", JunkFilterEncodingDetector.expandHtmlEntities("a & b < c")); + } + + @Test + void expandHtmlEntities_malformedPassesThrough() { + // No semicolon → not matched, left as literal + assertEquals("്", JunkFilterEncodingDetector.expandHtmlEntities("്")); + // Unknown named entity → left as literal + assertEquals("&unknown;", + JunkFilterEncodingDetector.expandHtmlEntities("&unknown;")); + // Out-of-range numeric → left as literal (passes overflow guard) + assertEquals("�", + JunkFilterEncodingDetector.expandHtmlEntities("�")); + } + + @Test + void expandHtmlEntities_mixedEntityAndRawCodepoints() { + // Simulates an AIT5-style document: mix of raw Malayalam codepoints + // and numeric entity references encoding more Malayalam codepoints. + // ത = ത ് = ് (virama) + String input = "ത്ര"; + String expected = "ത്ര"; + assertEquals(expected, JunkFilterEncodingDetector.expandHtmlEntities(input)); + } } diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/UnicodeBlockRangesTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/UnicodeBlockRangesTest.java new file mode 100644 index 00000000000..e25cff9204e --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/UnicodeBlockRangesTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +/** + * Sanity checks for the static {@link UnicodeBlockRanges} lookup table. + * + *

The table is the single source of truth for F2 block bucketing across + * trainer and inference, so any silent drift here would silently corrupt + * the block-transition feature for the entire model. These tests assert + * a handful of known-codepoint → known-bucket facts plus the table's + * internal invariants (sorted, non-overlapping, contiguous bucket ids). + */ +public class UnicodeBlockRangesTest { + + @Test + void bucketCountIs339() { + // 338 named ranges in the static table + 1 unassigned = 339 total. + // If this ever fails, the static table has changed — check that + // SCHEME_VERSION was bumped and downstream models retrained. + assertEquals(339, UnicodeBlockRanges.bucketCount()); + assertEquals(338, UnicodeBlockRanges.UNASSIGNED); + } + + @Test + void wellKnownCodepointsMapToExpectedBuckets() { + // 'A' (U+0041) → BASIC_LATIN bucket 0 + assertEquals(0, UnicodeBlockRanges.bucketOf('A')); + // 'a' (U+0061) → BASIC_LATIN + assertEquals(0, UnicodeBlockRanges.bucketOf('a')); + // U+00FF (ÿ) → LATIN_1_SUPPLEMENT bucket 1 (last codepoint in range) + assertEquals(1, UnicodeBlockRanges.bucketOf(0x00FF)); + // U+0100 (Ā) → LATIN_EXTENDED_A bucket 2 (first codepoint in next range) + assertEquals(2, UnicodeBlockRanges.bucketOf(0x0100)); + // 中 (U+4E2D) → CJK_UNIFIED_IDEOGRAPHS bucket 120 + assertEquals(120, UnicodeBlockRanges.bucketOf(0x4E2D)); + // 国 (U+56FD) → CJK_UNIFIED_IDEOGRAPHS bucket 120 + assertEquals(120, UnicodeBlockRanges.bucketOf(0x56FD)); + // U+0D24 (ത, Malayalam letter ta) → MALAYALAM bucket 30 + assertEquals(30, UnicodeBlockRanges.bucketOf(0x0D24)); + // Hangul syllables - U+AC00 → bucket 147 + assertEquals(147, UnicodeBlockRanges.bucketOf(0xAC00)); + // Cyrillic А (U+0410) → CYRILLIC bucket 8 + assertEquals(8, UnicodeBlockRanges.bucketOf(0x0410)); + } + + @Test + void codepointsInGapsBetweenBlocksReturnUnassigned() { + // The Unicode standard leaves gaps where no block is assigned. + // Examples (verified by enumeration on JDK 25): + // U+10200 falls between PHAISTOS_DISC (U+101D0..U+101FF) and + // LYCIAN (U+10280..U+1029F). + assertEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(0x10200)); + // U+0860 changed in Unicode 10 — verify it's in some block (SYRIAC_SUPPLEMENT). + assertNotEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(0x0860)); + } + + @Test + void codepointsBeyondSupplementaryReturnUnassigned() { + // Negative codepoints, supplementary range edges, and beyond U+10FFFF + // are not valid input but the lookup must not crash; UNASSIGNED is fine. + assertEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(-1)); + // U+10FFFF is the last codepoint and is in SUPPLEMENTARY_PRIVATE_USE_AREA_B. + assertNotEquals(UnicodeBlockRanges.UNASSIGNED, UnicodeBlockRanges.bucketOf(0x10FFFF)); + } + + @Test + void schemeVersionIsBumpedOnAnyTableChange() { + // If the static table is ever modified, SCHEME_VERSION MUST be bumped + // — otherwise loaded models silently re-map to the new bucketing. + // This test enforces awareness: anyone changing the table will see + // this assertion fail and be forced to think about the consequence. + // Update the expected value here and bump SCHEME_VERSION together. + assertEquals(1, UnicodeBlockRanges.SCHEME_VERSION); + } + + @Test + void bucketIdsCoverContiguousRange() { + // Every named block id 0..337 must be reachable. Hits a representative + // codepoint in each range and asserts all 338 ids are produced (plus + // UNASSIGNED for the gaps). + boolean[] seen = new boolean[UnicodeBlockRanges.bucketCount()]; + for (int cp = 0; cp <= 0x10FFFF; cp++) { + int bucket = UnicodeBlockRanges.bucketOf(cp); + assertTrue(bucket >= 0 && bucket < UnicodeBlockRanges.bucketCount(), + "Bucket out of range at cp=U+" + Integer.toHexString(cp) + + ": " + bucket); + seen[bucket] = true; + } + for (int b = 0; b < UnicodeBlockRanges.bucketCount(); b++) { + assertTrue(seen[b], "Bucket id " + b + " is never produced by any codepoint"); + } + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java new file mode 100644 index 00000000000..55398307191 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Set; + +import org.junit.jupiter.api.Test; + +/** + * Pin-test for {@link JunkDetectorTrainingConfig}. + * + *

The values exercised here are the durable choices that define the + * shipping junk-detector model's identity. This test exists so that any + * change to those values requires updating an assertion in the same + * commit, surfacing the change in code review rather than letting it + * slip silently. + * + *

If you are intentionally tuning a parameter, update both the + * constant and the matching assertion below in the same change. Do not + * "fix" a failing assertion in isolation. + */ +class JunkDetectorTrainingConfigTest { + + @Test + void corpusBuildValues() { + assertEquals(500_000_000L, + JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES); + assertEquals(5_000_000L, + JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES); + assertEquals(0.05, + JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC, 1e-9); + assertEquals(50, + JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE); + assertEquals(0.30, + JunkDetectorTrainingConfig.MAX_PUNC_FRAC, 1e-9); + assertEquals(500, + JunkDetectorTrainingConfig.MIN_DEV_SENTENCES); + assertEquals(2_000, + JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES); + assertEquals(200_000L, + JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES); + assertEquals(42, + JunkDetectorTrainingConfig.SEED); + } + + @Test + void droppedScripts() { + Set drop = JunkDetectorTrainingConfig.DROP_SCRIPTS; + assertEquals(Set.of("GOTHIC", "THAANA"), drop); + // Must be immutable: any caller that tries to mutate the set + // should fail loudly rather than corrupting the shared config. + assertThrows(UnsupportedOperationException.class, + () -> drop.add("FAKE")); + } + + @Test + void scriptBudgetOverridesEmpty() { + // v7 hypothesis test (HAN=60MB) ran but gave only marginal gains. + // Override map is intentionally empty pending a more decisive + // experiment. + assertTrue(JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES.isEmpty()); + } + + @Test + void modelTrainValues() { + assertEquals(3, JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT); + assertEquals(0.5, JunkDetectorTrainingConfig.OA_LOAD_FACTOR, 1e-9); + assertEquals(16, JunkDetectorTrainingConfig.KEY_INDEX_BITS); + assertTrue(JunkDetectorTrainingConfig.KEY_INDEX_BITS <= 16, + "KEY_INDEX_BITS must be <= 16 to fit packed key in an int"); + } + + @Test + void notInstantiable() { + // The class is a frozen configuration container; making it + // instantiable would invite per-call mutation. + java.lang.reflect.Constructor[] ctors = + JunkDetectorTrainingConfig.class.getDeclaredConstructors(); + assertEquals(1, ctors.length, "expected exactly one constructor"); + assertFalse(java.lang.reflect.Modifier.isPublic(ctors[0].getModifiers()), + "constructor should not be public"); + } +}