In [None]:
import os
import io
import csv
import json
from collections import Counter, defaultdict
from pathlib import Path

import pandas as pd

# ---------- CONFIG ----------
ROOT = Path("/kaggle/input/log-data-for-anomaly-detection/Hadoop_log/Hadoop_log")
LABEL_FILE = ROOT / "abnormal_label.txt"
OUT_DIR = Path("/kaggle/working")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Sampling
SAMPLES_PER_LABEL = 15
SAMPLES_PER_APP = 10
RANDOM_SEED = 42


In [None]:

def parse_labels(label_file: Path) -> dict:
    """
    Parse abnormal_label.txt of the form:
        abnormal:
        +application_123
        +application_456
        normal:
        +application_789
    Returns dict: { 'application_123': 'abnormal', ... }
    """
    app_label = {}
    current_label = None
    if not label_file.exists():
        print(f"[WARN] Label file not found: {label_file}")
        return app_label

    with label_file.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line or line.startswith("#"):
                continue
            if line.endswith(":"):
                current_label = line[:-1].strip()
                continue
            if line.startswith("+"):
                if current_label is None:
                    # In case of malformed ordering
                    continue
                app = line[1:].strip()
                if app:
                    app_label[app] = current_label
    return app_label


def safe_count_lines(file_path: Path) -> int:
    """
    Stream file to count lines without loading into memory.
    Handles potential encoding issues by ignoring errors.
    """
    count = 0
    with file_path.open("r", encoding="utf-8", errors="ignore") as f:
        for _ in f:
            count += 1
    return count


def stream_lines(file_path: Path, max_lines=None):
    """
    Yield lines from a text file with robust decoding.
    """
    n = 0
    with file_path.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.rstrip("\n")
            if line:
                yield line
            n += 1
            if max_lines is not None and n >= max_lines:
                break


def reservoir_sample(iterable, k, seed=RANDOM_SEED):
    """
    Classic reservoir sampling to draw k items from an iterator without reading all into memory.
    """
    import random
    random.seed(seed)
    sample = []
    for i, item in enumerate(iterable, start=1):
        if i <= k:
            sample.append(item)
        else:
            j = random.randint(1, i)
            if j <= k:
                sample[j-1] = item
    return sample


In [None]:


def main():
    # 1) Parse labels
    app_to_label = parse_labels(LABEL_FILE)

    # 2) Inventory: list application_* dirs and their *.log files
    apps = []
    file_rows = []   # for file_inventory.csv
    app_stats = defaultdict(lambda: {"num_files": 0, "total_lines": 0, "label": "unknown"})

    if not ROOT.exists():
        raise FileNotFoundError(f"Dataset root not found: {ROOT}")

    for entry in sorted(ROOT.iterdir()):
        if entry.is_dir() and entry.name.startswith("application_"):
            app_name = entry.name
            label = app_to_label.get(app_name, "unknown")
            apps.append(app_name)

            log_files = sorted([p for p in entry.iterdir() if p.is_file() and p.suffix == ".log"])
            for lf in log_files:
                # Count size and lines
                size_bytes = lf.stat().st_size
                line_count = safe_count_lines(lf)

                file_rows.append({
                    "application": app_name,
                    "label": label,
                    "file_path": str(lf),
                    "file_name": lf.name,
                    "size_bytes": size_bytes,
                    "line_count": line_count,
                })

                # Accumulate into app_stats
                app_stats[app_name]["num_files"] += 1
                app_stats[app_name]["total_lines"] += line_count
                app_stats[app_name]["label"] = label

    # 3) Convert to DataFrames and persist
    inv_df = pd.DataFrame(file_rows).sort_values(["application", "file_name"])
    inv_path = OUT_DIR / "file_inventory.csv"
    inv_df.to_csv(inv_path, index=False)

    app_df = (
        pd.DataFrame([
            {"application": app, **vals}
            for app, vals in app_stats.items()
        ])
        .sort_values(["label", "total_lines"], ascending=[True, False])
    )
    app_path = OUT_DIR / "app_stats.csv"
    app_df.to_csv(app_path, index=False)

    # 4) Label distribution (applications and lines)
    label_counts_apps = app_df["label"].value_counts().rename_axis("label").reset_index(name="num_apps")
    # lines per label
    lines_per_label = app_df.groupby("label")["total_lines"].sum().reset_index(name="total_lines")
    label_summary = pd.merge(label_counts_apps, lines_per_label, on="label", how="outer").fillna(0)
    label_path = OUT_DIR / "label_counts.csv"
    label_summary.to_csv(label_path, index=False)

    # 5) Sampling: a) per-label sample, b) per-app sample
    #    We’ll write a compact JSONL with entries:
    #    {"kind": "per_label", "label": ..., "application": ..., "file": ..., "line": "..."}
    #    {"kind": "per_app", "application": ..., "label": ..., "file": ..., "line": "..."}
    samples_path = OUT_DIR / "sample_logs.jsonl"
    with samples_path.open("w", encoding="utf-8") as jout:
        # a) Per-label samples
        for label in sorted(label_summary["label"].unique()):
            # Gather all lines for this label via reservoir sample across all its files
            # To keep it efficient, we sample per file then re-sample globally.
            per_file_samples = []
            for _, row in inv_df[inv_df["label"] == label].iterrows():
                lines = reservoir_sample(stream_lines(Path(row["file_path"])), k=3)  # small per-file sample
                for ln in lines:
                    per_file_samples.append({
                        "kind": "per_label",
                        "label": label,
                        "application": row["application"],
                        "file": row["file_name"],
                        "line": ln
                    })
            # Re-sample to SAMPLES_PER_LABEL
            selected = reservoir_sample((x for x in per_file_samples), k=min(SAMPLES_PER_LABEL, max(1, len(per_file_samples))))
            for item in selected:
                jout.write(json.dumps(item, ensure_ascii=False) + "\n")

        # b) Per-app samples
        for app_name in app_df["application"]:
            label = app_stats[app_name]["label"]
            per_app_lines = []
            rows = inv_df[inv_df["application"] == app_name]
            for _, row in rows.iterrows():
                lines = reservoir_sample(stream_lines(Path(row["file_path"])), k=2)  # tiny per-file for spread
                for ln in lines:
                    per_app_lines.append({
                        "kind": "per_app",
                        "application": app_name,
                        "label": label,
                        "file": row["file_name"],
                        "line": ln
                    })
            selected = reservoir_sample((x for x in per_app_lines), k=min(SAMPLES_PER_APP, max(1, len(per_app_lines))))
            for item in selected:
                jout.write(json.dumps(item, ensure_ascii=False) + "\n")

    # 6) Quick textual report
    total_apps = len(app_df)
    total_files = len(inv_df)
    total_lines = int(inv_df["line_count"].sum()) if not inv_df.empty else 0
    known_labels = sorted(set(app_to_label.values()))
    unknown_apps = int((app_df["label"] == "unknown").sum())

    report_lines = []
    report_lines.append("# Hadoop Log Dataset – Quick Report")
    report_lines.append("")
    report_lines.append(f"- Root: `{ROOT}`")
    report_lines.append(f"- Applications found: **{total_apps}**")
    report_lines.append(f"- Log files: **{total_files}**")
    report_lines.append(f"- Total log lines (approx.): **{total_lines:,}**")
    report_lines.append(f"- Labels in label file: {known_labels if known_labels else 'None found'}")
    report_lines.append(f"- Applications without a label (`unknown`): **{unknown_apps}**")
    report_lines.append("")
    report_lines.append("## Label Distribution (by applications)")
    report_lines.append(label_summary.to_markdown(index=False))
    report_lines.append("")
    report_lines.append("## Top Applications by Line Count")
    top_apps = app_df.nlargest(10, "total_lines")[["application", "label", "num_files", "total_lines"]]
    report_lines.append(top_apps.to_markdown(index=False))
    report_lines.append("")
    report_lines.append("## Outputs")
    report_lines.append(f"- `file_inventory.csv` – {inv_path}")
    report_lines.append(f"- `app_stats.csv` – {app_path}")
    report_lines.append(f"- `label_counts.csv` – {label_path}")
    report_lines.append(f"- `sample_logs.jsonl` – {samples_path}")

    rpt_path = OUT_DIR / "report.md"
    with rpt_path.open("w", encoding="utf-8") as f:
        f.write("\n".join(report_lines))

    # 7) Print a concise on-screen summary
    print("\n".join(report_lines[:12]))  # print header + high-level stats
    print("\nWrote full report to:", rpt_path)
    print("Artifacts:")
    print(" -", inv_path)
    print(" -", app_path)
    print(" -", label_path)
    print(" -", samples_path)


if __name__ == "__main__":
    main()


# Hadoop Log Dataset – Quick Report

- Root: `/kaggle/input/log-data-for-anomaly-detection/Hadoop_log/Hadoop_log`
- Applications found: **55**
- Log files: **978**
- Total log lines (approx.): **394,310**
- Labels in label file: ['Disk full', 'Machine down', 'Network disconnection', 'Normal']
- Applications without a label (`unknown`): **0**

## Label Distribution (by applications)
| label                 |   num_apps |   total_lines |
|:----------------------|-----------:|--------------:|
| Disk full             |          9 |         27946 |
| Machine down          |         28 |        102960 |
| Network disconnection |          7 |        237979 |
| Normal                |         11 |         25425 |


Wrote full report to: /kaggle/working/report.md
Artifacts:
 - /kaggle/working/file_inventory.csv
 - /kaggle/working/app_stats.csv
 - /kaggle/working/label_counts.csv
 - /kaggle/working/sample_logs.jsonl


In [None]:
import os
import re
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import random
import joblib

import tensorflow as tf
from tensorflow.keras import layers, callbacks, optimizers, regularizers

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score


2025-09-21 20:47:29.781566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758487649.975964      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758487650.031532      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
ROOT = Path("/kaggle/input/log-data-for-anomaly-detection/Hadoop_log/Hadoop_log")
LABEL_FILE = ROOT / "abnormal_label.txt"
OUT_DIR = Path("/kaggle/working")
OUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
TF_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.keras.utils.set_random_seed(TF_SEED)

# Balanced caps across labels; modest at first, scale up once stable
MAX_LINES_PER_LABEL_TRAIN = 30000
MAX_LINES_PER_LABEL_VAL   = 5000
MAX_LINES_PER_LABEL_TEST  = 5000

# Cap per application to avoid any single app dominating
MAX_LINES_PER_APP_TRAIN   = 4000
MAX_LINES_PER_APP_VAL     = 1500
MAX_LINES_PER_APP_TEST    = 1500

# Limit proportion of low-priority ("INFO-like") lines per app in each split
MAX_INFO_RATIO = 0.30  # at most 30% of sampled lines per app can be low-priority

# Text vectorization (character-level)
SEQ_LEN = 512      # truncate/pad each line to this many characters
VOCAB = None       # None => adapt from train; or pass a fixed character list

# Model / training
EMBED_DIM = 64
DROPOUT = 0.35
BATCH_SIZE = 512
EPOCHS = 12
BASE_LR = 1e-3     # lower than before for stability

# Confidence floor for app-level aggregation (exclude low-confidence lines)
CONF_FLOOR = 0.50

In [None]:

def parse_labels(label_file: Path) -> dict:
    """Return dict: application -> label"""
    app_to_label = {}
    current_label = None
    with label_file.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line or line.startswith("#"):
                continue
            if line.endswith(":"):
                current_label = line[:-1].strip()
                continue
            if line.startswith("+") and current_label is not None:
                app = line[1:].strip()
                if app:
                    app_to_label[app] = current_label
    return app_to_label

def iter_log_lines(file_path: Path):
    with file_path.open("r", encoding="utf-8", errors="ignore") as f:
        for ln in f:
            ln = ln.strip()
            if ln:
                yield ln

def stratified_app_split(app_to_label, train_size=0.7, val_size=0.15, test_size=0.15, seed=RANDOM_SEED):
    apps = np.array(sorted(app_to_label.keys()))
    y = np.array([app_to_label[a] for a in apps])

    # First split train vs (val+test)
    sss1 = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=seed)
    train_idx, vt_idx = next(sss1.split(apps, y))

    apps_train = apps[train_idx]
    y_vt = y[vt_idx]

    # Split remaining into val and test with preserved stratification
    vt_apps = apps[vt_idx]
    test_ratio_in_vt = test_size / (val_size + test_size)
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio_in_vt, random_state=seed+1)
    val_idx_rel, test_idx_rel = next(sss2.split(vt_apps, y_vt))

    apps_val  = vt_apps[val_idx_rel]
    apps_test = vt_apps[test_idx_rel]
    return set(apps_train), set(apps_val), set(apps_test)

# Priority heuristics for sampling
ERR_PAT  = re.compile(r"(ERROR|FATAL|EXCEPTION|fail(ed)?|refused|timeout|disk|full|network|connect|lost|down)", re.I)
WARN_PAT = re.compile(r"(WARN|warning|retry|backoff|throttle)", re.I)

def line_priority(line: str) -> float:
    if ERR_PAT.search(line):  return 3.0
    if WARN_PAT.search(line): return 1.5
    return 1.0  # info/other

def _collect_one_split(apps_subset, app_to_label, per_label_cap, per_app_cap, max_info_ratio):
    """Priority-based sampler with per-label and per-app caps, returns (texts, labels, app_to_indices)."""
    per_label_counts = Counter()
    texts, labels = [], []
    app_to_indices = defaultdict(list)

    apps_list = list(apps_subset)
    random.shuffle(apps_list)

    for app in apps_list:
        lab = app_to_label[app]
        app_dir = ROOT / app
        if not app_dir.exists():
            continue

        # Gather and score lines for this app
        scored = []
        for lf in sorted([p for p in app_dir.iterdir() if p.is_file() and p.suffix == ".log"]):
            for ln in iter_log_lines(lf):
                scored.append((line_priority(ln), ln))

        if not scored:
            continue

        # Sort by score, prefer high priority
        scored.sort(key=lambda x: x[0], reverse=True)

        # Enforce info ratio within per_app_cap
        chosen, info_count = [], 0
        info_limit = int(max_info_ratio * per_app_cap) if per_app_cap > 0 else 0
        for sc, ln in scored:
            if len(chosen) >= per_app_cap:
                break
            is_info = (sc == 1.0)
            if is_info and info_count >= info_limit:
                continue
            chosen.append(ln)
            if is_info:
                info_count += 1

        # Add chosen to global buffers subject to per-label cap
        for ln in chosen:
            if per_label_counts[lab] >= per_label_cap:
                break
            idx = len(texts)
            texts.append(ln)
            labels.append(lab)
            app_to_indices[app].append(idx)
            per_label_counts[lab] += 1

    return texts, labels, app_to_indices

def collect_lines_for_split(apps_subset, app_to_label, per_label_cap, per_app_cap, max_info_ratio=MAX_INFO_RATIO):
    return _collect_one_split(apps_subset, app_to_label, per_label_cap, per_app_cap, max_info_ratio)


In [None]:

app_to_label = parse_labels(LABEL_FILE)
labels_set = sorted(set(app_to_label.values()))
print("Labels:", labels_set)
print("Apps per label:", Counter(app_to_label.values()))

apps_train, apps_val, apps_test = stratified_app_split(app_to_label)
print(f"Train apps: {len(apps_train)}, Val apps: {len(apps_val)}, Test apps: {len(apps_test)}")

X_train_lines, y_train_labels, appidx_train = collect_lines_for_split(
    apps_train, app_to_label,
    per_label_cap=MAX_LINES_PER_LABEL_TRAIN, per_app_cap=MAX_LINES_PER_APP_TRAIN
)
X_val_lines, y_val_labels, appidx_val = collect_lines_for_split(
    apps_val, app_to_label,
    per_label_cap=MAX_LINES_PER_LABEL_VAL, per_app_cap=MAX_LINES_PER_APP_VAL
)
X_test_lines, y_test_labels, appidx_test = collect_lines_for_split(
    apps_test, app_to_label,
    per_label_cap=MAX_LINES_PER_LABEL_TEST, per_app_cap=MAX_LINES_PER_APP_TEST
)

print("Line counts:",
      "train", len(X_train_lines),
      "val", len(X_val_lines),
      "test", len(X_test_lines))

# Encode labels
le = LabelEncoder()
le.fit(list(labels_set))  # ensure stable class order
y_train = le.transform(y_train_labels)
y_val   = le.transform(y_val_labels)
y_test  = le.transform(y_test_labels)

num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

# Save per-split label counts actually used
pd.Series(Counter(y_train_labels)).to_csv(OUT_DIR / "train_label_counts.csv")
pd.Series(Counter(y_val_labels)).to_csv(OUT_DIR / "val_label_counts.csv")
pd.Series(Counter(y_test_labels)).to_csv(OUT_DIR / "test_label_counts.csv")


Labels: ['Disk full', 'Machine down', 'Network disconnection', 'Normal']
Apps per label: Counter({'Machine down': 28, 'Normal': 11, 'Disk full': 9, 'Network disconnection': 7})
Train apps: 38, Val apps: 8, Test apps: 9
Line counts: train 57981 val 5289 test 5871
Classes: ['Disk full', 'Machine down', 'Network disconnection', 'Normal']


In [None]:

vec = layers.TextVectorization(
    standardize=None,
    split="character",
    output_mode="int",
    output_sequence_length=SEQ_LEN,
    vocabulary=VOCAB
)
vec.adapt(tf.data.Dataset.from_tensor_slices(np.array(X_train_lines, dtype=object)).batch(2048))
vocab = vec.get_vocabulary()
with open(OUT_DIR / "char_vocab.txt", "w", encoding="utf-8") as f:
    for tok in vocab:
        f.write(tok + "\n")
print("Vocab size:", len(vocab))

# -------------------
# tf.data Pipelines
# -------------------
def make_ds(texts, labels, batch_size, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((np.array(texts, dtype=object), np.array(labels, dtype=np.int64)))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(texts), 100000), seed=RANDOM_SEED, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

def vectorize_batch(text, label):
    return vec(text), label

train_ds = make_ds(X_train_lines, y_train, BATCH_SIZE, shuffle=True).map(vectorize_batch, num_parallel_calls=tf.data.AUTOTUNE)
val_ds   = make_ds(X_val_lines,   y_val,   BATCH_SIZE, shuffle=False).map(vectorize_batch, num_parallel_calls=tf.data.AUTOTUNE)
test_ds  = make_ds(X_test_lines,  y_test,  BATCH_SIZE, shuffle=False).map(vectorize_batch, num_parallel_calls=tf.data.AUTOTUNE)

I0000 00:00:1758487673.484357      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Vocab size: 89


In [None]:

def build_model(vocab_size, num_classes):
    inputs = layers.Input(shape=(SEQ_LEN,), dtype=tf.int64)
    x = layers.Embedding(vocab_size, EMBED_DIM, mask_zero=True,
                         embeddings_regularizer=regularizers.l2(1e-6))(inputs)

    # Multi-kernel CNN block for varied n-grams
    b1 = layers.Conv1D(96, 3, padding="same", activation="relu",
                       kernel_regularizer=regularizers.l2(1e-6))(x)
    b2 = layers.Conv1D(96, 5, padding="same", activation="relu",
                       kernel_regularizer=regularizers.l2(1e-6))(x)
    b3 = layers.Conv1D(96, 7, padding="same", activation="relu",
                       kernel_regularizer=regularizers.l2(1e-6))(x)
    x = layers.Concatenate()([b1, b2, b3])
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.SpatialDropout1D(0.25)(x)

    x = layers.Bidirectional(layers.LSTM(80, return_sequences=True,
                                         dropout=0.2,
                                         kernel_regularizer=regularizers.l2(1e-6)))(x)
    x = layers.GlobalMaxPooling1D()(x)

    x = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-6))(x)
    x = layers.Dropout(DROPOUT)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = tf.keras.Model(inputs, outputs)
    return model

model = build_model(vocab_size=len(vocab), num_classes=num_classes)
model.summary()



In [None]:

opt = optimizers.Adam(learning_rate=BASE_LR)
model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

cb = [
    callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-5, verbose=1),
    callbacks.ModelCheckpoint(OUT_DIR / "line_charcnn_lstm.keras", save_best_only=True, monitor="val_loss")
]

# -------------------
# Train
# -------------------
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=cb,
    verbose=1
)

# -------------------
# Evaluation: line-level
# -------------------
y_pred_prob = model.predict(test_ds, verbose=0)
y_pred = y_pred_prob.argmax(axis=1)

print("\n[Line-level] Accuracy: %.4f  Macro-F1: %.4f" %
      (accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average="macro")))
print("\n[Line-level] Classification Report\n",
      classification_report(y_test, y_pred, target_names=list(le.classes_)))
print("\n[Line-level] Confusion Matrix\n",
      pd.DataFrame(confusion_matrix(y_test, y_pred),
                   index=[f"true_{c}" for c in le.classes_],
                   columns=[f"pred_{c}" for c in le.classes_]))


Epoch 1/12


I0000 00:00:1758487685.716605      56 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 191ms/step - accuracy: 0.6389 - loss: 0.9130 - val_accuracy: 0.1526 - val_loss: 1.4670 - learning_rate: 0.0010
Epoch 2/12
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 188ms/step - accuracy: 0.9203 - loss: 0.2268 - val_accuracy: 0.1579 - val_loss: 1.3863 - learning_rate: 0.0010
Epoch 3/12
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 182ms/step - accuracy: 0.9484 - loss: 0.1454 - val_accuracy: 0.5224 - val_loss: 1.0564 - learning_rate: 0.0010
Epoch 4/12
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 182ms/step - accuracy: 0.9549 - loss: 0.1286 - val_accuracy: 0.6563 - val_loss: 0.7988 - learning_rate: 0.0010
Epoch 5/12
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - accuracy: 0.9588 - loss: 0.1199
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [None]:

idx_to_app = [None] * len(X_test_lines)
for app, indices in appidx_test.items():
    for idx in indices:
        if 0 <= idx < len(idx_to_app):
            idx_to_app[idx] = app

app_true = {}
app_probs = defaultdict(list)

for i, app in enumerate(idx_to_app):
    if app is None:
        continue
    app_true.setdefault(app, y_test[i])
    # keep only reasonably confident lines (tune CONF_FLOOR 0.4–0.6)
    if float(np.max(y_pred_prob[i])) >= CONF_FLOOR:
        app_probs[app].append(y_pred_prob[i])

app_level_true, app_level_pred = [], []
for app in app_true.keys():
    probs = app_probs.get(app, None)
    if not probs:  # if all lines filtered, fallback to all lines for that app
        probs = [y_pred_prob[i] for i, a in enumerate(idx_to_app) if a == app]
    mean_prob = np.mean(probs, axis=0)
    app_level_pred.append(int(np.argmax(mean_prob)))
    app_level_true.append(int(app_true[app]))

print("\n[App-level from mean prob] Accuracy: %.4f  Macro-F1: %.4f" %
      (accuracy_score(app_level_true, app_level_pred),
       f1_score(app_level_true, app_level_pred, average="macro")))
print("\n[App-level] Classification Report\n",
      classification_report(app_level_true, app_level_pred, target_names=list(le.classes_)))
print("\n[App-level] Confusion Matrix\n",
      pd.DataFrame(confusion_matrix(app_level_true, app_level_pred),
                   index=[f"true_{c}" for c in le.classes_],
                   columns=[f"pred_{c}" for c in le.classes_]))


[App-level from mean prob] Accuracy: 1.0000  Macro-F1: 1.0000

[App-level] Classification Report
                        precision    recall  f1-score   support

            Disk full       1.00      1.00      1.00         2
         Machine down       1.00      1.00      1.00         5
Network disconnection       1.00      1.00      1.00         1
               Normal       1.00      1.00      1.00         1

             accuracy                           1.00         9
            macro avg       1.00      1.00      1.00         9
         weighted avg       1.00      1.00      1.00         9


[App-level] Confusion Matrix
                             pred_Disk full  pred_Machine down  \
true_Disk full                           2                  0   
true_Machine down                        0                  5   
true_Network disconnection               0                  0   
true_Normal                              0                  0   

                            pred_Netw

In [None]:
model.save(OUT_DIR / "line_charcnn_lstm_final.keras")

vec_config = vec.get_config()
vec_weights = vec.get_weights()
joblib.dump({"config": vec_config, "weights": vec_weights}, OUT_DIR / "textvectorization_char.pkl")


joblib.dump(le, OUT_DIR / "label_encoder_line.joblib")

print("\nSaved artifacts to /kaggle/working/:")
print(" - line_charcnn_lstm.keras (best checkpoint)")
print(" - line_charcnn_lstm_final.keras (final)")
print(" - textvectorization_char.pkl")
print(" - label_encoder_line.joblib")
print(" - train_label_counts.csv / val_label_counts.csv / test_label_counts.csv")



Saved artifacts to /kaggle/working/:
 - line_charcnn_lstm.keras (best checkpoint)
 - line_charcnn_lstm_final.keras (final)
 - textvectorization_char.pkl
 - label_encoder_line.joblib
 - train_label_counts.csv / val_label_counts.csv / test_label_counts.csv
