In [1]:
import os
import json
import numpy as np
import glob
from datetime import datetime
from collections import defaultdict

In [2]:
EPSILON = 1e-6  # 避免0概率
NORMAL_DIR = "normal"
SHORT_DIR = "normal"

ATTACK_DIR = "attack"
ATTACKBODY_DIR = "attackbody"
ATTACKREAD_DIR = "attackread"

MIX_DIR = "mix"

MARGIN = 0.05  # 阈值裕量

def load_all_probabilities(folder):
    result = []
    filenames = []
    for path in sorted(glob.glob(os.path.join(folder, "*.json"))):
        with open(path) as f:
            data = json.load(f)
        result.append(data["probabilities"])
        filenames.append(os.path.basename(path))
    return filenames, result

def to_full_vector(prob_dict, all_keys):
    vec = np.array([prob_dict.get(k, 0.0) for k in all_keys])
    vec = np.clip(vec, EPSILON, 1.0)
    return vec / vec.sum()

def kl_divergence(p, q, epsilon=1e-6):
    divergence = 0.0
    for key in p:
        p_val = p.get(key, 0.0)
        q_val = q.get(key, epsilon)
        if p_val > 0:
            divergence += p_val * math.log(p_val / q_val)
    return max(divergence, 0.0)

def js_divergence(p, q, epsilon=1e-6):
    keys = set(p.keys()).union(q.keys())
    m = {k: 0.5 * (p.get(k, 0.0) + q.get(k, 0.0)) for k in keys}
    kl_pm = sum(p.get(k, 0.0) * math.log(p.get(k, 0.0) / max(m[k], epsilon)) for k in keys if p.get(k, 0.0) > 0)
    kl_qm = sum(q.get(k, 0.0) * math.log(q.get(k, 0.0) / max(m[k], epsilon)) for k in keys if q.get(k, 0.0) > 0)
    return 0.5 * (kl_pm + kl_qm)

def js_divergence(p, q):
    m = 0.5 * (p + q)
    kl_pm = np.sum(p * np.log(p / m))
    kl_qm = np.sum(q * np.log(q / m))
    return 0.5 * (kl_pm + kl_qm)

In [3]:
# 加载正常样本
normal_names, normal_probs = load_all_probabilities(NORMAL_DIR)

# 构建所有状态对全集与参考分布
prob_sums = defaultdict(float)
prob_counts = defaultdict(int)
all_keys_set = set()

for prob in normal_probs:
    for k, v in prob.items():
        prob_sums[k] += v
        prob_counts[k] += 1
        all_keys_set.add(k)

all_keys = sorted(all_keys_set)
reference = {k: prob_sums[k] / prob_counts[k] for k in all_keys}
normal_vectors = [to_full_vector(prob, all_keys) for prob in normal_probs]
P = np.mean(normal_vectors, axis=0)

In [4]:
# 正常样本 JS 散度
normal_js = []
print("🔵 JS divergence among normal samples (to baseline):")
for name, prob in zip(normal_names, normal_probs):
    Q = to_full_vector(prob, all_keys)
    js = js_divergence(P,Q)
    normal_js.append(js)
    print(f"  {name}: JS = {js:.6f}")

max_normal_js = max(normal_js) if normal_js else 0
print(f"  Max normal JS  = {max_normal_js:.6f}")

🔵 JS divergence among normal samples (to baseline):
  baseline_short20250706_010457.json: JS = 0.013348
  baseline_short20250706_010733.json: JS = 0.012670
  baseline_short20250706_010948.json: JS = 0.015712
  baseline_short20250706_011131.json: JS = 0.010563
  baseline_short20250706_201505.json: JS = 0.004781
  baseline_short20250706_202218.json: JS = 0.010668
  Max normal JS  = 0.015712


In [5]:
# 攻击样本
attack_names, attack_probs = load_all_probabilities(ATTACK_DIR)
attack_js = []
print("\n🔴 JS divergence from attacks (to baseline):")
for name, prob in zip(attack_names, attack_probs):
    Q = to_full_vector(prob, all_keys)
    js = js_divergence(P,Q)
    attack_js.append(js)
    print(f"  {name}: JS = {js:.6f}")

# 攻击样本attackbody
attackbody_names, attackbody_probs = load_all_probabilities(ATTACKBODY_DIR)
attack_js = []
print("\n🔴 JS divergence from attacks (to baseline):")
for name, prob in zip(attackbody_names, attackbody_probs):
    Q = to_full_vector(prob, all_keys)
    js = js_divergence(P,Q)
    attack_js.append(js)
    print(f"  {name}: JS = {js:.6f}")
min_attack_js = min(attack_js) if js else 0
print(f"  Min attack KL  = {min_attack_js:.6f}")

# 攻击样本attackread
attackread_names, attackread_probs = load_all_probabilities(ATTACKREAD_DIR)
attack_js = []
print("\n🔴 JS divergence from attacks (to baseline):")
for name, prob in zip(attackread_names, attackread_probs):
    Q = to_full_vector(prob, all_keys)
    js = js_divergence(P,Q)
    attack_js.append(js)
    print(f"  {name}: JS = {js:.6f}")
min_attack_js = min(attack_js) if js else 0
print(f"  Min attack KL  = {min_attack_js:.6f}")


# 混合样本
mix_names, mix_probs = load_all_probabilities(MIX_DIR)
print("\n🟠 JS divergence from mix samples (to baseline):")
for name, prob in zip(mix_names, mix_probs):
    Q = to_full_vector(prob, all_keys)
    js = js_divergence(P,Q)
    attack_js.append(js)
    print(f"  {name}: JS = {js:.6f}")

min_attack_js = min(attack_js) if js else 0
print(f"  Min attack KL  = {min_attack_js:.6f}")



🔴 JS divergence from attacks (to baseline):
  baseline__probs_sliding20250704_125843.json: JS = 0.179678
  baseline__probs_sliding20250704_131445.json: JS = 0.181870
  baseline__probs_sliding20250704_132535.json: JS = 0.171797
  baseline__probs_sliding20250706_214705.json: JS = 0.204996
  baseline__probs_sliding20250706_232605.json: JS = 0.211804
  baseline__probs_sliding20250706_232722.json: JS = 0.208847

🔴 JS divergence from attacks (to baseline):
  baseline_short20250706_220537.json: JS = 0.204475
  baseline_short20250706_220650.json: JS = 0.187897
  baseline_short20250706_220801.json: JS = 0.204475
  baseline_short20250706_222008.json: JS = 0.208847
  baseline_short20250706_222142.json: JS = 0.204148
  baseline_short20250706_222406.json: JS = 0.204148
  Min attack KL  = 0.187897

🔴 JS divergence from attacks (to baseline):
  baseline_short20250706_224429.json: JS = 0.187897
  baseline_short20250706_224536.json: JS = 0.208847
  baseline_short20250706_224642.json: JS = 0.207656
  b

In [6]:
# 方法1：max_normal + margin
margin_threshold = max_normal_js + MARGIN

# 方法2：mean + 3*std
mean_js = np.mean(normal_js)
std_js = np.std(normal_js)
stat_threshold = mean_js + 3 * std_js


print("\n📊 JS")
print(f"  Max normal JS  = {max_normal_js:.6f}")
print(f"  Min attack JS  = {min_attack_js:.6f}")
print("\n📊 Threshold Comparison:(Based on Mean + 3σ):")
print(f"  Method 1 - Max Normal JS + Margin = {margin_threshold:.6f}")
print(f"  Method 2 - Mean + 3σ              = {stat_threshold:.6f}")
# print(f"  ✅ Recommended threshold: {recommended_threshold:.6f}")






# 保存基准信息
baseline_data = {
    "probabilities": reference,
    "max_normal_js": max_normal_js,
    "mean_normal_js": mean_js,
    "std_normal_js": std_js,
    "min_attack_js": min_attack_js,
    "threshold_max_margin": margin_threshold,
    "threshold_mean_3sigma": stat_threshold,
    "used_threshold": margin_threshold,  # ← 你选哪个就填哪个
    "used_threshold2": stat_threshold,  # ← 你选哪个就填哪个
    "epsilon": EPSILON,
    "method": "JS divergence with 3σ rule",
    "description": "Statistical baseline profile for TCP state transition"
}

filename = f"baseline_profile_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, "w") as f:
    json.dump(baseline_data, f, indent=4)

print(f"\nBaseline profile saved to {filename}")


📊 JS
  Max normal JS  = 0.015712
  Min attack JS  = 0.107266

📊 Threshold Comparison:(Based on Mean + 3σ):
  Method 1 - Max Normal JS + Margin = 0.065712
  Method 2 - Mean + 3σ              = 0.021462

Baseline profile saved to baseline_profile_20250707_151815.json
