# Rare Pattern Mining

Mine rare feature combinations from test data and build rare pattern graph.


In [1]:
import torch
import numpy as np
import pickle
from pathlib import Path

print("Libraries imported")


Libraries imported


## Configuration


In [2]:
DATA_DIR = "data"
OUTPUT_DIR = "rare_patterns"

MIN_COUNT = 10
MAX_COUNT = 1000
MAX_PATTERN_LENGTH = 2

Path(OUTPUT_DIR).mkdir(exist_ok=True)

print(f"Data: {DATA_DIR}/")
print(f"Output: {OUTPUT_DIR}/")
print(f"Pattern range: {MIN_COUNT}-{MAX_COUNT} processes")


Data: data/
Output: rare_patterns/
Pattern range: 10-1000 processes


## Load Test Data


In [3]:
test_graph = torch.load(Path(DATA_DIR) / "test_graph.pt", weights_only=False)

print(f"Test graph:")
print(f"  Nodes: {test_graph.num_nodes:,}")
print(f"  Edges: {test_graph.num_edges:,}")
print(f"  Features: {test_graph.num_node_features}")
print(f"  Attack nodes: {(test_graph.y == 1).sum().item()}")


Test graph:
  Nodes: 81,731
  Edges: 888,913
  Features: 299
  Attack nodes: 25


## Convert Features to Transactions


In [4]:
print("Converting one-hot features to transactions...")

test_features = test_graph.x.cpu().numpy()
test_transactions = []

for i in range(len(test_features)):
    transaction = set()
    for j in range(test_features.shape[1]):
        if test_features[i, j] > 0.5:
            transaction.add(f"feat_{j}")
    test_transactions.append(transaction)

print(f"Created {len(test_transactions):,} transactions")
print(f"Total features: {test_features.shape[1]}")
print(f"Avg active features per process: {np.mean([len(t) for t in test_transactions]):.1f}")


Converting one-hot features to transactions...
Created 81,731 transactions
Total features: 299
Avg active features per process: 6.1


## Mine Rare Patterns


In [5]:
from utils.apriori import mine_rare_patterns

n_total = len(test_transactions)
min_support = MIN_COUNT / n_total
max_support = MAX_COUNT / n_total

print(f"Target: Patterns in {MIN_COUNT}-{MAX_COUNT} processes")
print(f"Support: [{min_support*100:.3f}%, {max_support*100:.2f}%]")
print()

rare_itemsets, _ = mine_rare_patterns(
    test_transactions,
    min_support=min_support,
    max_support=max_support,
    max_length=MAX_PATTERN_LENGTH,
    min_confidence=0.5,
    verbose=True
)

print(f"\\nFound {len(rare_itemsets):,} rare feature combinations")


Target: Patterns in 10-1000 processes
Support: [0.012%, 1.22%]

Mining itemsets with Apriori algorithm
   Transactions: 81,731
   Support range: [0.00012235259571031799, 0.0122352595710318]
   Max itemset size: 2
   Counting 1-itemsets...


Scanning transactions: 100%|██████████| 81731/81731 [00:00<00:00, 794459.85it/s]


   1-itemsets: 162 frequent (support >= 0.00012235259571031799)


Checking 2-itemsets: 100%|██████████| 81731/81731 [01:48<00:00, 755.64it/s]


   2-itemsets: 2,876 frequent
Total itemsets found: 3,746

Rare itemsets (support in [0.00012235259571031799, 0.0122352595710318]):
   Count: 2,782
   Rarest: 0.0001 support
   Least rare: 0.0122 support

Generating association rules
   Min confidence: 0.5
Generated 2,896 rules
\nFound 2,782 rare feature combinations


## Display Top Patterns


In [6]:
if len(rare_itemsets) > 0:
    print(f"Top 10 rarest combinations:")
    for i, (itemset, support) in enumerate(rare_itemsets[:10], 1):
        count = int(support * n_total)
        features = ", ".join(sorted(list(itemset)))
        print(f"{i:2d}. [{features}]")
        print(f"    Appears in {count:,} processes ({support*100:.3f}%)")
else:
    print("No patterns found")


Top 10 rarest combinations:
 1. [feat_184]
    Appears in 10 processes (0.012%)
 2. [feat_96]
    Appears in 10 processes (0.012%)
 3. [feat_78]
    Appears in 10 processes (0.012%)
 4. [feat_184, feat_8]
    Appears in 10 processes (0.012%)
 5. [feat_184, feat_3]
    Appears in 10 processes (0.012%)
 6. [feat_13, feat_30]
    Appears in 10 processes (0.012%)
 7. [feat_17, feat_22]
    Appears in 10 processes (0.012%)
 8. [feat_12, feat_192]
    Appears in 10 processes (0.012%)
 9. [feat_17, feat_192]
    Appears in 10 processes (0.012%)
10. [feat_200, feat_43]
    Appears in 10 processes (0.012%)


## Analyze Pattern Distribution


In [7]:
from utils.rare_patterns import analyze_rare_pattern_distribution

if len(rare_itemsets) > 0:
    y_true = test_graph.y.cpu().numpy()
    
    stats = analyze_rare_pattern_distribution(
        test_transactions,
        rare_itemsets,
        y_true
    )



Rare Pattern Distribution Analysis:
   Normal processes:
      Avg rare patterns: 2.29 ± 21.34
      % with rare patterns: 15.7%
   Attack processes:
      Avg rare patterns: 43.60 ± 74.90
      % with rare patterns: 84.0%
   Attacks have MORE rare patterns (good signal!)


## Build Rare Pattern Graph


In [8]:
from utils.rare_patterns import build_rare_graph_fully_connected
from torch_geometric.data import Data

if len(rare_itemsets) > 0:
    print("Building rare pattern graph...")
    
    rare_graph_test = build_rare_graph_fully_connected(
        rare_itemsets,
        test_transactions,
        test_graph.x,
        test_graph.y
    )
    
    print(f"\\nRare graph created:")
    print(f"  Nodes: {rare_graph_test.num_nodes:,}")
    print(f"  Edges: {rare_graph_test.num_edges:,}")
else:
    print("No patterns found, creating empty graph...")
    
    rare_graph_test = Data(
        x=test_graph.x,
        edge_index=torch.zeros((2, 0), dtype=torch.long),
        y=test_graph.y
    )


Building rare pattern graph...
Building rare graph (fully connected mode)
   Processing 2,782 rare patterns...
Rare graph created:
   Nodes: 81,731
   Edges: 9,056,962
\nRare graph created:
  Nodes: 81,731
  Edges: 9,056,962


## Save Results


In [9]:
torch.save(rare_graph_test, Path(OUTPUT_DIR) / "rare_graph_test.pt")

print(f"Saved to {OUTPUT_DIR}/:")
print(f"  rare_graph_test.pt ({rare_graph_test.num_edges:,} edges)")


Saved to rare_patterns/:
  rare_graph_test.pt (9,056,962 edges)


## Summary


In [10]:
print("Rare pattern mining complete:")
print(f"- Mined {len(rare_itemsets):,} rare patterns")
print(f"- Built graph with {rare_graph_test.num_edges:,} edges")


Rare pattern mining complete:
- Mined 2,782 rare patterns
- Built graph with 9,056,962 edges
