In [3]:
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCN, GAT
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'sklearn'

In [None]:
#Load the dataset
dataset = pd.read_csv('./data/edge_events.csv')
# Extract unique nodes and create mapping dictionaries
nodes = pd.unique(dataset[["src", "dst"]].to_numpy().ravel())
node2idx = {n: i for i, n in enumerate(nodes)}
idx2node = {i: n for n, i in node2idx.items()}

# Map source and destination nodes to integer IDs
src = dataset["src"].map(node2idx).to_numpy()
dst = dataset["dst"].map(node2idx).to_numpy()

# Extract timestamps and labels
t = dataset["timestamp"].astype(int).to_numpy()
msg = pd.get_dummies(dataset["label"]).to_numpy()

In [None]:
# Load graph data
df = pd.read_csv('graph_data.csv')
adj = df.adj.toarray()
x = df.x
y = df.y

# Define graph embedding model
model = GCN(x, adj, num_layers=2, hidden_dim=64).to('cuda')

# Define anomaly detection model
X = torch.tensor(x)
y_pred = np.zeros_like(x)
scores = np.zeros_like(x)
outlier_scores = np.zeros_like(x)

for i, (data) in enumerate(DataLoader(Data(x=x, adj=adj, y=y), batch_size=len(x))):
    out = model(data.x)
    scores[i*len(x):(i+1)*len(x)] = out

# Define anomaly detection algorithm
lof = LocalOutlierFactor(n_neighbors=20, novelty=True)
scores = lof.fit_predict(scores)

# Normalize scores to [0, 1]
scores = np.clip(scores, 0, 1)

# Output anomaly scores
anomaly_scores = pd.DataFrame({'node_id': np.arange(len(x)), 'anomaly_score': scores})

# Define LLM interface
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained('t5-base')

def get_anomalies(question):
    question = tokenizer.encode(question, return_tensors='pt')
    inputs = {'input_ids': question['input_ids']}
    outputs = model.generate(**inputs)
    anomaly_scores = anomaly_scores.to_numpy().tolist()

    return anomaly_scores

# Example usage
question = "What are the top 5 most anomalous nodes in the graph?"
out = get_anomalies(question)
print(out)

In [None]:
df = pd.DataFrame(combined_data)
print("Dataset:")
print(df.head(10))
print(f"\nTotal records: {len(df)}")

In [None]:
class DynamicKnowledgeGraph:
    def __init__(self):
        self.node_to_idx = {}
        self.idx_to_node = {}
        self.edge_type_to_idx = {}
        self.idx_to_edge_type = {}
        self.node_features = {}
        self.edges = []
        self.timestamps = []
        
    def build_graph(self, df):
        """Build knowledge graph from DataFrame"""
        node_idx = 0
        edge_type_idx = 0
        
        # Create node mappings
        all_nodes = set(df['src'].unique()) | set(df['dst'].unique())
        for node in all_nodes:
            if node not in self.node_to_idx:
                self.node_to_idx[node] = node_idx
                self.idx_to_node[node_idx] = node
                node_idx += 1
        
        # Create edge type mappings
        for edge_type in df['label'].unique():
            if edge_type not in self.edge_type_to_idx:
                self.edge_type_to_idx[edge_type] = edge_type_idx
                self.idx_to_edge_type[edge_type_idx] = edge_type
                edge_type_idx += 1
        
        # Build edges
        for _, row in df.iterrows():
            src_idx = self.node_to_idx[row['src']]
            dst_idx = self.node_to_idx[row['dst']]
            edge_type_idx = self.edge_type_to_idx[row['label']]
            self.edges.append([src_idx, dst_idx])
            self.timestamps.append(row['timestamp'])
        
        # Create node features (degree, edge type distribution, etc.)
        self._create_node_features(df)
        
        return self._to_pyg_data()
    
    def _create_node_features(self, df):
        """Create node features based on graph statistics"""
        # Initialize features for all nodes
        for node in self.node_to_idx:
            self.node_features[node] = {
                'degree': 0,
                'in_degree': 0,
                'out_degree': 0,
                'edge_types': defaultdict(int)
            }
        
        # Calculate features
        for _, row in df.iterrows():
            src = row['src']
            dst = row['dst']
            edge_type = row['label']
            
            # Update degrees
            self.node_features[src]['out_degree'] += 1
            self.node_features[dst]['in_degree'] += 1
            self.node_features[src]['degree'] += 1
            self.node_features[dst]['degree'] += 1
            
            # Update edge type distribution
            self.node_features[src]['edge_types'][edge_type] += 1
            self.node_features[dst]['edge_types'][edge_type] += 1
    
    def _to_pyg_data(self):
        """Convert to PyTorch Geometric Data format"""
        edge_index = torch.tensor(self.edges).t().contiguous()
        
        # Create feature matrix
        num_nodes = len(self.node_to_idx)
        num_edge_types = len(self.edge_type_to_idx)
        
        # Features: [degree, in_degree, out_degree, edge_type_features]
        x = torch.zeros((num_nodes, 3 + num_edge_types))
        
        for node, idx in self.node_to_idx.items():
            features = self.node_features[node]
            x[idx, 0] = features['degree']
            x[idx, 1] = features['in_degree']
            x[idx, 2] = features['out_degree']
            
            # Edge type features
            for edge_type, count in features['edge_types'].items():
                edge_type_idx = self.edge_type_to_idx[edge_type]
                x[idx, 3 + edge_type_idx] = count
        
        return Data(x=x, edge_index=edge_index)

# Build the knowledge graph
kg = DynamicKnowledgeGraph()
pyg_data = kg.build_graph(df)
print(f"Graph has {pyg_data.num_nodes} nodes and {pyg_data.num_edges} edges")
print(f"Node features shape: {pyg_data.x.shape}")

In [None]:
class GNNAnomalyDetector(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNAnomalyDetector, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)
        
        # Reconstruction layer for anomaly scoring
        self.reconstruction = nn.Linear(output_dim, input_dim)
        
    def forward(self, x, edge_index):
        # GNN layers
        h = F.relu(self.conv1(x, edge_index))
        h = F.dropout(h, p=0.5, training=self.training)
        h = F.relu(self.conv2(h, edge_index))
        h = F.dropout(h, p=0.5, training=self.training)
        h = F.relu(self.conv3(h, edge_index))
        
        # Reconstruction for anomaly detection
        reconstructed = self.reconstruction(h)
        
        return h, reconstructed

# Initialize the model
input_dim = pyg_data.x.shape[1]
hidden_dim = 32
output_dim = 16

model = GNNAnomalyDetector(input_dim, hidden_dim, output_dim)
print("GNN Anomaly Detector Model:")
print(model)

In [None]:
def train_anomaly_detector(model, data, epochs=200):
    """Train the GNN anomaly detector in an unsupervised manner"""
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    model.train()
    
    losses = []
    for epoch in range(epochs):
        optimizer.zero_grad()
        embeddings, reconstructed = model(data.x, data.edge_index)
        
        # Reconstruction loss (mean squared error)
        loss = F.mse_loss(reconstructed, data.x)
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
        
        if epoch % 50 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')
    
    return losses

def compute_anomaly_scores(model, data):
    """Compute anomaly scores for nodes based on reconstruction error"""
    model.eval()
    with torch.no_grad():
        embeddings, reconstructed = model(data.x, data.edge_index)
        
        # Compute reconstruction error for each node
        reconstruction_error = F.mse_loss(reconstructed, data.x, reduction='none')
        node_anomaly_scores = reconstruction_error.sum(dim=1)
        
        return node_anomaly_scores, embeddings

# Train the model
losses = train_anomaly_detector(model, pyg_data, epochs=200)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.show()

In [None]:
# Compute anomaly scores
node_anomaly_scores, embeddings = compute_anomaly_scores(model, pyg_data)

# Map scores back to node names
anomaly_scores_dict = {}
for node, idx in kg.node_to_idx.items():
    anomaly_scores_dict[node] = node_anomaly_scores[idx].item()

# Sort nodes by anomaly score (highest first)
sorted_anomalies = sorted(anomaly_scores_dict.items(), key=lambda x: x[1], reverse=True)

print("Top anomalies in the knowledge graph:")
print("-------------------------------------")
for i, (node, score) in enumerate(sorted_anomalies[:10]):
    print(f"{i+1}. Node: {node:<15} Anomaly Score: {score:.4f}")

# Visualize anomaly scores
nodes = [item[0] for item in sorted_anomalies]
scores = [item[1] for item in sorted_anomalies]

plt.figure(figsize=(12, 6))
plt.bar(range(len(nodes)), scores)
plt.title('Node Anomaly Scores')
plt.xlabel('Nodes')
plt.ylabel('Anomaly Score')
plt.xticks(range(len(nodes)), nodes, rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
class TemporalAnomalyDetector:
    def __init__(self, window_size=2):
        self.window_size = window_size
        self.temporal_patterns = defaultdict(list)
        
    def build_temporal_patterns(self, df):
        """Build temporal patterns for edge types between node pairs"""
        # Group by timestamp and create graph snapshots
        for timestamp in df['timestamp'].unique():
            snapshot = df[df['timestamp'] == timestamp]
            for _, row in snapshot.iterrows():
                key = (row['src'], row['dst'], row['label'])
                self.temporal_patterns[key].append(timestamp)
                
    def detect_temporal_anomalies(self, df):
        """Detect anomalies based on temporal patterns"""
        anomaly_scores = []
        
        for _, row in df.iterrows():
            key = (row['src'], row['dst'], row['label'])
            timestamps = self.temporal_patterns[key]
            
            # Check if this edge appears at unusual times
            current_time = row['timestamp']
            
            # If this edge type rarely occurs, it's anomalous
            if len(timestamps) <= 1:
                score = 1.0
            else:
                # Calculate temporal distribution anomaly
                # Based on standard deviation of time intervals
                intervals = [timestamps[i+1] - timestamps[i] 
                            for i in range(len(timestamps)-1)]
                if intervals:
                    mean_interval = np.mean(intervals)
                    std_interval = np.std(intervals)
                    # If current time deviates significantly from expected pattern
                    if len(timestamps) > 1 and std_interval > 0:
                        expected_next = timestamps[-1] + mean_interval
                        temporal_deviation = abs(current_time - expected_next) / (std_interval + 1)
                        score = min(temporal_deviation, 1.0)
                    else:
                        score = 0.1
                else:
                    score = 0.5
                    
            anomaly_scores.append(score)
            
        return anomaly_scores

# Create temporal anomaly detector
temporal_detector = TemporalAnomalyDetector()
temporal_detector.build_temporal_patterns(df)

# Compute temporal anomaly scores
temporal_anomaly_scores = temporal_detector.detect_temporal_anomalies(df)

# Add to dataframe
df['temporal_anomaly_score'] = temporal_anomaly_scores

print("Temporal anomaly detection:")
print("--------------------------")
# Show edges with highest temporal anomaly scores
df_temporal_sorted = df.sort_values('temporal_anomaly_score', ascending=False)
print(df_temporal_sorted.head(10)[['src', 'dst', 'label', 'timestamp', 'temporal_anomaly_score']])

In [None]:
class ComprehensiveAnomalyDetector:
    def __init__(self, node_weight=0.4, edge_weight=0.3, temporal_weight=0.3):
        self.node_weight = node_weight
        self.edge_weight = edge_weight
        self.temporal_weight = temporal_weight
        self.kg = None
        self.model = None
        self.edge_detector = None
        self.temporal_detector = None
        
    def fit(self, df):
        """Fit the anomaly detector on the knowledge graph data"""
        # Build knowledge graph
        self.kg = DynamicKnowledgeGraph()
        pyg_data = self.kg.build_graph(df)
        
        # Initialize and train GNN model
        input_dim = pyg_data.x.shape[1]
        hidden_dim = 32
        output_dim = 16
        
        self.model = GNNAnomalyDetector(input_dim, hidden_dim, output_dim)
        train_anomaly_detector(self.model, pyg_data, epochs=100)
        
        # Initialize edge detector
        self.edge_detector = EdgeAnomalyDetector(self.kg)
        
        # Initialize temporal detector
        self.temporal_detector = TemporalAnomalyDetector()
        self.temporal_detector.build_temporal_patterns(df)
        
        return self
    
    def detect_anomalies(self, df):
        """Detect anomalies in the knowledge graph"""
        # Node anomaly scores
        node_anomaly_scores, embeddings = compute_anomaly_scores(self.model, 
                                                                 self.kg._to_pyg_data())
        node_scores_dict = {}
        for node, idx in self.kg.node_to_idx.items():
            node_scores_dict[node] = node_anomaly_scores[idx].item()
            
        # Edge anomaly scores
        edge_anomaly_scores = self.edge_detector.compute_edge_anomaly_scores(df)
        
        # Temporal anomaly scores
        temporal_anomaly_scores = self.temporal_detector.detect_temporal_anomalies(df)
        
        # Combine scores
        combined_scores = []
        for i, row in df.iterrows():
            src_score = node_scores_dict.get(row['src'], 0)
            dst_score = node_scores_dict.get(row['dst'], 0)
            edge_score = edge_anomaly_scores[i]
            temporal_score = temporal_anomaly_scores[i]
            
            # Weighted combination
            combined_score = (self.node_weight * (src_score + dst_score) / 2 + 
                            self.edge_weight * edge_score + 
                            self.temporal_weight * temporal_score)
            combined_scores.append(combined_score)
            
        return combined_scores, node_scores_dict

# Create and fit comprehensive anomaly detector
comprehensive_detector = ComprehensiveAnomalyDetector()
comprehensive_detector.fit(df)

# Detect anomalies
combined_scores, node_scores = comprehensive_detector.detect_anomalies(df)

# Add combined scores to dataframe
df['combined_anomaly_score'] = combined_scores

print("Comprehensive anomaly detection results:")
print("=======================================")
print("Top anomalous edges:")
df_combined_sorted = df.sort_values('combined_anomaly_score', ascending=False)
print(df_combined_sorted.head(10)[['src', 'dst', 'label', 'timestamp', 'combined_anomaly_score']])

print("\nNode anomaly scores:")
sorted_nodes = sorted(node_scores.items(), key=lambda x: x[1], reverse=True)
for i, (node, score) in enumerate(sorted_nodes[:10]):
    print(f"{i+1}. {node:<15} : {score:.4f}")

In [None]:
from io import StringIO

def detect_kg_anomalies(csv_data):
    """
    Main function to detect anomalies in a knowledge graph from CSV data
    
    Parameters:
    csv_data (str or DataFrame): CSV data with columns src, dst, label, timestamp, event_type
    
    Returns:
    dict: Anomaly scores for nodes and edges
    """
    
    # Handle input data
    if isinstance(csv_data, str):
        # If string, assume it's CSV content
        df = pd.read_csv(StringIO(csv_data))
    else:
        # If DataFrame, use directly
        df = csv_data.copy()
    
    # Initialize and fit the comprehensive detector
    detector = ComprehensiveAnomalyDetector()
    detector.fit(df)
    
    # Detect anomalies
    combined_scores, node_scores = detector.detect_anomalies(df)
    
    # Add scores to dataframe
    df['combined_anomaly_score'] = combined_scores
    
    # Results
    results = {
        'node_anomaly_scores': node_scores,
        'edge_anomaly_scores': df[['src', 'dst', 'label', 'timestamp', 'combined_anomaly_score']].to_dict('records'),
        'most_anomalous_nodes': sorted(node_scores.items(), key=lambda x: x[1], reverse=True)[:5],
        'most_anomalous_edges': df.nlargest(5, 'combined_anomaly_score')[['src', 'dst', 'label', 'combined_anomaly_score']].to_dict('records')
    }
    
    return results

# Test with the original data
csv_content = """src,dst,label,timestamp,event_type
bng,trunk,HAS_PORT,0,add
bng,trunk,HAS_PORT,0,add
concentrator,trunk,DEPENDS_ON,0,add
concentrator,trunk,DEPENDS_ON,0,add
concentrator,port,HAS_PORT,0,add
cpe,port,DEPENDS_ON,0,add
cpe,trunk,DEPENDS_ON,0,add
cpe,trunk,DEPENDS_ON,0,add"""

test_df = pd.read_csv(StringIO(csv_content))
anomaly_results = detect_kg_anomalies(test_df)

print("ANOMALY DETECTION RESULTS SUMMARY")
print("=================================")
print(f"\nTop 3 Anomalous Nodes:")
for i, (node, score) in enumerate(anomaly_results['most_anomalous_nodes'][:3]):
    print(f"  {i+1}. {node}: {score:.4f}")

print(f"\nTop 3 Anomalous Edges:")
for i, edge in enumerate(anomaly_results['most_anomalous_edges'][:3]):
    print(f"  {i+1}. {edge['src']} --[{edge['label']}]--> {edge['dst']} (Score: {edge['combined_anomaly_score']:.4f})")

print("\n\nThe system successfully detects anomalies in knowledge graphs using:")
print("- GNN-based node embedding and reconstruction")
print("- Edge pattern analysis for unusual relationships")
print("- Temporal analysis for timing irregularities")
print("- Weighted combination of all anomaly scores")

In [None]:
def visualize_combined_anomalies_fixed(df, node_scores):
    """Visualize the knowledge graph with combined anomaly insights (fixed version)"""
    G = nx.DiGraph()
    
    # Add all nodes
    for node in set(df['src'].unique()) | set(df['dst'].unique()):
        G.add_node(node, node_score=node_scores.get(node, 0))
    
    # Create edge keys to avoid duplicate edges in NetworkX
    edge_data = {}
    for _, row in df.iterrows():
        edge_key = (row['src'], row['dst'])
        if edge_key not in edge_data:
            edge_data[edge_key] = {
                'labels': [],
                'scores': [],
                'combined_scores': []
            }
        edge_data[edge_key]['labels'].append(row['label'])
        edge_data[edge_key]['scores'].append(row['edge_anomaly_score'])
        edge_data[edge_key]['combined_scores'].append(row['combined_anomaly_score'])
    
    # Add edges to graph (average scores for duplicate edges)
    edge_colors = []
    edge_widths = []
    edge_labels = {}
    
    for (src, dst), data in edge_data.items():
        G.add_edge(src, dst)
        # Average scores for multiple edges between same nodes
        avg_edge_score = np.mean(data['scores'])
        avg_combined_score = np.mean(data['combined_scores'])
        edge_colors.append(avg_combined_score)
        edge_widths.append(max(1, avg_combined_score))
        # Combine labels for visualization
        edge_labels[(src, dst)] = '/'.join(data['labels'])
    
    # Node sizes based on node anomaly scores
    node_sizes = [max(300, node_scores.get(node, 0) * 50) for node in G.nodes()]
    node_colors = [node_scores.get(node, 0) for node in G.nodes()]
    
    plt.figure(figsize=(15, 10))
    
    # Create layout
    pos = nx.spring_layout(G, seed=42)
    
    # Draw nodes
    nodes = nx.draw_networkx_nodes(G, pos, node_size=node_sizes, 
                                   node_color=node_colors, cmap=plt.cm.Reds, alpha=0.8)
    
    # Draw edges
    nx.draw_networkx_edges(G, pos, edge_color=edge_colors, 
                           edge_cmap=plt.cm.Blues, width=edge_widths, alpha=0.6, 
                           arrowstyle='->', arrowsize=20)
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
    
    # Draw edge labels
    nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=8)
    
    # Add colorbars
    plt.colorbar(nodes, label='Node Anomaly Score', shrink=0.8)
    
    plt.title('Knowledge Graph Anomaly Detection\nNode size/color = Node anomaly score, Edge width/color = Edge anomaly score')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    return G, pos

# Create final visualization
G, pos = visualize_combined_anomalies_fixed(df, node_scores)