# Cybersecurity Analysis with Community Detection

In this notebook, we'll apply our community detection methods to cybersecurity data from the UNSW-NB15 dataset. We'll:

1. Process the UNSW-NB15 dataset and construct a graph
2. Perform feature selection to identify important network attributes
3. Apply various community detection methods
4. Evaluate how well communities align with attack patterns
5. Compare the performance of different methods

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import rustworkx as rx
import torch
import time
import pickle
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Import community detection methods
from community_detection.traditional_methods import (
    run_louvain, run_leiden, run_infomap, run_label_propagation, run_spectral_clustering
)
from community_detection.gnn_community_detection import (
    run_gcn, run_graphsage, run_gat, run_vgae
)
from community_detection.overlapping_community_detection import (
    run_bigclam, run_demon, run_slpa, run_gnn_overlapping
)
from community_detection.visualization import visualize_communities

# Define paths
DATA_DIR = os.path.join(os.getcwd(), '..', 'data', 'unsw')
os.makedirs(DATA_DIR, exist_ok=True)

UNSW_FEATURES_PATH = os.path.join(DATA_DIR, "UNSW-NB15_features.csv")
UNSW_DATA_PATH_1 = os.path.join(DATA_DIR, "UNSW-NB15_1.csv")
GRAPH_PATH = os.path.join(DATA_DIR, "unsw_graph.pt")
RESULTS_DIR = os.path.join(DATA_DIR, "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool'}


2025-03-31 17:34:48.390494: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743456888.463590  630887 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743456888.483185  630887 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743456890.894644  630887 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743456890.894694  630887 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743456890.894696  630887 computation_placer.cc:177] computation placer alr

ImportError: cannot import name 'run_gcn' from 'community_detection.gnn_community_detection' (/home/braden/gnn-cd/community_detection/gnn_community_detection.py)

## 1. Download and Process the UNSW-NB15 Dataset

The UNSW-NB15 dataset is a comprehensive network traffic dataset that contains normal traffic and attack traffic. It includes 49 features and labels for different attack types.

Note: You'll need to manually download the dataset from https://research.unsw.edu.au/projects/unsw-nb15-dataset and place the CSV files in the data/unsw directory.

In [None]:
def load_dataset(data_paths, features_path):
    """
    Load the UNSW-NB15 dataset and its features
    
    Parameters:
    -----------
    data_paths: list
        List of paths to CSV data files
    features_path: str
        Path to features CSV file
        
    Returns:
    --------
    data: pandas.DataFrame
        Combined dataset
    features_info: pandas.DataFrame
        Feature information
    """
    print(f"Loading feature information from {features_path}")
    try:
        # Load feature information
        features_info = pd.read_csv(features_path)
        print(f"Loaded {len(features_info)} features")
    except FileNotFoundError:
        print(f"Features file not found: {features_path}")
        features_info = None
    
    # Check if any data files exist
    existing_paths = [path for path in data_paths if os.path.exists(path)]
    if not existing_paths:
        print("No data files found. Please download the UNSW-NB15 dataset first.")
        print("Download from: https://research.unsw.edu.au/projects/unsw-nb15-dataset")
        return None, features_info
    
    # Load and combine data files
    dfs = []
    for path in existing_paths:
        print(f"Loading data from {path}")
        try:
            df = pd.read_csv(path)
            dfs.append(df)
            print(f"Loaded {len(df)} records from {path}")
        except Exception as e:
            print(f"Error loading {path}: {e}")
    
    if not dfs:
        return None, features_info
    
    data = pd.concat(dfs, ignore_index=True)
    print(f"Combined dataset has {len(data)} records")
    return data, features_info

# Load the dataset
data_paths = [UNSW_DATA_PATH_1]
data, features_info = load_dataset(data_paths, UNSW_FEATURES_PATH)

# If no real data is available, create synthetic data for demonstration
if data is None:
    print("Creating synthetic data for demonstration...")
    # Create synthetic data with similar structure to UNSW-NB15
    np.random.seed(42)
    n_samples = 1000
    n_features = 10
    
    # Generate synthetic features
    X = np.random.rand(n_samples, n_features)
    
    # Generate synthetic source and destination IPs
    src_ips = [f"192.168.1.{np.random.randint(1, 255)}" for _ in range(n_samples)]
    dst_ips = [f"10.0.0.{np.random.randint(1, 255)}" for _ in range(n_samples)]
    
    # Generate synthetic labels (20% attacks)
    labels = np.random.choice([0, 1], size=n_samples, p=[0.8, 0.2])
    
    # Create feature names
    feature_names = [f"feature_{i}" for i in range(n_features)]
    
    # Create DataFrame
    data = pd.DataFrame(X, columns=feature_names)
    data['srcip'] = src_ips
    data['dstip'] = dst_ips
    data['label'] = labels
    
    # Create some correlation between features and labels
    for i in range(3):
        data.loc[data['label'] == 1, f'feature_{i}'] += 0.3
    
    print(f"Created synthetic dataset with {len(data)} records")

# Display the first few rows
if data is not None:
    print("\nDataset preview:")
    display(data.head())
    
    # Display class distribution
    if 'label' in data.columns:
        attack_count = data['label'].sum()
        normal_count = len(data) - attack_count
        print(f"\nClass distribution:\n- Normal: {normal_count} ({normal_count/len(data)*100:.1f}%)\n- Attack: {attack_count} ({attack_count/len(data)*100:.1f}%)")
        
        # Plot class distribution
        plt.figure(figsize=(8, 6))
        sns.countplot(x='label', data=data)
        plt.title('Class Distribution')
        plt.xlabel('Label (0=Normal, 1=Attack)')
        plt.ylabel('Count')
        plt.show()

## 2. Feature Selection and Analysis

Let's perform feature selection to identify the most important features for attack detection.

In [None]:
def feature_selection(data, k=20):
    """
    Perform feature selection on the dataset
    
    Parameters:
    -----------
    data: pandas.DataFrame
        Dataset with features and labels
    k: int
        Number of features to select
        
    Returns:
    --------
    selected_features: list
        List of selected feature names
    X_selected: numpy.ndarray
        Feature matrix with selected features
    y: numpy.ndarray
        Labels array
    """
    # Separate features and labels
    exclude_cols = ['label', 'attack_cat', 'srcip', 'dstip']
    X = data.drop([col for col in exclude_cols if col in data.columns], axis=1)
    y = data['label'] if 'label' in data.columns else np.zeros(len(data))
    
    # Handle categorical features
    categorical_cols = X.select_dtypes(include=['object']).columns
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    # Standardize numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply feature selection
    selector = SelectKBest(f_classif, k=min(k, X.shape[1]))
    X_selected = selector.fit_transform(X_scaled, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    print(f"Selected {len(selected_features)} features:\n{', '.join(selected_features)}")
    
    # Visualize feature importance scores
    scores = selector.scores_
    feature_scores = list(zip(X.columns, scores))
    feature_scores.sort(key=lambda x: x[1], reverse=True)
    top_features = feature_scores[:k]
    
    plt.figure(figsize=(10, 6))
    plt.barh([f[0] for f in top_features], [f[1] for f in top_features])
    plt.xlabel('F-Score')
    plt.ylabel('Feature')
    plt.title(f'Top {k} Features by F-Score')
    plt.tight_layout()
    plt.show()
    
    return selected_features, X_selected, y

# Perform feature selection
if data is not None:
    selected_features, X_selected, y = feature_selection(data, k=10)

## 3. Construct Network Graph

Now we'll construct a graph from the dataset where nodes represent devices (IPs) and edges represent connections.

In [None]:
def construct_graph(data, feature_cols, target_col='label'):
    """
    Construct a graph from the dataset where devices are nodes
    
    Parameters:
    -----------
    data: pandas.DataFrame
        Dataset with features and labels
    feature_cols: list
        List of feature columns to use
    target_col: str
        Target column name
        
    Returns:
    --------
    G: rustworkx.PyGraph
        Constructed graph
    node_mapping: dict
        Mapping from node IDs to device IPs
    """
    # Extract unique source and destination IPs
    src_ips = set(data['srcip'].unique()) if 'srcip' in data.columns else set()
    dst_ips = set(data['dstip'].unique()) if 'dstip' in data.columns else set()
    all_ips = src_ips.union(dst_ips)
    
    if not all_ips:
        # If IP columns not found, use a different approach
        print("IP columns not found. Using synthetic node IDs based on row indices.")
        all_ips = set(range(len(data)))
        ip_to_idx = {i: i for i in all_ips}
        data['srcip'] = data.index
        data['dstip'] = (data.index + 1) % len(data)  # Create connections in a ring
    else:
        # Create mapping from IPs to node indices
        ip_to_idx = {ip: i for i, ip in enumerate(all_ips)}
    
    # Create graph
    G = rx.PyGraph()
    
    # Add nodes with feature vectors and labels
    node_mapping = {}
    for ip in all_ips:
        # Get rows where this IP appears as source or destination
        ip_data = data[(data['srcip'] == ip) | (data['dstip'] == ip)]
        
        if len(ip_data) == 0:
            # If no data for this IP, use zeros for features
            features = np.zeros(len(feature_cols))
            label = 0
        else:
            # For demonstration with synthetic data, handle missing columns
            available_features = [f for f in feature_cols if f in ip_data.columns]
            
            if not available_features:
                # If no selected features are available, use random features
                features = np.random.rand(len(feature_cols))
            else:
                # Aggregate features for this IP (using mean)
                features = ip_data[available_features].mean().values
                # Pad with zeros if needed
                if len(features) < len(feature_cols):
                    features = np.pad(features, (0, len(feature_cols) - len(features)))
            
            # Determine label (1 if any traffic involving this IP is malicious)
            label = 1 if target_col in ip_data.columns and (ip_data[target_col] == 1).any() else 0
        
        # Create node data with features and label
        node_data = {
            'features': features,
            'label': label,
            'ip': ip
        }
        
        # Add node to graph
        node_idx = G.add_node(node_data)
        node_mapping[ip] = node_idx
    
    # Add edges based on traffic between IPs
    edge_counts = {}
    for _, row in data.iterrows():
        src = row['srcip']
        dst = row['dstip']
        if src != dst:  # Avoid self-loops
            src_idx = node_mapping[src]
            dst_idx = node_mapping[dst]
            
            # Count occurrences of this edge for weight
            edge_key = (src_idx, dst_idx)
            edge_counts[edge_key] = edge_counts.get(edge_key, 0) + 1
    
    # Add weighted edges to graph
    for (src_idx, dst_idx), weight in edge_counts.items():
        G.add_edge(src_idx, dst_idx, weight)
    
    print(f"Created graph with {len(G)} nodes and {G.num_edges()} edges")
    return G, node_mapping

def visualize_graph(G, node_mapping, max_nodes=100):
    """
    Visualize the graph
    
    Parameters:
    -----------
    G: rustworkx.PyGraph
        Graph to visualize
    node_mapping: dict
        Mapping from device IPs to node IDs
    max_nodes: int
        Maximum number of nodes to show
    """
    # Convert to NetworkX for visualization
    G_nx = nx.Graph()
    
    # Limit to max_nodes if graph is too large
    nodes_to_show = min(len(G), max_nodes)
    print(f"Showing {nodes_to_show} out of {len(G)} nodes")
    
    # Add nodes with attributes
    for i in range(nodes_to_show):
        node_data = G.get_node_data(i)
        label = node_data['label']
        G_nx.add_node(i, label=label)
    
    # Add edges
    for edge in G.edge_list():
        source, target = edge[0], edge[1]
        if source < nodes_to_show and target < nodes_to_show:
            weight = G.get_edge_data(source, target)
            G_nx.add_edge(source, target, weight=weight)
    
    # Visualize
    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(G_nx, seed=42)
    
    # Get node colors based on labels
    node_colors = [G_nx.nodes[n]['label'] for n in G_nx.nodes()]
    
    # Draw nodes
    nx.draw_networkx_nodes(G_nx, pos, 
                          node_color=node_colors, 
                          cmap=plt.cm.coolwarm, 
                          alpha=0.8, 
                          node_size=100)
    
    # Draw edges with width based on weight
    edge_widths = [G_nx[u][v].get('weight', 1) / 10 for u, v in G_nx.edges()]
    nx.draw_networkx_edges(G_nx, pos, width=edge_widths, alpha=0.3)
    
    plt.title(f"Network Graph from UNSW-NB15 (showing {nodes_to_show} nodes)")
    plt.axis('off')
    
    # Add colorbar
    sm = plt.cm.ScalarMappable(cmap=plt.cm.coolwarm, norm=plt.Normalize(vmin=0, vmax=1))
    sm.set_array([])
    plt.colorbar(sm, label='Attack Label (1=Attack)')
    
    plt.tight_layout()
    plt.savefig(os.path.join(DATA_DIR, 'unsw_graph.png'))
    plt.show()

def save_graph(G, filepath):
    """
    Save the graph to a file
    
    Parameters:
    -----------
    G: rustworkx.PyGraph
        Graph to save
    filepath: str
        Path to save the graph
    """
    # Convert node data to serializable format
    for i in range(len(G)):
        node_data = G.get_node_data(i)
        if node_data:
            # Convert numpy arrays to lists
            if 'features' in node_data and isinstance(node_data['features'], np.ndarray):
                node_data['features'] = node_data['features'].tolist()
            G.set_node_data(i, node_data)
    
    # Save using torch
    data = {
        'num_nodes': len(G),
        'edge_list': G.edge_list(),
        'node_data': [G.get_node_data(i) for i in range(len(G))],
        'edge_data': [G.get_edge_data(e[0], e[1]) for e in G.edge_list()]
    }
    torch.save(data, filepath)
    print(f"Graph saved to {filepath}")

def load_graph(filepath):
    """
    Load a graph from a file
    
    Parameters:
    -----------
    filepath: str
        Path to the saved graph
        
    Returns:
    --------
    G: rustworkx.PyGraph
        Loaded graph
    """
    data = torch.load(filepath)
    G = rx.PyGraph()
    
    # Add nodes
    for node_data in data['node_data']:
        # Convert features back to numpy array if needed
        if 'features' in node_data and isinstance(node_data['features'], list):
            node_data['features'] = np.array(node_data['features'])
        G.add_node(node_data)
    
    # Add edges
    for (src, dst), edge_data in zip(data['edge_list'], data['edge_data']):
        G.add_edge(src, dst, edge_data)
    
    print(f"Loaded graph with {len(G)} nodes and {G.num_edges()} edges")
    return G

# Construct and visualize the graph
if data is not None and 'selected_features' in locals():
    # Check if graph already exists
    if os.path.exists(GRAPH_PATH):
        print(f"Loading existing graph from {GRAPH_PATH}")
        G = load_graph(GRAPH_PATH)
        # Dummy node mapping for existing graph
        node_mapping = {i: i for i in range(len(G))}
    else:
        print("Constructing graph from data...")
        G, node_mapping = construct_graph(data, selected_features)
        save_graph(G, GRAPH_PATH)
    
    # Visualize graph
    visualize_graph(G, node_mapping)

## 4. Apply Community Detection Methods

Now we'll apply various community detection methods and evaluate their performance for identifying attack-related communities.

In [None]:
def evaluate_communities(G, communities, community_type='non-overlapping'):
    """
    Evaluate community detection results for cybersecurity
    
    Parameters:
    -----------
    G: rustworkx.PyGraph
        Graph with ground truth labels
    communities: dict or list
        Detected communities (dict for non-overlapping, list of lists for overlapping)
    community_type: str
        'non-overlapping' or 'overlapping'
        
    Returns:
    --------
    metrics: dict
        Evaluation metrics
    """
    if community_type == 'non-overlapping':
        # Convert community dict to community assignments
        community_assignments = {}
        for node, community in communities.items():
            if community not in community_assignments:
                community_assignments[community] = []
            community_assignments[community].append(node)
        
        communities_list = list(community_assignments.values())
    else:
        # Already in list of lists format
        communities_list = communities
    
    # Calculate homogeneity of communities regarding attack labels
    y_true = np.array([G.get_node_data(i)['label'] for i in range(len(G))])
    
    # Assign predicted label to each node based on majority class in its community
    y_pred = np.zeros_like(y_true)
    
    for comm_idx, community in enumerate(communities_list):
        # Get labels of nodes in this community
        comm_labels = [G.get_node_data(node)['label'] for node in community]
        
        # Determine majority class
        if len(comm_labels) > 0:
            majority_label = 1 if sum(comm_labels) / len(comm_labels) >= 0.5 else 0
            
            # Assign majority label to all nodes in this community
            for node in community:
                y_pred[node] = majority_label
    
    # Calculate metrics
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    
    # Calculate AUC if possible
    try:
        metrics['auc'] = roc_auc_score(y_true, y_pred)
    except:
        metrics['auc'] = 0.5  # Default for random classifier
    
    # Calculate cluster purity
    purities = []
    for community in communities_list:
        if len(community) > 0:
            comm_labels = [G.get_node_data(node)['label'] for node in community]
            majority_count = max(sum(comm_labels), len(comm_labels) - sum(comm_labels))
            purity = majority_count / len(comm_labels)
            purities.append(purity)
    
    metrics['avg_purity'] = np.mean(purities) if purities else 0
    
    # Number of communities and sizes
    metrics['num_communities'] = len(communities_list)
    
    community_sizes = [len(comm) for comm in communities_list]
    metrics['avg_community_size'] = np.mean(community_sizes) if community_sizes else 0
    
    # Calculate attack concentration
    attack_concentration = {}
    for comm_idx, community in enumerate(communities_list):
        attack_count = sum(G.get_node_data(node)['label'] for node in community)
        attack_concentration[comm_idx] = attack_count / len(community) if community else 0
    
    # Identify communities with high attack concentration
    high_attack_comms = [idx for idx, conc in attack_concentration.items() if conc >= 0.8]
    metrics['num_attack_communities'] = len(high_attack_comms)
    metrics['attack_communities_ratio'] = len(high_attack_comms) / len(communities_list) if communities_list else 0
    
    return metrics, y_pred, attack_concentration

# Apply traditional community detection methods
if 'G' in locals():
    # Check if results file exists
    results_file = os.path.join(RESULTS_DIR, 'community_detection_results.pkl')
    if os.path.exists(results_file):
        with open(results_file, 'rb') as f:
            all_results = pickle.load(f)
        print(f"Loaded results for {len(all_results)} methods")
    else:
        # Run methods
        all_results = {}
        
        # Louvain
        print("\nRunning Louvain algorithm...")
        start_time = time.time()
        louvain_communities, _ = run_louvain(G)
        louvain_time = time.time() - start_time
        
        print(f"Found {len(set(louvain_communities.values()))} communities in {louvain_time:.2f} seconds")
        louvain_metrics, louvain_pred, louvain_attack_conc = evaluate_communities(G, louvain_communities)
        print(f"Metrics: Accuracy={louvain_metrics['accuracy']:.4f}, F1={louvain_metrics['f1']:.4f}")
        
        all_results['louvain'] = {
            'communities': louvain_communities,
            'execution_time': louvain_time,
            'metrics': louvain_metrics,
            'predictions': louvain_pred,
            'attack_concentrations': louvain_attack_conc
        }
        
        # Visualize Louvain communities
        visualize_communities(G, louvain_communities, title="Louvain Communities")
        
        # Run a few other methods for comparison
        # Label Propagation
        print("\nRunning Label Propagation algorithm...")
        start_time = time.time()
        lp_communities, _ = run_label_propagation(G)
        lp_time = time.time() - start_time
        
        print(f"Found {len(set(lp_communities.values()))} communities in {lp_time:.2f} seconds")
        lp_metrics, lp_pred, lp_attack_conc = evaluate_communities(G, lp_communities)
        print(f"Metrics: Accuracy={lp_metrics['accuracy']:.4f}, F1={lp_metrics['f1']:.4f}")
        
        all_results['label_propagation'] = {
            'communities': lp_communities,
            'execution_time': lp_time,
            'metrics': lp_metrics,
            'predictions': lp_pred,
            'attack_concentrations': lp_attack_conc
        }
        
        # Save results
        with open(results_file, 'wb') as f:
            pickle.dump(all_results, f)
    
    # Display summary of results
    print("\nCommunity Detection Results Summary:")
    rows = []
    for method_name, result in all_results.items():
        metrics = result['metrics']
        rows.append({
            'Method': method_name,
            'Num Communities': metrics['num_communities'],
            'Avg Purity': metrics['avg_purity'],
            'Accuracy': metrics['accuracy'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'], 
            'F1': metrics['f1'],
            'Attack Comm. Ratio': metrics.get('attack_communities_ratio', 0),
            'Execution Time (s)': result['execution_time']
        })
    
    summary_df = pd.DataFrame(rows)
    display(summary_df)
    
    # Plot metrics comparison
    plt.figure(figsize=(12, 6))
    metrics_df = summary_df.melt(
        id_vars=['Method'],
        value_vars=['Accuracy', 'Precision', 'Recall', 'F1', 'Avg Purity'],
        var_name='Metric',
        value_name='Value'
    )
    sns.barplot(x='Method', y='Value', hue='Metric', data=metrics_df)
    plt.title('Performance Metrics by Method')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 5. Analyze Attack-Related Communities

Let's analyze the communities to identify those with high concentrations of attack traffic.

In [None]:
def analyze_attack_communities(G, communities, attack_concentrations, threshold=0.7):
    """
    Analyze communities with high concentrations of attack traffic
    
    Parameters:
    -----------
    G: rustworkx.PyGraph
        Graph with features and labels
    communities: dict
        Community assignments
    attack_concentrations: dict
        Dictionary mapping community ID to attack concentration
    threshold: float
        Threshold for considering a community attack-focused
    """
    # Identify communities with high attack concentration
    attack_comms = {comm_id: conc for comm_id, conc in attack_concentrations.items() 
                   if conc >= threshold}
    
    if not attack_comms:
        print(f"No communities with attack concentration >= {threshold} found.")
        return
    
    print(f"Found {len(attack_comms)} communities with attack concentration >= {threshold}:")
    for comm_id, conc in sorted(attack_comms.items(), key=lambda x: x[1], reverse=True):
        print(f"  Community {comm_id}: {conc:.2f} attack concentration")
    
    # Convert community dict to lists
    community_assignments = {}
    for node, community in communities.items():
        if community not in community_assignments:
            community_assignments[community] = []
        community_assignments[community].append(node)
    
    # Analyze characteristics of attack communities
    attack_community_stats = []
    for comm_id in attack_comms.keys():
        if comm_id in community_assignments:
            comm_nodes = community_assignments[comm_id]
            
            # Get node features for this community
            features = []
            for node in comm_nodes:
                node_data = G.get_node_data(node)
                if 'features' in node_data and isinstance(node_data['features'], (list, np.ndarray)):
                    features.append(node_data['features'])
            
            if features:
                features = np.array(features)
                # Calculate mean and std of features
                mean_features = np.mean(features, axis=0)
                std_features = np.std(features, axis=0)
                
                attack_community_stats.append({
                    'community_id': comm_id,
                    'attack_concentration': attack_comms[comm_id],
                    'size': len(comm_nodes),
                    'mean_features': mean_features,
                    'std_features': std_features
                })
    
    if attack_community_stats:
        # Visualize feature profiles of attack communities
        plt.figure(figsize=(14, 6))
        
        for i, stats in enumerate(attack_community_stats[:5]):  # Show top 5 communities
            plt.bar(
                np.arange(len(stats['mean_features'])) + i * 0.15, 
                stats['mean_features'], 
                width=0.15, 
                yerr=stats['std_features'],
                alpha=0.7,
                label=f"Comm {stats['community_id']} (Atk={stats['attack_concentration']:.2f})"
            )
        
        plt.xlabel('Feature Index')
        plt.ylabel('Feature Value')
        plt.title('Feature Profiles of High-Attack Communities')
        plt.legend()
        plt.tight_layout()
        plt.show()

# Analyze attack communities for the best performing method
if 'all_results' in locals() and all_results:
    # Find the method with the highest F1 score
    best_method = max(all_results.items(), key=lambda x: x[1]['metrics']['f1'])
    print(f"\nAnalyzing attack communities for the best performing method: {best_method[0]}")
    
    analyze_attack_communities(
        G, 
        best_method[1]['communities'], 
        best_method[1]['attack_concentrations']
    )

## 6. Conclusion and Performance Comparison

In this notebook, we've applied community detection methods to the UNSW-NB15 dataset for cybersecurity analysis. Here's a summary of our findings:

In [None]:
# Generate a comprehensive summary of all methods
if 'all_results' in locals() and all_results:
    # Create a summary table
    summary_df = pd.DataFrame(rows)
    
    # Display key findings
    print("Key Findings:")
    
    # Best method for accuracy
    best_accuracy = summary_df.loc[summary_df['Accuracy'].idxmax()]
    print(f"1. Best method for accuracy: {best_accuracy['Method']} ({best_accuracy['Accuracy']:.4f})")
    
    # Best method for F1 score
    best_f1 = summary_df.loc[summary_df['F1'].idxmax()]
    print(f"2. Best method for F1 score: {best_f1['Method']} ({best_f1['F1']:.4f})")
    
    # Most efficient method
    most_efficient = summary_df.loc[summary_df['Execution Time (s)'].idxmin()]
    print(f"3. Most efficient method: {most_efficient['Method']} ({most_efficient['Execution Time (s)']:.4f}s)")
    
    # Method with highest community purity
    best_purity = summary_df.loc[summary_df['Avg Purity'].idxmax()]
    print(f"4. Method with highest community purity: {best_purity['Method']} ({best_purity['Avg Purity']:.4f})")
    
    # Method with best attack community detection
    best_attack_ratio = summary_df.loc[summary_df['Attack Comm. Ratio'].idxmax()]
    print(f"5. Method best at isolating attack traffic: {best_attack_ratio['Method']} ({best_attack_ratio['Attack Comm. Ratio']:.4f})")
    
    # Create final comparison plot
    plt.figure(figsize=(12, 8))
    
    # Scatter plot of F1 vs Execution Time
    plt.scatter(
        summary_df['Execution Time (s)'], 
        summary_df['F1'], 
        s=summary_df['Num Communities'] * 20,  # Size based on number of communities
        alpha=0.7
    )
    
    # Label each point with method name
    for i, row in summary_df.iterrows():
        plt.annotate(
            row['Method'], 
            (row['Execution Time (s)'], row['F1']),
            xytext=(5, 5),
            textcoords='offset points'
        )
    
    plt.xscale('log')
    plt.xlabel('Execution Time (s)')
    plt.ylabel('F1 Score')
    plt.title('Community Detection Method Performance')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'performance_comparison.png'))
    plt.show()
    
    print("\nConclusion:")
    print("Community detection methods can effectively identify network traffic patterns related to cyber attacks.")
    print(f"The {best_f1['Method']} method provided the best balance of precision and recall for identifying attack traffic.")
    print(f"The {best_purity['Method']} method created the most homogeneous communities in terms of traffic type.")
    print("These methods can be valuable tools for network security monitoring and anomaly detection.")