In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
import gc
import os
import h5py

# Option 1: Add PyG classes to safe globals
torch.serialization.add_safe_globals([
    'torch_geometric.data.data.Data',
    'torch_geometric.data.data.DataEdgeAttr',
    'torch_geometric.data.storage.EdgeStorage',
    'torch_geometric.data.storage.NodeStorage'
])

# Modified load function
def load_pyg_graph(input_dir=r"C:\Users\IIT BHILAI\Desktop\Reddit SNA\cleaned_graph_data"):
    """Load the PyG graph and user mapping from disk"""
    # Load the graph data with weights_only=False or use the safer approach above
    try:
        
        # Try with safe globals (preferred approach)
        data = torch.load(os.path.join(input_dir, "cleaned_user_graph.pt"))
    except Exception as e:
        # Fallback option - less secure but will work
        print("Using fallback loading method...")
        data = torch.load(os.path.join(input_dir, "cleaned_user_graph.pt"), weights_only=False)
    
    # Load the user mapping
    user_to_idx = {}
    idx_to_user = {}
    
    with h5py.File(os.path.join(input_dir, "cleaned_user_mapping.h5"), 'r') as f:
        users = [u.decode('utf-8') for u in f['users'][:]]
        indices = f['indices'][:]
        
        for user, idx in zip(users, indices):
            user_to_idx[user] = idx
            idx_to_user[idx] = user
    
    return data, user_to_idx, idx_to_user

In [2]:
data, user_to_idx, idx_to_user = load_pyg_graph()

Using fallback loading method...


In [3]:
import torch
import torch_geometric as pyg
from torch_geometric.utils import homophily, k_hop_subgraph
from torch_geometric.utils import degree, to_scipy_sparse_matrix
from torch_geometric.transforms import AddLaplacianEigenvectorPE
import matplotlib.pyplot as plt
import numpy as np
from scipy import sparse
import scipy.stats as stats
from tqdm import tqdm
import pandas as pd
from collections import Counter
import community.community_louvain as community_louvain
import networkx as nx  # Still needed for some algorithms but used minimally
import torch_geometric.nn as pyg_nn
# from torch_sparse import coalesce
from torch_geometric.utils import remove_self_loops
# import torch_cluster

# 1. Basic Graph Analysis

def compute_basic_stats(data):
    """Compute basic statistics without using NetworkX"""
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges // 2} (undirected)")
    
    # Calculate degree distribution
    row, col = data.edge_index
    degrees = degree(row, num_nodes=data.num_nodes)
    
    # Average degree
    avg_degree = degrees.float().mean().item()
    print(f"Average degree: {avg_degree:.2f}")
    
    # Max degree and node with max degree
    max_degree = degrees.max().item()
    max_degree_node = degrees.argmax().item()
    print(f"Max degree: {max_degree} (Node {max_degree_node}: {idx_to_user[max_degree_node]})")
    
    # Degree distribution statistics
    degree_values = degrees.numpy()
    print(f"Degree distribution stats: Min={degree_values.min()}, Median={np.median(degree_values)}, Mean={degree_values.mean():.2f}")
    
    # Calculate degree percentiles
    percentiles = [25, 50, 75, 90, 95, 99]
    degree_percentiles = np.percentile(degree_values, percentiles)
    for p, val in zip(percentiles, degree_percentiles):
        print(f"{p}th percentile: {val:.2f}")
    
    return degrees


def find_influential_users(data, degrees, user_mapping, top_n=20):
    """Find the most influential users in the graph"""
    idx_to_user = user_mapping
    
    # Get top users by degree
    top_indices = torch.argsort(degrees, descending=True)[:top_n].tolist()
    
    print(f"\nTop {top_n} users by connections:")
    for i, idx in enumerate(top_indices):
        print(f"{i+1}. {idx_to_user[idx]}: {degrees[idx].item()} connections")
    
    return top_indices


def extremely_safe_analyze_degrees(degrees, chunk_size=10000):
    """
    Extremely safe degree analysis that processes data in small chunks
    to avoid memory issues
    """
    print("Starting extremely safe degree analysis...")
    
    # Convert tensor to list in chunks to avoid memory issues
    degree_list = []
    total_nodes = degrees.shape[0]
    
    for i in range(0, total_nodes, chunk_size):
        end = min(i + chunk_size, total_nodes)
        chunk = degrees[i:end].tolist()
        degree_list.extend(chunk)
        print(f"Processed chunk {i//chunk_size + 1}/{(total_nodes + chunk_size - 1)//chunk_size}")
        # Force garbage collection after each chunk
        gc.collect()
    
    print("Basic statistics calculation...")
    # Basic statistics - calculated manually to avoid numpy memory issues
    min_degree = min(degree_list)
    max_degree = max(degree_list)
    
    sum_degrees = 0
    for d in degree_list:
        sum_degrees += d
    mean_degree = sum_degrees / len(degree_list)
    
    # Print results
    print(f"Total nodes: {len(degree_list)}")
    print(f"Min degree: {min_degree}")
    print(f"Max degree: {max_degree}")
    print(f"Mean degree: {mean_degree:.2f}")
    
    # Count degrees in buckets to create a simple histogram without plotting
    print("Creating degree frequency table...")
    degree_counts = {}
    
    # Process in chunks to avoid memory issues
    for i in range(0, len(degree_list), chunk_size):
        chunk = degree_list[i:min(i+chunk_size, len(degree_list))]
        for d in chunk:
            if d in degree_counts:
                degree_counts[d] += 1
            else:
                degree_counts[d] = 1
        print(f"Processed count chunk {i//chunk_size + 1}/{(len(degree_list) + chunk_size - 1)//chunk_size}")
        gc.collect()
    
    # Save degree counts to a text file
    print("Saving degree counts to file...")
    with open("degree_counts.txt", "w") as f:
        f.write("Degree,Count\n")
        for d in sorted(degree_counts.keys()):
            f.write(f"{d},{degree_counts[d]}\n")
    
    print("Degree count data saved to degree_counts.txt")
    print("You can use this file later to create visualizations without memory issues")
    
    # Return simple summary stats
    return {
        "nodes": len(degree_list),
        "min_degree": min_degree,
        "max_degree": max_degree,
        "mean_degree": mean_degree,
    }



In [4]:
def visualize_random_subgraph(data, num_nodes=500, seed=42):
    """
    Sample a random subgraph for visualization
    """
    np.random.seed(seed)
    
    # Sample random nodes
    sampled_nodes = np.random.choice(data.num_nodes, size=min(num_nodes, data.num_nodes), replace=False)
    sampled_nodes = torch.tensor(sampled_nodes)
    
    # Extract the subgraph
    row, col = data.edge_index
    mask = torch.isin(row, sampled_nodes) & torch.isin(col, sampled_nodes)
    sub_edge_index = data.edge_index[:, mask]
    
    # Create node mapping
    node_map = {int(node): i for i, node in enumerate(sampled_nodes)}
    
    # Relabel nodes
    sub_row = torch.tensor([node_map[int(node)] for node in sub_edge_index[0]])
    sub_col = torch.tensor([node_map[int(node)] for node in sub_edge_index[1]])
    sub_edge_index = torch.stack([sub_row, sub_col])
    
    # Calculate node degrees
    sub_degrees = degree(sub_edge_index[0], num_nodes=len(sampled_nodes))
    
    # Create a simple NetworkX graph for layout
    G = nx.Graph()
    for i in range(len(sampled_nodes)):
        G.add_node(i)
    for i in range(sub_edge_index.shape[1]):
        G.add_edge(sub_edge_index[0, i].item(), sub_edge_index[1, i].item())
    
    # Calculate layout
    pos = nx.spring_layout(G, seed=seed)
    
    # Plot
    plt.figure(figsize=(12, 12))
    
    # Plot edges
    for i, j in zip(sub_edge_index[0], sub_edge_index[1]):
        i, j = i.item(), j.item()
        plt.plot([pos[i][0], pos[j][0]], [pos[i][1], pos[j][1]], 'k-', alpha=0.2, linewidth=0.5)
    
    # Plot nodes
    node_sizes = 10 + 50 * (sub_degrees / max(sub_degrees.max(), 1))
    for i in range(len(sampled_nodes)):
        plt.scatter(pos[i][0], pos[i][1], s=node_sizes[i].item(), 
                    c=plt.cm.viridis(float(sub_degrees[i])/max(sub_degrees.max(), 1)), 
                    alpha=0.7)
    
    plt.title(f'Random Subgraph with {len(sampled_nodes)} Nodes')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('output_123/random_subgraph.png', dpi=300)
    plt.close()  # Close to free memory
    print("Random subgraph visualization saved to 'random_subgraph.png'")

In [5]:
def analyze_homophily(data, node_features, feature_name="feature"):
    """
    Analyze homophily in the graph with respect to a node feature
    """
    h_score = homophily(data.edge_index, node_features, method='edge')
    print(f"Homophily score for {feature_name}: {h_score:.4f}")
    
    # Calculate feature distribution
    unique_values, counts = torch.unique(node_features, return_counts=True)
    
    # Plot distribution
    plt.figure(figsize=(10, 6))
    plt.bar(unique_values.numpy(), counts.numpy())
    plt.xlabel(feature_name)
    plt.ylabel('Count')
    plt.title(f'Distribution of {feature_name} Values')
    plt.savefig(f'{feature_name}_distribution.png', dpi=300)
    plt.close()  # Close to free memory
    
    return h_score

In [6]:
def analyze_and_visualize_graph(data, user_mapping=None):
    """
    Full analysis pipeline with memory-efficient visualization
    """
    # Import gc to help with garbage collection
    import gc
    
    # Compute degrees (this is required for most analyses)
    print("Computing node degrees...")
    row, col = data.edge_index
    degrees = degree(row, num_nodes=data.num_nodes)
    
    # Save degrees to a file for later use
    print("Saving degree data to file...")
    with open("degree_data.txt", "w") as f:
        for i, d in enumerate(degrees):
            f.write(f"{i},{d.item()}\n")
    
    # Create degree counts file (for plotting later) - without using bincount
    print("Creating degree counts file...")
    # Convert to int if it's float
    if degrees.dtype.is_floating_point:
        degrees_int = degrees.long()
    else:
        degrees_int = degrees
        
    # Count frequencies manually to avoid memory issues
    degree_count_dict = {}
    for d in degrees_int:
        d_val = d.item()
        if d_val in degree_count_dict:
            degree_count_dict[d_val] += 1
        else:
            degree_count_dict[d_val] = 1
    
    # Write to file
    with open("degree_counts.txt", "w") as f:
        f.write("Degree,Count\n")
        for d in sorted(degree_count_dict.keys()):
            f.write(f"{d},{degree_count_dict[d]}\n")
    
    # Plot degree distribution from file (memory efficient)
    print("Plotting degree distribution...")
    plot_degree_distribution("degree_counts.txt", log_scale=True)
    
    # Force garbage collection
    gc.collect()
    
    # Plot CCDF
    print("Plotting CCDF...")
    plot_ccdf(degrees)
    gc.collect()
    
    # Get top users if user mapping exists
    if user_mapping:
        top_users = find_influential_users(data, degrees, user_mapping, top_n=20)
        
        # Visualize ego network of the most connected user
        print("Visualizing ego network of the most connected user...")
        visualize_ego_network(data, top_users[0], hops=1)
        gc.collect()
    
    # Visualize a random subgraph (very memory efficient)
    print("Visualizing random subgraph...")
    visualize_random_subgraph(data, num_nodes=500)
    gc.collect()
    
    # Calculate connected components (this can be memory intensive for very large graphs)
    print("Calculating connected components...")
    try:
        visualize_connected_components_sizes(data)
    except MemoryError:
        print("Memory error when calculating connected components - skipping this step")
    
    gc.collect()
    
    print("Analysis complete!")

In [7]:
# 2. Community Detection and Network Structure

def detect_communities(data, method="louvain", resolution=1.0, max_nodes=100000):
    """Detect communities in the graph using different methods"""
    print("\nPerforming community detection...")
    start_time = time.time()
    
    # For very large graphs, we might need to sample
    if data.num_nodes > max_nodes:
        print(f"Graph is large, sampling {max_nodes} nodes for community detection...")
        sample_start = time.time()
        # Sample nodes (prefers higher degree nodes with some randomness)
        degrees = degree(data.edge_index[0], num_nodes=data.num_nodes)
        probs = degrees.float() / degrees.sum()
        sampled_nodes = torch.multinomial(probs, max_nodes, replacement=False)
        print(f"Sampled {len(sampled_nodes)} nodes in {time.time() - sample_start:.2f} seconds")
        
        # Extract subgraph
        print("Extracting subgraph...")
        subgraph_start = time.time()
        subset, edge_index, _, _ = k_hop_subgraph(
            node_idx=sampled_nodes, 
            num_hops=1,
            edge_index=data.edge_index, 
            relabel_nodes=True,
            num_nodes=data.num_nodes
        )
        print(f"Subgraph extraction completed in {time.time() - subgraph_start:.2f} seconds")
        print(f"Subgraph has {len(subset)} nodes and {edge_index.shape[1]} edges")
        
        # Create a new data object for the subgraph
        sub_data = pyg.data.Data(edge_index=edge_index, num_nodes=len(subset))
    else:
        print(f"Using full graph with {data.num_nodes} nodes and {data.edge_index.shape[1]} edges")
        sub_data = data
        subset = torch.arange(data.num_nodes)
    
    # Convert to scipy sparse matrix for community detection
    print("Converting to scipy sparse matrix...")
    sparse_start = time.time()
    adj_sparse = to_scipy_sparse_matrix(sub_data.edge_index, num_nodes=sub_data.num_nodes)
    print(f"Conversion to sparse matrix completed in {time.time() - sparse_start:.2f} seconds")
    
    # Convert to networkx for community detection algorithms
    print("Converting to networkx graph...")
    nx_start = time.time()
    G = nx.from_scipy_sparse_array(adj_sparse)
    print(f"Conversion to networkx completed in {time.time() - nx_start:.2f} seconds")
    print(f"NetworkX graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    
    # Perform community detection
    print(f"Starting {method} community detection with resolution={resolution}...")
    comm_start = time.time()
    
    if method == "louvain":
        # Use python-louvain for community detection
        print("Running Louvain algorithm...")
        partition = community_louvain.best_partition(G, resolution=resolution)
        print(f"Louvain completed in {time.time() - comm_start:.2f} seconds")
        
        communities = {}
        for node, comm_id in partition.items():
            if comm_id not in communities:
                communities[comm_id] = []
            communities[comm_id].append(node)
        
    elif method == "label_propagation":
        # Label propagation is faster but less accurate
        print("Running Label Propagation algorithm...")
        communities = {i: list(c) for i, c in enumerate(nx.algorithms.community.label_propagation_communities(G))}
        print(f"Label Propagation completed in {time.time() - comm_start:.2f} seconds")
    else:
        raise ValueError(f"Unknown community detection method: {method}")
    
    # Map back to original node indices if we sampled
    if data.num_nodes > max_nodes:
        print("Mapping sampled nodes back to original indices...")
        map_start = time.time()
        orig_communities = {}
        for comm_id, nodes in communities.items():
            orig_communities[comm_id] = [subset[node].item() for node in nodes]
        communities = orig_communities
        print(f"Mapping back completed in {time.time() - map_start:.2f} seconds")
    
    # Print community statistics
    sizes = [len(nodes) for nodes in communities.values()]
    print(f"Found {len(communities)} communities")
    print(f"Average community size: {np.mean(sizes):.2f}")
    print(f"Largest community size: {max(sizes)}")
    
    # Analyze top communities
    top_communities = sorted(communities.items(), key=lambda x: len(x[1]), reverse=True)[:5]
    print("\nTop 5 communities by size:")
    for i, (comm_id, nodes) in enumerate(top_communities):
        print(f"Community {i+1}: {len(nodes)} nodes")
    
    print(f"Total community detection completed in {time.time() - start_time:.2f} seconds")
    return communities, top_communities


def analyze_top_communities(communities, top_communities, data, idx_to_user, degrees, top_n=5):
    """Analyze the characteristics of top communities"""
    print("\nAnalyzing top communities...")
    start_time = time.time()
    
    for i, (comm_id, nodes) in enumerate(top_communities[:top_n]):
        print(f"\nAnalyzing Community {i+1} ({len(nodes)} nodes)...")
        comm_start = time.time()
        
        # Get top users in this community by degree
        print("  Identifying top users by degree...")
        user_start = time.time()
        community_degrees = [(node, degrees[node].item()) for node in nodes]
        top_users = sorted(community_degrees, key=lambda x: x[1], reverse=True)[:10]
        print(f"  Top users identified in {time.time() - user_start:.2f} seconds")
        
        print("  Top users in this community:")
        for j, (node, deg) in enumerate(top_users):
            print(f"    {j+1}. {idx_to_user[node]} (degree: {deg})")
        
        # Calculate average degree within community
        print("  Calculating average degree...")
        avg_start = time.time()
        community_avg_degree = np.mean([degrees[node].item() for node in nodes])
        print(f"  Average degree in community: {community_avg_degree:.2f}")
        print(f"  Average degree calculated in {time.time() - avg_start:.2f} seconds")
        
        # Calculate density (edges within community / possible edges)
        if len(nodes) > 1:
            print("  Calculating community density...")
            density_start = time.time()
            
            # Get subgraph
            node_tensor = torch.tensor(nodes)
            row, col = data.edge_index
            print("    Finding internal edges...")
            mask = torch.isin(row, node_tensor) & torch.isin(col, node_tensor)
            internal_edges = mask.sum().item() // 2  # divide by 2 for undirected
            possible_edges = len(nodes) * (len(nodes) - 1) // 2
            density = internal_edges / possible_edges
            print(f"  Community density: {density:.5f}")
            print(f"  Density calculated in {time.time() - density_start:.2f} seconds")
        else:
            print("  Community has only one node")
        
        print(f"Community {i+1} analysis completed in {time.time() - comm_start:.2f} seconds")
    
    print(f"Total community analysis completed in {time.time() - start_time:.2f} seconds")

# Add import at the top of your file
import time

In [8]:
# 3. Node Embeddings and Advanced Analysis

def compute_node_embeddings(data, dim=128, num_iterations=10):
    """Compute node embeddings using Node2Vec"""
    print("\nComputing node embeddings...")
    
    # Remove self loops if any exist
    edge_index, _ = remove_self_loops(data.edge_index)
    
    # Use torch_geometric's Node2Vec
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # Create model
    model = pyg_nn.models.Node2Vec(
        edge_index=edge_index,
        embedding_dim=dim,
        walk_length=10,
        context_size=5,
        walks_per_node=5,
        num_negative_samples=1,
        sparse=True
    ).to(device)
    
    # Train model
    loader = model.loader(batch_size=128, shuffle=True)
    optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01)
    
    model.train()
    for _ in tqdm(range(num_iterations), desc="Training Node2Vec"):
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        # print(f'Loss: {total_loss / len(loader):.4f}')
    
    # Get embeddings
    model.eval()
    with torch.no_grad():
        embeddings = model()
    
    return embeddings.cpu()


def visualize_embeddings(embeddings, nodes_to_highlight=None, labels=None, method='tsne'):
    """Visualize node embeddings using t-SNE or UMAP"""
    print(f"\nVisualizing embeddings using {method}...")
    
    # If more than 10,000 nodes, sample
    if embeddings.shape[0] > 10000:
        sample_size = 10000
        indices = torch.randperm(embeddings.shape[0])[:sample_size]
        sample_embeddings = embeddings[indices]
        
        # Adjust highlight indices if provided
        if nodes_to_highlight is not None:
            # Keep only those in the sample
            mask = torch.isin(indices, torch.tensor(nodes_to_highlight))
            highlight_indices = torch.where(mask)[0]
            if len(highlight_indices) == 0:
                # Force include some highlights if none were sampled
                sample_size = 9900
                indices = torch.randperm(embeddings.shape[0])[:sample_size]
                sample_indices = torch.cat([indices, torch.tensor(nodes_to_highlight[:100])])
                sample_embeddings = embeddings[sample_indices]
                highlight_indices = torch.arange(sample_size, sample_size + min(100, len(nodes_to_highlight)))
        else:
            highlight_indices = None
            
        # Adjust labels if provided
        if labels is not None:
            sample_labels = [labels[i.item()] for i in indices]
        else:
            sample_labels = None
    else:
        sample_embeddings = embeddings
        highlight_indices = nodes_to_highlight
        sample_labels = labels
    
    # Choose dimensionality reduction method
    if method == 'tsne':
        from sklearn.manifold import TSNE
        reducer = TSNE(n_components=2, random_state=42)
    elif method == 'umap':
        try:
            import umap
            reducer = umap.UMAP(random_state=42)
        except ImportError:
            print("UMAP not installed, falling back to t-SNE")
            from sklearn.manifold import TSNE
            reducer = TSNE(n_components=2, random_state=42)
    else:
        raise ValueError(f"Unknown visualization method: {method}")
    
    # Apply dimensionality reduction
    reduced = reducer.fit_transform(sample_embeddings.numpy())
    
    # Plot
    plt.figure(figsize=(12, 10))
    
    # Plot all points first
    if sample_labels is not None:
        unique_labels = sorted(set(sample_labels))
        colors = plt.cm.tab20(np.linspace(0, 1, len(unique_labels)))
        label_to_color = {label: color for label, color in zip(unique_labels, colors)}
        
        for label in unique_labels:
            mask = [l == label for l in sample_labels]
            plt.scatter(reduced[mask, 0], reduced[mask, 1], s=5, alpha=0.5, 
                       color=label_to_color[label], label=f"Community {label}")
        plt.legend(markerscale=2)
    else:
        plt.scatter(reduced[:, 0], reduced[:, 1], s=5, alpha=0.3, color='blue')
    
    # Highlight specific nodes if provided
    if highlight_indices is not None:
        plt.scatter(reduced[highlight_indices, 0], reduced[highlight_indices, 1], 
                   s=30, color='red', marker='x')
    
    plt.title(f"Node Embeddings Visualization ({method.upper()})")
    plt.tight_layout()
    plt.savefig(f'node_embeddings_{method}.png', dpi=300)
    plt.show()



In [9]:
# 4. Path Analysis and Structural Properties

def find_shortest_paths(data, source_nodes, target_nodes, max_nodes=1000):
    """Find shortest paths between source and target nodes"""
    # For large graphs, we need to limit the search
    if data.num_nodes > max_nodes:
        # Use bidirectional search or approximation methods
        print("Graph too large for exhaustive shortest path search")
        print("Computing approximate shortest paths...")
        
        # Convert to scipy sparse matrix for efficient path computation
        adj = to_scipy_sparse_matrix(data.edge_index, num_nodes=data.num_nodes)
        
        results = []
        for source in source_nodes:
            for target in target_nodes:
                if source == target:
                    results.append((source, target, 0, [source]))
                    continue
                    
                # Compute shortest path length using scipy sparse
                dist = sparse.csgraph.shortest_path(adj, directed=False, indices=[source])
                distance = dist[0, target]
                
                if np.isfinite(distance):
                    # We found a path, but we don't have the full path
                    # For large graphs, we approximate
                    results.append((source, target, int(distance), 
                                    [source, '...', target]))
                else:
                    results.append((source, target, float('inf'), []))
        
        return results
    else:
        # Use networkx for smaller graphs to get actual paths
        G = nx.Graph()
        edge_index = data.edge_index.numpy()
        edges = list(zip(edge_index[0], edge_index[1]))
        G.add_edges_from(edges)
        
        results = []
        for source in source_nodes:
            for target in target_nodes:
                try:
                    path = nx.shortest_path(G, source=source, target=target)
                    results.append((source, target, len(path)-1, path))
                except nx.NetworkXNoPath:
                    results.append((source, target, float('inf'), []))
        
        return results


def compute_structural_properties(data):
    """Compute structural properties of the graph"""
    print("\nComputing structural properties...")
    
    # Create edge weights tensor if not present
    if not hasattr(data, 'edge_attr') or data.edge_attr is None:
        data.edge_attr = torch.ones(data.edge_index.size(1))
    
    # Convert to scipy sparse matrix
    adj = to_scipy_sparse_matrix(data.edge_index, edge_attr=data.edge_attr, num_nodes=data.num_nodes)
    
    # Compute average clustering coefficient for a sample of nodes
    if data.num_nodes > 1000:
        # Sample nodes for clustering coefficient calculation
        sample_size = min(1000, data.num_nodes)
        sample_nodes = np.random.choice(data.num_nodes, size=sample_size, replace=False)
        
        # Convert to networkx for clustering coefficient calculation
        G = nx.Graph()
        edge_index = data.edge_index.numpy()
        edges = list(zip(edge_index[0], edge_index[1]))
        G.add_edges_from(edges)
        
        clustering_coefficients = []
        for node in tqdm(sample_nodes, desc="Computing clustering coefficients"):
            cc = nx.clustering(G, node)
            clustering_coefficients.append(cc)
        
        avg_clustering = np.mean(clustering_coefficients)
        print(f"Average clustering coefficient (sampled): {avg_clustering:.4f}")
    else:
        G = nx.Graph()
        edge_index = data.edge_index.numpy()
        edges = list(zip(edge_index[0], edge_index[1]))
        G.add_edges_from(edges)
        avg_clustering = nx.average_clustering(G)
        print(f"Average clustering coefficient: {avg_clustering:.4f}")
    
    # Compute graph diameter (or estimate for large graphs)
    if data.num_nodes > 1000:
        print("Graph too large for exact diameter computation")
        print("Estimating effective diameter...")
        
        # Sample some source nodes for BFS
        sample_size = min(20, data.num_nodes)
        sample_nodes = np.random.choice(data.num_nodes, size=sample_size, replace=False)
        
        max_distances = []
        for source in tqdm(sample_nodes, desc="Estimating diameter"):
            # Run BFS from source
            dist = sparse.csgraph.shortest_path(adj, directed=False, indices=[source])
            # Get maximum finite distance
            finite_distances = dist[np.isfinite(dist)]
            if len(finite_distances) > 0:
                max_distances.append(np.max(finite_distances))
        
        if max_distances:
            estimated_diameter = np.max(max_distances)
            print(f"Estimated graph diameter: {estimated_diameter:.1f}")
        else:
            print("Could not estimate diameter - graph may be disconnected")
    else:
        try:
            diameter = nx.diameter(G)
            print(f"Graph diameter: {diameter}")
        except nx.NetworkXError:
            print("Graph is not connected, diameter undefined")
    
    # Compute assortativity coefficient
    if data.num_nodes <= 10000:
        try:
            degrees = dict(G.degree())
            assortativity = nx.degree_assortativity_coefficient(G)
            print(f"Degree assortativity coefficient: {assortativity:.4f}")
            if assortativity > 0:
                print("Positive assortativity: nodes tend to connect with similar-degree nodes")
            else:
                print("Negative assortativity: nodes tend to connect with different-degree nodes")
        except:
            print("Could not compute assortativity coefficient")
    
    return {
        'avg_clustering': avg_clustering,
        'estimated_diameter': estimated_diameter if 'estimated_diameter' in locals() else None
    }




In [10]:
# 5. User Similarity and Recommendation

def compute_user_similarities(embeddings, user_ids, top_k=10):
    """Compute pairwise similarities between specified users based on embeddings"""
    user_embeddings = embeddings[user_ids]
    
    # Normalize embeddings for cosine similarity
    norm_embeddings = user_embeddings / user_embeddings.norm(dim=1, keepdim=True)
    
    # Compute pairwise similarities
    similarities = torch.mm(norm_embeddings, norm_embeddings.t())
    
    # Set self-similarities to 0 to find non-identical matches
    similarities[torch.eye(len(user_ids), dtype=bool)] = 0
    
    # Get top_k similar users for each user
    top_similarities, top_indices = similarities.topk(top_k, dim=1)
    
    return top_similarities, top_indices


def find_similar_users(embeddings, query_user_idx, idx_to_user, top_k=10):
    """Find users similar to the query user based on embeddings"""
    # Get query user embedding
    query_embedding = embeddings[query_user_idx].unsqueeze(0)
    
    # Normalize for cosine similarity
    query_norm = query_embedding / query_embedding.norm()
    all_norm = embeddings / embeddings.norm(dim=1, keepdim=True)
    
    # Compute similarities
    similarities = torch.mm(query_norm, all_norm.t()).squeeze()
    
    # Set self-similarity to -1 to exclude from results
    similarities[query_user_idx] = -1
    
    # Get top_k similar users
    top_similarities, top_indices = similarities.topk(top_k)
    
    # Print results
    print(f"\nUsers most similar to {idx_to_user[query_user_idx]}:")
    for i, (idx, score) in enumerate(zip(top_indices, top_similarities)):
        print(f"{i+1}. {idx_to_user[idx.item()]} (similarity: {score.item():.4f})")
    
    return top_indices.numpy(), top_similarities.numpy()


def recommend_connections(data, query_user_idx, embeddings, idx_to_user, top_k=10):
    """Recommend new connections for a user based on embeddings and graph structure"""
    # Get query user's current neighbors
    row, col = data.edge_index
    mask = (row == query_user_idx)
    neighbors = col[mask].tolist()
    
    # Get node embeddings
    query_embedding = embeddings[query_user_idx].unsqueeze(0)
    
    # Normalize for cosine similarity
    query_norm = query_embedding / query_embedding.norm()
    all_norm = embeddings / embeddings.norm(dim=1, keepdim=True)
    
    # Compute similarities
    similarities = torch.mm(query_norm, all_norm.t()).squeeze()
    
    # Set scores to -1 for the query user and existing neighbors
    mask = torch.ones(data.num_nodes, dtype=torch.bool)
    mask[query_user_idx] = False
    mask[neighbors] = False
    
    # Only consider nodes that aren't already neighbors
    similarities[~mask] = -1
    
    # Get top_k recommendations
    top_similarities, top_indices = similarities.topk(top_k)
    
    # Print results
    print(f"\nRecommended connections for {idx_to_user[query_user_idx]}:")
    for i, (idx, score) in enumerate(zip(top_indices, top_similarities)):
        print(f"{i+1}. {idx_to_user[idx.item()]} (similarity: {score.item():.4f})")
    
    return top_indices.numpy(), top_similarities.numpy()




In [11]:
# 6. Main Analysis Function

def analyze_large_reddit_graph(data, user_to_idx, idx_to_user, compute_embeddings=True):
    """Main function to analyze a large Reddit interaction graph"""
    print("Starting analysis of large Reddit interaction graph...")
    
    # 1. Basic graph analysis
    degrees = compute_basic_stats(data)
    top_users = find_influential_users(data, degrees, idx_to_user)
    degree_counts = analyze_degree_distribution(degrees)
    
    # 2. Community detection
    communities, top_communities = detect_communities(data, method="louvain", max_nodes=100000)
    analyze_top_communities(communities, top_communities, data, idx_to_user, degrees)
    
    # 3. Node embeddings (optional - can be computationally intensive)
    if compute_embeddings:
        embeddings = compute_node_embeddings(data, dim=128, num_iterations=5)
        
        # Create community labels for visualization
        sample_size = min(10000, data.num_nodes)
        sample_indices = torch.randperm(data.num_nodes)[:sample_size]
        
        # Map sampled nodes to communities
        sample_community_labels = []
        for idx in sample_indices:
            idx_item = idx.item()
            found = False
            for comm_id, nodes in communities.items():
                if idx_item in nodes:
                    sample_community_labels.append(comm_id)
                    found = True
                    break
            if not found:
                sample_community_labels.append(-1)  # No community
        
        # Visualize embeddings with community colors
        visualize_embeddings(embeddings, nodes_to_highlight=top_users, 
                            labels=sample_community_labels, method='tsne')
        
        # Find similar users to a few top users
        for user_idx in top_users[:3]:
            find_similar_users(embeddings, user_idx, idx_to_user, top_k=5)
            recommend_connections(data, user_idx, embeddings, idx_to_user, top_k=5)
    
    # 4. Structural properties
    structural_props = compute_structural_properties(data)
    
    print("\nAnalysis complete!")
    return {
        'degrees': degrees,
        'communities': communities,
        'structural_properties': structural_props,
        'embeddings': embeddings if compute_embeddings else None
    }


In [12]:
degrees = compute_basic_stats(data)


Number of nodes: 532659
Number of edges: 28462425 (undirected)
Average degree: 106.87
Max degree: 57911.0 (Node 461542: u/charavaka)
Degree distribution stats: Min=0.0, Median=0.0, Mean=106.87
25th percentile: 0.00
50th percentile: 0.00
75th percentile: 10.00
90th percentile: 130.00
95th percentile: 428.00
99th percentile: 2244.00


In [13]:
top_users = find_influential_users(data, degrees, idx_to_user)


Top 20 users by connections:
1. u/charavaka: 57911.0 connections
2. u/svmk1987: 46140.0 connections
3. u/whatsthebigdeal: 41992.0 connections
4. u/siriusleesam: 40919.0 connections
5. u/bhodrolok: 38602.0 connections
6. u/ivarun: 36846.0 connections
7. u/critfin: 36551.0 connections
8. u/mrfreeze2000: 36454.0 connections
9. u/uncertn_laaife: 35347.0 connections
10. u/anthonygonsalvez: 34689.0 connections
11. u/platinumgus18: 34394.0 connections
12. u/froogler: 33118.0 connections
13. u/-judeanpeoplesfront-: 32691.0 connections
14. u/viksi: 32678.0 connections
15. u/kash_if: 32123.0 connections
16. u/moojo: 30623.0 connections
17. u/tool_of_justice: 30442.0 connections
18. u/wanderingmind: 30415.0 connections
19. u/piezod: 29416.0 connections
20. u/evereddy: 28858.0 connections


In [14]:
# Call the extremely safe function
try:
    print("\n=== PERFORMING EXTREMELY SAFE DEGREE ANALYSIS ===\n")
    basic_stats = extremely_safe_analyze_degrees(degrees)
    print(f"\nDegree analysis completed successfully")
    print(f"Summary: {basic_stats}")
except Exception as e:
    print(f"Error during analysis: {str(e)}")


=== PERFORMING EXTREMELY SAFE DEGREE ANALYSIS ===

Starting extremely safe degree analysis...
Processed chunk 1/54
Processed chunk 2/54
Processed chunk 3/54
Processed chunk 4/54
Processed chunk 5/54
Processed chunk 6/54
Processed chunk 7/54
Processed chunk 8/54
Processed chunk 9/54
Processed chunk 10/54
Processed chunk 11/54
Processed chunk 12/54
Processed chunk 13/54
Processed chunk 14/54
Processed chunk 15/54
Processed chunk 16/54
Processed chunk 17/54
Processed chunk 18/54
Processed chunk 19/54
Processed chunk 20/54
Processed chunk 21/54
Processed chunk 22/54
Processed chunk 23/54
Processed chunk 24/54
Processed chunk 25/54
Processed chunk 26/54
Processed chunk 27/54
Processed chunk 28/54
Processed chunk 29/54
Processed chunk 30/54
Processed chunk 31/54
Processed chunk 32/54
Processed chunk 33/54
Processed chunk 34/54
Processed chunk 35/54
Processed chunk 36/54
Processed chunk 37/54
Processed chunk 38/54
Processed chunk 39/54
Processed chunk 40/54
Processed chunk 41/54
Processed ch

In [17]:
import torch
import matplotlib.pyplot as plt
import numpy as np
from torch_geometric.utils import degree, k_hop_subgraph, to_scipy_sparse_matrix
import networkx as nx
from scipy import sparse
import pandas as pd
import gc

def save_degrees_to_file(data, filename="degree_data.txt"):
    """Save degree information to file in a memory-efficient way"""
    print("Computing and saving node degrees...")
    row, col = data.edge_index
    
    # Process in chunks to avoid memory issues
    chunk_size = 1000000  # Adjust based on available memory
    total_nodes = data.num_nodes
    
    with open(filename, "w") as f:
        f.write("NodeID,Degree\n")
        
        for start in range(0, total_nodes, chunk_size):
            end = min(start + chunk_size, total_nodes)
            print(f"Processing nodes {start} to {end-1}...")
            
            # Calculate degrees for this chunk of nodes
            node_indices = torch.arange(start, end)
            chunk_degrees = torch.zeros(end - start, dtype=torch.long)
            
            # Count edges for each node in this chunk
            mask = (row >= start) & (row < end)
            chunk_row = row[mask] - start
            for idx in chunk_row:
                chunk_degrees[idx] += 1
            
            # Write to file
            for i, d in enumerate(chunk_degrees):
                node_id = start + i
                f.write(f"{node_id},{d.item()}\n")
            
            # Force garbage collection
            del chunk_degrees, mask, chunk_row
            gc.collect()
    
    print(f"Degree data saved to {filename}")
    return filename

def create_degree_counts_file(degree_file="degree_data.txt", output_file="degree_counts.txt"):
    """Create a frequency count of degrees from the degree data file"""
    print("Creating degree counts file...")
    
    # Count frequencies from file to avoid memory issues
    degree_counts = {}
    
    # Read the file in chunks
    chunk_size = 100000  # Adjust based on available memory
    chunk_iter = pd.read_csv(degree_file, chunksize=chunk_size)
    
    for chunk in chunk_iter:
        degrees = chunk['Degree'].values
        for d in degrees:
            if d in degree_counts:
                degree_counts[d] += 1
            else:
                degree_counts[d] = 1
        
        # Force garbage collection
        del degrees
        gc.collect()
    
    # Write counts to file
    with open(output_file, "w") as f:
        f.write("Degree,Count\n")
        for d in sorted(degree_counts.keys()):
            f.write(f"{d},{degree_counts[d]}\n")
    
    print(f"Degree counts saved to {output_file}")
    return output_file

def plot_degree_ccdf(counts_file="degree_counts.txt"):
    """
    Plot the Complementary Cumulative Distribution Function (CCDF)
    from the counts file, which is more memory efficient than using raw degrees
    """
    print("Plotting CCDF...")
    # Load the degree counts from file
    df = pd.read_csv(counts_file)
    degrees = df['Degree'].values
    counts = df['Count'].values
    
    # Calculate total number of nodes
    total_nodes = np.sum(counts)
    
    # Calculate CCDF points
    unique_degrees = np.sort(degrees)
    ccdf_values = np.zeros_like(unique_degrees, dtype=float)
    
    # For each degree value, sum up all counts for degrees >= that value
    for i, d in enumerate(unique_degrees):
        ccdf_values[i] = np.sum(counts[degrees >= d]) / total_nodes
    
    plt.figure(figsize=(10, 6))
    plt.loglog(unique_degrees, ccdf_values, 'bo', markersize=2, alpha=0.5)
    plt.xlabel('Degree (log scale)')
    plt.ylabel('CCDF: P(X ≥ x) (log scale)')
    plt.title('Degree CCDF (Complementary Cumulative Distribution Function)')
    plt.grid(True, alpha=0.3)
    plt.savefig('output_123/degree_ccdf.png', dpi=300)
    plt.close()  # Close to free memory
    print("CCDF plot saved to 'degree_ccdf.png'")

def find_top_nodes(degree_file="degree_data.txt", top_n=20, user_mapping=None):
    """Find the top N nodes by degree in a memory-efficient way"""
    print(f"Finding top {top_n} nodes by degree...")
    
    # Use pandas to read the file and find top nodes
    # This is more memory efficient than loading all degrees into memory
    top_df = pd.read_csv(degree_file).nlargest(top_n, 'Degree')
    
    print(f"\nTop {top_n} nodes by connections:")
    for i, (idx, row) in enumerate(top_df.iterrows()):
        node_id = row['NodeID']
        degree_val = row['Degree']
        
        if user_mapping and node_id in user_mapping:
            print(f"{i+1}. Node {node_id} ({user_mapping[node_id]}): {degree_val} connections")
        else:
            print(f"{i+1}. Node {node_id}: {degree_val} connections")
    
    return top_df['NodeID'].values.tolist()

def visualize_ego_network(data, center_node, hops=1, max_nodes=100):
    """
    Visualize the ego network (local neighborhood) of a specific node
    """
    print(f"Visualizing ego network for node {center_node}...")
    # Extract k-hop subgraph
    subset, edge_index, mapping, edge_mask = k_hop_subgraph(
        node_idx=[center_node], 
        num_hops=hops,
        edge_index=data.edge_index,
        relabel_nodes=True,
        num_nodes=data.num_nodes
    )
    
    # If too large, sample nodes
    if len(subset) > max_nodes:
        print(f"Ego network too large ({len(subset)} nodes). Sampling {max_nodes} nodes.")
        # Keep the center node and sample the rest
        center_idx = mapping.tolist().index(center_node)
        other_indices = list(range(len(subset)))
        other_indices.remove(center_idx)
        
        # Randomly sample additional nodes
        sampled_indices = np.random.choice(other_indices, min(max_nodes-1, len(other_indices)), replace=False).tolist()
        sampled_indices.append(center_idx)  # Add center node back
        
        # Create a new mapping and edge index with only the sampled nodes
        new_subset = subset[sampled_indices]
        node_map = {old_idx: new_idx for new_idx, old_idx in enumerate(sampled_indices)}
        
        # Filter edges to only include sampled nodes
        row, col = edge_index
        edge_mask = torch.zeros(row.size(0), dtype=torch.bool)
        for i, (src, dst) in enumerate(zip(row, col)):
            if src.item() in node_map and dst.item() in node_map:
                edge_mask[i] = True
        
        row = row[edge_mask]
        col = col[edge_mask]
        new_row = torch.tensor([node_map[idx.item()] for idx in row])
        new_col = torch.tensor([node_map[idx.item()] for idx in col])
        new_edge_index = torch.stack([new_row, new_col])
        
        subset = new_subset
        edge_index = new_edge_index
    
    # Calculate node sizes based on degree
    row, col = edge_index
    degrees = degree(row, num_nodes=len(subset))
    node_sizes = 10 + 50 * (degrees / max(degrees.max(), 1))
    
    # Create a simple plot using matplotlib
    plt.figure(figsize=(10, 10))
    
    # Extract node positions using a force-directed layout (this is lightweight)
    pos = {}
    # Convert to NetworkX just for layout calculation (this is the most efficient approach)
    G = nx.Graph()
    for i in range(len(subset)):
        G.add_node(i)
    for i in range(edge_index.shape[1]):
        G.add_edge(edge_index[0, i].item(), edge_index[1, i].item())
    
    pos = nx.spring_layout(G, seed=42)
    
    # Draw the network
    for i, j in zip(edge_index[0], edge_index[1]):
        i, j = i.item(), j.item()
        plt.plot([pos[i][0], pos[j][0]], [pos[i][1], pos[j][1]], 'k-', alpha=0.1, linewidth=0.5)
    
    # Draw the center node with a different color
    center_idx = mapping.tolist().index(center_node)
    for i in range(len(subset)):
        if i == center_idx:
            plt.scatter(pos[i][0], pos[i][1], s=node_sizes[i].item()*2, c='red', alpha=0.8)
        else:
            plt.scatter(pos[i][0], pos[i][1], s=node_sizes[i].item(), c='blue', alpha=0.5)
    
    plt.title(f"{hops}-hop Ego Network of Node {center_node}")
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'ego_network_node_{center_node}.png', dpi=300)
    plt.close()  # Close to free memory
    print(f"Ego network visualization saved to 'ego_network_node_{center_node}.png'")
    
    # Return some statistics about the ego network
    return {
        "num_nodes": len(subset),
        "num_edges": edge_index.shape[1] // 2,
        "avg_degree": degrees.float().mean().item()
    }

def visualize_random_subgraph(data, num_nodes=500, seed=42):
    """
    Sample a random subgraph for visualization
    """
    print(f"Visualizing random subgraph with {num_nodes} nodes...")
    np.random.seed(seed)
    
    # Sample random nodes
    sampled_nodes = np.random.choice(data.num_nodes, size=min(num_nodes, data.num_nodes), replace=False)
    sampled_nodes = torch.tensor(sampled_nodes)
    
    # Extract the subgraph
    row, col = data.edge_index
    mask = torch.isin(row, sampled_nodes) & torch.isin(col, sampled_nodes)
    sub_edge_index = data.edge_index[:, mask]
    
    # Create node mapping
    node_map = {int(node): i for i, node in enumerate(sampled_nodes)}
    
    # Relabel nodes
    sub_row = torch.tensor([node_map[int(node)] for node in sub_edge_index[0] if int(node) in node_map])
    sub_col = torch.tensor([node_map[int(node)] for node in sub_edge_index[1] if int(node) in node_map])
    
    if len(sub_row) == 0:
        print("No edges found in the sampled subgraph. Try increasing the sample size.")
        return
        
    sub_edge_index = torch.stack([sub_row, sub_col])
    
    # Calculate node degrees
    sub_degrees = degree(sub_edge_index[0], num_nodes=len(sampled_nodes))
    
    # Create a simple NetworkX graph for layout
    G = nx.Graph()
    for i in range(len(sampled_nodes)):
        G.add_node(i)
    for i in range(sub_edge_index.shape[1]):
        G.add_edge(sub_edge_index[0, i].item(), sub_edge_index[1, i].item())
    
    # Calculate layout
    pos = nx.spring_layout(G, seed=seed)
    
    # Plot
    plt.figure(figsize=(12, 12))
    
    # Plot edges
    for i, j in zip(sub_edge_index[0], sub_edge_index[1]):
        i, j = i.item(), j.item()
        plt.plot([pos[i][0], pos[j][0]], [pos[i][1], pos[j][1]], 'k-', alpha=0.2, linewidth=0.5)
    
    # Plot nodes
    node_sizes = 10 + 50 * (sub_degrees / max(sub_degrees.max(), 1))
    for i in range(len(sampled_nodes)):
        if i in pos:  # Check if node is in the position dictionary
            plt.scatter(pos[i][0], pos[i][1], s=node_sizes[i].item(), 
                        c=plt.cm.viridis(float(sub_degrees[i])/max(sub_degrees.max().item(), 1)), 
                        alpha=0.7)
    
    plt.title(f'Random Subgraph with {len(sampled_nodes)} Nodes')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('output_123/random_subgraph.png', dpi=300)
    plt.close()  # Close to free memory
    print("Random subgraph visualization saved to 'random_subgraph.png'")

def analyze_connected_components(data, max_components_to_display=10, sample_fraction=None):
    """
    Analyze connected components of the graph
    
    Args:
        data: PyG data object
        max_components_to_display: Number of largest components to display
        sample_fraction: If not None, analyze a random sample of the graph to save memory
    """
    print("Analyzing connected components...")
    
    if sample_fraction is not None and sample_fraction < 1.0:
        # Sample a fraction of nodes to reduce memory usage
        num_sample_nodes = int(data.num_nodes * sample_fraction)
        print(f"Sampling {num_sample_nodes} nodes ({sample_fraction*100:.1f}% of graph) for component analysis...")
        
        # Sample random nodes
        np.random.seed(42)
        sampled_nodes = np.random.choice(data.num_nodes, size=num_sample_nodes, replace=False)
        sampled_nodes = torch.tensor(sampled_nodes)
        
        # Extract the subgraph
        row, col = data.edge_index
        mask = torch.isin(row, sampled_nodes) & torch.isin(col, sampled_nodes)
        sub_edge_index = data.edge_index[:, mask]
        
        # Create node mapping
        node_map = {int(node): i for i, node in enumerate(sampled_nodes)}
        
        # Relabel nodes
        sub_row = torch.tensor([node_map[int(node)] for node in sub_edge_index[0] if int(node) in node_map])
        sub_col = torch.tensor([node_map[int(node)] for node in sub_edge_index[1] if int(node) in node_map])
        
        if len(sub_row) == 0:
            print("No edges found in the sampled subgraph. Try increasing the sample size.")
            return
        
        sub_edge_index = torch.stack([sub_row, sub_col])
        
        # Convert to sparse matrix
        adj_sparse = to_scipy_sparse_matrix(sub_edge_index, num_nodes=num_sample_nodes)
        total_nodes = num_sample_nodes
        
    else:
        # Use the full graph
        # Convert to sparse matrix for efficient component calculation
        adj_sparse = to_scipy_sparse_matrix(data.edge_index, num_nodes=data.num_nodes)
        total_nodes = data.num_nodes
    
    print("Computing connected components...")
    try:
        n_components, labels = sparse.csgraph.connected_components(
            adj_sparse, directed=False, return_labels=True
        )
        
        print(f"Total number of connected components: {n_components}")
        
        # Count component sizes
        component_counts = {}
        for label in labels:
            if label in component_counts:
                component_counts[label] += 1
            else:
                component_counts[label] = 1
        
        # Sort by size (descending)
        sorted_components = sorted(component_counts.items(), key=lambda x: x[1], reverse=True)
        
        # Plot the top N component sizes
        plt.figure(figsize=(12, 6))
        top_components = sorted_components[:max_components_to_display]
        plt.bar(range(len(top_components)), [size for _, size in top_components])
        plt.xlabel('Component ID')
        plt.ylabel('Component Size (# nodes)')
        plt.title(f'Top {len(top_components)} Connected Component Sizes')
        plt.tight_layout()
        plt.savefig('output_123/component_sizes.png', dpi=300)
        plt.close()  # Close to free memory
        
        # Print component statistics
        largest_component_size = sorted_components[0][1]
        largest_component_percentage = (largest_component_size / total_nodes) * 100
        
        print(f"Largest component has {largest_component_size} nodes " 
              f"({largest_component_percentage:.2f}% of the analyzed graph)")
        print(f"Number of singleton nodes: {sum(1 for _, size in sorted_components if size == 1)}")
        
        # Create a new figure for component size distribution
        plt.figure(figsize=(10, 6))
        sizes = [size for _, size in sorted_components]
        plt.loglog(range(1, len(sizes) + 1), sizes, 'o-', markersize=3)
        plt.xlabel('Component Rank (log scale)')
        plt.ylabel('Component Size (log scale)')
        plt.title('Component Size Distribution')
        plt.grid(True, alpha=0.3)
        plt.savefig('output_123/component_size_distribution.png', dpi=300)
        plt.close()  # Close to free memory
        print("Component visualizations saved to 'component_sizes.png' and 'component_size_distribution.png'")
        
        if sample_fraction is not None:
            print(f"Note: Component analysis was performed on a {sample_fraction*100:.1f}% sample of the graph")
        
        return sorted_components
        
    except MemoryError:
        print("Memory error when calculating connected components.")
        print("Try using a smaller sample_fraction parameter (e.g., 0.1) to analyze a subset of the graph.")
        return None

def stepwise_analysis(data, user_mapping=None):
    """Run analysis steps one by one with memory cleanup between each step"""
    # Step 1: Save degrees to file
    degree_file = save_degrees_to_file(data)
    gc.collect()
    
    # Step 2: Create degree counts file
    counts_file = create_degree_counts_file(degree_file)
    gc.collect()
    
    # Step 3: Plot degree distribution
    plot_degree_distribution(counts_file, log_scale=True)
    gc.collect()
    
    # Step 4: Plot CCDF
    plot_degree_ccdf(counts_file)
    gc.collect()
    
    # Step 5: Find top nodes
    top_nodes = find_top_nodes(degree_file, top_n=20, user_mapping=user_mapping)
    gc.collect()
    
    # Step 6: Visualize ego network of most connected node
    if top_nodes:
        try:
            visualize_ego_network(data, top_nodes[0], hops=1, max_nodes=100)
            gc.collect()
        except Exception as e:
            print(f"Error visualizing ego network: {str(e)}")
    
    # Step 7: Visualize random subgraph
    try:
        visualize_random_subgraph(data, num_nodes=200)
        gc.collect()
    except Exception as e:
        print(f"Error visualizing random subgraph: {str(e)}")
        # Try with smaller sample
        try:
            visualize_random_subgraph(data, num_nodes=50)
            gc.collect()
        except:
            print("Failed to visualize even a smaller random subgraph")
    
    # Step 8: Analyze connected components (with sampling if needed)
    try:
        analyze_connected_components(data, sample_fraction=0.1)
        gc.collect()
    except Exception as e:
        print(f"Error analyzing connected components: {str(e)}")
        print("Try running component analysis separately with smaller sample")
    
    print("Analysis complete!")

In [22]:
import pandas as pd
import numpy as np
import os
import time
import gc  # Garbage collector
import sys
import warnings

def plot_degree_distribution_safe(counts_file="degree_counts.txt", log_scale=True, max_display=None):
    """
    Ultra-safe version of the degree distribution plotting function that uses
    minimal memory and has multiple safeguards against kernel crashes.
    """
    print(f"\nPlotting degree distribution from {counts_file} (ultra-safe mode)...")
    start_time = time.time()
    
    # Step 1: Check if file exists
    if not os.path.exists(counts_file):
        print(f"Error: File {counts_file} not found")
        return
    
    # Step 2: Load data without plotting first
    try:
        print("Reading data file...")
        df = pd.read_csv(counts_file)
        
        if 'Degree' not in df.columns or 'Count' not in df.columns:
            print(f"Error: Required columns 'Degree' and 'Count' not found in {counts_file}")
            return
        
        # Apply filtering
        if max_display:
            print(f"Limiting to degrees <= {max_display}")
            df = df[df['Degree'] <= max_display]
        
        # Convert to simple lists for minimal memory usage
        degrees = df['Degree'].tolist()
        counts = df['Count'].tolist()
        
        # Free memory
        del df
        gc.collect()
        
        print(f"Data loaded: {len(degrees)} data points")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return
    
    # Step 3: Save the data as a simple text file first (as backup)
    try:
        os.makedirs('output_123', exist_ok=True)
        backup_file = 'output_123/degree_data_backup.txt'
        print(f"Saving backup data to {backup_file}...")
        
        with open(backup_file, 'w') as f:
            f.write("Degree,Count\n")
            for d, c in zip(degrees, counts):
                f.write(f"{d},{c}\n")
        
        print("Backup saved successfully")
    except Exception as e:
        print(f"Warning: Could not save backup data: {str(e)}")
    
    # Step 4: Try to import matplotlib with a timeout
    try:
        print("Importing matplotlib...")
        
        # Try to use Agg backend which is more stable
        import matplotlib
        matplotlib.use('Agg')  # Use non-interactive backend
        
        import matplotlib.pyplot as plt
        print("Matplotlib imported successfully")
    except Exception as e:
        print(f"Error importing matplotlib: {str(e)}")
        print("Cannot generate plot. Data has been saved to backup file.")
        return
    
    # Step 5: Generate the plot with extensive error handling
    try:
        print("Preparing to generate plot...")
        
        # Turn off interactive mode
        plt.ioff()
        
        # Suppress warnings
        warnings.filterwarnings('ignore')
        
        # Create figure with minimal memory footprint
        plt.figure(figsize=(10, 6), dpi=100)
        
        if log_scale:
            print("Preparing log-log plot...")
            # Filter out zeros and negative values for log scale
            valid_points = [(d, c) for d, c in zip(degrees, counts) if d > 0 and c > 0]
            
            if not valid_points:
                print("Error: No valid data points for log-log plot")
                return
            
            plot_degrees, plot_counts = zip(*valid_points)
            
            # Use scatter instead of plot for potentially better memory usage
            plt.loglog(plot_degrees, plot_counts, 'o', markersize=2, alpha=0.5)
            plt.xlabel('Degree (log scale)')
            plt.ylabel('Count (log scale)')
            plt.title('Degree Distribution (Log-Log Scale)')
        else:
            print("Preparing linear plot...")
            plt.scatter(degrees, counts, s=2, alpha=0.5)
            plt.xlabel('Degree')
            plt.ylabel('Count')
            plt.title('Degree Distribution')
        
        plt.grid(True, alpha=0.3)
        
        # Save figure with minimal DPI first as a test
        test_file = 'output_123/degree_distribution_test.png'
        print(f"Saving test plot to {test_file}...")
        plt.savefig(test_file, dpi=72)
        print("Test plot saved successfully")
        
        # Now try the full resolution plot
        save_path = 'output_123/degree_distribution.png'
        print(f"Saving final plot to {save_path}...")
        plt.savefig(save_path, dpi=300)
        print("Final plot saved successfully")
        
        # Close plot to free memory
        plt.close('all')
        gc.collect()
        
        print(f"Degree distribution plot saved to '{save_path}'")
        print(f"Total plotting time: {time.time() - start_time:.2f} seconds")
        
    except Exception as e:
        print(f"Error during plotting: {str(e)}")
        
        # Try to save a simplified plot as a last resort
        try:
            print("Attempting to save a simplified plot...")
            plt.figure(figsize=(8, 5), dpi=72)
            
            # Use even simpler plotting
            if log_scale:
                filtered_degrees = []
                filtered_counts = []
                for d, c in zip(degrees, counts):
                    if d > 0 and c > 0:
                        filtered_degrees.append(d)
                        filtered_counts.append(c)
                        
                # Sample points if there are too many
                if len(filtered_degrees) > 1000:
                    indices = np.random.choice(len(filtered_degrees), 1000, replace=False)
                    filtered_degrees = [filtered_degrees[i] for i in indices]
                    filtered_counts = [filtered_counts[i] for i in indices]
                
                plt.loglog(filtered_degrees, filtered_counts, 'o', markersize=2)
            else:
                # Sample points if there are too many
                if len(degrees) > 1000:
                    indices = np.random.choice(len(degrees), 1000, replace=False)
                    sampled_degrees = [degrees[i] for i in indices]
                    sampled_counts = [counts[i] for i in indices]
                    plt.scatter(sampled_degrees, sampled_counts, s=2)
                else:
                    plt.scatter(degrees, counts, s=2)
            
            plt.grid(True)
            simple_path = 'output_123/degree_distribution_simple.png'
            plt.savefig(simple_path, dpi=72)
            plt.close('all')
            print(f"Simplified plot saved to {simple_path}")
            
        except Exception as e2:
            print(f"Failed to save simplified plot: {str(e2)}")
            print("Plot generation failed, but data was saved to backup file.")

# Alternative approach: Use plain text output if plotting is problematic
def text_based_degree_distribution(counts_file="degree_counts.txt", max_display=None, num_bins=10):
    """Generate a text-based visualization of the degree distribution"""
    try:
        print(f"\nGenerating text-based degree distribution from {counts_file}...")
        
        # Load data
        df = pd.read_csv(counts_file)
        
        if max_display:
            df = df[df['Degree'] <= max_display]
        
        degrees = df['Degree'].values
        counts = df['Count'].values
        
        # Get summary statistics
        max_degree = max(degrees)
        total_nodes = sum(counts)
        avg_degree = sum(degrees * counts) / total_nodes if total_nodes > 0 else 0
        
        print("\n=== DEGREE DISTRIBUTION SUMMARY ===")
        print(f"Total nodes: {total_nodes}")
        print(f"Maximum degree: {max_degree}")
        print(f"Average degree: {avg_degree:.2f}")
        
        # Create bins for text histogram
        if len(degrees) > num_bins:
            # Create logarithmic bins
            if max_degree > 100:
                bins = np.logspace(0, np.log10(max_degree), num_bins+1)
            else:
                bins = np.linspace(0, max_degree, num_bins+1)
            
            bin_counts = np.zeros(num_bins)
            
            for d, c in zip(degrees, counts):
                for i in range(num_bins):
                    if bins[i] <= d < bins[i+1]:
                        bin_counts[i] += c
                        break
            
            # Print text histogram
            print("\n=== TEXT HISTOGRAM ===")
            max_bar_width = 50
            max_count = max(bin_counts)
            
            for i in range(num_bins):
                if bins[i+1] - bins[i] < 1:
                    range_str = f"{bins[i]:.1f}"
                else:
                    range_str = f"{int(bins[i])}-{int(bins[i+1])}"
                
                bar_width = int((bin_counts[i] / max_count) * max_bar_width) if max_count > 0 else 0
                bar = "#" * bar_width
                
                print(f"Degree {range_str:10}: {bar} ({bin_counts[i]:.0f})")
        
        # Save summary to file
        with open('output_123/degree_distribution_summary.txt', 'w') as f:
            f.write("=== DEGREE DISTRIBUTION SUMMARY ===\n")
            f.write(f"Total nodes: {total_nodes}\n")
            f.write(f"Maximum degree: {max_degree}\n")
            f.write(f"Average degree: {avg_degree:.2f}\n")
            
        print("\nSummary saved to 'output_123/degree_distribution_summary.txt'")
        
    except Exception as e:
        print(f"Error generating text-based distribution: {str(e)}")

In [3]:
# Import necessary packages
import gc  # For garbage collection

# Run the full analysis pipeline
plot_degree_distribution_safe("degree_counts.txt", log_scale=True, max_display=1000)
gc.collect()


Plotting degree distribution from degree_counts.txt (ultra-safe mode)...
Reading data file...
Limiting to degrees <= 1000
Data loaded: 1001 data points
Saving backup data to output_123/degree_data_backup.txt...
Backup saved successfully
Importing matplotlib...
Matplotlib imported successfully
Preparing to generate plot...
Preparing log-log plot...
Saving test plot to output_123/degree_distribution_test.png...
Test plot saved successfully
Saving final plot to output_123/degree_distribution.png...
Final plot saved successfully
Degree distribution plot saved to 'output_123/degree_distribution.png'
Total plotting time: 1.80 seconds


0

In [6]:
plot_degree_ccdf("degree_counts.txt")
gc.collect()

Plotting CCDF...
CCDF plot saved to 'degree_ccdf.png'


23527

In [14]:
import torch
import gc
import time
import pandas as pd
import numpy as np
from torch_geometric.utils import degree
from collections import Counter

def save_degrees_to_file(data, filename="degree_data.txt", idx_to_user=None):
    """Save degree information to file in a memory-efficient and FAST way"""
    print("Computing and saving node degrees...")
    start_time = time.time()
    
    # Use PyG's optimized degree calculation instead of manual counting
    print("Calculating degrees using optimized method...")
    deg_start = time.time()
    
    # This is MUCH faster than counting manually
    degrees = degree(data.edge_index[0], num_nodes=data.num_nodes)
    
    print(f"Degree calculation completed in {time.time() - deg_start:.2f} seconds")
    
    # Saving to file
    print(f"Saving degrees to {filename}...")
    save_start = time.time()
    
    with open(filename, "w") as f:
        f.write("NodeID,Degree" + (",Username" if idx_to_user else "") + "\n")
        
        # Process in chunks to avoid memory issues
        chunk_size = 1000000  # Still chunk for very large graphs
        total_nodes = data.num_nodes
        
        for start in range(0, total_nodes, chunk_size):
            batch_start = time.time()
            end = min(start + chunk_size, total_nodes)
            
            # Write this chunk to file
            for node_id in range(start, end):
                degree_val = degrees[node_id].item()
                
                if idx_to_user:
                    username = idx_to_user.get(node_id, "")
                    f.write(f"{node_id},{degree_val},{username}\n")
                else:
                    f.write(f"{node_id},{degree_val}\n")
            
            if end - start >= 100000:  # Only log for large chunks
                print(f"Processed nodes {start:,} to {end-1:,} in {time.time() - batch_start:.2f} seconds")
    
    # Also compute and save degree distribution statistics
    print("Computing degree distribution statistics...")
    stats_start = time.time()
    
    # Count frequency of each degree
    degree_counts = Counter(degrees.tolist())
    
    # Save degree distribution
    dist_file = filename.replace(".txt", "_counts.txt")
    with open(dist_file, "w") as f:
        f.write("Degree,Count\n")
        for deg, count in sorted(degree_counts.items()):
            f.write(f"{deg},{count}\n")
    
    print(f"Degree distribution statistics saved to {dist_file}")
    print(f"Statistics computation completed in {time.time() - stats_start:.2f} seconds")
    
    # Print some statistics
    total_time = time.time() - start_time
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average degree: {degrees.float().mean().item():.2f}")
    print(f"Maximum degree: {degrees.max().item()}")
    print(f"Total nodes: {len(degrees):,}")
    
    return filename, dist_file

# Even faster version - if you only need the degree distribution
def save_degree_distribution_only(data, filename="degree_counts.txt"):
    """
    Calculate and save only the degree distribution (faster than saving all degrees)
    """
    print("Computing degree distribution (fast method)...")
    start_time = time.time()
    
    # Calculate degrees
    print("Calculating degrees...")
    degrees = degree(data.edge_index[0], num_nodes=data.num_nodes)
    print(f"Degree calculation completed in {time.time() - start_time:.2f} seconds")
    
    # Compute distribution directly
    print("Computing distribution...")
    dist_start = time.time()
    
    # Count frequency of each degree
    degree_counts = {}
    for d in degrees:
        d_val = d.item()
        if d_val in degree_counts:
            degree_counts[d_val] += 1
        else:
            degree_counts[d_val] = 1
    
    # Save degree distribution
    print(f"Saving distribution to {filename}...")
    with open(filename, "w") as f:
        f.write("Degree,Count\n")
        for deg, count in sorted(degree_counts.items()):
            f.write(f"{deg},{count}\n")
    
    print(f"Degree distribution saved to {filename}")
    print(f"Distribution computation completed in {time.time() - dist_start:.2f} seconds")
    print(f"Total processing time: {time.time() - start_time:.2f} seconds")
    
    # Print some statistics
    avg_degree = sum(d * c for d, c in degree_counts.items()) / sum(degree_counts.values())
    max_degree = max(degree_counts.keys())
    print(f"Average degree: {avg_degree:.2f}")
    print(f"Maximum degree: {max_degree}")
    print(f"Total nodes: {sum(degree_counts.values()):,}")
    
    return filename

In [15]:
# Full version - saves both node degrees and distribution
node_degrees_file, distribution_file = save_degrees_to_file(data, filename="degree_data.txt")

# Fast version - only saves degree distribution
# distribution_file = save_degree_distribution_only(data)

Computing and saving node degrees...
Calculating degrees using optimized method...
Degree calculation completed in 0.15 seconds
Saving degrees to degree_data.txt...
Processed nodes 0 to 532,658 in 3.29 seconds
Computing degree distribution statistics...
Degree distribution statistics saved to degree_data_counts.txt
Statistics computation completed in 0.11 seconds
Total processing time: 4.18 seconds
Average degree: 106.87
Maximum degree: 57911.0
Total nodes: 532,659


In [20]:
top_nodes = find_top_nodes("degree_data.txt", top_n=20, user_mapping=None)
gc.collect()

# Step 6: Visualize ego network of most connected node
if top_nodes:
    try:
        visualize_ego_network(data, top_nodes[0], hops=1, max_nodes=100)
        gc.collect()
    except Exception as e:
        print(f"Error visualizing ego network: {str(e)}")

Finding top 20 nodes by degree...

Top 20 nodes by connections:
1. Node 461542.0: 57911.0 connections
2. Node 479353.0: 46140.0 connections
3. Node 313875.0: 41992.0 connections
4. Node 356599.0: 40919.0 connections
5. Node 223927.0: 38602.0 connections
6. Node 228186.0: 36846.0 connections
7. Node 258305.0: 36551.0 connections
8. Node 38569.0: 36454.0 connections
9. Node 60011.0: 35347.0 connections
10. Node 177072.0: 34689.0 connections
11. Node 28304.0: 34394.0 connections
12. Node 351694.0: 33118.0 connections
13. Node 45434.0: 32691.0 connections
14. Node 125196.0: 32678.0 connections
15. Node 24712.0: 32123.0 connections
16. Node 137953.0: 30623.0 connections
17. Node 446734.0: 30442.0 connections
18. Node 32426.0: 30415.0 connections
19. Node 8970.0: 29416.0 connections
20. Node 504981.0: 28858.0 connections
Visualizing ego network for node 461542...
Ego network too large (33646 nodes). Sampling 100 nodes.
Error visualizing ego network: 461542 is not in list
