In [1]:
!pip install pandas

[0m

In [6]:
import pandas as pd

In [11]:
def get_nodes_and_edges(k):
    dataset_list = [
        'bitcoin_alpha',
        'bitcoin_otc',
        'slashdot',
        'epinions'
    ]

    for dataset in dataset_list:
    
        train_filepath = f"/home/cegt/experiment-data/{dataset}/{dataset}-train-{k}.edgelist"
        test_filepath = f"/home/cegt/experiment-data/{dataset}/{dataset}-test-1.edgelist"
        
        # train_edgelist = pd.read_csv(train_filepath, sep=" ", header=None, names=["source", "target", "sign"])
        # test_edgelist = pd.read_csv(test_filepath, sep=" ", header=None, names=["source", "target", "sign"])

        train_G = nx.read_edgelist(train_filepath, create_using=nx.DiGraph(), data=(("sign", int),))
        test_G = nx.read_edgelist(test_filepath, create_using=nx.DiGraph(), data=(("sign", int),))
        
        # # Number of edges
        # train_num_edges = len(train_edgelist)
        # test_num_edges = len(test_edgelist)
        
        # # Number of nodes (unique nodes from source and target)
        # train_num_nodes = len(set(train_edgelist["source"]).union(set(train_edgelist["target"])))
        # test_num_nodes = len(set(test_edgelist["source"]).union(set(test_edgelist["target"])))

        print(f"Dataset: {dataset}")
        print("--------")
        print(f"Train Nodes: {train_G.number_of_nodes()}")
        print(f"Train Edges: {train_G.number_of_edges()}")
        print("--------")
        print(f"Test Nodes: {test_G.number_of_nodes()}")
        print(f"Test Edges: {test_G.number_of_edges()}")
        print("\n----------------------------------\n")

### Dataset statistics

In [12]:
get_nodes_and_edges(1)

Dataset: bitcoin_alpha
--------
Train Nodes: 3650
Train Edges: 19348
--------
Test Nodes: 2333
Test Edges: 4838

----------------------------------

Dataset: bitcoin_otc
--------
Train Nodes: 4863
Train Edges: 28473
--------
Test Nodes: 1722
Test Edges: 7119

----------------------------------

Dataset: slashdot
--------
Train Nodes: 22433
Train Edges: 43936
--------
Test Nodes: 13468
Test Edges: 10983

----------------------------------

Dataset: epinions
--------
Train Nodes: 28119
Train Edges: 67309
--------
Test Nodes: 17415
Test Edges: 16827

----------------------------------



In [15]:
get_nodes_and_edges("sampled")

Dataset: bitcoin_alpha
--------
Train Nodes: 2284
Train Edges: 4835
--------
Test Nodes: 2333
Test Edges: 4838

----------------------------------

Dataset: bitcoin_otc
--------
Train Nodes: 3102
Train Edges: 6501
--------
Test Nodes: 1722
Test Edges: 7119

----------------------------------

Dataset: slashdot
--------
Train Nodes: 13233
Train Edges: 16824
--------
Test Nodes: 13468
Test Edges: 10983

----------------------------------

Dataset: epinions
--------
Train Nodes: 3176
Train Edges: 2101
--------
Test Nodes: 17415
Test Edges: 16827

----------------------------------



In [7]:
!nvidia-smi

Sun Mar 23 10:14:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-16GB           On  |   00000000:1D:00.0 Off |                    0 |
| N/A   43C    P0             42W /  300W |       1MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import networkx as nx
import random

def undersample_signed_graph(edgelist_path, sampling_ratio=0.5):
    """
    Undersample edges in a signed graph network using stratified random sampling
    to preserve the proportion of positive and negative edges.

    Parameters:
    - edgelist_path: str, path to the .edgelist file
    - sampling_ratio: float, ratio of edges to sample (between 0 and 1, default is 0.5)

    Returns:
    - sampled_G: NetworkX DiGraph, the undersampled graph
    """
    # Step 1: Load the graph from the edgelist file
    G = nx.read_edgelist(edgelist_path, create_using=nx.DiGraph(), data=(("sign", int),))
    
    # Step 2: Separate positive and negative edges
    positive_edges = [(u, v, d['sign']) for u, v, d in G.edges(data=True) if d['sign'] > 0]
    negative_edges = [(u, v, d['sign']) for u, v, d in G.edges(data=True) if d['sign'] < 0]
    
    # Step 3: Calculate total edges and proportions
    total_edges = G.number_of_edges()
    num_positive = len(positive_edges)
    num_negative = len(negative_edges)
    p_positive = num_positive / total_edges if total_edges > 0 else 0
    p_negative = num_negative / total_edges if total_edges > 0 else 0
    
    # Step 4: Determine number of edges to sample
    k = int(total_edges * sampling_ratio)
    sample_size_positive = min(int(k * p_positive), num_positive)
    sample_size_negative = min(int(k * p_negative), num_negative)
    
    # Step 5: Sample edges proportionally
    sampled_positive = random.sample(positive_edges, sample_size_positive) if sample_size_positive > 0 else []
    sampled_negative = random.sample(negative_edges, sample_size_negative) if sample_size_negative > 0 else []
    
    # Combine sampled edges
    sampled_edges_with_signs = sampled_positive + sampled_negative
    
    # Step 6: Create the sampled graph
    sampled_G = nx.DiGraph()
    for u, v, sign in sampled_edges_with_signs:
        sampled_G.add_edge(u, v, sign=sign)
    
    # Add all original nodes to ensure the node set is preserved
    sampled_G.add_nodes_from(G.nodes())
    
    return sampled_G

In [5]:
dataset = 'slashdot'
edgelist_file = f"/home/cegt/experiment-data/{dataset}/{dataset}-train-1.edgelist"

# Undersample the graph with a 50% sampling ratio
sampled_graph = undersample_signed_graph(edgelist_file, sampling_ratio=0.5)

# Print basic information about the original and sampled graphs
original_graph = nx.read_edgelist(edgelist_file, create_using=nx.DiGraph(), data=(("sign", int),))
print(f"Original graph: {original_graph.number_of_nodes()} nodes, {original_graph.number_of_edges()} edges")
print(f"Sampled graph: {sampled_graph.number_of_nodes()} nodes, {sampled_graph.number_of_edges()} edges")

# Optionally, save the sampled graph to a new .edgelist file
nx.write_edgelist(sampled_graph, f"/home/cegt/experiment-data/{dataset}/{dataset}-train-sampled.edgelist", data=['sign'])

Original graph: 22433 nodes, 43936 edges
Sampled graph: 22433 nodes, 16824 edges


In [4]:
import networkx as nx
import random

def stratified_edge_sampling(edgelist_path, sampling_ratio=0.5):
    """
    Undersample edges in a signed graph using stratified random sampling
    to preserve the proportion of positive and negative edges.

    Parameters:
    - edgelist_path: str, path to the .edgelist file
    - sampling_ratio: float, ratio of edges to sample (between 0 and 1, default is 0.5)

    Returns:
    - sampled_G: NetworkX DiGraph, the undersampled graph
    """
    # Step 1: Load the graph from the edgelist file
    G = nx.read_edgelist(edgelist_path, create_using=nx.DiGraph(), data=(("sign", int),))
    
    # Step 2: Separate positive and negative edges
    positive_edges = [(u, v) for u, v, d in G.edges(data=True) if d['sign'] == 0]
    negative_edges = [(u, v) for u, v, d in G.edges(data=True) if d['sign'] == 1]
    
    # Step 3: Calculate sampling sizes
    num_positive = len(positive_edges)
    num_negative = len(negative_edges)
    total_edges = num_positive + num_negative
    p_positive = num_positive / total_edges if total_edges > 0 else 0
    p_negative = num_negative / total_edges if total_edges > 0 else 0
    
    k = int(total_edges * sampling_ratio)
    sample_size_positive = min(int(k * p_positive), num_positive)
    sample_size_negative = min(int(k * p_negative), num_negative)
    
    # Step 4: Sample edges proportionally
    sampled_positive = random.sample(positive_edges, sample_size_positive) if sample_size_positive > 0 else []
    sampled_negative = random.sample(negative_edges, sample_size_negative) if sample_size_negative > 0 else []
    
    # Combine sampled edges
    sampled_edges = sampled_positive + sampled_negative
    
    # Step 5: Create the sampled graph
    sampled_G = G.edge_subgraph(sampled_edges).copy()
    return sampled_G

In [6]:
dataset = 'slashdot'
edgelist_file = f"/home/cegt/experiment-data/{dataset}/{dataset}-train-1.edgelist"
sampled_graph = stratified_edge_sampling(edgelist_file, sampling_ratio=0.5)

# Print basic information for comparison
original_graph = nx.read_edgelist(edgelist_file, create_using=nx.DiGraph(), data=(("sign", int),))
print(f"Original graph: {original_graph.number_of_nodes()} nodes, {original_graph.number_of_edges()} edges")
print(f"Sampled graph: {sampled_graph.number_of_nodes()} nodes, {sampled_graph.number_of_edges()} edges")

verify_sequential_labels(sampled_graph)

# Optionally, save the sampled graph
# nx.write_edgelist(sampled_graph, f"/home/cegt/experiment-data/{dataset}/{dataset}-test-sampled.edgelist", data=['sign'])

Original graph: 22433 nodes, 43936 edges
Sampled graph: 15645 nodes, 21967 edges
Number of unique nodes: 15645
Maximum node label: 9999
Nodes are not sequentially renumbered. There might be gaps or non-sequential labeling.


In [1]:
import networkx as nx
import random

def stratified_edge_sampling_from_file(edgelist_path, sampling_ratio=0.5, renumber_nodes=True):
    """
    Undersample edges in a signed graph from an edgelist file using stratified random sampling
    to preserve the proportion of positive and negative edges.
    
    Parameters:
    - edgelist_path: str, path to the .edgelist file
    - sampling_ratio: float, ratio of edges to sample (between 0 and 1, default is 0.5)
    - renumber_nodes: bool, whether to renumber nodes sequentially (default is True)
    
    Returns:
    - sampled_G: NetworkX DiGraph, the undersampled (and optionally renumbered) graph
    """
    # Step 1: Load the graph from the edgelist file.
    # The file is expected to have each line in the format: node1 node2 sign
    # where sign is an integer (0 for positive, 1 for negative).
    G = nx.read_edgelist(edgelist_path, create_using=nx.DiGraph(), data=(("sign", int),))
    
    # Step 2: Separate positive and negative edges
    positive_edges = [(u, v) for u, v, d in G.edges(data=True) if d['sign'] == 1]
    negative_edges = [(u, v) for u, v, d in G.edges(data=True) if d['sign'] == -1]
    
    # Step 3: Calculate sampling sizes based on the ratio
    num_positive = len(positive_edges)
    num_negative = len(negative_edges)
    total_edges = num_positive + num_negative
    p_positive = num_positive / total_edges if total_edges > 0 else 0
    p_negative = num_negative / total_edges if total_edges > 0 else 0
    
    k = int(total_edges * sampling_ratio)
    sample_size_positive = min(int(k * p_positive), num_positive)
    sample_size_negative = min(int(k * p_negative), num_negative)
    
    # Step 4: Sample edges proportionally using random sampling
    sampled_positive = random.sample(positive_edges, sample_size_positive) if sample_size_positive > 0 else []
    sampled_negative = random.sample(negative_edges, sample_size_negative) if sample_size_negative > 0 else []
    sampled_edges = sampled_positive + sampled_negative
    
    # Step 5: Create the sampled graph (only nodes incident to sampled edges remain)
    sampled_G = G.edge_subgraph(sampled_edges).copy()
    
    # Step 6 (Optional): Renumber nodes sequentially if desired
    if renumber_nodes:
        mapping = {old_label: new_label for new_label, old_label in enumerate(sampled_G.nodes())}
        sampled_G = nx.relabel_nodes(sampled_G, mapping)
    
    return sampled_G

def verify_sequential_labels(G):
    """
    Verify if the nodes in graph G are renumbered sequentially.
    
    Parameters:
    - G: NetworkX graph where nodes are expected to be renumbered (e.g., 0, 1, 2, ...)
    
    This function prints:
    - The total number of unique nodes.
    - The maximum node label.
    - A message indicating whether the nodes are sequentially numbered.
    """
    unique_nodes = set(G.nodes())
    count = len(unique_nodes)
    
    try:
        max_node = max(unique_nodes)
    except TypeError:
        print("Node labels are not numeric. Cannot verify sequential numbering.")
        return

    print("Number of unique nodes:", count)
    print("Maximum node label:", max_node)
    
    if count > 0 and max_node == count - 1:
        print("Nodes are sequentially renumbered (0 to {}), with no gaps.".format(max_node))
    else:
        print("Nodes are not sequentially renumbered. There might be gaps or non-sequential labeling.")

# Example usage:
# Provide the path to your .edgelist file.
# The file should have lines formatted like: A B 0  (for an edge from A to B with sign 0)



In [7]:
dataset = 'epinions'
edgelist_file_path = f"/home/cegt/experiment-data/{dataset}/{dataset}-train-sampled.edgelist"
sampled_G = stratified_edge_sampling_from_file(edgelist_file_path, sampling_ratio=0.5, renumber_nodes=True)

original_graph = nx.read_edgelist(edgelist_file_path, create_using=nx.DiGraph(), data=(("sign", int),))
print(f"Original graph: {original_graph.number_of_nodes()} nodes, {original_graph.number_of_edges()} edges")

verify_sequential_labels(sampled_G)

print(f"Sampeld graph: {sampled_G.number_of_nodes()} nodes, {sampled_G.number_of_edges()} edges")

Original graph: 16078 nodes, 28112 edges
Number of unique nodes: 10828
Maximum node label: 10827
Nodes are sequentially renumbered (0 to 10827), with no gaps.
Sampeld graph: 10828 nodes, 14056 edges


In [None]:
nx.write_edgelist(sampled_G, f"/home/cegt/experiment-data/{dataset}/{dataset}-train-sampled.edgelist", data=['sign'])