Importing Required Libraries

In [5]:
import networkx as nx
import pandas as pd
import json


 Load Email Data 

In [6]:
def load_email_data(edge_file, label_file):
    """
    Load the email dataset and the department labels.

    Parameters:
    edge_file (str): Path to the file containing the edges of the email network.
    label_file (str): Path to the file containing the department labels of the nodes.

    Returns:
    G (networkx.Graph): The email network graph.
    labels (dict): Dictionary of node labels.
    """
    # Load the email network graph from the edge file
    G = nx.read_edgelist(edge_file, delimiter=' ', nodetype=int)
    
    # Load the department labels
    labels = {}
    with open(label_file) as f:
        for line in f:
            node, label = map(int, line.strip().split())
            labels[node] = label
    
    return G, labels

# Load the email data
email_graph, email_labels = load_email_data('data/email-Eu-core.txt', 'data/email-Eu-core-department-labels.txt')
print("Email Network Loaded. Number of nodes:", email_graph.number_of_nodes())
print("Email Network Loaded. Number of edges:", email_graph.number_of_edges())


Email Network Loaded. Number of nodes: 1005
Email Network Loaded. Number of edges: 16706


Remove Self-Loops 

In [7]:
def remove_self_loops(G):
    """
    Remove self-loops from the graph.

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    G (networkx.Graph): The graph without self-loops.
    """
    G.remove_edges_from(nx.selfloop_edges(G))
    return G

# Remove self-loops from the graph
email_graph = remove_self_loops(email_graph)
print("Self-loops removed. Number of edges:", email_graph.number_of_edges())


Self-loops removed. Number of edges: 16064


Remove Isolated Nodes

In [8]:
def remove_isolated_nodes(G):
    """
    Remove isolated nodes from the graph.

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    G (networkx.Graph): The graph without isolated nodes.
    """
    isolated_nodes = list(nx.isolates(G))
    G.remove_nodes_from(isolated_nodes)
    return G

# Remove isolated nodes from the graph
email_graph = remove_isolated_nodes(email_graph)
print("Isolated nodes removed. Number of nodes:", email_graph.number_of_nodes())


Isolated nodes removed. Number of nodes: 986


 Calculate Graph Statistics

In [9]:
def calculate_graph_statistics(G):
    """
    Calculate basic statistics of the graph.

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    stats (dict): Dictionary containing basic graph statistics.
    """
    stats = {
        'number_of_nodes': G.number_of_nodes(),
        'number_of_edges': G.number_of_edges(),
        'average_degree': sum(dict(G.degree()).values()) / G.number_of_nodes(),
        'density': nx.density(G),
        'average_clustering_coefficient': nx.average_clustering(G)
    }
    return stats

# Calculate and print basic graph statistics
email_stats = calculate_graph_statistics(email_graph)
print("Email Network Statistics:", email_stats)


Email Network Statistics: {'number_of_nodes': 986, 'number_of_edges': 16064, 'average_degree': 32.5841784989858, 'density': 0.033080384262929745, 'average_clustering_coefficient': 0.40705044751953817}


Normalize Node Features 

In [10]:
def normalize_node_features(G):
    """
    Normalize node features (degree, clustering coefficient).

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    features (pd.DataFrame): DataFrame containing normalized node features.
    """
    degrees = dict(G.degree())
    clustering_coeffs = nx.clustering(G)

    features = pd.DataFrame({
        'node': list(G.nodes()),
        'degree': [degrees[node] for node in G.nodes()],
        'clustering_coefficient': [clustering_coeffs[node] for node in G.nodes()]
    })

    features['degree'] = (features['degree'] - features['degree'].mean()) / features['degree'].std()
    features['clustering_coefficient'] = (features['clustering_coefficient'] - features['clustering_coefficient'].mean()) / features['clustering_coefficient'].std()
    
    return features

# Normalize node features and save to a CSV file
email_features = normalize_node_features(email_graph)
email_features.to_csv('data/email_node_features.csv', index=False)
print("Node features normalized and saved to 'data/email_node_features.csv'.")


Node features normalized and saved to 'data/email_node_features.csv'.


Save Preprocessed Data

In [11]:
# Save the preprocessed graph and labels
nx.write_gml(email_graph, 'data/email_graph.gml')
with open('data/email_labels.json', 'w') as f:
    json.dump(email_labels, f)
    
print("Email dataset preprocessing completed. Graph saved to 'data/email_graph.gml' and labels saved to 'data/email_labels.json'.")


Email dataset preprocessing completed. Graph saved to 'data/email_graph.gml' and labels saved to 'data/email_labels.json'.
