Importing Required Libraries

In [5]:
import networkx as nx
import pandas as pd


Load LastFM Data 

In [6]:
def load_lastfm_data(edge_file, target_file):
    """
    Load the LastFM dataset.

    Parameters:
    edge_file (str): Path to the file containing the edges of the LastFM network.
    target_file (str): Path to the file containing the target labels of the nodes.

    Returns:
    G (networkx.Graph): The LastFM network graph.
    targets (pandas.DataFrame): DataFrame of node targets.
    """
    # Load the edges into a DataFrame
    edges = pd.read_csv(edge_file)
    
    # Create the network graph from the edge DataFrame
    G = nx.from_pandas_edgelist(edges, source='node_1', target='node_2')
    
    # Load the target labels into a DataFrame
    targets = pd.read_csv(target_file)
    
    return G, targets

# Load the LastFM data
lastfm_graph, lastfm_targets = load_lastfm_data('data/lastfm_asia_edges.csv', 'data/lastfm_asia_target.csv')
print("LastFM Network Loaded. Number of nodes:", lastfm_graph.number_of_nodes())
print("LastFM Network Loaded. Number of edges:", lastfm_graph.number_of_edges())


LastFM Network Loaded. Number of nodes: 7624
LastFM Network Loaded. Number of edges: 27806


Remove Isolated Nodes

In [7]:
def remove_isolated_nodes(G):
    """
    Remove isolated nodes from the graph.

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    G (networkx.Graph): The graph without isolated nodes.
    """
    isolated_nodes = list(nx.isolates(G))
    G.remove_nodes_from(isolated_nodes)
    return G

# Remove isolated nodes from the graph
lastfm_graph = remove_isolated_nodes(lastfm_graph)
print("Isolated nodes removed. Number of nodes:", lastfm_graph.number_of_nodes())


Isolated nodes removed. Number of nodes: 7624


Calculate Graph Statistics 

In [8]:
def calculate_graph_statistics(G):
    """
    Calculate basic statistics of the graph.

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    stats (dict): Dictionary containing basic graph statistics.
    """
    stats = {
        'number_of_nodes': G.number_of_nodes(),
        'number_of_edges': G.number_of_edges(),
        'average_degree': sum(dict(G.degree()).values()) / G.number_of_nodes(),
        'density': nx.density(G),
        'average_clustering_coefficient': nx.average_clustering(G)
    }
    return stats

# Calculate and print basic graph statistics
lastfm_stats = calculate_graph_statistics(lastfm_graph)
print("LastFM Network Statistics:", lastfm_stats)


LastFM Network Statistics: {'number_of_nodes': 7624, 'number_of_edges': 27806, 'average_degree': 7.2943336831059815, 'density': 0.0009568849118596328, 'average_clustering_coefficient': 0.2194184243270858}


Normalize Node Features

In [1]:
def normalize_node_features(G):
    """
    Normalize node features (degree, clustering coefficient).

    Parameters:
    G (networkx.Graph): The network graph.

    Returns:
    features (pd.DataFrame): DataFrame containing normalized node features.
    """
    degrees = dict(G.degree())
    clustering_coeffs = nx.clustering(G)

    features = pd.DataFrame({
        'node': list(G.nodes()),
        'degree': [degrees[node] for node in G.nodes()],
        'clustering_coefficient': [clustering_coeffs[node] for node in G.nodes()]
    })

    features['degree'] = (features['degree'] - features['degree'].mean()) / features['degree'].std()
    features['clustering_coefficient'] = (features['clustering_coefficient'] - features['clustering_coefficient'].mean()) / features['clustering_coefficient'].std()
    
    return features

# Normalize node features and save to a CSV file
lastfm_features = normalize_node_features(lastfm_graph)
lastfm_features.to_csv('data/lastfm_node_features.csv', index=False)
print("Node features normalized and saved to 'data/lastfm_node_features.csv'.")


NameError: name 'lastfm_graph' is not defined

Save Preprocessed Data

In [10]:
# Save the preprocessed graph and targets
nx.write_gml(lastfm_graph, 'data/lastfm_graph.gml')
lastfm_targets.to_csv('data/lastfm_targets_processed.csv', index=False)

print("LastFM dataset preprocessing completed. Graph saved to 'data/lastfm_graph.gml' and targets saved to 'data/lastfm_targets_processed.csv'.")


LastFM dataset preprocessing completed. Graph saved to 'data/lastfm_graph.gml' and targets saved to 'data/lastfm_targets_processed.csv'.
