## Search for most important nodes in a graph given its matrix adjacency

Fazer um preprocessamento dos dados antes de treinar o modelo global, de identificar os nós mais importantes e ir inserindo nós pra mostrar a flexibilidade do modelo

Betweenness centrality (nodes that connect different parts of the graph)


In [19]:
import pandas as pd
import networkx as nx

# Load the edge list CSV
edges = pd.read_csv("../data/PEMS03/edges.csv") 

# Create undirected weighted graph
G = nx.from_pandas_edgelist(edges, source='source', target='target', edge_attr='weight', create_using=nx.Graph)

# Compute betweenness centrality
centrality = nx.betweenness_centrality(G, weight='weight')

# Get top-N nodes
top_n = 50
top_nodes = sorted(centrality, key=centrality.get, reverse=True)[:top_n]
# Induce subgraph with top-N nodes
top_subgraph = G.subgraph(top_nodes)
# Remove nodes with no edges in the subgraph
top_nodes_connected = [n for n in top_subgraph.nodes if top_subgraph.degree(n) > 0]
print(top_nodes_connected)
# Recreate subgraph with only connected nodes
top_subgraph = G.subgraph(top_nodes_connected)

# Convert to adjacency matrix (as a DataFrame)
adj_matrix_top = nx.to_pandas_adjacency(top_subgraph, dtype=float)

# Save adjacency matrix to CSV
adj_matrix_top.to_csv("top_nodes_adj_matrix.csv")


[314371, 315017, 314121, 316045, 313111, 312343, 313114, 315804, 317853, 318620, 316063, 316064, 312865, 312098, 318632, 312745, 316328, 313132, 314668, 315054, 318383, 316084, 314936, 313658, 317884, 314559, 318282, 312139, 314955, 316364, 314968, 315993, 311903, 313951, 316387, 316388, 314982, 316009, 317168, 312689, 316018, 318451, 316019, 312694, 312439, 311930, 313852, 316414]


In [21]:
edges_top = pd.DataFrame(
    [(u, v, d['weight']) for u, v, d in top_subgraph.edges(data=True)],
    columns=['source', 'target', 'weight']
)

# Save edges to CSV without index column
edges_top.to_csv("edges_top.csv", index=False)


In [22]:
# Load the node metadata CSV
nodes = pd.read_csv("../data/PEMS03/nodes.csv")

# Filter to only nodes in top_nodes_connected
nodes_top = nodes[nodes['sensor'].isin(top_nodes_connected)]
nodes_top.to_csv("nodes_top.csv", index=False)

# Load the data CSV
data = pd.read_csv("../data/PEMS03/data.csv")

# Filter to only columns corresponding to top_nodes_connected (plus timestamp)
columns_to_keep = ['timestamp'] + [str(n) for n in top_nodes_connected]
data_top = data[columns_to_keep]
data_top.to_csv("data_top.csv", index=False)

