In [1]:
import networkx as nx
from node2vec import Node2Vec
import pandas as pd
import os
import numpy as np

# 1. Defining Parameters

In [2]:
BASE_PATH = r"./Final_Graph"
GRAPHML_PATH = os.path.join(BASE_PATH, "ml_ready_graph.graphml")
EMBEDDINGS_PATH = os.path.join(BASE_PATH, "street_embeddings.csv")

# Output embedding vector dimension (numeric vector size)
EMBEDDING_DIM = 64
# Length of each random walk from each node
WALK_LENGTH = 30
# Number of walks from each node
NUM_WALKS = 10
# Return parameter (p): Controls the probability of revisiting a node
P_PARAM = 1.0
# Input/output parameter (q): Controls the probability of exploring distant neighbors
Q_PARAM = 1.0
# Number of processor threads/cores to parallelize
WORKERS = 4

# 2. Loading and Preprocessing Graph

In [3]:
G = nx.read_graphml(GRAPHML_PATH)

# Node2Vec requires that the graph be undirected and homogeneous (for traversal)
# and must not contain multiple arcs between the same nodes.
G_undirected = G.to_undirected()
G_simple = nx.Graph(G_undirected)

# 3. Embeddings Generation

In [4]:
node2vec = Node2Vec(
    G_simple,
    dimensions=EMBEDDING_DIM,
    walk_length=WALK_LENGTH,
    num_walks=NUM_WALKS,
    p=P_PARAM,
    q=Q_PARAM,
    workers=WORKERS
)

model = node2vec.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities:   0%|          | 0/48739 [00:00<?, ?it/s]

# 4. Mounting Training DataBase

In [5]:
final_data = []

# Iterate over the edges (streets) of the original graph G (which has the attributes)
for u, v, data in G.edges(data=True):

    # 5.1. Extract the embeddings of the NODES (u and v)
    try:
        # Get the embedding vector of the source node (u) and the destination node (v)
        emb_u = model.wv[str(u)]
        emb_v = model.wv[str(v)]

    except KeyError:

        # Ignore nodes that were not included in the model (very rare)
        continue

    # 5.2. Combine the node embeddings to form the EDGE embedding
    # We use the average to combine the vectors, a common method for edge embeddings
    edge_embedding = (emb_u + emb_v) / 2

    # 5.3. Extract the features and label that are already on the edge
    features = {
    'mean_twi': data.get('mean_twi', 0.0),
    'mean_slope': data.get('mean_slope', 0.0),
    'mean_dist_river': data.get('mean_dist_river', 0.0),
    'mm_len': data.get('mm_len', 0.0),
    'flood_label': data.get('flood_label', 0),
    'microbasin_id': data.get('microbacia_id', -1)
    }

    #5.4. Create the final row of data (Features + Embeddings + Label)
    row = features

    # Add the edge embedding as separate columns (emb_0, emb_1, ...)
    for i, val in enumerate(edge_embedding):

        row[f'emb_{i}'] = val

    final_data.append(row)

df_final = pd.DataFrame(final_data)

# 6. Saving Training DataBase

In [6]:
cols = df_final.columns.tolist()
cols.remove('flood_label')
cols.append('flood_label')
df_final = df_final[cols]

df_final.to_csv(EMBEDDINGS_PATH, index=False)

print(f"final DataFrame has {len(df_final)} streets and {len(df_final.columns)} columns (3 features + {EMBEDDING_DIM} embeddings + microbasin_id + 1 label).")

final DataFrame has 73076 streets and 70 columns (3 features + 64 embeddings + microbasin_id + 1 label).
