# Seed selection experiments based on node2vec and k-means algorithms

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import torch

from misc.loader import get_lazega_network
from misc.utils import set_seed
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec
from torch_geometric.utils.convert import from_networkx
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [None]:
def make_deterministic(random_seed=123):
    set_seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)

# make_deterministic()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"target device: {device}")

## Load network

In [None]:
# load exemplar network used in another experiments

nx_net = get_lazega_network().layers["friendship"]
print("NX edges: ", egs := len(nx_net.edges()), "nodes: ", len(nx_net.nodes()))

no_data_graph = nx.Graph()
no_data_graph.add_nodes_from(nx_net.nodes())
no_data_graph.add_edges_from(nx_net.edges())
tg_net = from_networkx(no_data_graph, None, None)
print("PyG ", tg_net)

# or out of the box data
# tg_net = Planetoid(root='/tmp/Cora', name='Cora')[0]
# print("PyG -> ", tg_net)

## Define and train node2vec

In [None]:
model = Node2Vec(
    tg_net.edge_index,
    embedding_dim=np.log(e/2).round(0).astype(int),  # amend this - curse of dimensionality!
    walk_length=20,
    context_size=10,
    walks_per_node=10,
    num_negative_samples=1,
    p=1,
    q=1,
    sparse=True
).to(device)

model_parameters = filter(lambda p: p.requires_grad, model.parameters())

num_params = sum([np.prod(p.size()) for p in model_parameters])
print(f"number of trainable parameters: {num_params}")
print(f"embedding dimension space: {model.embedding_dim}")

loader = model.loader(batch_size=50, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
# training loop - we are optimising model for 100 epochs

last_loss = 10000

for epoch in range(200):

    model.train()
    epoch_loss = 0

    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss = epoch_loss / len(loader)
    print(
        f"Epoch: {epoch:03d}, Loss: {epoch_loss:.4f}, " +
        f"Best loss: {last_loss:.4f}"
    )

    if epoch_loss > last_loss:
        print(f"No progress in training! Finishin in epoch  {epoch:03d}")
        break
    else:
        last_loss = epoch_loss

In [None]:
# obtain embedding coordinates of nodes
embeddings = model().cpu().detach().numpy()
print(f"obtained following embedding tensor: {embeddings.shape}")

## Define and fit k-means

In [None]:
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, n_init=300).fit(embeddings)

## Visualisation of embeddings

In [None]:
# obtain labels of generated clusters
embedding_labels = kmeans.labels_

centroids = kmeans.cluster_centers_
centroid_labels = np.arange(0, len(centroids), 1)

# make a map of cluster ids and unique colors
cluster_labels = np.unique(embedding_labels)
cluster_colors = plt.cm.get_cmap("jet", len(cluster_labels))
color_map = {cl: cluster_colors(idx) for idx, cl in enumerate(cluster_labels)}

# assign collor to each node of the network
embedding_colors = np.array(np.vectorize(color_map.get)(embedding_labels)).T

In [None]:
def plot_embeddings(ax, emb_points, emb_labels, cluster_color_map, marker, s):
    for cluster_id in cluster_color_map.keys():
        ax.scatter(
            emb_points[emb_labels == cluster_id, 0],
            emb_points[emb_labels == cluster_id, 1],
            s=s,
            marker=marker,
            color=cluster_color_map[cluster_id],
        )

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 8))

tsne = TSNE(n_components=2)

concat_embs = np.concatenate([embeddings, centroids], axis=0)
concat_embs_reduced = tsne.fit_transform(concat_embs)

plot_embeddings(
    ax,
    concat_embs_reduced[:embeddings.shape[0], :],
    embedding_labels,
    color_map,
    "o",
    20
)
plot_embeddings(
    ax,
    concat_embs_reduced[embeddings.shape[0]:, :],
    centroid_labels,
    color_map,
    "X",
    30
)
plt.axis("off")
plt.show()

In [None]:
def plot_network(network, color_map):
    plt.figure(figsize=(10, 8))
    nx.draw_spring(network, node_size=30, arrows=False, node_color=color_map)
    plt.show()

plot_network(nx_net, embedding_colors)


## To do

1. match centroids with nodes
2. optimise parameters of node2vec
3. optimise parameters of k-means (especially num oof dimensions)
4. implement seed selector basing on this pipeline