In [40]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import from_networkx
import numpy as np
import random

# Internal imports
from src.utils import memoize
from src.deepLearning import get_similar_nodes
from src.deepLearning import node2vec_embedding

from src.NetworkGraphs import NetworkGraphs

from src.machineLearning import get_communities
from src.visualisation_src.ML_visualisation import generate_static_cluster
from src.visualisation_src.DL_visualisation import TSNE_visualisation
from torch_geometric.utils import train_test_split_edges

In [2]:
class GAE(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, embed_dim):
        super(GAE, self).__init__()
        self.encoder = GCNConv(num_features, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, embed_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        z = self.encoder(x, edge_index)
        x = self.decoder(z)
        return x


In [3]:
class GAE2(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, embed_dim):
        super(GAE2, self).__init__()
        self.encoder = SAGEConv(num_features, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, embed_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        z = self.encoder(x, edge_index)
        x = self.decoder(z)
        return x


In [4]:
def unsupervised_loss(recon_x, x):
    mse_loss = F.mse_loss(recon_x, x)
    return mse_loss


In [5]:
def contrastive_loss(embeddings, positive_pairs, negative_pairs, margin=1.0):

    positive_distances = torch.norm(embeddings[positive_pairs[:, 0]] - embeddings[positive_pairs[:, 1]], dim=1)
    negative_distances = torch.norm(embeddings[negative_pairs[:, 0]] - embeddings[negative_pairs[:, 1]], dim=1)

    positive_loss = torch.mean(torch.square(positive_distances))
    negative_loss = torch.mean(torch.square(torch.clamp(margin - negative_distances, min=0.0)))

    loss = 0.5 * (positive_loss + negative_loss)
    return loss

In [6]:
def pairs_to_indices(data, positive_pairs, negative_pairs):
    node_to_index = {node: i for i, node in enumerate(data.mapping)}

    positive_indices = np.array([[node_to_index.get(u), node_to_index.get(v)] for u, v in positive_pairs if u in node_to_index and v in node_to_index])
    negative_indices = np.array([[node_to_index.get(u), node_to_index.get(v)] for u, v in negative_pairs if u in node_to_index and v in node_to_index])

    return positive_indices, negative_indices

In [7]:
def generate_pairs(networkx_graph, num_negative_pairs=None):
    nodes = list(networkx_graph.nodes())

    # Positive pairs
    positive_pairs = np.array([[u, v] for u, v in networkx_graph.edges])

    # Negative pairs
    if num_negative_pairs is None:
        num_negative_pairs = len(positive_pairs)

    negative_pairs = []
    while len(negative_pairs) < num_negative_pairs:
        u, v = random.sample(nodes, 2)
        if not networkx_graph.has_edge(u, v):
            negative_pairs.append([u, v])
    negative_pairs = np.array(negative_pairs)

    return positive_pairs, negative_pairs

In [8]:
def train(model, optimizer, data, device):
    model.train()
    optimizer.zero_grad()
    out = model(data.to(device))
    loss = unsupervised_loss(out[data.train_mask], data.x[data.train_mask])
    # loss = contrastive_loss(out, data.positive_pairs, data.negative_pairs)
    loss.backward()
    optimizer.step()
    return loss.item()


In [9]:
def test(model, data, device):
    model.eval()
    out = model(data.to(device))
    loss = unsupervised_loss(out[data.test_mask], data.x[data.test_mask])
    # loss = contrastive_loss(out, data.positive_pairs, data.negative_pairs)
    return loss.item()

In [10]:
def train_model(model, optimizer, data, device, epochs):
    best_loss = float('inf')
    best_weights = None
    for epoch in range(1, epochs + 1):
        loss = train(model, optimizer, data, device)
        test_loss = test(model, data, device)
        if test_loss < best_loss:
            best_loss = test_loss
            best_weights = model.state_dict()
        print('Epoch: {:03d}, Loss: {:.5f}, Test Loss: {:.5f}'.format(epoch, loss, test_loss))
    model.load_state_dict(best_weights)
    return model


In [11]:
def preprocess_data(networkx_graph, node_features):
    """
    :Function: Preprocess
    :param networkx_graph: Networkx graph
    :param node_features: Node features
    :return: data
    :rtype: torch_geometric.data.Data
    """
    # Convert to torch_geometric.data.Data
    data = from_networkx(networkx_graph)

    # Add node features
    data.x = torch.tensor(node_features, dtype=torch.float)

    # Add train and test mask
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
    data.train_mask[:data.num_nodes // 2] = 1
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
    data.test_mask[data.num_nodes // 2:] = 1
    data.mapping = {node: i for i, node in enumerate(networkx_graph.nodes)}

    return data

In [12]:
networkGraphs = NetworkGraphs('../../datasets/Railway.csv', type="RAILWAY")

Excluded 0 stations
twopi
sfdp


if choose to use one-hot features

In [60]:
num_nodes = networkGraphs.Graph.number_of_nodes()
one_hot_features = np.eye(num_nodes)
data = preprocess_data(networkGraphs.Graph, one_hot_features)

if choose to use metrics features

In [13]:
from src.metrics import get_metrics
['degree', 'pagerank', 'kcore', 'triangles']
features = []
for metric in ['degree', 'pagerank', 'kcore', 'triangles']:
    df = get_metrics(networkGraphs, metric, directed=False, multi=False)
    np_arr = np.array(df.iloc[:, 1].values)
    np_arr = (np_arr - np_arr.min()) / (np_arr.max() - np_arr.min())
    features.append(np_arr)
features = np.array(features).T
data = preprocess_data(networkGraphs.Graph, features)

[0;94mCACHE: Computing value for compute_nodes_degree, hash: [0;93m31f0ab76850555acd3c40fcc94a42615 
[0;94mCACHE: Computing value for compute_page_rank, hash: [0;93m7a98fdce508707fcc7885a16e218def5 
[0;94mCACHE: Computing value for compute_kcore, hash: [0;93mfaa21788490ebd1e30b49e93b46ec8b4 
[0;94mCACHE: Computing value for compute_triangles, hash: [0;93m6598f368c351016f17ad5184551605b1 


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_dim = 256
embed_dim = data.num_features
gae_model = GAE2(data.num_features, hidden_dim, embed_dim).to(device)
optimizer = torch.optim.Adam(gae_model.parameters(), lr=0.001)
epochs = 300
positive_pairs, negative_pairs = generate_pairs(networkGraphs.Graph, num_negative_pairs=1000)
positive_indices, negative_indices= pairs_to_indices(data, positive_pairs, negative_pairs)
data.positive_pairs = positive_indices
data.negative_pairs = negative_indices
gae_model = train_model(gae_model, optimizer, data, device, epochs)

Epoch: 001, Loss: 0.08752, Test Loss: 0.08898
Epoch: 002, Loss: 0.05818, Test Loss: 0.06162
Epoch: 003, Loss: 0.03716, Test Loss: 0.04026
Epoch: 004, Loss: 0.02202, Test Loss: 0.02576
Epoch: 005, Loss: 0.01316, Test Loss: 0.01780
Epoch: 006, Loss: 0.00998, Test Loss: 0.01466
Epoch: 007, Loss: 0.01062, Test Loss: 0.01428
Epoch: 008, Loss: 0.01297, Test Loss: 0.01503
Epoch: 009, Loss: 0.01548, Test Loss: 0.01583
Epoch: 010, Loss: 0.01721, Test Loss: 0.01599
Epoch: 011, Loss: 0.01765, Test Loss: 0.01519
Epoch: 012, Loss: 0.01668, Test Loss: 0.01352
Epoch: 013, Loss: 0.01460, Test Loss: 0.01141
Epoch: 014, Loss: 0.01190, Test Loss: 0.00931
Epoch: 015, Loss: 0.00915, Test Loss: 0.00760
Epoch: 016, Loss: 0.00675, Test Loss: 0.00650
Epoch: 017, Loss: 0.00494, Test Loss: 0.00607
Epoch: 018, Loss: 0.00381, Test Loss: 0.00624
Epoch: 019, Loss: 0.00333, Test Loss: 0.00683
Epoch: 020, Loss: 0.00336, Test Loss: 0.00760
Epoch: 021, Loss: 0.00370, Test Loss: 0.00832
Epoch: 022, Loss: 0.00417, Test Lo

In [15]:
gae_model.eval()
with torch.no_grad():
    x, edge_index = data.x, data.edge_index
    embeddings = gae_model.encoder(x, edge_index)

In [16]:
embeddings.shape

torch.Size([2719, 256])

In [19]:
clusters = get_communities(networkGraphs, method='kmeans', noOfClusters=4, embedding=embeddings)
TSNE_visualisation(networkGraphs, embeddings, filename='TSNE_GAE.html', clusters=clusters)
generate_static_cluster(networkGraphs, clusters, 'sfdp.html', 'kmeans', layout_='map', nbr=10)

torch.Size([2719, 256])

In [37]:
data_edge_index = data.edge_index
data = train_test_split_edges(data)
data.edge_index = data_edge_index
data

Data(
  pos=[2719, 2],
  start=[12338],
  end=[12338],
  color=[12338],
  weight=[12338],
  x=[2719, 4],
  train_mask=[2719],
  test_mask=[2719],
  mapping={
    692=0,
    1351=1,
    698=2,
    944=3,
    232=4,
    1639=5,
    2335=6,
    1512=7,
    127=8,
    259=9,
    1835=10,
    1480=11,
    2362=12,
    503=13,
    1009=14,
    2058=15,
    802=16,
    681=17,
    1704=18,
    2314=19,
    314=20,
    783=21,
    542=22,
    1148=23,
    683=24,
    983=25,
    390=26,
    804=27,
    1005=28,
    1309=29,
    627=30,
    123=31,
    2330=32,
    529=33,
    530=34,
    826=35,
    2007=36,
    381=37,
    382=38,
    158=39,
    202=40,
    131=41,
    903=42,
    160=43,
    161=44,
    2140=45,
    92=46,
    1146=47,
    2168=48,
    1030=49,
    1786=50,
    2030=51,
    561=52,
    752=53,
    1384=54,
    317=55,
    997=56,
    2175=57,
    1508=58,
    1580=59,
    2020=60,
    1254=61,
    1625=62,
    1633=63,
    47=64,
    1573=65,
    2441=66,
    1256=67,
    1

In [None]:
# Define a link predictor model
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels):
        super(LinkPredictor, self).__init__()
        self.lin = torch.nn.Linear(in_channels*2, 1)

    def forward(self, x_i, x_j):
        x = torch.cat([x_i, x_j], dim=-1)
        x = self.lin(x)
        return x

link_predictor = LinkPredictor(gae_model.encoder.out_channels)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(link_predictor.parameters(), lr=0.01)

# Train the model
for epoch in range(1, 201):
    optimizer.zero_grad()
    out = gae_model.encoder(data.x, data.edge_index)
    edge_scores = link_predictor(out[data.train_pos_edge_index[0]], out[data.train_pos_edge_index[1]])
    ones = torch.ones(edge_scores.size(0), dtype=torch.float)
    ones = ones.unsqueeze(1)
    loss = criterion(edge_scores, ones)
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        accs = []
        # for prefix in ["train", "val", "test"]:
        pos_edge_index = data.train_pos_edge_index.t()
        neg_edge_index = data.train_neg_adj_mask.t()
        all_edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=0)
        out = gae_model.encoder(data.x, data.edge_index)
        pos_edge_scores = link_predictor(out[pos_edge_index[0]], out[pos_edge_index[1]])
        neg_edge_scores = link_predictor(out[neg_edge_index[0]], out[neg_edge_index[1]])
        edge_scores = torch.cat([pos_edge_scores, neg_edge_scores])
        pred = (edge_scores > 0).long()
        label = torch.cat([torch.ones(pos_edge_index.size(1)), torch.zeros(neg_edge_index.size(1))]).long()
        acc = (pred == label).float().mean()
        accs.append(acc.item())
        print(f"Epoch {epoch:03d}", f"Train: {accs[0]:.4f}", f"Val: {accs[1]:.4f}", f"Test: {accs[2]:.4f}")
