In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import from_networkx
import numpy as np
import random

# Internal imports
from src.utils import memoize
from src.deepLearning import get_similar_nodes
from src.deepLearning import node2vec_embedding

from src.NetworkGraphs import NetworkGraphs

from src.machineLearning import get_communities
from src.visualisation_src.ML_visualisation import generate_static_cluster
from src.visualisation_src.DL_visualisation import TSNE_visualisation

Using TensorFlow backend.


In [5]:
class GAE(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, embed_dim):
        super(GAE, self).__init__()
        self.encoder = GCNConv(num_features, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, embed_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        z = self.encoder(x, edge_index)
        x = self.decoder(z)
        return x


In [6]:
class GAE2(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, embed_dim):
        super(GAE2, self).__init__()
        self.encoder = SAGEConv(num_features, hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, embed_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        z = self.encoder(x, edge_index)
        x = self.decoder(z)
        return x


In [7]:
def unsupervised_loss(recon_x, x):
    mse_loss = F.mse_loss(recon_x, x)
    return mse_loss


In [8]:
def contrastive_loss(embeddings, positive_pairs, negative_pairs, margin=1.0):

    positive_distances = torch.norm(embeddings[positive_pairs[:, 0]] - embeddings[positive_pairs[:, 1]], dim=1)
    negative_distances = torch.norm(embeddings[negative_pairs[:, 0]] - embeddings[negative_pairs[:, 1]], dim=1)

    positive_loss = torch.mean(torch.square(positive_distances))
    negative_loss = torch.mean(torch.square(torch.clamp(margin - negative_distances, min=0.0)))

    loss = 0.5 * (positive_loss + negative_loss)
    return loss

In [9]:
def pairs_to_indices(data, positive_pairs, negative_pairs):
    node_to_index = {node: i for i, node in enumerate(data.mapping)}

    positive_indices = np.array([[node_to_index.get(u), node_to_index.get(v)] for u, v in positive_pairs if u in node_to_index and v in node_to_index])
    negative_indices = np.array([[node_to_index.get(u), node_to_index.get(v)] for u, v in negative_pairs if u in node_to_index and v in node_to_index])

    return positive_indices, negative_indices

In [10]:
def generate_pairs(networkx_graph, num_negative_pairs=None):
    nodes = list(networkx_graph.nodes())

    # Positive pairs
    positive_pairs = np.array([[u, v] for u, v in networkx_graph.edges])

    # Negative pairs
    if num_negative_pairs is None:
        num_negative_pairs = len(positive_pairs)

    negative_pairs = []
    while len(negative_pairs) < num_negative_pairs:
        u, v = random.sample(nodes, 2)
        if not networkx_graph.has_edge(u, v):
            negative_pairs.append([u, v])
    negative_pairs = np.array(negative_pairs)

    return positive_pairs, negative_pairs

In [11]:
def train(model, optimizer, data, device):
    model.train()
    optimizer.zero_grad()
    out = model(data.to(device))
    loss = unsupervised_loss(out[data.train_mask], data.x[data.train_mask])
    # loss = contrastive_loss(out, data.positive_pairs, data.negative_pairs)
    loss.backward()
    optimizer.step()
    return loss.item()


In [12]:
def test(model, data, device):
    model.eval()
    out = model(data.to(device))
    loss = unsupervised_loss(out[data.test_mask], data.x[data.test_mask])
    # loss = contrastive_loss(out, data.positive_pairs, data.negative_pairs)
    return loss.item()

In [13]:
def train_model(model, optimizer, data, device, epochs):
    best_loss = float('inf')
    best_weights = None
    for epoch in range(1, epochs + 1):
        loss = train(model, optimizer, data, device)
        test_loss = test(model, data, device)
        if test_loss < best_loss:
            best_loss = test_loss
            best_weights = model.state_dict()
        print('Epoch: {:03d}, Loss: {:.5f}, Test Loss: {:.5f}'.format(epoch, loss, test_loss))
    model.load_state_dict(best_weights)
    return model


In [14]:
def preprocess_data(networkx_graph, node_features):
    """
    :Function: Preprocess
    :param networkx_graph: Networkx graph
    :param node_features: Node features
    :return: data
    :rtype: torch_geometric.data.Data
    """
    # Convert to torch_geometric.data.Data
    data = from_networkx(networkx_graph)

    # Add node features
    data.x = torch.tensor(node_features, dtype=torch.float)

    # Add train and test mask
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
    data.train_mask[:data.num_nodes // 2] = 1
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
    data.test_mask[data.num_nodes // 2:] = 1
    data.mapping = {node: i for i, node in enumerate(networkx_graph.nodes)}

    return data

In [59]:
networkGraphs = NetworkGraphs('../../datasets/Railway.csv', type="RAILWAY")

Excluded 0 stations
twopi
sfdp


if choose to use one-hot features

In [60]:
num_nodes = networkGraphs.Graph.number_of_nodes()
one_hot_features = np.eye(num_nodes)
data = preprocess_data(networkGraphs.Graph, one_hot_features)

if choose to use metrics features

In [63]:
from src.metrics import get_metrics
['degree', 'pagerank', 'kcore', 'triangles']
features = []
for metric in ['degree', 'pagerank', 'kcore', 'triangles']:
    df = get_metrics(networkGraphs, metric, directed=False, multi=False)
    np_arr = np.array(df.iloc[:, 1].values)
    np_arr = (np_arr - np_arr.min()) / (np_arr.max() - np_arr.min())
    features.append(np_arr)
features = np.array(features).T
data = preprocess_data(networkGraphs.Graph, features)

[0;92mCACHE: Using cache for compute_nodes_degree, hash: [0;93m2c3bff11aa5e34db6404f9948b036a49
[0;92mCACHE: Using cache for compute_page_rank, hash: [0;93m93fecb522e302e4ca9f35fd267594fc8
[0;92mCACHE: Using cache for compute_kcore, hash: [0;93mdba4e84a265a1128809d8b2d04e0bf42
[0;92mCACHE: Using cache for compute_triangles, hash: [0;93m3ca5a85187c716d839842022e2e2e41b


In [64]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_dim = 256
embed_dim = data.num_features
gae_model = GAE2(data.num_features, hidden_dim, embed_dim).to(device)
optimizer = torch.optim.Adam(gae_model.parameters(), lr=0.001)
epochs = 300
positive_pairs, negative_pairs = generate_pairs(networkGraphs.Graph, num_negative_pairs=1000)
positive_indices, negative_indices= pairs_to_indices(data, positive_pairs, negative_pairs)
data.positive_pairs = positive_indices
data.negative_pairs = negative_indices
gae_model = train_model(gae_model, optimizer, data, device, epochs)

Epoch: 001, Loss: 0.10681, Test Loss: 0.09578
Epoch: 002, Loss: 0.07120, Test Loss: 0.06672
Epoch: 003, Loss: 0.04581, Test Loss: 0.04870
Epoch: 004, Loss: 0.03109, Test Loss: 0.03891
Epoch: 005, Loss: 0.02439, Test Loss: 0.03451
Epoch: 006, Loss: 0.02300, Test Loss: 0.03316
Epoch: 007, Loss: 0.02449, Test Loss: 0.03272
Epoch: 008, Loss: 0.02659, Test Loss: 0.03171
Epoch: 009, Loss: 0.02771, Test Loss: 0.02971
Epoch: 010, Loss: 0.02737, Test Loss: 0.02695
Epoch: 011, Loss: 0.02573, Test Loss: 0.02375
Epoch: 012, Loss: 0.02312, Test Loss: 0.02047
Epoch: 013, Loss: 0.01989, Test Loss: 0.01748
Epoch: 014, Loss: 0.01653, Test Loss: 0.01512
Epoch: 015, Loss: 0.01344, Test Loss: 0.01353
Epoch: 016, Loss: 0.01087, Test Loss: 0.01272
Epoch: 017, Loss: 0.00895, Test Loss: 0.01265
Epoch: 018, Loss: 0.00777, Test Loss: 0.01321
Epoch: 019, Loss: 0.00729, Test Loss: 0.01413
Epoch: 020, Loss: 0.00735, Test Loss: 0.01499
Epoch: 021, Loss: 0.00764, Test Loss: 0.01545
Epoch: 022, Loss: 0.00784, Test Lo

In [65]:
gae_model.eval()
with torch.no_grad():
    x, edge_index = data.x, data.edge_index
    embeddings = gae_model.encoder(x, edge_index)

In [66]:
embeddings.shape

torch.Size([2719, 256])

In [71]:
clusters = get_communities(networkGraphs, method='kmeans', noOfClusters=4, embedding=embeddings)
TSNE_visualisation(networkGraphs, embeddings, filename='TSNE_GAE.html', clusters=clusters)
generate_static_cluster(networkGraphs, clusters, 'sfdp.html', 'kmeans', layout_='map', nbr=10)

100%|██████████| 2719/2719 [00:00<00:00, 4398.66it/s]


[0;92mCACHE: Using cache for generate_edge_trace, hash: [0;93mc5f7347b0a46338ab8e62d3150868a92
[0;92mCACHE: Using cache for get_layout, hash: [0;93mcc134767592b7e253f0b88e5ef755614


[0;94mCACHE: Computing value for kmeans_clustering, hash: [0;93mac7a588450fc9772a9ab29e8fe8efcef 


100%|██████████| 2719/2719 [00:00<00:00, 5672.96it/s]


[0;92mCACHE: Using cache for generate_edge_trace, hash: [0;93mc5f7347b0a46338ab8e62d3150868a92
[0;92mCACHE: Using cache for get_layout, hash: [0;93mcc134767592b7e253f0b88e5ef755614
