In [1]:
from preprocessing import Preprocessing as pp
from classes import transportnetwork as tn
import networkx as nx

In [74]:
# Load data
G = pp.create_network_from_trailway("../../data/Railway Data_JL.xlsx")
TN = tn.TransportNetwork(G, pos_argument=['lon', 'lat'], time_arguments=['dep_time', 'arr_time'], distance_argument='distance')

Network creation: 


100%|██████████| 69638/69638 [00:05<00:00, 12365.33it/s]


In [380]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu111.html
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu111.html


In [221]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv, ChebConv, GATConv, SAGEConv, GINConv, GraphConv, TopKPooling, global_mean_pool
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import *

In [222]:
use_cuda_if_available = False

# Create the Dataset object

In [223]:
data = from_networkx(TN.multidigraph)
data

Data(edge_index=[2, 64155], lon=[2719], lat=[2719], dep_time=[64155], arr_time=[64155], train=[64155], train_max_speed=[64155], day=[64155], distance=[64155], num_nodes=2719)

In [224]:
# Create a one-hot encoding of the node features
data.x = torch.eye(data.num_nodes)[data.x]

In [354]:
import torch

def get_max_deg(data):
    """
    Find the max degree across all nodes in graphs.
    """
    max_deg = 0

    row, col = data.edge_index
    num_nodes = data.num_nodes
    deg = degree(row, num_nodes)
    deg = max(deg).item()
    if deg > max_deg:
        max_deg = int(deg)
    return max_deg

def cat_one_hot_feature(data, max_degree, in_degree=False, cat=True, features=['degree']):

    if 'degree' in features:
        idx, x = data.edge_index[1 if in_degree else 0], data.x
        deg = degree(idx, data.num_nodes, dtype=torch.long)
        deg = F.one_hot(deg, num_classes=max_degree + 1).to(torch.float)

        if x is not None and cat:
            x = x.view(-1, 1) if x.dim() == 1 else x
            data.x = torch.cat([x, deg.to(x.dtype)], dim=-1)
        else:
            data.x = deg

    return data

from torch_geometric.data import Data, InMemoryDataset

class TransportNetworkDataset(InMemoryDataset):
    def __init__(self, root, input_file, transform=None, pre_transform=None):
        self.input_file = input_file
        self.transform = transform
        super(TransportNetworkDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        # If you have any raw data files, you can list them here
        return []

    @property
    def processed_file_names(self):
        # This is the name of the processed file that will be saved to disk
        return ['transport_network_data.pt']

    def download(self):
        # Downloading is not needed for this example
        pass

    def process(self):
        data_list = []

        G = pp.create_network_from_trailway(self.input_file)

        # Add features to network
        # Add node degree

        # Create the Data object
        data = from_networkx(G)

        # Create a one-hot encoding with one_hot function
        # Get a tensor with all the nodes ids
        indices = torch.tensor([i for i in range(data.num_nodes)])
        one_hot_encoding = one_hot(indices, num_classes=data.num_nodes)
        data.x = one_hot_encoding

        # Add node features
        # Add node degree
        max_degree = get_max_deg(data)
        data = cat_one_hot_feature(data, max_degree)

        # Split the data
        train_ratio = 0.2
        num_nodes = data.x.shape[0]
        num_train = int(num_nodes * train_ratio)
        idx = [i for i in range(num_nodes)]

        np.random.shuffle(idx)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[idx[:num_train]] = True
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask[idx[num_train:]] = True

        data.train_mask = train_mask
        data.test_mask = test_mask

        # create a tensor with only 1 class
        data.y = torch.ones(data.num_nodes, dtype=torch.long)

        data = Data(x=data.x, edge_index=data.edge_index, train_mask=data.train_mask, test_mask=data.test_mask, y=data.y)

        print(data.edge_index.shape)

        data_list.append(data)

        if self.transform is not None:
            data_list = [self.transform(data) for data in data_list]

        # Store the processed data
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [355]:
import torch_geometric.transforms as T
# Define augmentations
transform = T.Compose([
    T.RandomNodeSplit(num_splits=2),
])

dataset = TransportNetworkDataset(root='./transport_dataset', input_file="../../data/Railway Data_JL.xlsx", transform=transform)

Processing...


Network creation: 


100%|██████████| 69638/69638 [00:15<00:00, 4509.63it/s]


torch.Size([2, 64155])


Done!


In [320]:
import random

DATA_SPLIT = [0.7, 0.2, 0.1] # Train / val / test split ratio


def split_dataset(dataset, train_data_percent=1.0):
    """
    Splits the data into train / val / test sets.
    Args:
        dataset (list): all graphs in the dataset.
        train_data_percent (float): Fraction of training data
            which is labelled. (default 1.0)
    """
    random.shuffle(dataset)

    n = len(dataset)
    train_split, val_split, test_split = DATA_SPLIT

    train_end = int(n * DATA_SPLIT[0] * train_data_percent)
    val_end = train_end + int(n * DATA_SPLIT[1])
    train_dataset, val_dataset, test_dataset = [i for i in dataset[:train_end]], [i for i in dataset[train_end:val_end]], [i for i in dataset[val_end:]]
    return train_dataset, val_dataset, test_dataset

In [321]:
# Data loader
from torch_geometric.loader import DataLoader

loader = DataLoader(dataset, batch_size=32, shuffle=True)



In [322]:
def train():
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = contrastive_loss(out)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

def contrastive_loss(out):
    batch_size = out.size(0) // 2
    emb_size = out.size(1)
    out1, out2 = out.split(batch_size, dim=0)
    labels = torch.arange(batch_size, device=out.device)
    mask = torch.eye(batch_size * 2, device=out.device)
    sim_matrix = torch.mm(out, out.t()) / emb_size
    sim_matrix = sim_matrix - mask * 1e9
    pos_mask = labels.unsqueeze(1) == labels.unsqueeze(0)
    neg_mask = ~pos_mask
    pos_sim = sim_matrix[pos_mask].view(batch_size, -1)
    neg_sim = sim_matrix[neg_mask].view(batch_size, -1)
    logits = torch.cat([pos_sim, neg_sim], dim=1)
    labels = torch.zeros(batch_size, dtype=torch.long, device=out.device)
    loss = F.cross_entropy(logits, labels)
    return loss

In [323]:
from torch_geometric.nn import GCNConv
from torch_geometric.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        self.hidden_channels = hidden_channels
        super(GCN, self).__init__()
        self.conv1 = GCNConv(data.num_node_features, self.hidden_channels)
        self.conv2 = GCNConv(self.hidden_channels, data.num_nodes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [324]:
from torch.optim import Adam

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(hidden_channels=16).to(device)
optimizer = Adam(model.parameters(), lr=0.001)



In [327]:
for epoch in range(10):
    loss = train()
    print(f"Epoch {epoch}, Loss {loss:.4f}")

TypeError: GCN.forward() missing 1 required positional argument: 'edge_index'

In [239]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 1.0000


In [328]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch_geometric.datasets import TUDataset
from torch_geometric.nn import GCNConv
from torch_geometric.transforms import RandomNodeSplit
from torch_geometric.utils import to_networkx

# Load the dataset
dataset = TUDataset(root='data', name='MUTAG')
train_dataset = dataset[:150]
val_dataset = dataset[150:]

# Define the graph encoder
class GraphEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Define the contrastive loss function
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, z1, z2):
        batch_size = z1.size(0)
        z = torch.cat([z1, z2], dim=0)
        sim = torch.matmul(z, z.t())
        sim /= self.temperature
        mask = torch.eye(batch_size * 2, device=z.device).bool()
        loss = F.cross_entropy(sim[mask], torch.arange(batch_size * 2, device=z.device))
        return loss

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Extracting data/MUTAG/MUTAG.zip
Processing...
Done!


In [330]:
transform = RandomNodeSplit(num_splits=2)

In [331]:
def train(model, loader, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    for data in loader:
        data = data.to(device)
        data_aug1 = transform(data)
        data_aug2 = transform(data)
        z1 = model(data.x, data.edge_index)
        z2 = model(data_aug1.x, data_aug1.edge_index)
        z3 = model(data_aug2.x, data_aug2.edge_index)
        loss = loss_fn(z1, z2) + loss_fn(z1, z3)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.num_graphs
        return train_loss / len(loader.dataset)

In [332]:
def evaluate(model, loader, loss_fn, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            data_aug1 = transform(data)
            data_aug2 = transform(data)
            z1 = model(data.x, data.edge_index)
            z2 = model(data_aug1.x, data_aug1.edge_index)
            z3 = model(data_aug2.x, data_aug2.edge_index)
            loss = loss_fn(z1, z2) + loss_fn(z1, z3)
            val_loss += loss.item() * data.num_graphs
            return val_loss / len(loader.dataset)

In [333]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.01
hidden_channels = 32
out_channels = 16
temperature = 0.5
num_epochs = 100

In [334]:
model = GraphEncoder(dataset.num_node_features, hidden_channels, out_channels).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = ContrastiveLoss(temperature=temperature)

In [348]:
def collate_fn(data_list):
    batched_data = {}
    for key in data_list[0].keys:
        batched_data[key] = torch.cat([data[key] for data in data_list], dim=0)
    return batched_data

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

for data in train_loader:
    print(data)

RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 38 but got size 36 for tensor number 1 in the list.

In [346]:
for epoch in range(1, num_epochs + 1):
    train_loss = train(model, train_loader, optimizer, loss_fn, device)
    val_loss = evaluate(model, DataLoader(val_dataset, batch_size=32), loss_fn, device)
    print(f'Epoch: {epoch:02d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'torch_geometric.data.data.Data'>

# Create the dataset

In [404]:
import torch

def get_max_deg(data):
    """
    Find the max degree across all nodes in graphs.
    """
    max_deg = 0

    row, col = data.edge_index
    num_nodes = data.num_nodes
    deg = degree(row, num_nodes)
    deg = max(deg).item()
    if deg > max_deg:
        max_deg = int(deg)
    return max_deg

def cat_one_hot_feature(data, max_degree, in_degree=False, cat=True, features=['degree']):

    if 'degree' in features:
        idx, x = data.edge_index[1 if in_degree else 0], data.x
        deg = degree(idx, data.num_nodes, dtype=torch.long)
        deg = F.one_hot(deg, num_classes=max_degree + 1).to(torch.float)

        if x is not None and cat:
            x = x.view(-1, 1) if x.dim() == 1 else x
            data.x = torch.cat([x, deg.to(x.dtype)], dim=-1)
        else:
            data.x = deg

    return data

from torch_geometric.data import Data, InMemoryDataset, Dataset

class TransportNetworkDataset(InMemoryDataset):
    def __init__(self, root, input_file, transform=None, pre_transform=None):
        self.input_file = input_file
        self.transform = transform
        super(TransportNetworkDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        # If you have any raw data files, you can list them here
        return []

    @property
    def processed_file_names(self):
        # This is the name of the processed file that will be saved to disk
        return ['transport_network_data.pt']

    def download(self):
        # Downloading is not needed for this example
        pass

    def process(self):
        data_list = []

        G = pp.create_network_from_trailway(self.input_file)

        # Add features to network
        # Add node degree

        # Create the Data object
        data = from_networkx(G)

        # Create a one-hot encoding with one_hot function
        # Get a tensor with all the nodes ids
        indices = torch.tensor([i for i in range(data.num_nodes)])
        one_hot_encoding = one_hot(indices, num_classes=data.num_nodes)
        data.x = one_hot_encoding

        # Add node features
        # Add node degree
        max_degree = get_max_deg(data)
        data = cat_one_hot_feature(data, max_degree)

        # Split the data
        DATA_SPLIT = [0.7, 0.2, 0.1]
        num_nodes = data.x.shape[0]
        num_train = int(num_nodes * DATA_SPLIT[0])
        num_val = int(num_nodes * DATA_SPLIT[1])
        num_test = int(num_nodes * DATA_SPLIT[2])
        idx = [i for i in range(num_nodes)]

        np.random.shuffle(idx)
        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[idx[:num_train]] = True
        val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        val_mask[idx[num_train:num_train + num_val]] = True
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask[idx[num_train + num_val:]] = True

        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask

        # create a tensor with only 1 class
        data.y = torch.ones(data.num_nodes, dtype=torch.long)

        data = Data(x=data.x, edge_index=data.edge_index, train_mask=data.train_mask, test_mask=data.test_mask, y=data.y)

        print(data.edge_index.shape)

        data_list.append(data)

        if self.transform is not None:
            data_list = [self.transform(data) for data in data_list]

        # Store the processed data
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])


class MyDataset(Dataset):
	"""
	Dataset class that returns a graph and its augmented view in get() call.
	Augmentations are applied sequentially based on the augment_list.
	"""

	def __init__(self, dataset):
		super(MyDataset, self).__init__()

		self.dataset = dataset

	def get_positive_sample(self, current_graph):

         node_mask_prob = 0.2
         edge_mask_prob = 0.1
         edge_perturb_prob = 0.1

         graph_temp = current_graph

         if node_mask_prob > 0:
            num_nodes = graph_temp.x.size(0)
            mask = torch.rand(num_nodes) < node_mask_prob
            mask = mask.view(-1, 1).to(torch.float)
            graph_temp.x = graph_temp.x * mask

         if edge_mask_prob > 0 or edge_perturb_prob > 0:
            # Convert edge_index to a COO format
            graph_temp.edge_index, _ = add_self_loops(graph_temp.edge_index, num_nodes=graph_temp.num_nodes)

            # Dropout edges
            if edge_mask_prob > 0:
                graph_temp.edge_index, _ = dropout_adj(graph_temp.edge_index, p=edge_mask_prob)

            # Perturb edges
            if edge_perturb_prob > 0:
                edge_mask = torch.ones(graph_temp.edge_index.size(1)).to(torch.bool)
                edge_mask = F.dropout(edge_mask, p=edge_perturb_prob, training=True)
                mask = edge_mask.view(1, -1).repeat(2, 1)
                perturb = torch.randn(graph_temp.edge_index.size(1), 2) * edge_mask.float().unsqueeze(-1)
                graph_temp.edge_index = (graph_temp.edge_index + perturb).long()

         return graph_temp

	def get(self, idx):
		graph_anchor = self.dataset[idx]
		graph_pos = self.get_positive_sample(graph_anchor)
		return PairData(graph_anchor.edge_index, graph_anchor.x, graph_pos.edge_index, graph_pos.x)

	def len(self):
		return len(self.dataset)


class PairData(Data):
	"""
	Utility function to return a pair of graphs in dataloader.
	Adapted from https://pytorch-geometric.readthedocs.io/en/latest/notes/batching.html
	"""

	def __init__(self, edge_index_anchor = None, x_anchor = None, edge_index_pos = None, x_pos = None):
		super().__init__()
		self.edge_index_anchor = edge_index_anchor
		self.x_anchor = x_anchor

		self.edge_index_pos = edge_index_pos
		self.x_pos = x_pos

	def __inc__(self, key, value, *args, **kwargs):
		if key == "edge_index_anchor":
			return self.x_anchor.size(0)
		if key == "edge_index_pos":
			return self.x_pos.size(0)
		else:
			return super().__inc__(key, value, *args, **kwargs)


In [405]:
dataset = TransportNetworkDataset(root='./transport_dataset', input_file="../../data/Railway Data_JL.xlsx", transform=transform)

Processing...


Network creation: 


100%|██████████| 69638/69638 [00:13<00:00, 5168.80it/s]


torch.Size([2, 64155])


Done!


In [383]:
loader = DataLoader(MyDataset(dataset), num_workers=2, batch_size=64,
						shuffle=True)

# loader = DataLoader(MyDataset(dataset), num_workers=2, batch_size=64,
# 						shuffle=True, follow_batch=["x_anchor", "x_pos"])

In [385]:
import torch.nn as nn
from torch_geometric.nn import GCNConv

class GCN(nn.Module):
    def __init__(self, feat_dim, hidden_dim, n_layers):
        super(GCN, self).__init__()

        self.convs = nn.ModuleList()
        self.acts = nn.ModuleList()
        self.n_layers = n_layers

        a = nn.ReLU()
        for i in range(n_layers):
            start_dim = hidden_dim if i else feat_dim
            conv = GCNConv(start_dim, hidden_dim)
            self.convs.append(conv)
            self.acts.append(a)

    def forward(self, data):
        x, edge_index, batch = data
        for i in range(self.n_layers):
            x = self.convs[i](x, edge_index)
            x = self.acts[i](x)
        return x

In [387]:
model = GCN(feat_dim=128, hidden_dim=64, n_layers=3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(model)

GCN(
  (convs): ModuleList(
    (0): GCNConv(128, 64)
    (1-2): 2 x GCNConv(64, 64)
  )
  (acts): ModuleList(
    (0-2): 3 x ReLU()
  )
)


In [398]:
def infonce(readout_anchor, readout_positive, tau=0.5, norm=True):
    """
    The InfoNCE (NT-XENT) loss in contrastive learning. The implementation
    follows the paper `A Simple Framework for Contrastive Learning of
    Visual Representations <https://arxiv.org/abs/2002.05709>`.
    Args:
        readout_anchor, readout_positive: Tensor of shape [batch_size, feat_dim]
        tau: Float. Usually in (0,1].
        norm: Boolean. Whether to apply normlization.
    """

    batch_size = readout_anchor.shape[0]
    sim_matrix = torch.einsum("ik,jk->ij", readout_anchor, readout_positive)

    if norm:
        readout_anchor_abs = readout_anchor.norm(dim=1)
        readout_positive_abs = readout_positive.norm(dim=1)
        sim_matrix = sim_matrix / torch.einsum("i,j->ij", readout_anchor_abs, readout_positive_abs)

    sim_matrix = torch.exp(sim_matrix / tau)
    pos_sim = sim_matrix[range(batch_size), range(batch_size)]
    loss = pos_sim / (sim_matrix.sum(dim=1) - pos_sim)
    loss = - torch.log(loss).mean()
    return loss


def run(epoch, mode, dataloader):
	if mode == "train":
		model.train()
	elif mode == "val" or mode == "test":
		model.eval()

	losses = []
	for data in dataloader:
		data.to(device)

		# readout_anchor is the embedding of the original datapoint x on passing through the model
		readout_anchor = model((data.x_anchor,
								data.edge_index_anchor, data.x_anchor_batch))

		# readout_positive is the embedding of the positively augmented x on passing through the model
		readout_positive = model((data.x_pos,
									data.edge_index_pos, data.x_pos_batch))

		# negative samples for calculating the contrastive loss is computed in contrastive_fn
		loss = infonce(readout_anchor, readout_positive)

		if mode == "train":
			# backprop
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

		# keep track of loss values
		losses.append(loss.item())

	# gather the results for the epoch
	epoch_loss = sum(losses) / len(losses)
	return epoch_loss

In [399]:
import os

if not os.path.isdir(os.path.join("logs", "ssl_model")):
    os.makedirs(os.path.join("logs", "ssl_model"))

best_train_loss, best_val_loss = float("inf"), float("inf")

for epoch in range(20):
    train_loss = run(epoch, "train", loader)
    val_loss = run(epoch, "val", loader)
    log = "Epoch {}, Train Loss: {:.3f}, Val Loss: {:.3f}"
    print(log.format(epoch, train_loss, val_loss))

    # save model
    is_best_loss = False
    if val_loss < best_val_loss:
        best_epoch, best_train_loss, best_val_loss, is_best_loss = \
                                            epoch, train_loss, val_loss, True

    model.save_checkpoint(os.path.join("logs", args.save), optimizer, epoch,
                          best_train_loss, best_val_loss, is_best_loss)

print("Train Loss at epoch {} (best model): {:.3f}".format(best_epoch, best_train_loss))
print("Val Loss at epoch {} (best model): {:.3f}".format(best_epoch, best_val_loss))



RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/anthony/Desktop/Group Project/XYZnetwork_lib/venv/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/anthony/Desktop/Group Project/XYZnetwork_lib/venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/anthony/Desktop/Group Project/XYZnetwork_lib/venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/anthony/Desktop/Group Project/XYZnetwork_lib/venv/lib/python3.10/site-packages/torch_geometric/data/dataset.py", line 258, in __getitem__
    data = self.get(self.indices()[idx])
  File "/tmp/ipykernel_211312/691676025.py", line 159, in get
    graph_pos = self.get_positive_sample(graph_anchor)
  File "/tmp/ipykernel_211312/691676025.py", line 150, in get_positive_sample
    edge_mask = F.dropout(edge_mask, p=edge_perturb_prob, training=True)
  File "/home/anthony/Desktop/Group Project/XYZnetwork_lib/venv/lib/python3.10/site-packages/torch/nn/functional.py", line 1252, in dropout
    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
RuntimeError: result type Float can't be cast to the desired output type Bool


In [2]:
G = pp.create_network_from_trailway("../../data/Railway Data_JL.xlsx")
TN = tn.TransportNetwork(G, pos_argument=['lon', 'lat'], time_arguments=['dep_time', 'arr_time'], distance_argument='distance')

Network creation: 


100%|██████████| 69638/69638 [00:15<00:00, 4632.59it/s]


In [3]:
def get_max_deg(data):
    """
    Find the max degree across all nodes in graphs.
    """
    max_deg = 0

    row, col = data.edge_index
    num_nodes = data.num_nodes
    deg = degree(row, num_nodes)
    deg = max(deg).item()
    if deg > max_deg:
        max_deg = int(deg)
    return max_deg

def cat_one_hot_feature(data, max_degree, in_degree=False, cat=True, features=['degree']):

    feature_output = []
    for feature in features:
        if 'degree' in features:
            idx, x = data.edge_index[1 if in_degree else 0], data.x
            deg = degree(idx, data.num_nodes, dtype=torch.long)
            deg = F.one_hot(deg, num_classes=max_degree + 1).to(torch.float)
            feature_output.append(deg)

    for f in feature_output:
        if x is not None and cat:
            data.x = torch.cat([data.x, f], dim=-1)
        else:
            data.x = f

    return data

In [7]:
from torch_geometric.utils import from_networkx, degree
import torch

data = from_networkx(TN.get_higher_complexity())
max_degree = get_max_deg(data)
print(max_degree)
data = cat_one_hot_feature(data, max_degree, in_degree=False, cat=True, features=['degree'])

num_nodes = data.num_nodes
train_ratio = 0.8
val_ratio = 0.2

num_train_nodes = int(num_nodes * train_ratio)
num_val_nodes = int(num_nodes * val_ratio)

indices = np.arange(num_nodes)
np.random.shuffle(indices)

train_indices = indices[:num_train_nodes]
val_indices = indices[num_train_nodes:num_train_nodes + num_val_nodes]

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[train_indices] = True
val_mask[val_indices] = True

data.train_mask = train_mask
data.val_mask = val_mask

480


NameError: name 'torch' is not defined

In [5]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Sequential, Linear, ReLU

class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(data.num_node_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.fc = Sequential(Linear(128, 64), ReLU(), Linear(64, 32))

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc(x)
        return x

NameError: name 'torch' is not defined

In [None]:
import random

def augment_data(data, node_mask_rate=0.2, edge_perturb_rate=0.2):
    # Node attribute masking
    node_mask = torch.rand(data.num_nodes) < node_mask_rate
    data.x[node_mask] = 0

    # Edge perturbation
    edge_indices = torch.tensor(list(data.edge_index.T.cpu().numpy()))
    edge_mask = torch.rand(len(edge_indices)) < edge_perturb_rate
    edge_indices[edge_mask] = torch.randint(0, data.num_nodes, (edge_mask.sum(), 2)).to(edge_indices.device)
    data.edge_index = edge_indices.T
    return data

class GraphCL_GNN(torch.nn.Module):
    def __init__(self):
        super(GraphCL_GNN, self).__init__()
        self.gnn = GNN()

    def forward(self, data, data_aug):
        x, edge_index = data.x, data.edge_index
        x_aug, edge_index_aug = data_aug.x, data_aug.edge_index

        return self.gnn(x, edge_index), self.gnn(x_aug, edge_index_aug)

In [None]:
def info_nce_loss(z1, z2, temperature=0.1):
    z1 = F.normalize(z1, p=2, dim=-1)
    z2 = F.normalize(z2, p=2, dim=-1)
    sim_matrix = torch.matmul(z1, z2.t())

    pos_sim = torch.diag(sim_matrix)
    neg_sim = torch.exp(sim_matrix) / (torch.exp(sim_matrix).sum(dim=-1, keepdim=True) - torch.exp(pos_sim).unsqueeze(-1))
    neg_sim = neg_sim.sum(dim=-1)

    loss = -torch.log(torch.exp(pos_sim) / (torch.exp(pos_sim) + neg_sim)).mean()
    return loss / temperature

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
model = GraphCL_GNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
from torch_geometric.utils import subgraph

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Augment data
    G_pyg_train = data.subgraph(train_mask)
    G_pyg_train_aug = augment_data(G_pyg_train)

    # Forward pass with the original and augmented data
    z1_train, z2_train = model(G_pyg_train, G_pyg_train_aug.to(device))

    # Compute InfoNCE loss for training set
    train_loss = info_nce_loss(z1_train, z2_train)

    # Backpropagation
    train_loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
            # Augment data
            G_pyg_val = data.subgraph(val_mask)
            G_pyg_val_aug = augment_data(G_pyg_val)

            # Forward pass with the original and augmented data
            z1_val, z2_val = model(G_pyg_val, G_pyg_val_aug.to(device))

            # Compute InfoNCE loss for validation set
            val_loss = info_nce_loss(z1_val, z2_val)

    print(f'Epoch: {epoch + 1}, Train Loss: {train_loss.item()}, Val Loss: {val_loss.item()}')

    # Save the best model weights based on validation loss
    if val_loss.item() < best_loss:
        best_loss = val_loss.item()
        torch.save(model.state_dict(), "best_model_weights.pth")

In [None]:
model.eval()
with torch.no_grad():
    z, _ = model(data, data)  # You can use the same graph twice as input, as the second graph is not used in the forward pass when not training

In [None]:
from visualisation.visualisation import *

plot_tsne_embedding(z)


In [None]:
import numpy as np
from sklearn.cluster import KMeans

# Set the number of clusters you want to obtain
n_clusters = 4

# Convert the embeddings tensor to a NumPy array
embeddings = z

# Run K-means clustering on the embeddings
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings)

# Get the cluster labels for each node
cluster_labels = kmeans.labels_

cluster_labels

In [None]:
cluster_labels