In [1]:
import glob
import numpy as np
from tqdm import tqdm, trange
import argparse
import json
import math
import random
import tensorflow as tf
import os
import torch
from tensorflow import keras
from keras import backend as K
import torch.nn as nn
from sklearn.utils import shuffle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install tqdm
!pip install torch
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install texttable
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!cat /proc/cpuinfo | grep model\ name
!cat /proc/meminfo | grep MemTotal
!/opt/bin/nvidia-smi

model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
MemTotal:       13297228 kB
/bin/bash: /opt/bin/nvidia-smi: No such file or directory


# Data Processing

### Loading the test and traing data

In [5]:
# File is loaded into a list. The list contains dictionary of each *json files. (One json files consists of p
# pair graphs with its labels, edges and ged)

path_to_train = '/content/drive/MyDrive/Capstone_Project/syn_data/train/' #change the file location to the location where the train and test file is stored
train = []

for file_name in [file for file in os.listdir(path_to_train) if file.endswith('json')]:
    with open(path_to_train + file_name) as json_file:
        data = json.load(json_file)
        train.append(data)

path_to_test = '/content/drive/MyDrive/Capstone_Project/syn_data/test/' #change the file location to the location where the train and test file is stored
test = []

for file_name in [file for file in os.listdir(path_to_test) if file.endswith('json')]:
    with open(path_to_test + file_name) as json_file:
        data = json.load(json_file)
        test.append(data)

### Getting the unique labels

In [6]:
# this is used to gather the unique labels of all the graphs

train_and_test = train + test

labels = [(d["labels_1"], d["labels_2"]) for d in train_and_test]

flat_labels = []
for a_tuple in labels:
    flat_labels.extend(list(a_tuple))

flatten_labels = list(np.concatenate(flat_labels).flat)

unique_labels = set(flatten_labels)

unique_labels = list(unique_labels) 

features = {val:index  for index, val in enumerate(unique_labels)}

length_of_labels = len(unique_labels)

### Creating batches

In [7]:
def batches_gen(batch_size, train_data):
    batches = []
    train = shuffle(train_data)
    for graph in range(0, len(train), batch_size):
        batches.append(train[graph:graph+batch_size])
    return batches

### Creating Adjacency Matrix List

adjacency matrix converted code inspired from https://github.com/pulkit1joshi/SimGNN

In [8]:
def new_matrix(data, features):
        matrix_data = dict()

        edges_1 = data["graph_1"] + [[y, x] for x, y in data["graph_1"]]
        size = max(max(edges_1))+1
        r = [[0 for i in range(size)] for j in range(size)]
        for row,col in edges_1:
            r[row][col] = 1
        r=np.array(r)
        edges_1 = r
        edges_2 = data["graph_2"] + [[y, x] for x, y in data["graph_2"]]
        size = max(max(edges_2))+1
        r = [[0 for i in range(size)] for j in range(size)]
        for row,col in edges_2:
            r[row][col] = 1
        r=np.array(r)
        edges_2 = r

        features_1, features_2 = [], []
        for n in data["labels_1"]:
            features_1.append([1.0 if features[n] == i else 0.0 for i in features.values()])
        for n in data["labels_2"]:
            features_2.append([1.0 if features[n] == i else 0.0 for i in features.values()])
        features_1 = tf.convert_to_tensor(np.array(features_1), dtype=tf.float32)
        features_2 = tf.convert_to_tensor(np.array(features_2), dtype=tf.float32)
        matrix_data["edge_index_1"] = edges_1
        matrix_data["edge_index_2"] = edges_2
        matrix_data["features_1"] = features_1
        matrix_data["features_2"] = features_2
        norm_ged = data["ged"]/(0.5*(len(data["labels_1"])+len(data["labels_2"])))

        matrix_data["target"] = tf.reshape(tf.convert_to_tensor(np.exp(-norm_ged).reshape(1, 1)),-1)

        return matrix_data 

In [9]:
# This will be a list of the adjacency matrix for each pair of graphs. 

matrix_data = []
range_train = len(train)

for i in range(range_train):
    converted = new_matrix(train[i], features)
    matrix_data.append(converted)

In [10]:
# This will be a list of the adjacency matrix for each pair of graphs. 

matrix_data = []
range_test = len(test)

for i in range(range_test):
    converted = new_matrix(test[i], features)
    matrix_data.append(converted)

### Getting GED

In [11]:
# Get GED of train 
train_ged = [(d["ged"]) for d in train]

# Get GED of test
test_ged = [(d["ged"]) for d in test]


## GED

In [12]:
# used to gather the GED and shaped it accordingly for the model

def ged(x,k):
  if len(x.shape) == 3 and len(k.shape) == 3:
    b = x.shape[0]
    return np.matmul(np.matmul(x.reshape(b,1,-1), k), x.reshape(b,-1,1)).reshape(b)
  elif len(x.shape) == 2 and len(k.shape) == 2:
    return np.matmul(np.matmul.reshape(1,-1), x.reshape(-1,1)).reshape(1)
  else:
    raise ValueError('Input dimenstions not supported')


#SimGNN Model

## NTN Layer

In [13]:
class TenorNetworkModule(torch.nn.Module):
    """
    SimGNN Tensor Network module to calculate similarity vector.
    """
    def __init__(self, model_config):
        """
        :param args: Arguments object.
        """
        super(TenorNetworkModule, self).__init__()
        self.model_config = model_config
        self.setup_weights()
        self.init_parameters()

    def setup_weights(self):
        """
        Defining weights.
        """
        self.weight_matrix = torch.nn.Parameter(torch.Tensor(self.model_config["filters_3"],
                                                             self.model_config["filters_3"],
                                                             self.model_config["tensor_neurons"]))

        self.weight_matrix_block = torch.nn.Parameter(torch.Tensor(self.model_config["tensor_neurons"],
                                                                   2*self.model_config["filters_3"]))
        self.bias = torch.nn.Parameter(torch.Tensor(self.model_config["tensor_neurons"], 1))

    def init_parameters(self):
        """
        Initializing weights.
        """
        torch.nn.init.xavier_uniform_(self.weight_matrix)
        torch.nn.init.xavier_uniform_(self.weight_matrix_block)
        torch.nn.init.xavier_uniform_(self.bias)

    def forward(self, embedding_1, embedding_2):
        """
        Making a forward propagation pass to create a similarity vector.
        :param embedding_1: Result of the 1st embedding after attention.
        :param embedding_2: Result of the 2nd embedding after attention.
        :return scores: A similarity score vector.
        """
        scoring = torch.mm(torch.t(embedding_1), self.weight_matrix.view(self.model_config["filters_3"], -1))
        scoring = scoring.view(self.model_config["filters_3"], self.model_config["tensor_neurons"])
        scoring = torch.mm(torch.t(scoring), embedding_2)
        combined_representation = torch.cat((embedding_1, embedding_2))
        block_scoring = torch.mm(self.weight_matrix_block, combined_representation)
        scores = torch.nn.functional.relu(scoring + block_scoring + self.bias)
        return scores


## Attention

In [14]:
import torch

class AttentionModule(torch.nn.Module):
    """
    SimGNN Attention Module to make a pass on graph.
    """
    def __init__(self, model_config):
        """
        :param args: Arguments object.
        """
        super(AttentionModule, self).__init__()
        self.model_config = model_config
        self.setup_weights()
        self.init_parameters()

    def setup_weights(self):
        """
        Defining weights.
        """
        self.weight_matrix = torch.nn.Parameter(torch.Tensor(self.model_config["filters_3"],
                                                             self.model_config["filters_3"]))

    def init_parameters(self):
        """
        Initializing weights.
        """
        torch.nn.init.xavier_uniform_(self.weight_matrix)

    def forward(self, embedding):
        """
        Making a forward propagation pass to create a graph level representation.
        :param embedding: Result of the GCN.
        :return representation: A graph level representation vector.
        """
        global_context = torch.mean(torch.matmul(embedding, self.weight_matrix), dim=0)
        transformed_global = torch.tanh(global_context)
        sigmoid_scores = torch.sigmoid(torch.mm(embedding, transformed_global.view(-1, 1)))
        representation = torch.mm(torch.t(embedding), sigmoid_scores)
        return representation

## SimGNN Layer Set UP

In [15]:
import glob
import torch
import random
import numpy as np
from tqdm import tqdm, trange
from torch_geometric.nn import GCNConv
from torch_sparse import SparseTensor

class SimGNN(torch.nn.Module):
    """
    SimGNN: A Neural Network Approach to Fast Graph Similarity Computation
    https://arxiv.org/abs/1808.05689
    """
    def __init__(self, config, number_of_labels):
        """
        :param args: Arguments object.
        :param number_of_labels: Number of node labels.
        """
        super(SimGNN, self).__init__()
        self.model_config = config['model_config']
        self.number_labels = number_of_labels
        self.setup_layers()

    def calculate_bottleneck_features(self):
        """
        Deciding the shape of the bottleneck layer.
        """
        if self.model_config["histogram"] == True:
            self.feature_count = self.model_config["tensor_neurons"] + self.model_config["bins"]
        else:
            self.feature_count = self.model_config["tensor_neurons"]

    def setup_layers(self):
        """
        Creating the layers.
        """
        self.calculate_bottleneck_features()
        self.convolution_1 = GCNConv(self.number_labels, self.model_config["filters_1"])
        self.convolution_2 = GCNConv(self.model_config["filters_1"], self.model_config["filters_2"])
        self.convolution_3 = GCNConv(self.model_config["filters_2"], self.model_config["filters_3"])

        # self.convolution_4 = GCNConv(self.model_config["filters_3"], self.model_config["filters_4"])
        # self.convolution_5 = GCNConv(self.model_config["filters_4"], self.model_config["filters_5"])
        
        self.attention = AttentionModule(self.model_config)
        self.tensor_network = TenorNetworkModule(self.model_config)
        self.fully_connected_first = torch.nn.Linear(self.feature_count,
                                                     self.model_config["bottle_neck_neurons"])
        self.scoring_layer = torch.nn.Linear(self.model_config["bottle_neck_neurons"], 1)

    def calculate_histogram(self, abstract_features_1, abstract_features_2):
        """
        Calculate histogram from similarity matrix.
        :param abstract_features_1: Feature matrix for graph 1.
        :param abstract_features_2: Feature matrix for graph 2.
        :return hist: Histsogram of similarity scores.
        """
        scores = torch.mm(abstract_features_1, abstract_features_2).detach()
        scores = scores.view(-1, 1)
        hist = torch.histc(scores, bins=self.model_config["bins"])
        hist = hist/torch.sum(hist)
        hist = hist.view(1, -1)
        return hist

    def convolutional_pass(self, edge_index, features):
        """
        Making convolutional pass.
        :param edge_index: Edge indices.
        :param features: Feature matrix.
        :return features: Absstract feature matrix.
        """
        features = self.convolution_1(features, edge_index)
        features = torch.nn.functional.relu(features)
        features = torch.nn.functional.dropout(features,
                                               p=self.model_config["dropout"],
                                               training=self.training)

        features = self.convolution_2(features, edge_index)
        features = torch.nn.functional.relu(features)
        features = torch.nn.functional.dropout(features,
                                               p=self.model_config["dropout"],
                                               training=self.training)


        features = self.convolution_3(features, edge_index)

        return features

    def forward(self, data):
        """
        Forward pass with graphs.
        :param data: Data dictiyonary.
        :return score: Similarity score.
        """
        edge_index_1 = data["edge_index_1"]
        edge_index_2 = data["edge_index_2"]
        features_1 = data["features_1"]
        features_2 = data["features_2"]

        abstract_features_1 = self.convolutional_pass(edge_index_1, features_1)
        abstract_features_2 = self.convolutional_pass(edge_index_2, features_2)

        hist = self.calculate_histogram(abstract_features_1,
                                            torch.t(abstract_features_2))

        pooled_features_1 = self.attention(abstract_features_1)
        pooled_features_2 = self.attention(abstract_features_2)
        scores = self.tensor_network(pooled_features_1, pooled_features_2)
        scores = torch.t(scores)

        scores = torch.cat((scores, hist), dim=1).view(1, -1)

        scores = torch.nn.functional.relu(self.fully_connected_first(scores))
        
        # scores = torch.sigmoid(self.fully_connected_first(scores))

        # scores = torch.tanh(self.fully_connected_first(scores))
        score = torch.sigmoid(self.scoring_layer(scores))
        return score

## Train Model

In [16]:
global device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class SimGNNTrainer(object):
    """
    SimGNN model trainer.
    """
    def __init__(self, global_labels, config):
        self.global_labels = global_labels
        self.config = config
        self.setup_model()

    def setup_model(self):
        """
        Creating a SimGNN.
        """
        self.model = SimGNN(self.config, len(self.global_labels))

    def transfer_to_torch(self, data):
        new_data = dict()
        edges_1 = data["graph_1"] + [[y, x] for x, y in data["graph_1"]]

        edges_2 = data["graph_2"] + [[y, x] for x, y in data["graph_2"]]

        edges_1 = torch.from_numpy(np.array(edges_1, dtype=np.int64).T).type(torch.long)
        edges_2 = torch.from_numpy(np.array(edges_2, dtype=np.int64).T).type(torch.long)

        features_1, features_2 = [], []

        for n in data["labels_1"]:
            features_1.append([1.0 if self.global_labels[n] == i else 0.0 for i in self.global_labels.values()])

        for n in data["labels_2"]:
            features_2.append([1.0 if self.global_labels[n] == i else 0.0 for i in self.global_labels.values()])

        features_1 = torch.FloatTensor(np.array(features_1))
        features_2 = torch.FloatTensor(np.array(features_2))

        new_data["edge_index_1"] = edges_1
        new_data["edge_index_2"] = edges_2

        new_data["features_1"] = features_1
        new_data["features_2"] = features_2

        norm_ged = data["ged"]/(0.5*(len(data["labels_1"])+len(data["labels_2"])))

        new_data["target"] = torch.from_numpy(np.exp(-norm_ged).reshape(1, 1)).view(-1).float()
        return new_data

    def process_batch(self, batch):
        """
        Forward pass with a batch of data.
        :param batch: Batch of graph pair locations.
        :return loss: Loss on the batch.
        """
        self.optimizer.zero_grad()
        losses = 0
        for graph_pair in batch:
            data = graph_pair
            data = self.transfer_to_torch(data)
            target = data["target"]
            prediction = self.model(data)
            losses = losses + torch.nn.functional.mse_loss(target, prediction.view(-1))
        losses.backward(retain_graph=True)
        self.optimizer.step()
        loss = losses.item()
        return loss

    def fit(self):
        print("\nModel training.\n")

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.config['optim_config']['learning_rate'],
                                          weight_decay=self.config['optim_config']['weight_decay'])

        self.model.train()
        
        for epoch in range(1,self.config['optim_config']['epochs']+1):
            batches = self.config['data_config']['batches']
            self.loss_sum = 0
            main_index = 0
            batches = tqdm(enumerate(batches), total=len(batches), leave = True)
            for index, batch in batches:
                loss_score = self.process_batch(batch)/len(batch)
                main_index = main_index + len(batch)
                self.loss_sum = self.loss_sum + loss_score
                batches.set_description("(Batch Loss=%g)" % round(loss_score, 5))
            print("Epoch {epoch}/{len_epoches} Loss:{loss:.5f}".format(epoch=epoch,
                                                                       len_epoches=self.config['optim_config']['epochs'],
                                                                       loss=self.loss_sum/len(batches)))

    def calculate_normalized_ged(self, data):
        norm_ged = data["ged"]/(0.5*(len(data["labels_1"])+len(data["labels_2"])))
        return norm_ged

    def calculate_loss(self,prediction, target):
        score = torch.nn.functional.mse_loss(target, prediction)
        return score

    def score(self):
        print("\n\nModel evaluation.\n")
        self.model.eval()
        self.scores = []
        self.ground_truth = []
        for graph_pair in tqdm(self.config['data_config']['test_data']):
            data = graph_pair
            self.ground_truth.append(self.calculate_normalized_ged(data))
            data = self.transfer_to_torch(data)
            target = data["target"]
            with torch.no_grad():
                prediction = self.model(data)
            self.scores.append(self.calculate_loss(prediction.view(-1), target))
        self.print_evaluation()

    def print_evaluation(self):
        """
        Printing the error rates.
        """
        norm_ged_mean = np.mean(self.ground_truth)
        base_error = np.mean([(n-norm_ged_mean)**2 for n in self.ground_truth])
        model_error = np.mean(self.scores)
        print("\nBaseline error: " +str(round(base_error, 5))+".")
        print("\nModel test error: " +str(round(model_error, 5))+".")

    def save(self):
        torch.save(self.model.state_dict(), self.config['run_config']['out_dir'])

    def load(self):
        self.model.load_state_dict(torch.load(self.config['run_config']['load_dir']))

# Model Runner

## Running config

In [17]:
from collections import OrderedDict
def config_setup():
    model_config = OrderedDict([
        ('histogram', True),
        ('bins', 32),
        ('filters_1', 128),
        ('filters_2', 64),
        ('filters_3', 32),
        # ('filters_4', 32),
        # ('filters_5', 32),
        ('tensor_neurons', 16),
        ('bottle_neck_neurons', 16),
        ('dropout', 0.5)
    ])

    optim_config = OrderedDict([
        ('epochs', 5),
        ('batch_size', 256),
        ('learning_rate', 0.05),
        ('weight_decay', 0),
        ('momentum', 0.9),
        ('lr_decay', 0.1),
        ('epoch_decay_begin', 50)
    ])

    run_config = OrderedDict([
        ('seed', 1),
        ('training_root','./dataset/train/'),
        ('testing_root','./dataset/test/'),
        ('out_dir', False),
        ('load_dir', False)
    ])
    
    batches = batches_gen(optim_config['batch_size'],train)

    data_config = OrderedDict([
        ('training_data',train),
        ('test_data', test),
        ('batches', batches),

    ])

    config = OrderedDict([
        ('model_config', model_config),
        ('optim_config', optim_config),
        ('run_config', run_config),
        ('data_config', data_config)
    ])
    return config

In [18]:
def main(config):
    trainer = SimGNNTrainer(features, config)
    if config['run_config']['load_dir']:
        trainer.load()
    else:
        trainer.fit()
    trainer.score()
    if config['run_config']['out_dir']:
        trainer.save()

if __name__ == "__main__":
    config = config_setup()
    main(config)



Model training.



(Batch Loss=0.01397): 100%|██████████| 40/40 [01:09<00:00,  1.73s/it]


Epoch 1/5 Loss:0.02600


(Batch Loss=0.01397): 100%|██████████| 40/40 [01:01<00:00,  1.53s/it]


Epoch 2/5 Loss:0.02305


(Batch Loss=0.01397): 100%|██████████| 40/40 [01:00<00:00,  1.52s/it]


Epoch 3/5 Loss:0.02305


(Batch Loss=0.01397): 100%|██████████| 40/40 [01:02<00:00,  1.55s/it]


Epoch 4/5 Loss:0.02305


(Batch Loss=0.01397): 100%|██████████| 40/40 [01:00<00:00,  1.52s/it]


Epoch 5/5 Loss:0.02305


Model evaluation.



100%|██████████| 1000/1000 [00:04<00:00, 236.27it/s]


Baseline error: 0.54748.

Model test error: 0.02455.



