# Example 5: Exploring the similarity between classes

Here, we provide an example in which we construct three classes of graphs. We then compare each class pairwise to identify which classes are similar to each other. 


In [None]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

if not Path("datasets").exists():
    os.mkdir("datasets")
if not Path("results").exists():
    os.mkdir("results")

%matplotlib inline

## Parameters for constructing synthetic data

In [None]:
# defining limits on number of nodes
n_min = 20
n_max = 50

# number of graphs
num_g = 50

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3

# empty list of graphs and labels
graphs = []
labels = []
node_features = []

# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# setting limits on number of edges to add per node
m_min = 3
m_max = 6

## Create data for class 0

In [None]:
# adding 50 random graphs (label 0)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    node_feat_matrix = np.random.random((rand_n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(0)

## Create data for class 1

In [None]:
# adding 50  powerlaw cluster graphs (label 1)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    node_feat_matrix = np.random.random((rand_n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(1)

## Create data for class 2

In [None]:
# adding 50 watts strogatz graphs (label 2)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.watts_strogatz_graph(rand_n, rand_m, rand_p)
    node_feat_matrix = np.random.random((rand_n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(2)

# Load graphs into graph object

We now have three lists of length 100. The graphs list is composed of numpy arrays that represent the adjacency matrix of the graph. The node features list is composed of numpy arrays that contain the node information for each graph. The labels list is a list of integers that corresponds to the class label for each graph.

The next step is to take this data and convert it into an appropriate format for hcga.

In [None]:
# converting this data into the format required for hcga

from hcga.graph import Graph, GraphCollection

# create graph collection object
g_c = GraphCollection()

# add graphs, node features and labels to the object
g_c.add_graph_list(graphs, node_features, labels)

In [None]:
# perform some sanity checks

print("There are {} graphs".format(len(g_c.graphs)))
print("There are {} features per node".format(g_c.get_n_node_features()))

In [None]:
# we can save this if we want to and run everything from the command line
from hcga.io import save_dataset

save_dataset(
    g_c,
    "custom_dataset_multilabel_similarity",
    folder="./datasets/custom_multilabel_similarity",
)

# Extracting features


In [None]:
# import hcga object
from hcga.hcga import Hcga

# define an object
h = Hcga()

In [None]:
# load previously saved dataset
h.load_data(
    "./datasets/custom_multilabel_similarity/custom_dataset_multilabel_similarity.pkl"
)

In [None]:
# extracting all features here
h.extract(mode="fast", n_workers=4, timeout=5)

# saving all features into a pickle
h.save_features("./results/custom_multilabel_similarity/all_features.pkl")

# Analysis

In [None]:
# load the saved features

h.load_features("./results/custom_multilabel_similarity/all_features.pkl")

In [None]:
# implement a classification analyse of the features

h.analyse_features(
    feature_file="./results/custom_multilabel_similarity/all_features.pkl",
    results_folder="./results/custom_multilabel_similarity",
)

# Pairwise classification

In [None]:
accuracy_matrix, top_features = h.pairwise_classification(
    feature_file="./results/custom_multilabel_similarity/all_features.pkl"
)

In [None]:
sns.heatmap(accuracy_matrix)
plt.show()

In [None]:
# what are the top features for classifying between class 0 and class 1?
print(top_features[(0.0, 1.0)])