In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

if not Path("datasets").exists():
    os.mkdir("datasets")
if not Path("results").exists():
    os.mkdir("results")

%matplotlib inline

In [2]:
# defining limits on number of nodes
n_min = 100
n_max = 250

# number of graphs
num_g = 100

# number of node features
n_nf = 3

# empty list of graphs and labels
graphs = []
labels = []
node_features = []

# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# setting limits on number of edges to add per node
m_min = 5
m_max = 10

In [3]:
# adding 100 random graphs (label 0)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    node_feat_matrix = np.random.random((rand_n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(0)

In [4]:
# adding 100 powerlaw cluster graphs (label 1)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    node_feat_matrix = np.random.random((rand_n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(1)

In [5]:
# adding 100 watts strogatz graphs (label 2)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.watts_strogatz_graph(rand_n, rand_m, rand_p)
    node_feat_matrix = np.random.random((rand_n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(2)

In [6]:
# converting this data into the format required for hcga

from hcga.graph import Graph, GraphCollection

# create graph collection object
g_c = GraphCollection()

# add graphs, node features and labels to the object
g_c.add_graph_list(graphs, node_features, labels)

In [7]:
# perform some sanity checks

print("There are {} graphs".format(len(g_c.graphs)))
print("There are {} features per node".format(g_c.get_n_node_features()))

There are 300 graphs
There are 3 features per node


In [8]:
# we can save this if we want to and run everything from the command line
from hcga.io import save_dataset

save_dataset(
    g_c,
    "labelled_graph_dataset",
    folder="./datasets/labelled_graph",
)

In [9]:
# import hcga object
from hcga.hcga import Hcga

# define an object
h = Hcga()

In [10]:
# load previously saved dataset
h.load_data(
    "./datasets/labelled_graph/labelled_graph_dataset.pkl"
)

In [12]:
# extracting all features here
h.extract(mode="fast", n_workers=4, timeout=5)

# saving all features into a pickle
h.save_features("./results/labelled_graph/features.pkl")

INFO:hcga.extraction:Setting up feature classes...
 88%|████████▊ | 38/43 [00:05<00:00,  7.59it/s]


IndexError: tuple index out of range

In [None]:
# load the saved features

h.load_features("./results/labelled_graph/features.pkl")

In [None]:
# implement a classification analyse of the features

h.analyse_features(
    feature_file="./results/labelled_graph/features.pkl",
    results_folder="./results/labelled_graph",
)

In [None]:
accuracy_matrix, top_features = h.pairwise_classification(
    feature_file="./results/labelled_graph/featuress.pkl"
)

In [None]:
sns.heatmap(accuracy_matrix)
plt.show()

In [None]:
# what are the top features for classifying between class 0 and class 1?
print(top_features[(0.0, 1.0)])

In [13]:
# defining limits on number of nodes
n_min = 100
n_max = 250

# number of graphs
num_g = 20

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 20 random graphs (label 0)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    g.label = 0

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)

    node_features.append(node_feat_matrix)


# setting limits on number of edges to add per node
m_min = 5
m_max = 10

# adding 20 powerlaw cluster graphs (label 1)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)


# adding 20 watts strogatz graphs (label 2)
for i in range(num_g):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.watts_strogatz_graph(rand_n, rand_m, rand_p)
    g.label = 2

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)

In [14]:
# create graph collection object
graphs_unlabelled = GraphCollection()
graphs_unlabelled.add_graph_list(graphs, node_features)  # loaded without the labels

# save the unlabelled dataset
save_dataset(
    graphs_unlabelled, "unlabelled_graph", folder="./datasets"
)

# perform some sanity checks
print(
    "There are {} graphs in the unlabelled dataset".format(
        len(graphs_unlabelled.graphs)
    )
)
print("There are {} features per node".format(graphs_unlabelled.get_n_node_features()))

There are 60 graphs in the unlabelled dataset
There are 3 features per node


In [15]:
# extract features for the secondary dataset with no labels
h.load_data(
    "./datasets/unlabelled_graph_dataset.pkl"
)  # set prediction graphs to True
h.extract(mode="fast", n_workers=4, timeout=20)  # set prediction set to True

FileNotFoundError: [Errno 2] No such file or directory: './datasets/unlabelled_graph_dataset.pkl'

In [None]:
h.analyse_features(
    plot=False,
    trained_model="./results/test/fitted_model",
    results_folder="./results/test",
)

In [None]:
predictions = pd.read_csv("./results/test/prediction_results.csv", index_col=0)
print(predictions)

In [None]:
# first we use the default
model = "XG"
h.analyse_features(
    model=model,
    plot=False,
    feature_file="./results/labelled_graph_dataset/features.pkl",
    results_folder="./results/labelled_graph_dataset",
)

In [None]:
model = "RF"
h.analyse_features(
    model=model,
    plot=False,
    feature_file="./results/labelled_graph_dataset/features.pkl",
    results_folder="./results/labelled_graph_dataset",
)

In [None]:
from sklearn.svm import SVC

model = SVC(
    probability=True
)  # it is necessary to use probability=True to compute SHAP values

In [None]:
# we can compute with shap values
h.analyse_features(
    compute_shap=False,
    model=model,
    plot=False,
    feature_file="./results/labelled_graph_dataset/features.pkl",
    results_folder="./results/labelled_graph_dataset",
)

In [None]:
# or with shap values:
# WARNING the Kernel Explainer (for general models) is slow and requires a lot of memory
h.analyse_features(
    compute_shap=True,
    kfold=False,
    model=model,
    plot=False,
    feature_file="./results/labelled_graph_dataset/features.pkl",
    results_folder="./results/labelled_graph_dataset",
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
h.analyse_features(
    compute_shap=False,
    model=model,
    plot=False,
    feature_file="./results/labelled_graph_dataset/features.pkl",
    results_folder="./results/labelled_graph_dataset",
)