In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from hcga.io import save_dataset
from hcga.graph import Graph, GraphCollection

if not Path("datasets").exists():
    os.mkdir("datasets")
if not Path("results").exists():
    os.mkdir("results")

%matplotlib inline

# Generate labelled random graphs

In [2]:
# defining number of nodes
n = 100

# number of graphs
num_g = 50

# number of node features
n_nf = 3

# empty list of graphs and labels
graphs = []
labels = []
node_features = []

# setting probability of edge existing for random graphs
p = 0.6

# setting number of edges to add per node
m = 4

In [3]:
# adding 50 random graphs (label 0)
for i in range(num_g):
    
    g = nx.fast_gnp_random_graph(n, p)
    node_feat_matrix = np.random.random((n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(0)

In [4]:
# adding 50 powerlaw cluster graphs (label 1)
for i in range(num_g):
    
    g = nx.powerlaw_cluster_graph(n, m, p)
    node_feat_matrix = np.random.random((n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(1)

In [5]:
# adding 50 watts strogatz graphs (label 2)
for i in range(num_g):
    
    g = nx.watts_strogatz_graph(n, m, p)
    node_feat_matrix = np.random.random((n, n_nf))

    graphs.append(g)
    node_features.append(node_feat_matrix)

    labels.append(2)

In [6]:
# create graph collection object
g_c = GraphCollection()

# add graphs, node features and labels to the object
g_c.add_graph_list(graphs, node_features, labels)

In [7]:
# perform some sanity checks

print("There are {} graphs".format(len(g_c.graphs)))
print("There are {} features per node".format(g_c.get_n_node_features()))

There are 150 graphs
There are 3 features per node


In [8]:
# we can save this if we want to and run everything from the command line

save_dataset(
    g_c,
    "labelled_graph_dataset",
    folder="./datasets/labelled_graph",
)

## Extracting features and generate feature matrix

In [9]:
# import hcga object
from hcga.hcga import Hcga

# define an object
h = Hcga()

In [10]:
# load previously saved dataset
h.load_data(
    "./datasets/labelled_graph/labelled_graph_dataset.pkl"
)

In [11]:
# extracting all features here
h.extract(mode="fast", n_workers=4, timeout=5)

# saving all features into a pickle
h.save_features("./results/labelled_graph/features.pkl")

INFO:hcga.extraction:Setting up feature classes...
100%|██████████| 43/43 [00:04<00:00,  9.91it/s]
INFO:hcga.extraction:Extracting features from 150 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 150 graphs:
100%|██████████| 150/150 [14:15<00:00,  5.70s/it]
INFO:hcga.extraction:1264 feature extracted.


In [12]:
h.features

feature_class,Cns,Cns,Cns,EF,EF,EF,EF,EF,EF,RC,...,DM,DM,DM,DM,DM,DM,DM,DM,DM,label
feature_name,connectance,connectance_N,connectance_E,local_efficiency,global_efficiency,local_efficiency_N,local_efficiency_E,global_efficiency_N,global_efficiency_E,rich_club_k_1,...,eccentricity_min_E,eccentricity_sum_N,eccentricity_sum_E,eccentricity_max_N,eccentricity_max_E,eccentricity_median_N,eccentricity_median_E,eccentricity_std_N,eccentricity_std_E,Unnamed: 21_level_1
0,0.599192,0.005992,0.000202,0.799500,0.799596,0.007995,0.000270,0.007996,0.000270,0.599192,...,0.000674,2.00,0.067431,0.02,0.000674,0.02,0.000674,0.000000,0.000000,0.0
1,0.620808,0.006208,0.000202,0.810647,0.810404,0.008106,0.000264,0.008104,0.000264,0.620808,...,0.000651,2.00,0.065083,0.02,0.000651,0.02,0.000651,0.000000,0.000000,0.0
2,0.590707,0.005907,0.000202,0.795415,0.795354,0.007954,0.000272,0.007954,0.000272,0.590707,...,0.000684,2.00,0.068399,0.02,0.000684,0.02,0.000684,0.000000,0.000000,0.0
3,0.604848,0.006048,0.000202,0.803169,0.802424,0.008032,0.000268,0.008024,0.000268,0.604848,...,0.000668,2.00,0.066800,0.02,0.000668,0.02,0.000668,0.000000,0.000000,0.0
4,0.593737,0.005937,0.000202,0.797144,0.796869,0.007971,0.000271,0.007969,0.000271,0.593737,...,0.000681,2.00,0.068050,0.02,0.000681,0.02,0.000681,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.040404,0.000404,0.000202,0.050262,0.330764,0.000503,0.000251,0.003308,0.001654,0.040404,...,0.020000,5.29,2.645000,0.06,0.030000,0.05,0.025000,0.004959,0.002479,2.0
146,0.040404,0.000404,0.000202,0.037000,0.330444,0.000370,0.000185,0.003304,0.001652,0.040404,...,0.025000,5.37,2.685000,0.06,0.030000,0.05,0.025000,0.004828,0.002414,2.0
147,0.040404,0.000404,0.000202,0.093702,0.326088,0.000937,0.000469,0.003261,0.001630,0.040404,...,0.020000,5.44,2.720000,0.06,0.030000,0.05,0.025000,0.005352,0.002676,2.0
148,0.040404,0.000404,0.000202,0.098778,0.326288,0.000988,0.000494,0.003263,0.001631,0.040404,...,0.025000,5.65,2.825000,0.07,0.035000,0.06,0.030000,0.006062,0.003031,2.0


## Classification and analysis

In [13]:
# load the saved features

h.load_features("./results/labelled_graph/features.pkl")

In [27]:
# implement a classification analyse of the features


from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
h.analyse_features(
    compute_shap=False,
    model=model,
    feature_file="./results/labelled_graph/features.pkl",
    results_folder="./results/labelled_graph",
    save_model = True,
    plot=False,
)

h.analyse_features(
    feature_file="./results/custom_multilabel_similarity/all_features.pkl",
    results_folder="./results/custom_multilabel_similarity",
)

INFO:hcga.analysis:1264 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:957 valid features
INFO:hcga.analysis:957 with interpretability 1
INFO:hcga.analysis:Counts of graphs/label: 
label
0.0    50
1.0    50
2.0    50
Name: count, dtype: int64
INFO:hcga.analysis:Using 10 splits
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Accuracy: 1.0 +/- 0.0
INFO:hcga.analysis:Fitting model to all data


In [22]:
accuracy_matrix, top_features = h.pairwise_classification(
    feature_file="./results/labelled_graph/features.pkl"
)

INFO:hcga.analysis:1264 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:957 valid features
INFO:hcga.analysis:957 with interpretability 1
INFO:hcga.analysis:... Using Xgboost classifier ...
  0%|          | 0/3 [00:00<?, ?it/s]INFO:hcga.analysis:Pairwise classification between classes 0.0 and 1.0
INFO:hcga.analysis:Counts of graphs/label: 
label
0    50
1    50
Name: count, dtype: int64
INFO:hcga.analysis:Using 10 splits
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 0.8 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Accuracy: 0.98 +/- 0.06
 3

In [23]:
sns.heatmap(accuracy_matrix)
plt.show()

In [24]:
# what are the top features for classifying between class 0 and class 1?
print(top_features[(0.0, 1.0)])

['CM_communities_weighted_coverage_N', 'CM_communities_weighted_coverage_E', 'CM_communities_weighted_performance_N', 'CM_communities_performance_N', 'Cns_connectance']


# Generate unlabelled random graphs
Here we need to restart the kernel to ensure the extract feature step run successfully.

In [2]:
# defining limits on number of nodes
n = 100

# number of graphs
num_g = 50

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
node_features = []


# setting limits on probability of edge existing for random graphs
p = 0.6

# adding 20 random graphs (label 0)
for i in range(num_g):

    g = nx.fast_gnp_random_graph(n, p)
    g.label = 0

    node_feat_matrix = np.random.random((n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)

    node_features.append(node_feat_matrix)


# setting limits on number of edges to add per node
m = 4

# adding 20 powerlaw cluster graphs (label 1)
for i in range(num_g):

    g = nx.powerlaw_cluster_graph(n, m, p)
    g.label = 1

    node_feat_matrix = np.random.random((n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)


# adding 20 watts strogatz graphs (label 2)
for i in range(num_g):

    g = nx.watts_strogatz_graph(n, m, p)
    g.label = 2

    node_feat_matrix = np.random.random((n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)

In [3]:
# create graph collection object
graphs_unlabelled = GraphCollection()
graphs_unlabelled.add_graph_list(graphs, node_features)  # loaded without the labels

# save the unlabelled dataset
save_dataset(
    graphs_unlabelled, "unlabelled_graph_dataset", folder="./datasets/unlabelled_graph"
)

# perform some sanity checks
print(
    "There are {} graphs in the unlabelled dataset".format(
        len(graphs_unlabelled.graphs)
    )
)
print("There are {} features per node".format(graphs_unlabelled.get_n_node_features()))

There are 150 graphs in the unlabelled dataset
There are 3 features per node


## Extracting features and generate feature matrix

In [4]:
# import hcga object
from hcga.hcga import Hcga

# define an object
h = Hcga()

In [5]:
# extract features for the secondary dataset with no labels
h.load_data(
    "./datasets/unlabelled_graph/unlabelled_graph_dataset.pkl"
)  # set prediction graphs to True
h.extract(mode="fast", n_workers=4, timeout=20)  # set prediction set to True

INFO:hcga.extraction:Setting up feature classes...
100%|██████████| 43/43 [00:04<00:00,  9.96it/s]
INFO:hcga.extraction:Extracting features from 150 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 150 graphs:
100%|██████████| 150/150 [14:53<00:00,  5.96s/it]
INFO:hcga.extraction:1264 feature extracted.


In [6]:
h.features

feature_class,Cns,Cns,Cns,EF,EF,EF,EF,EF,EF,RC,...,DM,DM,DM,DM,DM,DM,DM,DM,DM,label
feature_name,connectance,connectance_N,connectance_E,local_efficiency,global_efficiency,local_efficiency_N,local_efficiency_E,global_efficiency_N,global_efficiency_E,rich_club_k_1,...,eccentricity_min_E,eccentricity_sum_N,eccentricity_sum_E,eccentricity_max_N,eccentricity_max_E,eccentricity_median_N,eccentricity_median_E,eccentricity_std_N,eccentricity_std_E,Unnamed: 21_level_1
0,0.598182,0.005982,0.000202,0.799492,0.799091,0.007995,0.000270,0.007991,0.000270,0.598182,...,0.000675,2.00,0.067545,0.02,0.000675,0.02,0.000675,0.000000,0.000000,
1,0.596162,0.005962,0.000202,0.798198,0.798081,0.007982,0.000270,0.007981,0.000270,0.596162,...,0.000678,2.00,0.067774,0.02,0.000678,0.02,0.000678,0.000000,0.000000,
2,0.599798,0.005998,0.000202,0.799020,0.799899,0.007990,0.000269,0.007999,0.000269,0.599798,...,0.000674,2.00,0.067363,0.02,0.000674,0.02,0.000674,0.000000,0.000000,
3,0.604242,0.006042,0.000202,0.802186,0.802121,0.008022,0.000268,0.008021,0.000268,0.604242,...,0.000669,2.00,0.066867,0.02,0.000669,0.02,0.000669,0.000000,0.000000,
4,0.609293,0.006093,0.000202,0.805074,0.804646,0.008051,0.000267,0.008046,0.000267,0.609293,...,0.000663,2.00,0.066313,0.02,0.000663,0.02,0.000663,0.000000,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.040404,0.000404,0.000202,0.055643,0.325785,0.000556,0.000278,0.003258,0.001629,0.040404,...,0.025000,5.34,2.670000,0.06,0.030000,0.05,0.025000,0.004737,0.002369,
146,0.040404,0.000404,0.000202,0.031476,0.328258,0.000315,0.000157,0.003283,0.001641,0.040404,...,0.025000,5.48,2.740000,0.07,0.035000,0.05,0.025000,0.005381,0.002691,
147,0.040404,0.000404,0.000202,0.055119,0.332232,0.000551,0.000276,0.003322,0.001661,0.040404,...,0.020000,5.25,2.625000,0.06,0.030000,0.05,0.025000,0.004555,0.002278,
148,0.040404,0.000404,0.000202,0.056111,0.328414,0.000561,0.000281,0.003284,0.001642,0.040404,...,0.020000,5.39,2.695000,0.06,0.030000,0.05,0.025000,0.005272,0.002636,


In [9]:
h.analyse_features(
    plot=False,
    save_model=True,
    trained_model="./results/labelled_graph/fitted_model",
    results_folder="./results/test",
)

INFO:hcga.analysis:1264 total features


INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:956 valid features
INFO:hcga.analysis:956 with interpretability 1


In [10]:
predictions = pd.read_csv("./results/test/prediction_results.csv", index_col=0)
print(predictions)

     y_prediction
0             0.0
1             0.0
2             0.0
3             0.0
4             0.0
..            ...
145           2.0
146           2.0
147           2.0
148           2.0
149           2.0

[150 rows x 1 columns]
