# Classification Example: synthetic data

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy as sc


In [2]:
# this cell is generating some synthetic graph dataset with node features. 


#defining limits on number of nodes
n_min = 20
n_max = 50

#number of graphs
num_g = 100

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
labels = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g/2)):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    
    g = nx.fast_gnp_random_graph(rand_n,rand_p)    
    node_feat_matrix = np.random.random((rand_n,n_nf))
    
    graphs.append(nx.to_numpy_array(g))
    node_features.append(node_feat_matrix)
    
    labels.append(0)

# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g/2)):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    rand_m = np.random.randint(m_min,m_max)
    
    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    node_feat_matrix = np.random.random((rand_n,n_nf))
    
    graphs.append(nx.to_numpy_array(g))
    node_features.append(node_feat_matrix)

    labels.append(1)

# Load graphs into graph object

We now have three lists of length 100. The graphs list is composed of numpy arrays that represent the adjacency matrix of the graph. The node features list is composed of numpy arrays that contain the node information for each graph. The labels list is a list of integers that corresponds to the class label for each graph.

The next step is to take this data and convert it into an appropriate format for hcga.

In [3]:
# converting this data into the format required for hcga

from hcga.graph import Graph, GraphCollection

# create graph collection object
g_c = GraphCollection()

# add graphs, node features and labels to the object
g_c.add_graph_list(graphs,node_features,labels)

In [10]:
# perform some sanity checks

print('There are {} graphs'.format(len(g_c.graphs)))
print('There are {} features per node'.format(g_c.get_n_node_features()))


There are 100 graphs
There are 3 features per node


In [17]:
# we can save this if we want to and run everything from the command line
from hcga.io import save_dataset

save_dataset(g_c, 'custom_dataset_classification', folder='./datasets/custom_dataset_classification')


# Extracting features

We have now produced a pickle dataset of your own custom data. We can now run the feature extraction from the command line using the following commands:

hcga extract_features ./datasets/custom_dataset.pkl -m fast -n 4 -sl advanced --timeout 10 


Alternatively,we could import the Hcga class and run the feature extraction and analysis from within the notebook. We will do this below.

In [18]:
#import hcga object
from hcga.hcga import Hcga

# define an object
h = Hcga()

# load previously saved dataset
h.load_data('./datasets/custom_dataset_classification/custom_dataset_classification.pkl')

In [19]:
# extracting all features here
h.extract(mode='fast', n_workers=4, timeout=20)

# saving all features into a pickle
h.save_features('./results/custom_dataset_classification/all_features.pkl')


INFO:hcga.extraction:Extracting features from 100 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 100 graphs:
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    8.4s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   11.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   13.6s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   17.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   20.8s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   24.8s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   28.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   33.3s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   37.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   41.3s finished
INFO:hcga.extraction:1093 feature extracted.


# Analysis - classification example

In [20]:
# load the saved features

h.load_features('./results/custom_dataset_classification/all_features.pkl')

In [21]:
# implement a classification analyse of the features

h.analyse_features(feature_file='./results/custom_dataset_classification/all_features.pkl',results_folder='./results/custom_dataset_classification')


INFO:hcga.analysis:... Using Xgboost classifier ...
INFO:hcga.analysis:1093 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:1018 valid features
INFO:hcga.analysis:1018 with interpretability 1
INFO:hcga.analysis:Counts of graphs/label: 
0.0    50
1.0    50
Name: label, dtype: int64
INFO:hcga.analysis:Using 10 splits
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 0.9 ---
INFO:hcga.analysis:Fold accuracy: --- 0.9 ---
INFO:hcga.analysis:Fold accuracy: --- 0.9 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 0.9 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Accuracy: 0.96 +/- 0.049
INFO:hcga.analysis:Now using a reduced set of 100 features with < 0.9 correlation.
INFO:hcga.analysis:Fold ac