# Example 2: Regression with Synthetic data


In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import scipy as sc

In [2]:
# this cell is generating some synthetic graph dataset with node features. 


#defining limits on number of nodes
n_min = 20
n_max = 50

#number of graphs
num_g = 50 

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
labels = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.05
p_max = 0.9

# adding 50 random graphs and adding the label corresponding to probability of edge existing
for i in range(num_g):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    
    g = nx.fast_gnp_random_graph(rand_n,rand_p)    
    node_feat_matrix = np.random.random((rand_n,n_nf))
    
    graphs.append(nx.to_numpy_array(g))
    node_features.append(node_feat_matrix)
    
    labels.append(rand_p)


In [3]:
# converting this data into the format required for hcga

from hcga.graph import Graph, GraphCollection

# create graph collection object
g_c = GraphCollection()

# add graphs, node features and labels to the object
g_c.add_graph_list(graphs,node_features,labels)

In [4]:
# perform some sanity checks

print('There are {} graphs'.format(len(g_c.graphs)))
print('There are {} features per node'.format(g_c.get_n_node_features()))


There are 50 graphs
There are 3 features per node


In [5]:
# we can save this if we want to and run everything from the command line
from hcga.io import save_dataset

save_dataset(g_c, 'custom_dataset_regression', folder='./datasets/custom_dataset_regression')

# Extracting features

We have now produced a pickle dataset of your own custom data. We can now run the feature extraction from the command line using the following commands:

hcga extract_features ./datasets/custom_dataset.pkl -m fast -n 4 -sl advanced --timeout 10 


Alternatively,we could import the Hcga class and run the feature extraction and analysis from within the notebook. We will do this below.

In [6]:
from hcga.io import load_dataset

graphs = load_dataset('./datasets/custom_dataset_regression/custom_dataset_regression.pkl')

In [7]:
#import hcga object
from hcga.hcga import Hcga

# define an object
h = Hcga()

#assigning the graphs field to the recently created dataset
h.graphs = graphs

In [8]:
# extracting all features here
h.extract(mode='fast',n_workers=4,timeout=20)



INFO:hcga.extraction:Extracting features from 50 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 50 graphs:
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    7.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   17.4s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   24.5s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   31.9s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   38.0s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   55.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.2min finished
INFO:hcga.extraction:1093 feature extracted.


In [9]:
# saving all features into a pickle
h.save_features('./results/custom_dataset_regression/all_features.pkl')


# Analysis

In [10]:
# load the saved features

h.load_features('./results/custom_dataset_regression/all_features.pkl')

In [11]:
# implement a classification analyse of the features
h.analyse_features(feature_file='./results/custom_dataset_regression/all_features.pkl',
                   analysis_type='regression',
                   results_folder='./results/custom_dataset_regression')


INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:hcga.analysis:... Using Xgboost regressor ...
INFO:hcga.analysis:1093 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:1001 valid features
INFO:hcga.analysis:1001 with interpretability 1
INFO:hcga.analysis:Counts of graphs/label: 
0.74    3
0.60    2
0.81    2
0.11    2
0.87    2
0.68    2
0.26    2
0.69    1
0.59    1
0.57    1
0.17    1
0.48    1
0.88    1
0.76    1
0.58    1
0.84    1
0.24    1
0.30    1
0.19    1
0.49    1
0.52    1
0.63    1
0.56    1
0.15    1
0.53    1
0.75    1
0.41    1
0.29    1
0.31    1
0.32    1
0.09    1
0.40    1
0.35    1
0.37    1
0.83    1
0.73    1
0.45    1
0.10    1
0.07    1
0.42    1
0.67    1
0.27    1
Name: label, dtype: int64
INFO:hcga.analysis:Using 2 splits
INFO:hcga.analysis:Mean Absolute Error: --- 0.0