# Example: Predicting samples with unknown labels

Here we provide an example of extracting features and training a model on samples with known labels. We then take a secondary dataset without labels and predict their class. 

Of course, in such a case we are unable to evaluate the accuracy but it provides a more realistic example that researchers may encounter.

In [14]:
import numpy as np
import networkx as nx
import pandas as pd
from hcga.io import save_dataset
from hcga.graph import Graph, GraphCollection
from hcga.hcga import Hcga


# Generate synthetic data with known labels

In [10]:
# this cell is generating some synthetic graph dataset with node features. 


#defining limits on number of nodes
n_min = 20
n_max = 50

#number of graphs
num_g = 100

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
labels = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g/2)):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    
    g = nx.fast_gnp_random_graph(rand_n,rand_p)    
    g.label = 0

    node_feat_matrix = np.random.random((rand_n,n_nf))

    for i,node in enumerate(g.nodes):
        g.nodes[node]['features'] = node_feat_matrix[i,:]
    
    #graphs.append(g)
    
    graphs.append(nx.to_numpy_array(g)*2)
    
    node_features.append(node_feat_matrix)
    
    labels.append(0)

# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g/2)):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    rand_m = np.random.randint(m_min,m_max)
    
    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n,n_nf))
    
    for i,node in enumerate(g.nodes):
        g.nodes[node]['features'] = node_feat_matrix[i,:]
        
    #graphs.append(g)
    
    graphs.append(nx.to_numpy_array(g)*2)
    node_features.append(node_feat_matrix)

    labels.append(1)

In [11]:
from hcga.graph import Graph, GraphCollection

# create graph collection object
graphs_labelled = GraphCollection()
graphs_labelled.add_graph_list(graphs,node_features,labels)

save_dataset(graphs_labelled, 'custom_dataset_classification_labelled', folder='./datasets')

# perform some sanity checks
print('There are {} graphs'.format(len(graphs_labelled.graphs)))
print('There are {} features per node'.format(graphs_labelled.get_n_node_features()))

There are 100 graphs
There are 3 features per node


# Construct synthetic data with no labels

Creating synthetic data constructed in the same way as the training data but without labels.

In [21]:
# this cell is generating some synthetic graph dataset with node features. 


#defining limits on number of nodes
n_min = 20
n_max = 50

#number of graphs
num_g = 20

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g/2)):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    
    g = nx.fast_gnp_random_graph(rand_n,rand_p)    
    g.label = 0

    node_feat_matrix = np.random.random((rand_n,n_nf))

    for i,node in enumerate(g.nodes):
        g.nodes[node]['features'] = node_feat_matrix[i,:]
    
    #graphs.append(g)
    
    graphs.append(nx.to_numpy_array(g)*2)
    
    node_features.append(node_feat_matrix)
    

# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g/2)):
    rand_n = np.random.randint(n_min,n_max)
    rand_p = np.random.randint(int(p_min*100),int(p_max*100))/100   
    rand_m = np.random.randint(m_min,m_max)
    
    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n,n_nf))
    
    for i,node in enumerate(g.nodes):
        g.nodes[node]['features'] = node_feat_matrix[i,:]
        
    #graphs.append(g)
    
    graphs.append(nx.to_numpy_array(g)*2)
    node_features.append(node_feat_matrix)



In [25]:
# create graph collection object
graphs_unlabelled = GraphCollection()
graphs_unlabelled.add_graph_list(graphs,node_features) # loaded without the labels

# save the unlabelled dataset
save_dataset(graphs_unlabelled, 'custom_dataset_classification_unlabelled', folder='./datasets')

# perform some sanity checks
print('There are {} graphs in the unlabelled dataset'.format(len(graphs_unlabelled.graphs)))
print('There are {} features per node'.format(graphs_unlabelled.get_n_node_features()))

There are 20 graphs in the unlabelled dataset
There are 3 features per node


# Load labelled and unlabelled data into hcga

In [16]:
# define an object
h = Hcga()

In [17]:
# load and extract features for the primary dataset with training labels
h.load_data('./datasets/custom_dataset_classification_labelled.pkl')
h.extract(mode='fast', n_workers=4, timeout=20)

INFO:hcga.extraction:Extracting features from 100 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 100 graphs:
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    7.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   11.9s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   15.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   20.0s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   23.6s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   27.8s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   32.2s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   37.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   40.1s finished
INFO:hcga.extraction:970 feature extracted.


In [23]:
# extract features for the secondary dataset with no labels
h.load_data('./datasets/custom_dataset_classification_unlabelled.pkl',prediction_graphs=True) # set prediction graphs to True
h.extract(mode='fast', n_workers=4, timeout=20, prediction_set=True) # set prediction set to True


INFO:hcga.extraction:Extracting features from 20 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 20 graphs:
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done  16 out of  20 | elapsed:    6.6s remaining:    1.7s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    7.5s finished
INFO:hcga.extraction:970 feature extracted.


# Predict unlabelled data

In [24]:
y = h.predict_unlabelled_graphs()


INFO:hcga.analysis:970 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:897 valid features
INFO:hcga.analysis:897 with interpretability 1
INFO:hcga.analysis:... Using Xgboost classifier ...
