# Example 4: Predicting samples with unknown labels

Here we provide an example of extracting features and training a model on samples with known labels. We then take a secondary dataset without labels and predict their class. 

Of course, in a real scenario we are unable to predict the accuracy of our unlabelled samples. However, here we know how the data is generated and can confirm that the pipeline works correctly.

In [1]:
import numpy as np
import networkx as nx
import pandas as pd
from hcga.io import save_dataset
from hcga.graph import Graph, GraphCollection

import os
from pathlib import Path

if not Path("datasets").exists():
    os.mkdir("datasets")
if not Path("results").exists():
    os.mkdir("results")

# Generate synthetic data with known labels

In [2]:
# this cell is generating some synthetic graph dataset with node features.


# defining limits on number of nodes
n_min = 20
n_max = 50

# number of graphs
num_g = 100

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
labels = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    g.label = 0

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)

    node_features.append(node_feat_matrix)

    labels.append(0)

# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)

    labels.append(1)

In [3]:
from hcga.graph import Graph, GraphCollection

# create graph collection object
graphs_labelled = GraphCollection()
graphs_labelled.add_graph_list(graphs, node_features, labels)

save_dataset(
    graphs_labelled, "custom_dataset_classification_labelled", folder="./datasets"
)

# perform some sanity checks
print("There are {} graphs".format(len(graphs_labelled.graphs)))
print("There are {} features per node".format(graphs_labelled.get_n_node_features()))

There are 100 graphs
There are 3 features per node


# Extract and analyse labelled data

In [4]:
# define an object
from hcga.hcga import Hcga

h = Hcga()

In [5]:
# load and extract features for the primary dataset with training labels
h.load_data("./datasets/custom_dataset_classification_labelled.pkl")
h.extract(mode="fast", n_workers=4, timeout=5)

INFO:hcga.extraction:Extracting features from 100 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 100 graphs:
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   10.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   15.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   17.6s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   20.3s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   23.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   25.1s finished
INFO:hcga.extraction:1096 feature extracted.


In [6]:
h.analyse_features(save_model=True, plot=False, results_folder="./results/test")

INFO:hcga.analysis:... Using Xgboost classifier ...
INFO:hcga.analysis:1096 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features
INFO:hcga.analysis:1024 valid features
INFO:hcga.analysis:1024 with interpretability 1
INFO:hcga.analysis:Counts of graphs/label: 
0.0    50
1.0    50
Name: label, dtype: int64
INFO:hcga.analysis:Using 10 splits
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 0.9 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Fold accuracy: --- 0.9 ---
INFO:hcga.analysis:Fold accuracy: --- 1.0 ---
INFO:hcga.analysis:Accuracy: 0.98 +/- 0.04
INFO:hcga.analysis:Now using a reduced set of 100 features with < 0.9 correlation.
INFO:hcga.analysis:Fold acc

# Construct synthetic data with no labels

Creating synthetic data constructed in the same way as the training data but without labels.

In [7]:
# this cell is generating some synthetic graph dataset with node features.


# defining limits on number of nodes
n_min = 20
n_max = 50

# number of graphs
num_g = 20

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    g.label = 0

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)

    node_features.append(node_feat_matrix)


# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)

In [8]:
# create graph collection object
graphs_unlabelled = GraphCollection()
graphs_unlabelled.add_graph_list(graphs, node_features)  # loaded without the labels

# save the unlabelled dataset
save_dataset(
    graphs_unlabelled, "custom_dataset_classification_unlabelled", folder="./datasets"
)

# perform some sanity checks
print(
    "There are {} graphs in the unlabelled dataset".format(
        len(graphs_unlabelled.graphs)
    )
)
print("There are {} features per node".format(graphs_unlabelled.get_n_node_features()))

There are 20 graphs in the unlabelled dataset
There are 3 features per node


# Extract and predict unlabelled data using pre-trained model

In [9]:
# extract features for the secondary dataset with no labels
h.load_data(
    "./datasets/custom_dataset_classification_unlabelled.pkl"
)  # set prediction graphs to True
h.extract(mode="fast", n_workers=4, timeout=20)  # set prediction set to True

INFO:hcga.extraction:Extracting features from 20 graphs (we disabled 0 graphs).
INFO:hcga.extraction:Computing features for 20 graphs:
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done  16 out of  20 | elapsed:    3.8s remaining:    0.9s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    4.7s finished
INFO:hcga.extraction:1096 feature extracted.


In [10]:
h.analyse_features(
    plot=False,
    trained_model="./results/test/fitted_model",
    results_folder="./results/test",
)

INFO:hcga.analysis:1096 total features
INFO:hcga.analysis:0 graphs were removed for more than 0.3 fraction of bad features


KeyError: "None of [MultiIndex([('EU',   'semi_eulerian'),\n            ('EU', 'semi_eulerian_E'),\n            ('EU', 'semi_eulerian_N')],\n           names=['feature_class', 'feature_name'])] are in the [columns]"

# Lets look at the predictions

In [None]:
predictions = pd.read_csv("./results/test/prediction_results.csv", index_col=0)
print(predictions)