# Example 4: Predicting samples with unknown labels

Here we provide an example of extracting features and training a model on samples with known labels. We then take a secondary dataset without labels and predict their class. 

Of course, in a real scenario we are unable to predict the accuracy of our unlabelled samples. However, here we know how the data is generated and can confirm that the pipeline works correctly.

In [None]:
import numpy as np
import networkx as nx
import pandas as pd
from hcga.io import save_dataset
from hcga.graph import Graph, GraphCollection

import os
from pathlib import Path

if not Path("datasets").exists():
    os.mkdir("datasets")
if not Path("results").exists():
    os.mkdir("results")

# Generate synthetic data with known labels

In [None]:
# this cell is generating some synthetic graph dataset with node features.


# defining limits on number of nodes
n_min = 20
n_max = 50

# number of graphs
num_g = 100

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
labels = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    g.label = 0

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)

    node_features.append(node_feat_matrix)

    labels.append(0)

# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)

    labels.append(1)

In [None]:
from hcga.graph import Graph, GraphCollection

# create graph collection object
graphs_labelled = GraphCollection()
graphs_labelled.add_graph_list(graphs, node_features, labels)

save_dataset(
    graphs_labelled, "custom_dataset_classification_labelled", folder="./datasets"
)

# perform some sanity checks
print("There are {} graphs".format(len(graphs_labelled.graphs)))
print("There are {} features per node".format(graphs_labelled.get_n_node_features()))

# Extract and analyse labelled data

In [None]:
# define an object
from hcga.hcga import Hcga

h = Hcga()

In [None]:
# load and extract features for the primary dataset with training labels
h.load_data("./datasets/custom_dataset_classification_labelled.pkl")
h.extract(mode="fast", n_workers=4, timeout=5)

In [None]:
h.analyse_features(save_model=True, plot=False, results_folder="./results/test")

# Construct synthetic data with no labels

Creating synthetic data constructed in the same way as the training data but without labels.

In [None]:
# this cell is generating some synthetic graph dataset with node features.


# defining limits on number of nodes
n_min = 20
n_max = 50

# number of graphs
num_g = 20

# number of node features - in this example I will generate random node features that aren't useful for classifcation
n_nf = 3


# empty list of graphs and labels
graphs = []
node_features = []


# setting limits on probability of edge existing for random graphs
p_min = 0.1
p_max = 0.5

# adding 50 random graphs (label 0)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100

    g = nx.fast_gnp_random_graph(rand_n, rand_p)
    g.label = 0

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)

    node_features.append(node_feat_matrix)


# setting limits on number of edges to add per node
m_min = 1
m_max = 5

# adding 50  powerlaw cluster graphs (label 1)
for i in range(int(num_g / 2)):
    rand_n = np.random.randint(n_min, n_max)
    rand_p = np.random.randint(int(p_min * 100), int(p_max * 100)) / 100
    rand_m = np.random.randint(m_min, m_max)

    g = nx.powerlaw_cluster_graph(rand_n, rand_m, rand_p)
    g.label = 1

    node_feat_matrix = np.random.random((rand_n, n_nf))

    for i, node in enumerate(g.nodes):
        g.nodes[node]["features"] = node_feat_matrix[i, :]

    # graphs.append(g)

    graphs.append(nx.to_numpy_array(g) * 2)
    node_features.append(node_feat_matrix)

In [None]:
# create graph collection object
graphs_unlabelled = GraphCollection()
graphs_unlabelled.add_graph_list(graphs, node_features)  # loaded without the labels

# save the unlabelled dataset
save_dataset(
    graphs_unlabelled, "custom_dataset_classification_unlabelled", folder="./datasets"
)

# perform some sanity checks
print(
    "There are {} graphs in the unlabelled dataset".format(
        len(graphs_unlabelled.graphs)
    )
)
print("There are {} features per node".format(graphs_unlabelled.get_n_node_features()))

# Extract and predict unlabelled data using pre-trained model

In [None]:
# extract features for the secondary dataset with no labels
h.load_data(
    "./datasets/custom_dataset_classification_unlabelled.pkl"
)  # set prediction graphs to True
h.extract(mode="fast", n_workers=4, timeout=20)  # set prediction set to True

In [None]:
h.analyse_features(
    plot=False,
    trained_model="./results/test/fitted_model",
    results_folder="./results/test",
)

# Lets look at the predictions

In [None]:
predictions = pd.read_csv("./results/test/prediction_results.csv", index_col=0)
print(predictions)