## Active Learning Library for Graphical Data
### Adi Faintuch
### University of California, Irvine   -  May 2021

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric import utils
import torch_scatter

In [None]:
import numpy as np
from scipy.stats import entropy
import copy
import matplotlib.pyplot as plt

In [None]:
from torch_geometric.datasets import Planetoid

In [None]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [None]:
#dataset = Planetoid(root='/tmp/CiteSeer', name='CiteSeer')

In [None]:
#dataset = Planetoid(root='/tmp/CiteSeer', name='PubMed')

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)
        
    def forward(self):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [None]:
def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()

def test():
    model.eval()
    logits, accs = model(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs
    
def make_and_update_queries(model, query_size=1, query_type="random"):
    if query_type == "random":
        pool_ind = torch.where(data.pool_mask)[0]
        idx = torch.randperm(pool_ind.size(0))[:query_size]
        queries = pool_ind[idx]
    elif query_type == "degree":
        degrees = utils.degree(data.edge_index[0,:], num_nodes=data.x.shape[0])
        ind = np.argmax((degrees*data.pool_mask).numpy(), axis=0)
        queries = torch.tensor([ind])
    elif query_type == "entropy":
        pool_ind = torch.where(data.pool_mask)[0]
        probs = model.forward()
        probs = probs.detach().numpy()
        entropies = entropy(probs, base=2, axis=1)
        ind = np.argmax((entropies*data.pool_mask.detach().numpy()), axis=0)
        queries = torch.tensor([ind])   
        
    # All queries are False in the training mask
    assert torch.any(data.train_mask[queries]).item() == False
    data.train_mask[queries] = True
    
    # All queries are queryable
    assert torch.all(data.pool_mask[queries]).item()
    data.pool_mask[queries] = False
    
    return queries

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
initial_train_size = 10
query_size = 1

query_methods = ["random", "degree", "entropy"]

random_results = []
degree_results = []
entropy_results = []

for query in query_methods:
    for i in range(3):
        data = dataset[0]
        original_train_size = data.train_mask.sum().item()
        print("original train size: ", original_train_size)
        data = data.to(device)
        data.pool_mask = data.train_mask.clone()
        data.pool_mask[:initial_train_size] = False
        data.train_mask[initial_train_size:] = False
        all_queries = []
        active_accuracies = []
        for active_q in range((original_train_size - initial_train_size) // query_size):
            model, data = Net().to(device), data.to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

            best_val = -1.
            best_model = None
            acc_list = [-1., -1., -1.]

            print(f"Train Size {data.train_mask.sum().item()} Pool Size {data.pool_mask.sum().item()}")
            for epoch in range(1, 201):
                train()
                train_acc, val_acc, test_acc = test()
                if val_acc > best_val:
                    acc_list[0] = train_acc
                    acc_list[1] = val_acc
                    acc_list[2] = test_acc
                    best_val = val_acc 
                    best_model = copy.deepcopy(model)

            log = 'Active Run: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
            print(log.format(active_q, acc_list[0], acc_list[1], acc_list[2]))
            active_accuracies.append(acc_list)

            queries = make_and_update_queries(best_model, query_size=query_size, query_type=query)
            queries_tolist = queries.tolist()
            all_queries += queries_tolist
            all_queries += queries.tolist()

        if query == "random":
            print("Random")
            random_results.append(np.array(active_accuracies))
        elif query == "degree":
            print("Degree")
            degree_results.append(np.array(active_accuracies))
        else:
            print("Entropy")
            entropy_results.append(np.array(active_accuracies))

In [None]:
for b in range(3):
    plt.title("Cora Random Results")
    plt.xlabel("Train Size (# Nodes)")
    plt.ylabel("Accuracy %")
    l = "random, " + str(b)
    plt.plot(random_results[b][:, 2], label=l)

    plt.legend();

In [None]:
for b in range(3):
    plt.title("Cora Degree Results")
    plt.xlabel("Train Size (# Nodes)")
    plt.ylabel("Accuracy %")
    l = "degree, " + str(b)
    plt.plot(degree_results[b][:, 2], label=l)

    plt.legend();

In [None]:
for b in range(3):
    plt.title("Cora Entropy Results")
    plt.xlabel("Train Size (# Nodes)")
    plt.ylabel("Accuracy %")
    l = "entropy, " + str(b)
    plt.plot(entropy_results[b][:, 2], label=l)

    plt.legend();

In [None]:
plt.title("Cora Combined Results")
plt.xlabel("Train Size (# Nodes)")
plt.ylabel("Accuracy %")

l = "random"
plt.plot(np.mean(random_results, axis=0)[:, 2], label=l)
plt.legend();

l = "degree"
plt.plot(np.mean(degree_results, axis=0)[:, 2], label=l)
plt.legend();

l = "entropy"
plt.plot(np.mean(entropy_results, axis=0)[:, 2], label=l)
plt.legend();