## PROTEINS

In *message passing simplicial networks* they benchmark using some data sets from TUdatasets. Here we try the PROTEINS dataset. See "Protein Function Prediction via Graph Kernels", Bogwart et al., for details about the data set. 

There are two classes of graphs: *enzymes* and *not enzymes*

The data set has node features in $\{0,1 \}^3$ . 

### Todo: 
- Check initialization of cochains: I think right now its random 
- Try different models,  maybe more convolutional layers or higher powers of L 

In [7]:
import torch
import torch.nn as nn
import torch_geometric
from torch_geometric.datasets import TUDataset
import networkx as nx
import numpy as np
import gudhi as gd
import matplotlib.pyplot as plt
import numpy as np
  
import cochainlearning as cl

# Load the dataset

In [27]:
#dataset = TUDataset(root='data/TUDataset', name='PROTEINS')
dataset = TUDataset(root='tudata/TUDataset', name='AIDS')
dataset

Downloading https://www.chrsmrrs.com/graphkerneldatasets/AIDS.zip
Extracting tudata/TUDataset/AIDS/AIDS.zip
Processing...
Done!


AIDS(2000)

In [28]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: AIDS(2000):
Number of graphs: 2000
Number of features: 38
Number of classes: 2

Data(edge_index=[2, 106], x=[47, 38], edge_attr=[106, 3], y=[1])
Number of nodes: 47
Number of edges: 106
Average node degree: 2.26
Has isolated nodes: False
Has self-loops: False
Is undirected: True


#### Separate data into training and testing 

In [29]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

len_train_set = 800

train_dataset = dataset[:len_train_set]
test_dataset = dataset[len_train_set:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 800
Number of test graphs: 1200


#### Make clique complexess with the graphs
clique complexes up to dimension 2 

In [82]:
def graph_to_chain(graph):
    """ 
    A function for turning a graph into a chain
    """

    # get node features
    node_features = torch.tensor(graph['x'])

    # get edges
    edge_index =torch.tensor(data['edge_index']).T

    # number of 1-simplices
    r = edge_index.shape[0]

    # embedding dimension
    n = node_features.shape[1]

    # sort the edge indices
    edges = torch.tensor([np.sort([edge_index[i][0],edge_index[i][1]]) for i in range(len(edge_index))])

    # initialize chain
    chain = torch.zeros((r,2,n))

    # turn edges into a 1-chain
    for i in range(r):
        chain[i,0,:] = node_features[edges[i][0]]
        chain[i,1,:] = node_features[edges[i][1]]

    return chain

# example
data = dataset[0]
graph_to_chain(data).shape


  node_features = torch.tensor(graph['x'])
  edge_index =torch.tensor(data['edge_index']).T


torch.Size([22, 2, 38])

## Train a model

In [118]:
class model(nn.Module):  
    
    """Define a simple model using convolutional layers and linear layers 
    to reduce the dim of the output """
    
    
    def __init__(self, n, out, c = 5, m1 = 50, m2 = 30, m3 = 20, m4 = 10): ## check channel sizes
        super().__init__()
        self.n = n
        self.c = c
        self.m1 = m1
        self.m2 = m2
        self.m3 = m3

        # initialise vector field
        self.vf = nn.Sequential(
                    nn.Linear(n, m1),
                    nn.ReLU(),
                    nn.Linear(m1, m2),
                    nn.ReLU(),
                    nn.Linear(m2, n*c)
                    )
        
        # initialise MLP classifier
        self.classifier = nn.Sequential(
                    nn.Linear(c, m3),
                    nn.ReLU(),
                    nn.Linear(m3, m4),
                    nn.ReLU(),
                    nn.Linear(m4, out)
                    )

        
        
    def forward(self, x):
        ## asses the dimensions are correct somewhere 
        "Here the input is a chain, and the output is a vector of probabilities"

        # generate cochain data matrix
        X = cl.gen_CDM(self.vf, x)

        # orientation invariant square L2-norm readout function
        X = torch.diag(X.T @ X)

        # put output through classifier
        output = self.classifier(X)

        # softmax
        sm = nn.functional.softmax(output)
        
        return sm
    
    

In [124]:
n = node_features.shape[1]
out = dataset.num_classes

basic_model = model(n = n, out = out)
basic_model.forward(chain)

  sm = nn.functional.softmax(output)


tensor([0.4674, 0.5326], grad_fn=<SoftmaxBackward0>)

In [129]:
dataset[0].y

tensor([1])

In [130]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(basic_model.parameters(), lr=1e-2)

criterion = torch.nn.CrossEntropyLoss() 

In [144]:
def train(dataset):
    
    basic_model.train()

    for data in dataset:  # Iterate in batches over the training dataset.

        chain = graph_to_chain(data)

        out = basic_model.forward(chain)  # Perform a single forward pass.

        # do a 1-hot encoding of data.y
        y = torch.zeros(dataset.num_classes)
        y[data.y] = 1
        
        print(out.shape)
        print(y)

        loss = criterion(out, y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.

In [145]:
train(dataset)

torch.Size([2])
tensor([0., 1.])
torch.Size([2])
tensor([0., 1.])


  node_features = torch.tensor(graph['x'])
  edge_index =torch.tensor(data['edge_index']).T
  sm = nn.functional.softmax(output)


IndexError: index 7 is out of bounds for dimension 0 with size 7

In [None]:
epochs = 50

batch_size = 300

losses = torch.zeros((batch_size,epochs))

for j in range(epochs):

    # make a random choice of size batch_size
    idx = np.random.choice(len(paths), size=batch_size, replace=False)

    batch_paths = [paths[i] for i in idx]
    batch_labels = labels[idx]

        
    for i in range(batch_size):

        p = batch_paths[i]
        l = batch_labels[i]

        p = path_to_chain(p)
        
        X = gen_CDM2(vf, p, d = 5)
        X = torch.sum(X, dim = 0)

        sm = torch.nn.functional.softmax(X)


        loss = criterion(sm,l.float())

        losses[i,j] = loss.detach()

        
        loss.backward()

        # for some reason it works better doing the backprop/gradient step after each path

        optimizer.step()

        optimizer.zero_grad()

    # clear print statement from previous iteration
    # clear_output(wait=True)
    
    print("Epoch = ", j, "Loss = ", torch.sum(losses[:,j])/batch_size)


fig, axs = plt.subplots(3, 1, figsize=(5, 15))

for i in range(3):
    ax = axs[i]
    plot_component_vf(vf, ax, comp = i, x_range=10, y_range=10)
    ax.set_title('Component {}'.format(i+1))

# add a title to the figure
fig.suptitle('Final feature vector fields')

plt.show()