### TODO
1. Softmax backprop
2. Fix backprop
3. Gradient checking
4. Visualize loss
5. Embeddings

In [1]:
import config
import argparse
import random
import networkx as nx
import numpy as np
from collections import defaultdict
from pyvis.network import Network
import matplotlib.pyplot as plt
from networkx.algorithms.community.modularity_max import greedy_modularity_communities

ModuleNotFoundError: No module named 'pyvis'

In [None]:
seed = 100
np.random.seed(seed)

### Graph data

In [None]:
G = nx.karate_club_graph()

In [None]:
G.number_of_nodes(), G.number_of_edges()

### Generate labels from communities

In [None]:
communities = greedy_modularity_communities(G)
colors = np.zeros(G.number_of_nodes())
classes = set()

for i, c in enumerate(communities):
    colors[list(c)] = i
    classes.add(i)
    
num_classes = len(classes)
labels = (np.eye(len(classes))[colors.astype(int)]).T

### Color nodes

In [None]:
def random_color():
    return '#%02X%02X%02X' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

# uncomment for random colors
# color_map = {cls: random_color() for cls in classes}
color_map = {0: '#46FB47', 1: '#B9E6B5', 2: '#9F9EBF'}

colored_graph = Network(width='100%', notebook=True)

for node in G.nodes():
    colored_graph.add_node(node, color=color_map[int(colors[node])])
    
for edge in G.edges():
    colored_graph.add_edge(int(edge[0]), int(edge[1]))
    
colored_graph.show('colored_graph.html')

#### Renormalization trick

$A$ is the adjacency matrix, $I$ is the identity matrix, and $N$ is the cardinality of the set of nodes in the graph.

$$
\begin{align}
    \tilde{A} &= A + I_{N}\\
       \tilde{\mathcal{D}}_{ii} &= \sum_{i}\tilde{A}_{ij}\\
    \hat{\mathcal{A}}&=\tilde{\mathcal{D}}^{-\frac{1}{2}}\tilde{\mathcal{A}}\tilde{\mathcal{D}}^{-\frac{1}{2}}
\end{align}
$$

In [None]:
def renormalization(G):
    X = nx.to_numpy_matrix(G)
    I = np.eye(len(X))
    X_tilde = X + I
    D_tilde = np.zeros(X.shape, int)
    np.fill_diagonal(D_tilde, np.sum(X_tilde, axis=1).flatten())
    D_tilde = np.linalg.inv(D_tilde)
    D_tilde = np.power(D_tilde, 0.5)
    return D_tilde @ X_tilde @ D_tilde

In [None]:
nx.to_numpy_matrix(G)

In [None]:
A_hat = renormalization(G)
print(A_hat)

### Helper functions

In [None]:
def glorot_init(in_dim, out_dim):
    sd = np.sqrt(6.0 / (in_dim + out_dim))
    return np.random.uniform(-sd, sd, size=(in_dim, out_dim))

In [None]:
class GradientDescent(object):
    def __init__(self, parameters, learning_rate):
        self.parameters = parameters
        self.learning_rate = learning_rate
        
        
    def zero_gradients(self):
        for layer in self.parameters:
            layer.W_grad = np.zeros(layer.W.shape)
            layer.b_grad = np.zeros(layer.b.shape)
    
    
    def step(self):
        for layer in self.parameters:
            layer.W -= self.learning_rate * layer.W_grad
            layer.b -= self.learning_rate * layer.b_grad

### Graph Convolutional Layer

$$
\text{ReLU}(\hat{A}XW^{1}+b^{1})
$$

In [None]:
class GCLayer(object):
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.W = glorot_init(output_dim, input_dim)
        self.W_grad = np.zeros(self.W.shape)
        self.b = np.ones((output_dim, 1))
        self.b_grad = np.zeros(self.b.shape)
        
        
    def relu(self, x):
        return np.vectorize(lambda i : i if i > 0 else 0)(x)
    
    
    def relu_derivative(self, x):
        return np.asarray((x > 0) * 1)
    
    
    '''
    inputs:
    G (nx.Graph)   Normalized Laplacian matrix for a static graph.
                   Dimensions: N x N where N is the number of nodes.
    x (np.ndarray) Embedding matrix
                   Dimensions: N x F where F is the number of features.
    '''
    def __call__(self, G, x):
        # (nodes x nodes), (nodes x features), so need to transpose
        # before taking linear combination
        self.i = x # (nxf)
        self.X = (G @ x).T # (n,n) x (n,f) -> (n,f).T -> (f,n)
        self.z = self.W @ self.X + self.b # (h,f) x (f,n) + (h,1) -> (h,n). Broadcast bias vector.
        self.a = self.relu(self.z) # (h,n), where n is number of samples/nodes
        
        # print('GC Layer')
        # print(f'x.shape: {x.shape}')
        # print(f'X.shape: {self.X.shape}')
        # print(f'W.shape: {self.W.shape}')
        # print(f'b.shape: {self.b.shape}')
        # print(f'z.shape: {self.z.shape}')
        # print(f'a.shape: {self.a.shape}')
        
        # transpose so can multiply by adjacency matrix in next layer
        return self.a.T # (n,h)
    
    
    def backward(self, error, compute_error=True):
        batch_size = self.X.shape[1] # batch size
        self.W_grad += (error @ self.X.T)/batch_size # (h,n) x (n,f) -> (h,f) which matches W.shape 
       
        expected_grad = np.average(error, axis=1, weights=[batch_size]*error[0]) # expected gradient
        expected_grad = expected_grad[:, np.newaxis] # add back column dim of 1 to match b_grad shape
        
        self.b_grad += expected_grad # (h,n)
        
        # print(f'self.W.T: {self.W.T.shape}')
        # print(f'error: {error.shape}')
        # print(f'self.z: {self.z.shape}')
        # print(f'self.W.T @ error: {type(self.W.T @ error)}')
        # print(f'self.relu_derivative(self.z): {type(self.relu_derivative(self.z))}')
        
        return self.W.T @ error * self.relu_derivative(self.z) if compute_error else None

    

### Linear Layer

In [None]:
class Linear(object):
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.W = glorot_init(output_dim, input_dim)
        self.W_grad = np.zeros(self.W.shape)
        self.b = np.ones((self.output_dim, 1))
        self.b_grad = np.zeros(self.b.shape)
        
        
    def relu(self, x):
        return np.vectorize(lambda i : i if i > 0 else 0)(x)
    
    
    def relu_derivative(self, x):
        return (x > 0) * 1
    

    '''
    inputs:
    x (np.ndarray) Inputs to this layer
    
    outputs:
    a (np.ndarray) Output activations
    '''
    def __call__(self, x):    
        self.x = x # (f,n)
        self.z = self.W @ x + self.b # (h,f) x (f,n) + (h,1). Broadcast bias vector.
        self.a = self.relu(self.z)   # (h,n)
        
        # print('Linear layer')
        # print(f'x.shape: {x.shape}')
        # print(f'W.shape: {self.W.shape}')
        # print(f'b.shape: {self.b.shape}')
        # print(f'z.shape: {self.z.shape}')
        
        return self.a # (h,n)
    
    
    '''
    inputs:
    error (np.ndarray) Error signal of shape (W.out_dim, batch_size) from subsequent layer
    
    outputs:
    
    '''
    def backward(self, error):
        batch_size = error.shape[1]
        # print(f'layer 3 - error: {error.shape}\tx.T: {self.x.T.shape}')
        self.W_grad += (error @ self.x.T) / batch_size # (h,n) x (n,f)
        self.b_grad += (np.sum(error, axis=1)) / batch_size # (h,n)
        return self.W.T @ np.multiply(np.array(error), np.array(self.relu_derivative(self.z))) # (f,h) x (h,n) * (h,n)
        # (16,34) and (3,34)
        

In [None]:
class Softmax(object):
    def __init__(self, stable=False):
        self.stable = stable
        
        
    '''
    inputs:
    x (np.array) Logits (unnormalized outoputs) from final layer
    
    outputs:
    x (np.array) Class probabilities
    '''
    def __call__(self, x):
        if self.stable:
            exps = np.exp(x - np.max(x))
            return exps / np.sum(exps)
        else:
            return np.exp(x)/np.sum(np.exp(x))        

### Graph Convolutional Network

In [None]:
class GCN(object):
    def __init__(self, graph, num_classes):
        self.G = graph
        self.nodes = self.G.shape[0]
        self.embedding = np.eye(self.nodes)
        self.l0 = GCLayer(self.nodes, 16)
        self.l1 = GCLayer(16, 16)
        self.l2 = Linear(16, num_classes)
        self.softmax = Softmax(stable=False)
        self.parameters = [self.l0, self.l1, self.l2]
        
    
    def __call__(self, x):
        a0 = self.l0(self.G, x)
        a1 = self.l1(self.G, a0).T
        a2 = self.l2(a1)
        return self.softmax(a2)
    
    
    def backward(self, x):
        # Transpose errors from (34,3) -> (3,34) because linear weights are (16,34), but transposed for BP
        # so computation must be (labels, batch_size) x (batch_size, hidden_dim) to get error @ x.T
        d2 = self.l2.backward(x)
        # print(f'd2.shape: {d2.shape}')
        d1 = self.l1.backward(d2)
        # print(f'd1.shape: {d1.shape}')
        self.l0.backward(d1, compute_error=False)
        

### Forward pass

#### Layer 1
$$
\begin{align}
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{W}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            w_{1,1} & \ldots & w_{1,34}\\
            \vdots & \ddots & \vdots\\
            w_{16,1} & \ldots & w_{16,34}
        \end{bmatrix}}
        \Biggl(
        \
            \underset{\hat{\mathcal{A}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                \alpha_{1,1} & \ldots & \alpha_{1,34}\\
                \vdots & \ddots & \vdots\\
                \alpha_{34,1} & \ldots & \alpha_{34,34}
            \end{bmatrix}}
            \ 
            \underset{\mathcal{X}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                x_{1,1} & \ldots & x_{1,34}\\
                \vdots & \ddots & \vdots\\
                x_{34,1} & \ldots & x_{34,34}
            \end{bmatrix}}
        \
        \Biggl)^{\top}
        +
        \underset{\mathcal{b}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            b_{1,1} & \ldots & b_{1,34}\\
            \vdots & \ddots & \vdots\\
            b_{16,1} & \ldots & b_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =\
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =
    &\quad\quad\quad\underset{\mathcal{A}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        \vdots & \ddots & \vdots\\
        a_{16,1} & \ldots & a_{16,34}
    \end{bmatrix}}
    \rightarrow
    \Biggl(
    \
        \underset{\mathcal{A}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            a_{1,1} & \ldots & a_{1,34}\\
            \vdots & \ddots & \vdots\\
            a_{16,1} & \ldots & a_{16,34}
        \end{bmatrix}}
    \
    \Biggl)^{\top}
\end{align}
$$

#### Layer 2
$$
\begin{align}
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{W}^{(2)}\ \in\ \mathbb{R}^{16\times16}}{
        \begin{bmatrix}
            w_{1,1} & \ldots & w_{1,16}\\
            \vdots & \ddots & \vdots\\
            w_{16,1} & \ldots & w_{16,16}
        \end{bmatrix}}
        \
        \Biggl(
        \
            \underset{\hat{\mathcal{A}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                \alpha_{1,1} & \ldots & \alpha_{1,34}\\
                \vdots & \ddots & \vdots\\
                \alpha_{34,1} & \ldots & \alpha_{34,34}
            \end{bmatrix}}
            \
            \underset{\mathcal{A}^{(2)^{{\top}}}\ \in\ \mathbb{R}^{34\times16}}{
            \begin{bmatrix}
                a_{1,1} & \ldots & a_{1,16}\\
                \vdots & \ddots & \vdots\\
                a_{34,1} & \ldots & a_{34,16}
            \end{bmatrix}}
        \
        \Biggl)^{\top}
        +
        \underset{\mathcal{b}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            b_{1,1} & \ldots & b_{1,34}\\
            \vdots & \ddots & \vdots\\
            b_{16,1} & \ldots & b_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =\
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =
    &\quad\quad\quad\underset{\mathcal{A}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        \vdots & \ddots & \vdots\\
        a_{16,1} & \ldots & a_{16,34}
    \end{bmatrix}}
    \rightarrow
    \Biggl(
    \
        \underset{\mathcal{A}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            a_{1,1} & \ldots & a_{1,34}\\
            \vdots & \ddots & \vdots\\
            a_{16,1} & \ldots & a_{16,34}
        \end{bmatrix}}
    \Biggl)^{\top}
\end{align}
$$


#### Layer 3
$$
\begin{align}
    &\text{Softmax}
    \Biggl(
    \
        \underset{\mathcal{W}^{(3)}\ \in\ \mathbb{R}^{3\times16}}{
        \begin{bmatrix}
            w_{1,1} & \ldots & w_{1,16}\\
            w_{2,1} & \ldots & w_{2,16}\\
            w_{3,1} & \ldots & w_{3,16}
        \end{bmatrix}}
        \
        \Biggl(
        \
            \underset{\hat{\mathcal{A}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                \alpha_{1,1} & \ldots & \alpha_{1,34}\\
                \vdots & \ddots & \vdots\\
                \alpha_{34,1} & \ldots & \alpha_{34,34}
            \end{bmatrix}}
            \ 
            \underset{\mathcal{A}^{(3)^{\top}}\ \in\ \mathbb{R}^{34\times16}}{
            \begin{bmatrix}
                a_{1,1} & \ldots & a_{1,16}\\
                \vdots & \ddots & \vdots\\
                a_{34,1} & \ldots & a_{34,16}
            \end{bmatrix}}
        \Biggl)^{\top}
        +
        \underset{\mathcal{b}^{(3)}\ \in\ \mathbb{R}^{3\times34}}{
        \begin{bmatrix}
            b_{1,1} & \ldots & b_{1,34}\\
            b_{2,1} & \ldots & b_{2,34}\\
            b_{3,1} & \ldots & b_{3,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =\
    &\text{Softmax}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(3)}\ \in\ \mathbb{R}^{3\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            z_{2,1} & \ldots & z_{2,34}\\
            z_{3,1} & \ldots & z_{3,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =
    &\quad\quad\quad\quad\underset{\mathcal{A}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        a_{2,1} & \ldots & a_{2,34}\\
        a_{3,1} & \ldots & a_{3,34}
    \end{bmatrix}}
\end{align}
$$


### Loss function

Why we use cross entropy loss for classification when doing MLE:
https://en.wikipedia.org/wiki/Cross_entropy#Relation_to_maximum_likelihood

In [None]:
def cross_ent(predictions, targets):
    N = predictions.shape[1] # (3,34), so index 1 for samples
    targets_ = np.squeeze(np.asarray(targets))
    predictions_ = np.squeeze(np.asarray(predictions))
    ce = -np.sum(targets_*np.log(predictions_))/N
    return ce

### Backpropagation

#### Cross entropy loss

$$
\begin{align}
    \delta^{(4)}=&\quad\frac{\partial}{\partial z^{(3)}}\ \frac{1}{2} \Big\lVert Y-H_{\mathcal{W},\mathcal{b}}(\mathcal{X})\Big\rVert^{2}\\
    =&\quad\mathcal{A}^{(4)}-Y\\
    =&\underset{\mathcal{A}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        a_{2,1} & \ldots & a_{2,34}\\
        a_{3,1} & \ldots & a_{3,34}
    \end{bmatrix}}
    -
    \underset{\mathcal{Y}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        y_{1,1} & \ldots & y_{1,34}\\
        y_{2,1} & \ldots & y_{2,34}\\
        y_{3,1} & \ldots & y_{3,34}
    \end{bmatrix}}\\
    =&\underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}
\end{align}
$$

#### Layer 3
$$
\begin{align}
    \nabla W^{(3)} =& \delta^{(4)}A^{(3)^{\top}}\\
    =& 
    \underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}
    \
    \underset{\mathcal{A}^{(3)^{\top}}\ \in\ \mathbb{R}^{34\times16}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,16}\\
        \vdots & \ddots & \vdots\\
        a_{34,1} & \ldots & a_{34,16}
    \end{bmatrix}}\\
    =&
    \underset{\nabla\mathcal{W}^{(3)}\ \in\ \mathbb{R}^{3\times16}}{
    \begin{bmatrix}
        w_{1,1} & \ldots & w_{1,16}\\
        w_{2,1} & \ldots & w_{2,16}\\
        w_{3,1} & \ldots & w_{3,16}
    \end{bmatrix}}\\
    \nabla b^{(3)}=&\delta^{(4)}\\
    =&\underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}\\
    \delta^{(3)} =&\mathcal{W}^{(3)^{\top}}\delta^{(4)}\odot\frac{\partial}{\partial z^{(2)}}\text{ReLU}(z^{(2)})\\
    =&
    \underset{\mathcal{W}^{(3)^{\top}}\ \in\ \mathbb{R}^{16\times3}}{
    \begin{bmatrix}
        w_{1,1} & w_{1,2} & w_{1,3}\\
        \vdots & \ddots & \vdots\\
        w_{16,1} & w_{16,2} & w_{16,3}
    \end{bmatrix}}
    \
    \underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}
    \odot
    \frac{\partial}{\partial z^{(2)}}
    \text{ReLU}
    \Biggl(
    \ 
        \underset{\mathcal{Z}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \ \Biggl)\\
    =&
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
\end{align}
$$

#### Layer 2
$$
\begin{align}
    \nabla W^{(2)} =& \delta^{(3)}A^{(2)^{\top}}\\
    =&
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
    \
    \underset{\mathcal{A}^{(2)^{{\top}}}\ \in\ \mathbb{R}^{34\times16}}{
            \begin{bmatrix}
                a_{1,1} & \ldots & a_{1,16}\\
                \vdots & \ddots & \vdots\\
                a_{34,1} & \ldots & a_{34,16}
            \end{bmatrix}}\\
    \nabla b^{(2)} =& \delta^{(3)}\\
    =&
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}\\
    \delta^{(2)} =&\mathcal{W}^{(2)^{\top}}\delta^{(3)}\odot\frac{\partial}{\partial z^{(1)}}\text{ReLU}(z^{(1)})\\
    =&
    \underset{\mathcal{W}^{(2)^{\top}}\ \in\ \mathbb{R}^{16\times16}}{
    \begin{bmatrix}
        w_{1,1} & \ldots & w_{1,16}\\
        \vdots & \ddots & \vdots\\
        w_{16,1} & \ldots & w_{16,16}
    \end{bmatrix}}
    \
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
    \
    \odot\frac{\partial}{\partial z^{(1)}}\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \
    \Biggl)\\
    =&
    \underset{\mathcal{\delta}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
\end{align}
$$

#### Layer 1
$$
\begin{align}
    \nabla W^{(1)} =& \delta^{(2)}\mathcal{X}^{\top}\\
    =&
    \underset{\mathcal{\delta}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
    \ 
    \underset{\mathcal{X^{\top}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                x_{1,1} & \ldots & x_{1,34}\\
                \vdots & \ddots & \vdots\\
                x_{34,1} & \ldots & x_{34,34}
            \end{bmatrix}}\\
    \nabla b^{(1)} =& \delta^{(2)}\\
    =& \underset{\mathcal{\delta}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
\end{align}
$$

### Training loop

In [None]:
def train(model, loss, epochs, features, labels, opt):
    for e in range(epochs):
        output = model(features)
        # print(f'output: {output.shape}')
        # print(f'labels: {labels.shape}')
        
        loss_val = loss(output, labels)
        deriv_loss = output - labels
        
        # print(f'loss: {loss}')
        # print(f'loss_deriv: {deriv_loss.shape}')
        
        model.backward(deriv_loss)
        opt.step()
        opt.zero_gradients()
        
        if e % 100 == 0:
            print(f'epoch {e} loss: {loss_val.mean()}')

### Hyperparameters

In [None]:
epochs = 1500
lr = 0.001

In [None]:
features = np.eye(G.number_of_nodes())
model = GCN(A_hat, num_classes)
opt = GradientDescent(model.parameters, lr)
train(model, cross_ent, epochs, features, labels, opt)