### TODO
1. [GCN Backprop](https://github.com/dmlc/dgl/issues/4021)
2. Visualize loss
3. [Embeddings](https://beta.openai.com/docs/guides/embeddings/what-are-embeddings)

In [1]:
import sys
import config
import argparse
import random
import networkx as nx
import numpy as np
from collections import defaultdict
from pyvis.network import Network
import matplotlib.pyplot as plt
from networkx.algorithms.community.modularity_max import greedy_modularity_communities

In [2]:
seed = 100
hidden_dim = 16
hidden_dim2 = 20
np.random.seed(seed)

### Graph data

In [3]:
G = nx.karate_club_graph()

In [4]:
G.number_of_nodes(), G.number_of_edges()

(34, 78)

### Generate labels from communities

In [5]:
communities = greedy_modularity_communities(G)
colors = np.zeros(G.number_of_nodes())
classes = set()

for i, c in enumerate(communities):
    colors[list(c)] = i
    classes.add(i)
    
num_classes = len(classes)
labels = (np.eye(len(classes))[colors.astype(int)]).T

classes, samples = labels.shape
print(f'classes: {classes}\t samples: {samples}')

classes: 3	 samples: 34


### Color nodes

In [6]:
def random_color():
    return '#%02X%02X%02X' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

# uncomment for random colors
# color_map = {cls: random_color() for cls in classes}
color_map = {0: '#46FB47', 1: '#B9E6B5', 2: '#9F9EBF'}

colored_graph = Network(width='100%', notebook=True)

for node in G.nodes():
    colored_graph.add_node(node, color=color_map[int(colors[node])])
    
for edge in G.edges():
    colored_graph.add_edge(int(edge[0]), int(edge[1]))
    
colored_graph.show('colored_graph.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


#### Renormalization trick

$A$ is the adjacency matrix, $I$ is the identity matrix, and $N$ is the cardinality of the set of nodes in the graph.

$$
\begin{align}
    \tilde{A} &= A + I_{N}\\
       \tilde{\mathcal{D}}_{ii} &= \sum_{i}\tilde{A}_{ij}\\
    \hat{\mathcal{A}}&=\tilde{\mathcal{D}}^{-\frac{1}{2}}\tilde{\mathcal{A}}\tilde{\mathcal{D}}^{-\frac{1}{2}}
\end{align}
$$

In [7]:
def renormalization(G):
    A = np.asarray(nx.to_numpy_matrix(G))
    I = np.eye(len(A))
    A_tilde = A + I
    D_tilde = np.zeros(A.shape, int)
    np.fill_diagonal(D_tilde, np.sum(A_tilde, axis=1).flatten())
    D_tilde = np.linalg.inv(D_tilde)
    D_tilde = np.power(D_tilde, 0.5)
    return D_tilde @ A_tilde @ D_tilde

In [8]:
nx.to_numpy_matrix(G)

matrix([[0., 4., 5., ..., 2., 0., 0.],
        [4., 0., 6., ..., 0., 0., 0.],
        [5., 6., 0., ..., 0., 2., 0.],
        ...,
        [2., 0., 0., ..., 0., 4., 4.],
        [0., 0., 2., ..., 4., 0., 5.],
        [0., 0., 0., ..., 4., 5., 0.]])

In [9]:
# Must pre-process offline
A_hat = renormalization(G)
print(A_hat)

[[0.02325581 0.11136921 0.13076645 ... 0.06502561 0.         0.        ]
 [0.11136921 0.03333333 0.18786729 ... 0.         0.         0.        ]
 [0.13076645 0.18786729 0.02941176 ... 0.         0.0549235  0.        ]
 ...
 [0.06502561 0.         0.         ... 0.04545455 0.13655775 0.12182898]
 [0.         0.         0.0549235  ... 0.13655775 0.02564103 0.11437725]
 [0.         0.         0.         ... 0.12182898 0.11437725 0.02040816]]


### Helper functions

In [10]:
def init(model, scheme):
    for i, layer in enumerate(model.parameters):
        model.parameters[i].W = scheme(*layer.W.shape)
        model.parameters[i].b = scheme(*layer.b.shape)
        
    return model

def glorot_init(in_dim, out_dim):
    sd = np.sqrt(6.0 / (in_dim + out_dim))
    return np.random.uniform(-sd, sd, size=(in_dim, out_dim))

In [11]:
class GradientDescent(object):
    def __init__(self, parameters, learning_rate):
        self.parameters = parameters
        self.learning_rate = learning_rate
        
        
    def zero_gradients(self):
        for layer in self.parameters:
            layer.dW = np.zeros(layer.W.shape)
            layer.db = np.zeros(layer.b.shape)
    
    
    def step(self):
        for i, layer in enumerate(self.parameters):
            # TODO: Replace with assertion
            if np.any(np.isnan(layer.db)):
                print(f'nans layer {i}')
                
            layer.W -= self.learning_rate * layer.dW
            layer.b -= self.learning_rate * layer.db

### Activation functions

In [12]:
def relu(x):
    return np.maximum(0, x)

def relu_(x):
    return (x > 0).astype(int)

def softmax(x, axis=0):
    return np.exp(x)/np.sum(np.exp(x), axis=axis, keepdims=True)

### Graph Convolutional Layer

$$
\text{ReLU}(\hat{A}XW^{1}+b^{1})
$$

In [13]:
class GCLayer(object):
    def __init__(self, input_dim, output_dim, name=''):
        self.name = name
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.W = glorot_init(output_dim, input_dim)
        self.dW = np.zeros(self.W.shape)
        self.b = np.ones((output_dim, 1))
        self.db = np.zeros(self.b.shape)
            
    '''
    inputs:
    G (nx.Graph)   Normalized Laplacian matrix for a static graph.
                   Dimensions: N x N where N is the number of nodes.
    x (np.ndarray) Embedding matrix
                   Dimensions: N x F where F is the number of features.
    '''
    def __call__(self, G, x, activation=None):
        if not activation:
            activation = lambda x: x
            
        # (nodes x nodes), (nodes x features), so need to transpose
        # before taking linear combination
        self.z = x # (n,f)
        
        # need to apply the activations along feature/hidden dimension
        # since x is (n,f), transpose to apply activations, then transpose back
        # to dot with the adjacency matrix
        self.a = activation(x.T).T
        
        # (n,n) x (n,f) -> (n,f).T -> (f,n) so can left-multiply weight with features
        # this is purely stylistic preference.
        X = (G @ self.a).T
        
        print(f'({self.name}) W.shape: {self.W.shape}\t X.shape: {X.shape}')
        
        # transpose so can multiply by adjacency matrix in next layer, (n,h)
        return (self.W @ X + self.b).T # (h,f) x (f,n) + (h,1) -> (h,n). Broadcast bias vector.
    
    
    def backward(self, error, derivative=None):
        if not derivative:
            derivative = lambda x: x
            
        print(f'({self.name}) W.T.shape: {self.W.T.shape}\t error.shape: {error.shape}\t a.shape: {self.a.shape}')
        
        #self.dW = error @ self.a.T # (h,n) x (n,f) -> (h,f) which matches W.shape
        self.dW = error @ self.a
        self.db = np.sum(error, axis=1, keepdims=True) # (h,n)
        
        return self.W.T @ error * derivative(self.z)

    

### Linear Layer

In [14]:
class Linear(object):
    def __init__(self, input_dim, output_dim, name=''):
        self.name = name
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.W = glorot_init(output_dim, input_dim)
        self.dW = np.zeros(self.W.shape)
        self.b = np.ones((self.output_dim, 1))
        self.db = np.zeros(self.b.shape)    

    '''
    inputs:
    x (np.ndarray) Inputs to this layer
    
    outputs:
    a (np.ndarray) Output activations
    '''
    def __call__(self, x, activation=None):
        if not activation:
            activation = lambda x: x
            
        self.z = x # (f,n)
        self.a = activation(x)
        print(f'({self.name}) W.shape: {self.W.shape}\t a.shape: {self.a.shape}')
        return self.W @ self.a + self.b # (h,f) x (f,n) + (h,1) -> (h,n). Broadcast bias vector.
    
    
    '''
    inputs:
    error (np.ndarray) Error signal of shape (W.out_dim, batch_size) from subsequent layer
    
    outputs:
    
    '''
    def backward(self, error, derivative=None):
        if not derivative:
            derivative = lambda x: x
            
        batch_size = error.shape[1]
        
        print(f'({self.name}) W.T.shape: {self.W.T.shape}\t error.shape: {error.shape}\t a.T.shape: {self.a.T.shape}\t z.shape: {self.z.shape}')
        
        self.dW = error @ self.a.T # (h,n) x (n,f)        
        self.db = np.sum(error, axis=1, keepdims=True) # (h,n)
        
        return self.W.T @ error * derivative(self.z) # (f,h) x (h,n) * (h,n)
        

### Graph Convolutional Network

In [15]:
class GCN(object):
    def __init__(self, graph, num_classes):
        self.G = graph
        self.nodes = self.G.shape[0]
        self.embedding = np.eye(self.nodes)
        self.l0 = GCLayer(self.nodes, hidden_dim, name='l0')
        self.l1 = GCLayer(hidden_dim, hidden_dim2, name='l1')
        self.l2 = Linear(hidden_dim2, num_classes, name='l2')
        self.parameters = [self.l0, self.l1, self.l2]
        
    
    def __call__(self, x):
        a0 = self.l0(self.G, x, activation=relu)
        a1 = self.l1(self.G, a0, activation=relu).T # transpose b/c Linear layer expects (f,n)
        a2 = self.l2(a1)
        return softmax(a2)
    
    
    def backward(self, x):
        # Transpose errors from (34,3) -> (3,34) because linear weights are (16,34), but transposed for BP
        # so computation must be (labels, batch_size) x (batch_size, hidden_dim) to get error @ x.T
        d2 = self.l2.backward(x, derivative=relu_)
        d1 = self.l1.backward(d2, derivative=relu_)
        self.l0.backward(d1, derivative=None)
        

### Forward pass

#### Layer 1
$$
\begin{align}
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{W}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            w_{1,1} & \ldots & w_{1,34}\\
            \vdots & \ddots & \vdots\\
            w_{16,1} & \ldots & w_{16,34}
        \end{bmatrix}}
        \Biggl(
        \
            \underset{\hat{\mathcal{A}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                \alpha_{1,1} & \ldots & \alpha_{1,34}\\
                \vdots & \ddots & \vdots\\
                \alpha_{34,1} & \ldots & \alpha_{34,34}
            \end{bmatrix}}
            \ 
            \underset{\mathcal{X}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                x_{1,1} & \ldots & x_{1,34}\\
                \vdots & \ddots & \vdots\\
                x_{34,1} & \ldots & x_{34,34}
            \end{bmatrix}}
        \
        \Biggl)^{\top}
        +
        \underset{\mathcal{b}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            b_{1,1} & \ldots & b_{1,34}\\
            \vdots & \ddots & \vdots\\
            b_{16,1} & \ldots & b_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =\
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =
    &\quad\quad\quad\underset{\mathcal{A}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        \vdots & \ddots & \vdots\\
        a_{16,1} & \ldots & a_{16,34}
    \end{bmatrix}}
    \rightarrow
    \Biggl(
    \
        \underset{\mathcal{A}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            a_{1,1} & \ldots & a_{1,34}\\
            \vdots & \ddots & \vdots\\
            a_{16,1} & \ldots & a_{16,34}
        \end{bmatrix}}
    \
    \Biggl)^{\top}
\end{align}
$$

#### Layer 2
$$
\begin{align}
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{W}^{(2)}\ \in\ \mathbb{R}^{16\times16}}{
        \begin{bmatrix}
            w_{1,1} & \ldots & w_{1,16}\\
            \vdots & \ddots & \vdots\\
            w_{16,1} & \ldots & w_{16,16}
        \end{bmatrix}}
        \
        \Biggl(
        \
            \underset{\hat{\mathcal{A}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                \alpha_{1,1} & \ldots & \alpha_{1,34}\\
                \vdots & \ddots & \vdots\\
                \alpha_{34,1} & \ldots & \alpha_{34,34}
            \end{bmatrix}}
            \
            \underset{\mathcal{A}^{(2)^{{\top}}}\ \in\ \mathbb{R}^{34\times16}}{
            \begin{bmatrix}
                a_{1,1} & \ldots & a_{1,16}\\
                \vdots & \ddots & \vdots\\
                a_{34,1} & \ldots & a_{34,16}
            \end{bmatrix}}
        \
        \Biggl)^{\top}
        +
        \underset{\mathcal{b}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            b_{1,1} & \ldots & b_{1,34}\\
            \vdots & \ddots & \vdots\\
            b_{16,1} & \ldots & b_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =\
    &\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =
    &\quad\quad\quad\underset{\mathcal{A}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        \vdots & \ddots & \vdots\\
        a_{16,1} & \ldots & a_{16,34}
    \end{bmatrix}}
    \rightarrow
    \Biggl(
    \
        \underset{\mathcal{A}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            a_{1,1} & \ldots & a_{1,34}\\
            \vdots & \ddots & \vdots\\
            a_{16,1} & \ldots & a_{16,34}
        \end{bmatrix}}
    \Biggl)^{\top}
\end{align}
$$


#### Layer 3
$$
\begin{align}
    &\text{Softmax}
    \Biggl(
    \
        \underset{\mathcal{W}^{(3)}\ \in\ \mathbb{R}^{3\times16}}{
        \begin{bmatrix}
            w_{1,1} & \ldots & w_{1,16}\\
            w_{2,1} & \ldots & w_{2,16}\\
            w_{3,1} & \ldots & w_{3,16}
        \end{bmatrix}}
        \
        \Biggl(
        \
            \underset{\hat{\mathcal{A}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                \alpha_{1,1} & \ldots & \alpha_{1,34}\\
                \vdots & \ddots & \vdots\\
                \alpha_{34,1} & \ldots & \alpha_{34,34}
            \end{bmatrix}}
            \ 
            \underset{\mathcal{A}^{(3)^{\top}}\ \in\ \mathbb{R}^{34\times16}}{
            \begin{bmatrix}
                a_{1,1} & \ldots & a_{1,16}\\
                \vdots & \ddots & \vdots\\
                a_{34,1} & \ldots & a_{34,16}
            \end{bmatrix}}
        \Biggl)^{\top}
        +
        \underset{\mathcal{b}^{(3)}\ \in\ \mathbb{R}^{3\times34}}{
        \begin{bmatrix}
            b_{1,1} & \ldots & b_{1,34}\\
            b_{2,1} & \ldots & b_{2,34}\\
            b_{3,1} & \ldots & b_{3,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =\
    &\text{Softmax}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(3)}\ \in\ \mathbb{R}^{3\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            z_{2,1} & \ldots & z_{2,34}\\
            z_{3,1} & \ldots & z_{3,34}
        \end{bmatrix}}
    \
    \Biggr)\\
    =
    &\quad\quad\quad\quad\underset{\mathcal{A}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        a_{2,1} & \ldots & a_{2,34}\\
        a_{3,1} & \ldots & a_{3,34}
    \end{bmatrix}}
\end{align}
$$


### Loss function

Why we use cross entropy loss for classification when doing MLE:
https://en.wikipedia.org/wiki/Cross_entropy#Relation_to_maximum_likelihood

In [16]:
def cross_ent(predictions, targets):
    N = predictions.shape[1] # (3,34), so index 1 for samples
    targets_ = np.squeeze(np.asarray(targets))
    predictions_ = np.squeeze(np.asarray(predictions))
    ce = -np.sum(targets_*np.log(predictions_))/N
    return ce

### Backpropagation

#### Cross entropy loss

$$
\begin{align}
    \delta^{(4)}=&\quad\frac{\partial}{\partial z^{(3)}}\ \frac{1}{2} \Big\lVert Y-H_{\mathcal{W},\mathcal{b}}(\mathcal{X})\Big\rVert^{2}\\
    =&\quad\mathcal{A}^{(4)}-Y\\
    =&\underset{\mathcal{A}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,34}\\
        a_{2,1} & \ldots & a_{2,34}\\
        a_{3,1} & \ldots & a_{3,34}
    \end{bmatrix}}
    -
    \underset{\mathcal{Y}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        y_{1,1} & \ldots & y_{1,34}\\
        y_{2,1} & \ldots & y_{2,34}\\
        y_{3,1} & \ldots & y_{3,34}
    \end{bmatrix}}\\
    =&\underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}
\end{align}
$$

#### Layer 3
$$
\begin{align}
    \nabla W^{(3)} =& \delta^{(4)}A^{(3)^{\top}}\\
    =& 
    \underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}
    \
    \underset{\mathcal{A}^{(3)^{\top}}\ \in\ \mathbb{R}^{34\times16}}{
    \begin{bmatrix}
        a_{1,1} & \ldots & a_{1,16}\\
        \vdots & \ddots & \vdots\\
        a_{34,1} & \ldots & a_{34,16}
    \end{bmatrix}}\\
    =&
    \underset{\nabla\mathcal{W}^{(3)}\ \in\ \mathbb{R}^{3\times16}}{
    \begin{bmatrix}
        w_{1,1} & \ldots & w_{1,16}\\
        w_{2,1} & \ldots & w_{2,16}\\
        w_{3,1} & \ldots & w_{3,16}
    \end{bmatrix}}\\
    \nabla b^{(3)}=&\delta^{(4)}\\
    =&\underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}\\
    \delta^{(3)} =&\mathcal{W}^{(3)^{\top}}\delta^{(4)}\odot\frac{\partial}{\partial z^{(2)}}\text{ReLU}(z^{(2)})\\
    =&
    \underset{\mathcal{W}^{(3)^{\top}}\ \in\ \mathbb{R}^{16\times3}}{
    \begin{bmatrix}
        w_{1,1} & w_{1,2} & w_{1,3}\\
        \vdots & \ddots & \vdots\\
        w_{16,1} & w_{16,2} & w_{16,3}
    \end{bmatrix}}
    \
    \underset{\mathcal{\delta}^{(4)}\ \in\ \mathbb{R}^{3\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        d_{2,1} & \ldots & d_{2,34}\\
        d_{3,1} & \ldots & d_{3,34}
    \end{bmatrix}}
    \odot
    \frac{\partial}{\partial z^{(2)}}
    \text{ReLU}
    \Biggl(
    \ 
        \underset{\mathcal{Z}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \ \Biggl)\\
    =&
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
\end{align}
$$

#### Layer 2
$$
\begin{align}
    \nabla W^{(2)} =& \delta^{(3)}A^{(2)^{\top}}\\
    =&
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
    \
    \underset{\mathcal{A}^{(2)^{{\top}}}\ \in\ \mathbb{R}^{34\times16}}{
            \begin{bmatrix}
                a_{1,1} & \ldots & a_{1,16}\\
                \vdots & \ddots & \vdots\\
                a_{34,1} & \ldots & a_{34,16}
            \end{bmatrix}}\\
    \nabla b^{(2)} =& \delta^{(3)}\\
    =&
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}\\
    \delta^{(2)} =&\mathcal{W}^{(2)^{\top}}\delta^{(3)}\odot\frac{\partial}{\partial z^{(1)}}\text{ReLU}(z^{(1)})\\
    =&
    \underset{\mathcal{W}^{(2)^{\top}}\ \in\ \mathbb{R}^{16\times16}}{
    \begin{bmatrix}
        w_{1,1} & \ldots & w_{1,16}\\
        \vdots & \ddots & \vdots\\
        w_{16,1} & \ldots & w_{16,16}
    \end{bmatrix}}
    \
    \underset{\mathcal{\delta}^{(3)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
    \
    \odot\frac{\partial}{\partial z^{(1)}}\text{ReLU}
    \Biggl(
    \
        \underset{\mathcal{Z}^{(1)}\ \in\ \mathbb{R}^{16\times34}}{
        \begin{bmatrix}
            z_{1,1} & \ldots & z_{1,34}\\
            \vdots & \ddots & \vdots\\
            z_{16,1} & \ldots & z_{16,34}
        \end{bmatrix}}
    \
    \Biggl)\\
    =&
    \underset{\mathcal{\delta}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
\end{align}
$$

#### Layer 1
$$
\begin{align}
    \nabla W^{(1)} =& \delta^{(2)}\mathcal{X}^{\top}\\
    =&
    \underset{\mathcal{\delta}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
    \ 
    \underset{\mathcal{X^{\top}}\ \in\ \mathbb{R}^{34\times34}}{
            \begin{bmatrix}
                x_{1,1} & \ldots & x_{1,34}\\
                \vdots & \ddots & \vdots\\
                x_{34,1} & \ldots & x_{34,34}
            \end{bmatrix}}\\
    \nabla b^{(1)} =& \delta^{(2)}\\
    =& \underset{\mathcal{\delta}^{(2)}\ \in\ \mathbb{R}^{16\times34}}{
    \begin{bmatrix}
        d_{1,1} & \ldots & d_{1,34}\\
        \vdots & \ddots & \vdots\\
        d_{16,1} & \ldots & d_{16,34}
    \end{bmatrix}}
\end{align}
$$

### Training loop

In [17]:
def train(model, loss, epochs, features, labels, opt):
    for e in range(epochs):
        output = model(features)
        loss_val = loss(output, labels)
        deriv_loss = output - labels
        
#         try:
        model.backward(deriv_loss)
        opt.step()
        opt.zero_gradients()
#         except:
#             print(f'error at epoch {e}')
#             sys.exit(0)
        
        if e % 100 == 0:
            print(f'epoch {e} loss: {loss_val.mean()}')

### Hyperparameters

In [18]:
epochs = 1500
lr = 0.001

In [19]:
features = np.eye(G.number_of_nodes())
model = GCN(A_hat, num_classes)
opt = GradientDescent(model.parameters, lr)
train(model, cross_ent, epochs, features, labels, opt)

(l0) W.shape: (16, 34)	 X.shape: (34, 34)
(l1) W.shape: (20, 16)	 X.shape: (16, 34)
(l2) W.shape: (3, 20)	 a.shape: (20, 34)
(l2) W.T.shape: (20, 3)	 error.shape: (3, 34)	 a.T.shape: (34, 20)	 z.shape: (20, 34)
(l1) W.T.shape: (16, 20)	 error.shape: (20, 34)	 a.T.shape: (16, 34)


AttributeError: 'GCLayer' object has no attribute 'relu_derivative'