# Explorando GNN para prediccion de nodos

1. Setup (librerias, datos, splits)
2. EDA (como se ve el grafo?)
3. Un GNN sencillo
4. Un GraphNets


# 1 - Setup
## 1a  Bajar colab_utils + repo



<a href="https://colab.research.google.com/github/beangoben/gnn_workshop_riiaa/blob/master/Prediccion de Nodos con Arxiv MAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [None]:
!wget https://raw.githubusercontent.com/beangoben/gnn_workshop_riiaa/master/colab_utils.py -O colab_utils.py
!rm -rf sample_data
github_repo = 'https://github.com/beangoben/gnn_workshop_riiaa'
import colab_utils
colab_utils.clone_repo(github_repo)

## 1b Instala paquetes via pip

In [None]:
if colab_utils.is_running_colab():
    colab_utils.pip_install(['umap-learn', 'dm-sonnet', 'graph_nets', 'ogb', 'ml-collections'])

## 1c importa modulos


In [None]:
import os
from collections import OrderedDict, defaultdict

import tqdm.auto as tqdm
import colab_utils 
import ml_collections

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import networkx as nx

import sklearn
import umap
import tensorflow as tf
import sonnet as snt
import graph_nets as gn
import ogb

colab_utils.print_module_versions([umap, tf, snt, nx, ogb])
print(f'Tiene GPU? {tf.config.list_physical_devices("gpu")}')
colab_utils.matplotlib_settings()

## 1d Bajar dataset via ogb (arxiv)

https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv

In [None]:
from ogb.nodeproppred import NodePropPredDataset

dataset = NodePropPredDataset(name = 'ogbn-arxiv')
dataset

In [None]:
dataset.meta_info

Datos relacionados a los papers


In [214]:
!wget https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz -O titleabs.tsv.gz
paper_df = pd.read_csv('titleabs.tsv.gz', sep='\t', compression="gzip", names=['paper id', 'title', 'abstract'],)
paper_df = paper_df.drop(0,axis=0).dropna()
paper_df['paper id'] = paper_df['paper id'].astype(int)
paper_df.set_index('paper id', drop=True, inplace=True)
paper_df

^C


KeyError: ignored

## 1e Establecer categorias a predecir

In [None]:
def get_topk_labels(dataset, k=10):
    """Conseguir las categorias mas pobladas."""
    unique, counts = np.unique(dataset.labels.ravel(), return_counts=True)
    sorted_labels = np.argsort(counts)[::-1]
    
    adf = pd.read_csv(os.path.join('dataset/ogbn_arxiv/mapping', 'labelidx2arxivcategeory.csv.gz'), compression="gzip")
    label_to_category = dict(zip(adf['label idx'], adf['arxiv category']))
    top_labels = sorted_labels[:k]
    return top_labels, [label_to_category[i] for i in top_labels]

top_labels, categories = get_topk_labels(dataset, 10)
N_LABELS = len(top_labels)
top_labels, categories

In [None]:
def labels_to_nodecolors(labels, k=10):
    """Convierte etqiuetas a colores."""
    cols = sns.color_palette("Set3", k)
    return [cols[int(l)] for l in labels.ravel()]

def plot_color_legend(k=10):
    cols = sns.color_palette("Set3", k)
    sns.palplot(cols)
    plt.xticks(np.arange(N_LABELS)-0.5, categories, rotation=45)
    plt.show()
    
plot_color_legend()

## 1f Construir train-test split

In [None]:
def make_arxiv_subset(dataset, split, label_subset=None, large_cc=True):
    """Get a split of the arxiv dataset"""

    n_edges = dataset.graph['edge_index'].shape[1]
    n_nodes = dataset.graph['node_feat'].shape[0]
    data_dict = {
        'nodes':dataset.graph['node_feat'].astype(np.float32),
        'edges':np.zeros((n_edges, 1),dtype=np.float32),
        'senders':dataset.graph['edge_index'][0],
        'receivers':dataset.graph['edge_index'][1],
        'globals':np.zeros(1,dtype=np.float32),
    }
    y = dataset.labels
    # Get paper ids
    adf = pd.read_csv(os.path.join('dataset/ogbn_arxiv/mapping', 'nodeidx2paperid.csv.gz'), compression="gzip")
    paperids = adf['paper id'].values
    # Convert to networkx
    g = gn.utils_np.data_dict_to_networkx(data_dict)

    # Subset by indices
    indices = dataset.get_idx_split()[split]
    y = y[indices]
    paperids = paperids[indices]
    g = g.subgraph(indices)
    g = nx.relabel.convert_node_labels_to_integers(g)
    print(len(indices), y.shape, len(g.nodes))

    # Subset by labels
    if label_subset is not None:
        is_top = np.isin(y.ravel(), top_labels)
        top_indices = np.arange(len(g.nodes))[is_top]
        y = y[top_indices]
        g = g.subgraph(top_indices)
        g = nx.relabel.convert_node_labels_to_integers(g)
        paperids = paperids[top_indices]

    # Get largest component.
    if large_cc:
        gcc = list(sorted(nx.connected_components(g.to_undirected()), key=len, reverse=True)[0])
        print(len(gcc), np.max(gcc), len(y))
        g = g.subgraph(gcc)
        g = nx.relabel.convert_node_labels_to_integers(g)
        y = y[gcc]
        paperids = paperids[gcc]
    
    # relabel labels to 0 to n-labels
    new_labels = {j:i for i,j in enumerate(label_subset)}
    y = np.array([new_labels[i] for i in y.ravel()]).reshape(-1,1)
    return g, y, paperids

g_train, y_train, paperids_train = make_arxiv_subset(dataset, 'train', top_labels, True)
g_valid, y_valid, paperids_valid = make_arxiv_subset(dataset, 'valid', top_labels, False)

print(len(g_train.nodes), len(y_train), len(paperids_train))
print(len(g_valid.nodes), len(y_valid), len(paperids_valid))

# 2 A explorar datos! (EDA)

## 2a: Los data dicts y graphtuples


In [None]:
data_dict = gn.utils_np.networkx_to_data_dict(g_train)
data_dict

In [None]:
def nx_to_graph_stuple(g):
    """Convierte networkx a un GraphsTuple"""
    return gn.utils_tf.data_dicts_to_graphs_tuple([gn.utils_np.networkx_to_data_dict(g)])

x_train = nx_to_graph_stuple(g_train)
x_valid = nx_to_graph_stuple(g_valid)
x_train

In [None]:
# Aplanamos los labels
y_train = y_train.ravel()
y_valid = y_valid.ravel()
print(y_train.shape, y_valid.shape)

## 2b explorando el espacio de nodos
Visualizaremos la informacion en los nodos via umap

Ojo, toma un poco de tiempo

In [None]:
import sklearn.pipeline
import sklearn.preprocessing

node_info = data_dict['nodes']
print(node_info.shape)

pipe = sklearn.pipeline.Pipeline([('scaler', sklearn.preprocessing.StandardScaler()),
                                  ('dim_reduce', umap.UMAP())])
node_umap = pipe.fit_transform(node_info)
print(node_umap.shape)

In [None]:
plot_color_legend()
plt.scatter(node_umap[:,0], node_umap[:, 1],
            c=labels_to_nodecolors(y_train),
            s=1, alpha=0.5)
plt.show()

### Interactivo

In [None]:
indices = np.random.permutation(len(node_umap))[:5000]

vis_df = pd.DataFrame()
vis_df['UMAP1'] = node_umap[indices, 0]
vis_df['UMAP2'] = node_umap[indices, 1]
vis_df['label'] = y_train[indices].ravel()
vis_df['id'] = paperids_train[indices]
vis_df['title'] = paper_df.loc[vis_df['id']]['title'].tolist()
vis_df

In [None]:
alt.Chart(vis_df).mark_circle(size=10).encode(
    x='UMAP1:Q',
    y='UMAP2:Q',
    color='label:N',
    tooltip=['id', 'label', 'title']
).interactive()

## 2c: Vamos a crear mini-batches de grafos

Para una version mas avanzada checa: https://arxiv.org/abs/2006.04311

In [None]:
def get_batch(x, y, center_node, radius=3, batch_size=128):
    """Sub-samplea el graph."""
    nx_graph = gn.utils_np.graphs_tuple_to_networkxs(x)[0].to_undirected()
    sub_graph = nx.generators.ego_graph(nx_graph.to_undirected(),
                                        n=center_node, radius=radius)
    node_indices = list(sub_graph.nodes)[:batch_size]
    g_batch = sub_graph.subgraph(node_indices)
    g_batch = nx.relabel.convert_node_labels_to_integers(g_batch)
    y_batch = y[node_indices].ravel()
    return g_batch, y_batch

In [None]:
g_batch, y_batch = get_batch(x_train, y_train, center_node=0, radius=2)
plot_color_legend()
pos = nx.kamada_kawai_layout(g_batch)
nx.draw(g_batch, pos, node_size=60, node_color=labels_to_nodecolors(y_batch))

In [None]:
x_batch = nx_to_graph_stuple(g_batch)
x_batch

# 3 Nuestro primer GNN

In [None]:
def get_num_parameters(model: snt.Module, trainable: bool = True) -> int:
  """Numero de parametros."""
  variables = model.trainable_variables if trainable else model.variables
  return int(np.sum([np.prod(v.shape) for v in variables]))


def print_model(model: snt.Module):
  """Sumario de un modelo."""
  print(f'{model.__class__.__name__} : {model.name}\n')
  print(snt.format_variables(model.variables))
  n_params = get_num_parameters(model, False)
  trainable_params = get_num_parameters(model, True)
  print(f'\nParams: {trainable_params} trainable out of {n_params}')

## 3a - Bloque de transformacion

In [None]:
def make_mlp_model(latent_size=32, n_layers=2, add_head=0):
    """Crea un MLP luego pasando por un LayerNorm y opcionalmente una capa lineal."""
    layers = [
        snt.nets.MLP([latent_size] * n_layers, activate_final=True),
        snt.LayerNorm(axis=-1, create_offset=True, create_scale=True)]
    if add_head > 0:
        layers.append(snt.Linear(add_head))
    return snt.Sequential(layers)
    
bloque = make_mlp_model(32, 2, 0)
bloque(x_batch.nodes)
print_model(bloque)

# 3b - Un GNN independiente

In [None]:
gnn = gn.modules.GraphIndependent(node_model_fn=lambda: make_mlp_model(32, 2, N_LABELS))
x_batch = nx_to_graph_stuple(g_batch)
out = gnn(x_batch)
print_model(gnn)

# 3c - A Entrenar !

In [None]:
NUM_ITER = 100
optimizer = tf.optimizers.Adam(3e-4)
metric = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
@tf.function(experimental_relax_shapes=True)
def forward_pass(x):
    """Prediccion."""
    out_x = gnn(x)
    return out_x.nodes

def logits_to_stats(y_true, node_logits):
    """Convert predicted logits to class stats."""
    loss = loss_fn(y_true, node_logits)
    probs = tf.nn.softmax(node_logits)
    acc = metric(y_true, probs).numpy()
    return loss.numpy(), acc

In [None]:
pbar = tqdm.tqdm(range(NUM_ITER))
stats = defaultdict(list)

for i in pbar:
    with tf.GradientTape() as tape:
      node_logits = forward_pass(x_train)
      loss = loss_fn(y_train, node_logits)
    grads = tape.gradient(loss, gnn.trainable_variables)
    optimizer.apply_gradients(zip(grads, gnn.trainable_variables))
    # Train statistics.
    train_loss, train_acc = logits_to_stats(y_train, node_logits)
    stats['train_loss'].append(train_loss)
    stats['train_acc'].append(train_acc)
    # Validation statistics.
    node_logits = forward_pass(x_valid)
    valid_loss, valid_acc = logits_to_stats(y_valid, node_logits)
    stats['valid_loss'].append(valid_loss)
    stats['valid_acc'].append(valid_acc)
    # Update progress bar.
    pbar.set_postfix({key:values[-1] for key, values in stats.items()})
    

In [None]:
for key in ['train_loss','test_loss']:
    plt.plot(stats[key],label=key)
plt.yscale('log')
plt.legend()
plt.show()

for key in ['train_acc','test_acc']:
    plt.plot(stats[key],label=key)
plt.show()

## 4 GNN mas avanzados: GraphNets

In [None]:
cabeza = gn.modules.GraphIndependent(
    node_model_fn=lambda: make_mlp_model(32, 2, N_LABELS))

gnn_layers = [gn.modules.GraphNetwork(
    edge_model_fn=lambda: make_mlp_model(32, 2),
    node_model_fn=lambda: make_mlp_model(32, 2),
    global_model_fn=lambda: make_mlp_model(32, 2)) for i in range(3)]

gnn = snt.Sequential( gnn_layers + [cabeza])
x_batch = nx_to_graph_stuple(g_batch)
out = gnn(x_batch)
print_model(gnn)

In [None]:
NUM_ITER = 100
optimizer = tf.optimizers.Adam(3e-4)
metric = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
@tf.function(experimental_relax_shapes=True)
def forward_pass(x):
    """Prediccion."""
    out_x = gnn(x)
    return out_x.nodes

def logits_to_stats(y_true, node_logits):
    """Convert predicted logits to class stats."""
    loss = loss_fn(y_true, node_logits)
    probs = tf.nn.softmax(node_logits)
    acc = metric(y_true, probs).numpy()
    return loss.numpy(), acc

In [None]:
pbar = tqdm.tqdm(range(NUM_ITER))
stats = defaultdict(list)

for i in pbar:
    with tf.GradientTape() as tape:
      node_logits = forward_pass(x_train)
      loss = loss_fn(y_train, node_logits)
    grads = tape.gradient(loss, gnn.trainable_variables)
    optimizer.apply_gradients((grad, var) 
    for (grad, var) in zip(grads, gnn.trainable_variables) 
    if grad is not None)
    # Train statistics.
    train_loss, train_acc = logits_to_stats(y_train, node_logits)
    stats['train_loss'].append(train_loss)
    stats['train_acc'].append(train_acc)
    # Validation statistics.
    node_logits = forward_pass(x_valid)
    valid_loss, valid_acc = logits_to_stats(y_valid, node_logits)
    stats['valid_loss'].append(valid_loss)
    stats['valid_acc'].append(valid_acc)
    # Update progress bar.
    pbar.set_postfix({key:values[-1] for key, values in stats.items()})

In [None]:
for key in ['train_loss','test_loss']:
    plt.plot(stats[key],label=key)
plt.yscale('log')
plt.legend()
plt.show()

for key in ['train_acc','test_acc']:
    plt.plot(stats[key],label=key)
plt.show()