In [11]:
import os
import re
import json
import random
import string
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt

import tensorflow as tf
print(tf.__version__)
import tensorflow_io as tfio
print(tfio.__version__)

from tensorflow.keras import layers
from tensorflow.keras import models

2.10.0
0.27.0


In [19]:
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

random_id = id_generator(size=8)
print('Experiment Id: ', random_id)


configs = Namespace(
    num_frames = 32,
    batch_size = 128,
    epochs = 60,
    resizing_interpolation = "nearest",
    learning_rate = 1e-3,
    num_steps = 1.0,
    experiment_id = random_id,
    num_jcd_features = 3321, # ((82*82)-82)/2
    num_classes = 250,
    filters = 256,
)


LIP = [
    61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]

RIGHT_EYE = [
    246, 161, 160, 159, 158, 157, 173,
    33, 7, 163, 144, 145, 153, 154, 155, 133,
    247, 30, 29, 27, 28, 56, 190,
    130, 25, 110, 24, 23, 22, 26, 112, 243,
    113, 225, 224, 223, 222, 221, 189,
    226, 31, 228, 229, 230, 231, 232, 233, 244,
    143, 111, 117, 118, 119, 120, 121, 128, 245,
]

LEFT_EYE = [
    466, 387, 386, 385, 384, 398,
    263, 249, 390, 373, 374, 380, 381, 382, 362,
    467, 260, 259, 257, 258, 286, 414,
    359, 255, 339, 254, 253, 252, 256, 341, 463,
    342, 445, 444, 443, 442, 441, 413,
    446, 261, 448, 449, 450, 451, 452, 453, 464,
    372, 340, 346, 347, 348, 349, 350, 357, 465,
]

LEFT_HAND = [
    468, 469, 470, 471, 472, 473, 474, 475,
    476, 477, 478, 479, 480, 481, 482, 483,
    484, 485, 486, 487, 488
]

RIGHT_HAND = [
    522, 523, 524, 525, 526, 527, 528, 529,
    530, 531, 532, 533, 534, 535, 536, 537,
    538, 539, 540, 541, 542
]

Experiment Id:  Q6UNKCGK


In [13]:
data_path = "../data/tfrecords"


def natural_keys(text):
    ""
    def atoi(text):
        return int(text) if text.isdigit() else text
    
    return [atoi(c) for c in re.split(r'(\d+)', text)]

tfrecords = sorted(glob(f"{data_path}/*.tfrec"), key=natural_keys)

In [17]:
train_tfrecords, valid_tfrecords = tfrecords[:19], tfrecords[19:]

def parse_sequence(serialized_sequence):
    return tf.io.parse_tensor(
        serialized_sequence,
        out_type=tf.float32,
    )


def parse_tfrecord_fn(example):
    feature_description = {
        "n_frames": tf.io.FixedLenFeature([], tf.float32),
        "frames": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }
    
    return tf.io.parse_single_example(example, feature_description)


@tf.function
def preprocess_frames(frames):
    """
    In this preprocessing function:
    - Fill NaN values to 0.
    - Use `tf.image.resize` to interpolate.
    """
    frames = tf.where(tf.math.is_nan(frames), 0.0, frames)

    frames = tf.image.resize(
        frames, (configs.num_frames, 543), method=configs.resizing_interpolation
    )

    return frames


def parse_data(example):
    # Parse Frames
    n_frames = example["n_frames"]
    frames = tf.reshape(parse_sequence(example["frames"]), shape=(n_frames, 543, 3))
#     frames = preprocess_frames(frames)
    
    # Parse Labels
    label = tf.one_hot(example["label"], depth=250)

    return frames, label


AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.TFRecordDataset(train_tfrecords)
valid_ds = tf.data.TFRecordDataset(valid_tfrecords)

trainloader = (
    train_ds
    .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
    .shuffle(1024)
    .map(parse_data, num_parallel_calls=AUTOTUNE)
    .batch(1)
    .prefetch(AUTOTUNE)
)

validloader = (
    valid_ds
    .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
    .map(parse_data, num_parallel_calls=AUTOTUNE)
    .batch(128)
    .prefetch(AUTOTUNE)
)

In [18]:
samples, _ = next(iter(trainloader))
sample = samples[0]
sample

<tf.Tensor: shape=(10, 543, 3), dtype=float32, numpy=
array([[[ 0.5411273 ,  0.5323652 , -0.04391564],
        [ 0.5405812 ,  0.49619186, -0.05565592],
        [ 0.539276  ,  0.5099292 , -0.03568446],
        ...,
        [ 0.28491077,  0.57865244, -0.13284244],
        [ 0.28494364,  0.5564076 , -0.15725341],
        [ 0.28038225,  0.53750515, -0.1695216 ]],

       [[ 0.5423614 ,  0.5348249 , -0.04183666],
        [ 0.54156756,  0.4996673 , -0.05482912],
        [ 0.54018104,  0.512451  , -0.03478922],
        ...,
        [        nan,         nan,         nan],
        [        nan,         nan,         nan],
        [        nan,         nan,         nan]],

       [[ 0.54324853,  0.53755105, -0.04118378],
        [ 0.5415171 ,  0.502253  , -0.05495819],
        [ 0.5407671 ,  0.5145851 , -0.03427748],
        ...,
        [ 0.23306374,  0.6457217 , -0.13134223],
        [ 0.22389913,  0.67878467, -0.11890843],
        [ 0.2213714 ,  0.7007487 , -0.09989434]],

       ...,

      

> There are N (10) frames with 543 joints (nodes) and 3 features.

> Some of the joints are missing in some frames - right hand and left hand.


Let us select the parts of interest - lips, right hand and left hand (we can in future extend it to right eye and left eye.

In [139]:
lips = tf.gather(sample, indices=LIP, axis=1)
rh = tf.gather(sample, indices=RIGHT_HAND, axis=1)
lh = tf.gather(sample, indices=LEFT_HAND, axis=1)

lips.shape, rh.shape, lh.shape

(TensorShape([10, 40, 3]), TensorShape([10, 21, 3]), TensorShape([10, 21, 3]))

> Since lips, rh, and lh are not connected joints we can create adjacency matrix to represent local connections (graph representation).

In [140]:
rh_edges = {
    0: (1,5,17), 1: (0,2), 2: (1,3), 3: (2,4), 4: (3,),
    5: (0,6,9), 6: (5, 7), 7: (6, 8), 8: (7,),
    9: (5,10,13), 10: (9, 11), 11: (10, 12), 12: (11,),
    13: (9,14,17), 14: (13,15), 15: (14,16), 16: (15,),
    17: (0,13,18), 18: (17,19), 19: (18,20), 20: (19,)
}

lh_edges = {
    k+21: tuple(j+21 for j in v) for k, v in rh_edges.items()
}

In [173]:
edges = pd.DataFrame(columns=["target", "source"])

i = 0

for k, v in rh_edges.items():
    for j in v:
        edges.loc[i] = [k, j]
        i+=1
        
for k, v in lh_edges.items():
    for j in v:
        edges.loc[i] = [k, j]
        i+=1
        
edges

Unnamed: 0,target,source
0,0,1
1,0,5
2,0,17
3,1,0
4,1,2
...,...,...
79,39,38
80,39,40
81,40,39
82,40,41


In [174]:
edges = edges.to_numpy().T
edges = tf.convert_to_tensor(edges, dtype=tf.int32)
edges

<tf.Tensor: shape=(2, 84), dtype=int32, numpy=
array([[ 0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  5,  5,  5,  6,  6,  7,
         7,  8,  9,  9,  9, 10, 10, 11, 11, 12, 13, 13, 13, 14, 14, 15,
        15, 16, 17, 17, 17, 18, 18, 19, 19, 20, 21, 21, 21, 22, 22, 23,
        23, 24, 24, 25, 26, 26, 26, 27, 27, 28, 28, 29, 30, 30, 30, 31,
        31, 32, 32, 33, 34, 34, 34, 35, 35, 36, 36, 37, 38, 38, 38, 39,
        39, 40, 40, 41],
       [ 1,  5, 17,  0,  2,  1,  3,  2,  4,  3,  0,  6,  9,  5,  7,  6,
         8,  7,  5, 10, 13,  9, 11, 10, 12, 11,  9, 14, 17, 13, 15, 14,
        16, 15,  0, 13, 18, 17, 19, 18, 20, 19, 22, 26, 38, 21, 23, 22,
        24, 23, 25, 24, 21, 27, 30, 26, 28, 27, 29, 28, 26, 31, 34, 30,
        32, 31, 33, 32, 30, 35, 38, 34, 36, 35, 37, 36, 21, 34, 39, 38,
        40, 39, 41, 40]], dtype=int32)>

In [175]:
edge_weights = tf.ones(shape=edges.shape[1])
edge_weights

<tf.Tensor: shape=(84,), dtype=float32, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
      dtype=float32)>

In [147]:
node_features = tf.concat([rh[0], lh[0]], axis=0)
node_features

<tf.Tensor: shape=(42, 3), dtype=float32, numpy=
array([[ 2.1019016e-01,  6.8588758e-01,  9.8952035e-10],
       [ 2.8388086e-01,  7.0495671e-01, -3.3344377e-02],
       [ 3.6415419e-01,  6.9332105e-01, -5.4764863e-02],
       [ 4.2491096e-01,  6.7820156e-01, -7.2426505e-02],
       [ 4.6550936e-01,  6.6995484e-01, -9.0417974e-02],
       [ 3.9068621e-01,  6.1251056e-01, -7.0830353e-02],
       [ 4.2774025e-01,  6.3708228e-01, -1.1835943e-01],
       [ 4.2169711e-01,  6.7327696e-01, -1.4734654e-01],
       [ 4.1342527e-01,  6.9977760e-01, -1.6354330e-01],
       [ 3.5835549e-01,  5.9624678e-01, -7.3845766e-02],
       [ 4.0113658e-01,  6.2684619e-01, -1.3077991e-01],
       [ 3.9249700e-01,  6.7660195e-01, -1.5948087e-01],
       [ 3.8580406e-01,  7.1198761e-01, -1.7318243e-01],
       [ 3.1274539e-01,  5.9283233e-01, -7.9512097e-02],
       [ 3.5000527e-01,  6.1845076e-01, -1.4067358e-01],
       [ 3.5016501e-01,  6.7015713e-01, -1.6457407e-01],
       [ 3.4562823e-01,  7.0844293e-01,

In [130]:
node_features.shape

(42, 3)

In [148]:
graph_info = (node_features, edges, edge_weights)

In [190]:
from tensorflow.keras import layers
from tensorflow.keras import models



def create_ffn(hidden_units, dropout_rate, name=None):
    fnn_layers = []

    for units in hidden_units:
        fnn_layers.append(layers.BatchNormalization())
        fnn_layers.append(layers.Dropout(dropout_rate))
        fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))

    return keras.Sequential(fnn_layers, name=name)


class GraphConvLayer(layers.Layer):
    def __init__(
        self,
        hidden_units,
        dropout_rate=0.2,
        aggregation_type="mean",
        combination_type="concat",
        normalize=False,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)

        self.aggregation_type = aggregation_type
        self.combination_type = combination_type
        self.normalize = normalize

        self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
        if self.combination_type == "gated":
            self.update_fn = layers.GRU(
                units=hidden_units,
                activation="tanh",
                recurrent_activation="sigmoid",
                dropout=dropout_rate,
                return_state=True,
                recurrent_dropout=dropout_rate,
            )
        else:
            self.update_fn = create_ffn(hidden_units, dropout_rate)

    def prepare(self, node_repesentations, weights=None):
        # node_repesentations shape is [num_edges, embedding_dim].
        messages = self.ffn_prepare(node_repesentations)
        if weights is not None:
            messages = messages * tf.expand_dims(weights, -1)
        return messages

    def aggregate(self, node_indices, neighbour_messages, node_repesentations):
        # node_indices shape is [num_edges].
        # neighbour_messages shape: [num_edges, representation_dim].
        # node_repesentations shape is [num_nodes, representation_dim].
        num_nodes = node_repesentations.shape[0]
        if self.aggregation_type == "sum":
            aggregated_message = tf.math.unsorted_segment_sum(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "mean":
            aggregated_message = tf.math.unsorted_segment_mean(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "max":
            aggregated_message = tf.math.unsorted_segment_max(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        else:
            raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")

        return aggregated_message

    def update(self, node_repesentations, aggregated_messages):
        # node_repesentations shape is [num_nodes, representation_dim].
        # aggregated_messages shape is [num_nodes, representation_dim].
        if self.combination_type == "gru":
            # Create a sequence of two elements for the GRU layer.
            h = tf.stack([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "concat":
            # Concatenate the node_repesentations and aggregated_messages.
            h = tf.concat([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "add":
            # Add node_repesentations and aggregated_messages.
            h = node_repesentations + aggregated_messages
        else:
            raise ValueError(f"Invalid combination type: {self.combination_type}.")

        # Apply the processing function.
        node_embeddings = self.update_fn(h)
        if self.combination_type == "gru":
            node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]

        if self.normalize:
            node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
        return node_embeddings

    def call(self, inputs):
        """Process the inputs to produce the node_embeddings.

        inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
        Returns: node_embeddings of shape [num_nodes, representation_dim].
        """

        node_repesentations, edges, edge_weights = inputs
        # Get node_indices (source) and neighbour_indices (target) from edges.
        node_indices, neighbour_indices = edges[0], edges[1]
        # neighbour_repesentations shape is [num_edges, representation_dim].
        neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)

        # Prepare the messages of the neighbours.
        neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
        # Aggregate the neighbour messages.
        aggregated_messages = self.aggregate(
            node_indices, neighbour_messages, node_repesentations
        )
        # Update the node embedding with the neighbour messages.
        return self.update(node_repesentations, aggregated_messages)

In [191]:
hidden_units = [32, 32]
dropout_rate = 0.5

conv1 = GraphConvLayer(
    hidden_units,
    dropout_rate,
    name="graph_conv1",
)

In [193]:
# features = tf.keras.Input(shape=(10, 42, 3))
# edges = tf.keras.Input(shape=(10, 2, 84), dtype=tf.int32)
# e_weights = tf.keras.Input(shape=(10, 84))
# graph_inputs = (features, edges, e_weights)

# outputs = tf.keras.layers.TimeDistributed(conv1)((features, edges, e_weights))
# outputs.shape

In [194]:
!pip install stellargraph

Collecting stellargraph
  Downloading stellargraph-1.2.1-py3-none-any.whl (435 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.2/435.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hCollecting gensim>=3.4.0
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting smart-open>=1.8.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<3.20,>=3.9.2
  Using cached protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)

Installing collected packages: smart-open, protobuf, gensim, google-auth-oauthlib, stellargraph
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
  Attempting uninstall: google-auth-oauthlib
    Found existing installation: google-auth-oauthlib 1.0.0
    Uninstalling google-auth-oauthlib-1.0.0:
      Successfully uninstalled google-auth-oauthlib-1.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.42.0 requires pyarrow<8.0.0,>=0.15.1, but you have pyarrow 10.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed gensim-4.2.0 google-auth-oauthlib-0.4.6 protobuf-3.19.6 smart-open-6.3.0 stellargraph-1.2.1


In [196]:
!pip install chardet

Collecting chardet
  Downloading chardet-5.1.0-py3-none-any.whl (199 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.1.0


In [197]:
from stellargraph import datasets
dataset = datasets.PROTEINS()
graphs, graph_labels = dataset.load()

In [205]:
from stellargraph.mapper import PaddedGraphGenerator
generator = PaddedGraphGenerator(graphs=graphs)

In [206]:
from stellargraph.layer import DeepGraphCNN
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow.keras.losses import binary_crossentropy
import tensorflow as tf
nrows = 35  # the number of rows for the output tensor
layer_dims = [32, 32, 32, 1]
# backbone part of the model (Encoder)
dgcnn_model = DeepGraphCNN(
    layer_sizes=layer_dims,
    activations=["tanh", "tanh", "tanh", "tanh"],
    k=nrows,
    bias=False,
    generator=generator,
)

In [209]:
# necessary for connecting the backbone to the head
gnn_inp, gnn_out = dgcnn_model.in_out_tensors()
# head part of the model (classification)
x_out = Conv1D(filters=16, kernel_size=sum(layer_dims), strides=sum(layer_dims))(gnn_out)
x_out = MaxPool1D(pool_size=2)(x_out)
x_out = Conv1D(filters=32, kernel_size=5, strides=1)(x_out)
x_out = Flatten()(x_out)
x_out = Dense(units=128, activation="relu")(x_out)
x_out = Dropout(rate=0.5)(x_out)
predictions = Dense(units=1, activation="sigmoid")(x_out)

Instructions for updating:
Use fn_output_signature instead


In [210]:
model = Model(inputs=gnn_inp, outputs=predictions)
model.compile(optimizer=Adam(lr=0.0001), loss=binary_crossentropy, metrics=["acc"])

  super().__init__(name, **kwargs)


In [211]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_28 (InputLayer)          [(None, None, 4)]    0           []                               
                                                                                                  
 dropout_36 (Dropout)           (None, None, 4)      0           ['input_28[0][0]']               
                                                                                                  
 input_30 (InputLayer)          [(None, None, None)  0           []                               
                                ]                                                                 
                                                                                                  
 graph_convolution (GraphConvol  (None, None, 32)    128         ['dropout_36[0][0]',         

In [187]:
graph_info

(<tf.Tensor: shape=(42, 3), dtype=float32, numpy=
 array([[ 2.1019016e-01,  6.8588758e-01,  9.8952035e-10],
        [ 2.8388086e-01,  7.0495671e-01, -3.3344377e-02],
        [ 3.6415419e-01,  6.9332105e-01, -5.4764863e-02],
        [ 4.2491096e-01,  6.7820156e-01, -7.2426505e-02],
        [ 4.6550936e-01,  6.6995484e-01, -9.0417974e-02],
        [ 3.9068621e-01,  6.1251056e-01, -7.0830353e-02],
        [ 4.2774025e-01,  6.3708228e-01, -1.1835943e-01],
        [ 4.2169711e-01,  6.7327696e-01, -1.4734654e-01],
        [ 4.1342527e-01,  6.9977760e-01, -1.6354330e-01],
        [ 3.5835549e-01,  5.9624678e-01, -7.3845766e-02],
        [ 4.0113658e-01,  6.2684619e-01, -1.3077991e-01],
        [ 3.9249700e-01,  6.7660195e-01, -1.5948087e-01],
        [ 3.8580406e-01,  7.1198761e-01, -1.7318243e-01],
        [ 3.1274539e-01,  5.9283233e-01, -7.9512097e-02],
        [ 3.5000527e-01,  6.1845076e-01, -1.4067358e-01],
        [ 3.5016501e-01,  6.7015713e-01, -1.6457407e-01],
        [ 3.4562823e-0

In [192]:
conv1(graph_info)

<tf.Tensor: shape=(42, 32), dtype=float32, numpy=
array([[-0.01795586, -0.0093726 ,  0.00034456, ..., -0.07718842,
        -0.01776318, -0.03412152],
       [-0.01923367, -0.01144987, -0.00065695, ..., -0.08074419,
        -0.01443034, -0.03598208],
       [-0.02181519, -0.0110498 , -0.00290867, ..., -0.08290777,
        -0.00989566, -0.03743376],
       ...,
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]], dtype=float32)>

In [171]:
model = models.Sequential()
model.add(layers.TimeDistributed(layers.Dense(8), input_shape=(10, 16)))

In [172]:
model.output_shape

(None, 10, 8)

In [29]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [49]:
zip_file = keras.utils.get_file(
    fname="cora.tgz",
    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
    extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")

In [50]:
citations = pd.read_csv(
    os.path.join(data_dir, "cora.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
print("Citations shape:", citations.shape)

Citations shape: (5429, 2)


In [51]:
citations

Unnamed: 0,target,source
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
...,...,...
5424,853116,19621
5425,853116,853155
5426,853118,1140289
5427,853155,853118


In [52]:
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv(
    os.path.join(data_dir, "cora.content"), sep="\t", header=None, names=column_names,
)
print("Papers shape:", papers.shape)

Papers shape: (2708, 1435)


In [53]:
papers

Unnamed: 0,paper_id,term_0,term_1,term_2,term_3,term_4,term_5,term_6,term_7,term_8,...,term_1424,term_1425,term_1426,term_1427,term_1428,term_1429,term_1430,term_1431,term_1432,subject
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Case_Based


In [54]:
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

In [55]:
paper_idx

{35: 0,
 40: 1,
 114: 2,
 117: 3,
 128: 4,
 130: 5,
 164: 6,
 288: 7,
 424: 8,
 434: 9,
 463: 10,
 504: 11,
 506: 12,
 887: 13,
 906: 14,
 910: 15,
 936: 16,
 940: 17,
 941: 18,
 943: 19,
 1026: 20,
 1033: 21,
 1034: 22,
 1035: 23,
 1213: 24,
 1237: 25,
 1246: 26,
 1272: 27,
 1365: 28,
 1385: 29,
 1481: 30,
 1688: 31,
 1694: 32,
 1717: 33,
 1786: 34,
 1817: 35,
 1919: 36,
 1949: 37,
 1951: 38,
 1952: 39,
 1953: 40,
 1955: 41,
 1956: 42,
 1959: 43,
 1997: 44,
 1999: 45,
 2354: 46,
 2440: 47,
 2653: 48,
 2654: 49,
 2658: 50,
 2663: 51,
 2665: 52,
 2695: 53,
 2696: 54,
 2698: 55,
 2702: 56,
 3084: 57,
 3085: 58,
 3095: 59,
 3097: 60,
 3101: 61,
 3112: 62,
 3187: 63,
 3191: 64,
 3192: 65,
 3217: 66,
 3218: 67,
 3220: 68,
 3222: 69,
 3223: 70,
 3229: 71,
 3231: 72,
 3232: 73,
 3233: 74,
 3235: 75,
 3236: 76,
 3237: 77,
 3240: 78,
 3243: 79,
 3828: 80,
 3932: 81,
 4274: 82,
 4329: 83,
 4330: 84,
 4335: 85,
 4553: 86,
 4584: 87,
 4637: 88,
 4649: 89,
 4660: 90,
 4804: 91,
 4878: 92,
 4983: 93

In [56]:
papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

In [57]:
papers

Unnamed: 0,paper_id,term_0,term_1,term_2,term_3,term_4,term_5,term_6,term_7,term_8,...,term_1424,term_1425,term_1426,term_1427,term_1428,term_1429,term_1430,term_1431,term_1432,subject
0,462,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2
1,1911,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5
2,2002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,248,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,519,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,2370,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2704,2371,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2705,2372,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2706,955,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

In [59]:
citations

Unnamed: 0,target,source
0,0,21
1,0,905
2,0,906
3,0,1909
4,0,1940
...,...,...
5424,1873,328
5425,1873,1876
5426,1874,2586
5427,1876,1874


In [61]:
feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
num_classes = len(class_idx)

feature_names

{'term_998',
 'term_589',
 'term_1050',
 'term_408',
 'term_224',
 'term_576',
 'term_546',
 'term_232',
 'term_1039',
 'term_1400',
 'term_1154',
 'term_1149',
 'term_1277',
 'term_199',
 'term_312',
 'term_32',
 'term_101',
 'term_613',
 'term_722',
 'term_1259',
 'term_979',
 'term_946',
 'term_1209',
 'term_1282',
 'term_1074',
 'term_927',
 'term_142',
 'term_777',
 'term_251',
 'term_1306',
 'term_438',
 'term_1361',
 'term_790',
 'term_509',
 'term_84',
 'term_1015',
 'term_1401',
 'term_286',
 'term_496',
 'term_1276',
 'term_721',
 'term_587',
 'term_434',
 'term_1351',
 'term_874',
 'term_1117',
 'term_19',
 'term_596',
 'term_759',
 'term_122',
 'term_674',
 'term_1122',
 'term_369',
 'term_96',
 'term_928',
 'term_161',
 'term_1348',
 'term_1118',
 'term_162',
 'term_824',
 'term_1289',
 'term_700',
 'term_302',
 'term_378',
 'term_185',
 'term_1292',
 'term_913',
 'term_654',
 'term_644',
 'term_1130',
 'term_1224',
 'term_43',
 'term_661',
 'term_1402',
 'term_500',
 'ter

In [62]:
# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Nodes shape:", node_features.shape)

Edges shape: (2, 5429)
Nodes shape: (2708, 1433)


In [63]:
edges

array([[  21,  905,  906, ..., 2586, 1874, 2707],
       [   0,    0,    0, ..., 1874, 1876, 1897]])

In [64]:
edge_weights

<tf.Tensor: shape=(5429,), dtype=float32, numpy=array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)>

In [65]:
node_features

<tf.Tensor: shape=(2708, 1433), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [94]:
citations[["source", "target"]].to_numpy().T

array([[  21,  905,  906, ..., 2586, 1874, 2707],
       [   0,    0,    0, ..., 1874, 1876, 1897]])