In [1]:
%load_ext autoreload
%autoreload 2

In [74]:
import os
while 'notebooks' in os.getcwd():
    os.chdir('..')

import pandas as pd
from IPython.display import Markdown, display, Latex

In [3]:
from src.data.gamma import structural_arxiv, structural_citeseer, structural_cora, structural_pubmed



In [111]:
total_nodes = {}
total_edges = {}
n_classes = {}
n_features = {}

homophily = {}

train_nodes = {}
valid_nodes = {}
test_nodes = {}

train_edges = {}
valid_edges = {}
test_edges = {}

## Arxiv

In [112]:
dataset_name = 'ogbn-arxiv'

In [113]:
dataset = structural_arxiv.load_dataset()
data, edges_train, edges_val, edges_test = structural_arxiv.get_train_val_test_edges(dataset)

In [114]:
((data.y[data.edge_index[0]] == data.y[data.edge_index[1]]) * 1.0).mean().item()

0.6550830602645874

In [115]:
n_classes[dataset_name] = data.y.unique().shape[0]
n_features[dataset_name] = data.x.shape[1]

total_nodes[dataset_name] = data.num_nodes
total_edges[dataset_name] = data.num_edges
homophily[dataset_name] = (
    ((data.y[data.edge_index[0]] == data.y[data.edge_index[1]]) * 1.0)
    .mean()
    .item()
)

train_nodes[dataset_name] = dataset.get_idx_split()["train"].shape[0]
valid_nodes[dataset_name] = dataset.get_idx_split()["valid"].shape[0]
test_nodes[dataset_name] = dataset.get_idx_split()["test"].shape[0]

train_edges[dataset_name] = edges_train.shape[1]
valid_edges[dataset_name] = edges_val.shape[1]
test_edges[dataset_name] = edges_test.shape[1]

In [116]:
print(
    f"Arxiv dataset has {train_nodes[dataset_name]} train nodes, "
    f"{valid_nodes[dataset_name]} valid nodes and "
    f"{test_nodes[dataset_name]} test nodes."
)
print(
    f"Arxiv dataset has {train_edges[dataset_name]} train edges, "
    f"{valid_edges[dataset_name]} valid edges and "
    f"{test_edges[dataset_name]} test edges."
)

Arxiv dataset has 90941 train nodes, 29799 valid nodes and 48603 test nodes.
Arxiv dataset has 374839 train edges, 247627 valid edges and 543777 test edges.


## Cora, Citeseer, Pubmed

In [117]:
dataset_name_module_dict = {
    "Cora": structural_cora,
    "Citeseer": structural_citeseer,
    "Pubmed": structural_pubmed,
}

for dataset_name, module in dataset_name_module_dict.items():
    dataset = module.load_dataset()
    data, edges_train, edges_val, edges_test = module.get_train_val_test_edges(
        dataset
    )

    n_classes[dataset_name] = data.y.unique().shape[0]
    n_features[dataset_name] = data.x.shape[1]
    total_nodes[dataset_name] = data.num_nodes
    total_edges[dataset_name] = data.num_edges
    homophily[dataset_name] = (
        ((data.y[data.edge_index[0]] == data.y[data.edge_index[1]]) * 1.0)
        .mean()
        .item()
    )

    train_nodes[dataset_name] = data.train_mask.sum().item()
    valid_nodes[dataset_name] = data.val_mask.sum().item()
    test_nodes[dataset_name] = data.test_mask.sum().item()

    train_edges[dataset_name] = edges_train.shape[1]
    valid_edges[dataset_name] = edges_val.shape[1]
    test_edges[dataset_name] = edges_test.shape[1]

In [118]:
data.x

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1046, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0194, 0.0080,  ..., 0.0000, 0.0000, 0.0000],
        [0.1078, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0266, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [119]:
data.y.unique().shape[0]

3

## Summary

In [123]:
summary_df = pd.concat(
    [
        pd.Series(n_features).astype(int).rename("$|X_i|$"),
        pd.Series(n_classes).astype(int).rename("$|\{Y\}|$"),
        pd.Series(homophily).round(3).rename("$h$"),
        pd.Series(total_nodes).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|V|$"),
        pd.Series(train_nodes).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|V_{Train}|$"),
        pd.Series(valid_nodes).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|V_{Validation}|$"),
        pd.Series(test_nodes).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|V_{Test}|$"),
        pd.Series(total_edges).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|E|$"),
        pd.Series(train_edges).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|E_{Train}|$"),
        pd.Series(valid_edges).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|E_{Validation}|$"),
        pd.Series(test_edges).astype(int).apply(lambda x : "{:,}".format(x)).rename("$|E_{Test}|$"),
    ],
    axis=1,
)

summary_df

Unnamed: 0,$|X_i|$,$|\{Y\}|$,$h$,$|V|$,$|V_{Train}|$,$|V_{Validation}|$,$|V_{Test}|$,$|E|$,$|E_{Train}|$,$|E_{Validation}|$,$|E_{Test}|$
ogbn-arxiv,128,40,0.655,169343,90941,29799,48603,1166243,374839,247627,543777
Cora,1433,7,0.81,2708,1208,500,1000,10556,2308,2130,6118
Citeseer,3703,6,0.736,3327,1827,500,1000,9104,2642,1712,4750
Pubmed,500,3,0.802,19717,18217,500,1000,88648,75472,4140,9036


In [107]:
print(summary_df.style.to_latex())

\begin{tabular}{lrrllllllll}
 & $|\{Y\}|$ & $h$ & $|V|$ & $|V_{Train}|$ & $|V_{Validation}|$ & $|V_{Test}|$ & $|E|$ & $|E_{Train}|$ & $|E_{Validation}|$ & $|E_{Test}|$ \\
ogbn-arxiv & 40 & 0.655000 & 169,343 & 90,941 & 29,799 & 48,603 & 1,166,243 & 374,839 & 247,627 & 543,777 \\
Cora & 7 & 0.810000 & 2,708 & 1,208 & 500 & 1,000 & 10,556 & 2,308 & 2,130 & 6,118 \\
Citeseer & 6 & 0.736000 & 3,327 & 1,827 & 500 & 1,000 & 9,104 & 2,642 & 1,712 & 4,750 \\
Pubmed & 3 & 0.802000 & 19,717 & 18,217 & 500 & 1,000 & 88,648 & 75,472 & 4,140 & 9,036 \\
\end{tabular}



\begin{table}
    \resizebox{\columnwidth}{!}{%
        \centering
        \begin{tabular}{lrrrrrrrrr}
            \toprule
                                &           &  \multicolumn{4}{c}{$|V|$}                        & \multicolumn{4}{c}{$|E|$} \\
                                               \cmidrule(lr){3-6}                                 \cmidrule(lr){7-10}
            Dataset             & $|\{Y\}|$ & $h$   & Total     & Train     & Validation    & Test      & Total     & Train     & Validation    & Test \\
            \midrule
            \texttt{ogbn-arxiv} & 40        & 0.655 & 169,343   & 90,941    & 29,799        & 48,603    & 1,166,243 & 374,839   & 247,627       & 543,777 \\
            Cora                & 7         & 0.810 & 2,708     & 1,208     & 500           & 1,000     & 10,556    & 2,308     & 2,130         & 6,118 \\
            Citeseer            & 6         & 0.736 & 3,327     & 1,827     & 500           & 1,000     & 9,104     & 2,642     & 1,712         & 4,750 \\
            Pubmed              & 3         & 0.802 & 19,717    & 18,217    & 500           & 1,000     & 88,648    & 75,472    & 4,140         & 9,036 \\
            \bottomrule
        \end{tabular}%
    }
    \caption{Descriptive statistics of the datasets used.}
\end{table}