<a href="https://colab.research.google.com/github/bariscemb/saatUygulamas-/blob/main/GCN_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download the required packages and preprocessed Elliptic dataset from the given links.

## Note: We temporarily uploaded our nodes.csv data as a LFS object in GitHub, but I just got an email saying that my quota has been depleted. If you cannot run the code for data download, please download our nodes.csv data from https://drive.google.com/file/d/1xOxc6VN0qkLjqBES1SXDjX1Kku7MNxp3/view?usp=sharing. Thanks! 

In [None]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu113.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 5.2 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu113.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.9 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.12
Collecting torch-geometric
  Downloading torch_geometric-2.0.2.tar.gz (325 kB)
[K     |████████████████████████████████| 325 kB 5.4 MB/s 
Collecting rdflib
  Downloading rdflib-6.0.2-py3-none-any.whl (407 kB)
[K     |████████████████████████████████| 407 kB 38.1 MB/s 
Collecting 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'axes.facecolor':'dimgrey', 'grid.color':'lightgrey'})

import numpy as np
import pandas as pd
import networkx as nx
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch_scatter
from torch_geometric.data import Data
print(torch.__version__)

# # The PyG built-in GCNConv
# from torch_geometric.nn import GCNConv
from torch_geometric.nn.conv import MessagePassing
import torch_geometric.transforms as T
from torch_geometric.utils import remove_self_loops, add_self_loops, softmax, degree

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score
import scipy.sparse as scsp
from sklearn.cluster import KMeans
import copy

1.10.0+cu111


In [None]:
url_data_e = (r'https://raw.githubusercontent.com/yuchenWYC/'
              r'Elliptic_dataset/master/edges.csv')
edges = pd.read_csv(url_data_e)

url_data_n = (r'https://media.githubusercontent.com/media/yuchenWYC/'
              r'Elliptic_dataset/master/nodes.csv')
nodes = pd.read_csv(url_data_n)

We can take a look at how the preprocessed dataset looks like.

In [None]:
nodes.iloc[0:2, 0:10]

Unnamed: 0,txId,class,timestamp,2,3,4,5,6,7,8
0,232438397,0,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558
1,232029206,0,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305


In [None]:
edges.head()

Unnamed: 0,txId1,txId2
0,232344069,27553029
1,3881097,232457116
2,232051089,232470704
3,230473487,7089694
4,231182296,14660781


# Time-Step Splitting Script



In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def time_step_split_helper(new_nodes, new_edges):
    """
    Split the graph and store node features, edges (represented by adjacency list),
    and labels separately by timestamp t (from 1 to 49).

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list

    Returns:
        features_t    A list of (|N_t|, d) feature matrices by timestamp
        edge_indices  A list of (2, |E_t|) adjacency list by timestamp
        labels_t      A list of (|N_t|) labels by timestamp
    """

    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    times = new_nodes.iloc[:, 2].to_numpy()
    times = torch.LongTensor(times.reshape(len(times),))
    labels = new_nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))

    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))

    min_t = torch.min(times) # 1
    max_t = torch.max(times) # 49
    
    # Construct nodes of the directed graph for each time step;
    # features by time step are stored in "features_t"; labels by
    # time step are stored in "labels_t"
    features_t = []
    labels_t = []
    
    # Create a dictionary where
    # <key, value> = <node_id, <<idx, node_index_in_time_t_subgraph>, <t, time_t>>>.
    id2idx = {}
    for t in range(min_t, max_t + 1):
        features_t.append(features[times == t, :])
        labels_t.append(labels[times == t])
        nodes_t = nodes_id[times == t]
        for i in range(nodes_t.shape[0]):
            id2idx[nodes_t[i].item()] = {}
            id2idx[nodes_t[i].item()]['idx'] = i
            id2idx[nodes_t[i].item()]['t'] = t

    # Construct adjacency lists of the directed graph (non-symmetric) for each time step;
    # adjacency lists for each time step are stored in "edge_indices".
    edge_idx_t = [[] for _ in range(min_t, max_t + 1)]
    for index in range(new_edges.shape[0]):   
        node1_t = id2idx[new_edges.iloc[index, 0]]['t']
        node1_idx = id2idx[new_edges.iloc[index, 0]]['idx']
        node2_t = id2idx[new_edges.iloc[index, 1]]['t']
        node2_idx = id2idx[new_edges.iloc[index, 1]]['idx']
        edge_idx_t[node1_t - 1].append([node1_idx, node2_idx]) # time_step starts from 1

    edge_indices = [torch.LongTensor(edge_idx_t[i]).t() for i in range(len(edge_idx_t))]
    return features_t, edge_indices, labels_t

In [None]:
def time_step_split(new_nodes, new_edges, device, train_lt = 31, val_lt = 36, test_lt = 49):
    """
    Create and return the training, validation, and test set, splitted by time step,
    where each subgraph at time t is considered as an input of GCN model.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        device        Computing device
        train_lt      The last time step index of training set
        val_lt        The last time step index of validation set
        test_lt       The last time step index of test set

    Returns:
        data          A dictionary that stores training, validation, and test set,
                        each value is a list of Data object
        graph_info    A matrix where each row contains information of the time-step subgraph
                      [time_step, num_of_nodes, num_of_edges, num_of_illicit_nodes]
    """
    features_t, edge_indices, labels_t = time_step_split_helper(new_nodes, new_edges)

    graph_info = np.zeros((len(labels_t), 4), dtype = np.int64)
    for t in range(len(labels_t)):
        graph_info[t, :] = np.array([t, features_t[t].shape[0], edge_indices[t].shape[1],
                                     labels_t[t][labels_t[t] == 1].shape[0]])

    train_idx, val_idx, test_idx = [np.arange(train_lt), np.arange(train_lt, val_lt),
                                    np.arange(val_lt, test_lt)]
    train_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                       y = labels_t[idx]).to(device) for idx in train_idx ]
    val_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                     y = labels_t[idx]).to(device) for idx in val_idx ]
    test_list = [Data(x = features_t[idx], edge_index = edge_indices[idx],
                      y = labels_t[idx]).to(device) for idx in test_idx ]
    data = {}
    data['train'] = train_list
    data['val'] = val_list
    data['test'] = test_list

    return data, graph_info

In [None]:
data, graph_info = time_step_split(nodes, edges, device)
for key in data:
  print(key, len(data[key]))

train 31
val 5
test 13


In [None]:
def time_group_split(new_nodes, new_edges, device, train_lt = 31, val_lt = 36, test_lt = 49):
    """
    Create and return the training, validation, and test set, splitted by specific
    time step intervals, where the combination of subgraphs within the time step
    interval is considered as an input of GCN model.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        device        Computing device
        train_lt      The last time step index of training set
        val_lt        The last time step index of validation set
        test_lt       The last time step index of test set

    Returns:
        data          A dictionary that stores training, validation, and test set
                        each value is one Data object
    """
    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    times = new_nodes.iloc[:, 2].to_numpy()
    times = torch.LongTensor(times.reshape(len(times),))
    labels = new_nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))

    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))
    train_idx, val_idx, test_idx = [np.arange(1, train_lt + 1),
                                    np.arange(train_lt + 1, val_lt + 1),
                                    np.arange(val_lt + 1, test_lt + 1)]
    data_names = {'train': train_idx, 'val': val_idx, 'test': test_idx}
    
    # Construct nodes of the directed graph for specific time step intervals.
    # Features are stored in the given dataset name (train/val/test) of a dictionary,
    # 'raw_data', with key "features"; labels are stored with key "labels".
    min_t = torch.min(times) # 1
    max_t = torch.max(times) # 49

    id2idx = {}
    raw_data = {}
    for name in data_names.keys():
        features_set = []
        labels_set = []
        Id_set = []
        set_index = data_names[name]
        for time in set_index:
            features_set.append(features[times == time, :])
            labels_set.append(labels[times == time])
            Id_set.append(nodes_id[times == time])
        features_set = torch.cat(features_set, 0)
        labels_set = torch.cat(labels_set, 0)
        Id_set = torch.cat(Id_set, 0)
        for i in range((Id_set).shape[0]):
            id2idx[Id_set[i].item()] = {}
            id2idx[Id_set[i].item()]['idx'] = i
            id2idx[Id_set[i].item()]['set_name'] = name
        raw_data[name] = {'features': features_set, 'labels': labels_set}


    # Construct adjacency lists of the directed graph (non-symmetric) for
    # specific time intervals. Adjacency lists are stored with key "edge_indices".
    edge_idx_set = {name: [] for name in data_names.keys()}
    for index in range(new_edges.shape[0]): 
        node1_set = id2idx[new_edges.iloc[index, 0]]['set_name']
        node1_idx = id2idx[new_edges.iloc[index, 0]]['idx']
        node2_set = id2idx[new_edges.iloc[index, 1]]['set_name']
        node2_idx = id2idx[new_edges.iloc[index, 1]]['idx']
        edge_idx_set[node1_set].append([node1_idx, node2_idx]) # time_stamp starts from 1

    for name in data_names.keys():
        raw_data[name]['edge_indices'] = torch.LongTensor(edge_idx_set[name]).t()


    # Construct the training, validation, test set by 'raw_data' and store
    # in a dictionary, 'data'.
    data = {}
    for name in data_names.keys():
        data[name] = Data(x = raw_data[name]['features'],
                          edge_index = raw_data[name]['edge_indices'],
                          y = raw_data[name]['labels']).to(device)
    return data

In [None]:
data2 = time_group_split(nodes, edges, device)
data2

{'test': Data(x=[13621, 166], edge_index=[2, 11576], y=[13621]),
 'train': Data(x=[27615, 166], edge_index=[2, 21045], y=[27615]),
 'val': Data(x=[5328, 166], edge_index=[2, 4003], y=[5328])}

# Random Splitting Script


In [None]:
def random_split_transd(new_nodes, new_edges, train_size, test_size, device, seed = 42):
    """
    Create and return the training, validation, and test set by randomly splitting
    the node indices to these three sets. Keep edge_index known for all sets.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        train_size    The node size proportion in training set
        test_size     The node size proportion in test set
        device        Computing device
        seed          Random seed for data splitting

    Returns:
        data          A Data object that stores node features, edge_index, and labels
        dict          A dictionary that stores training, validation, test set node indices
    """
    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    labels = new_nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))
    nodes_id = new_nodes.iloc[:, 0].to_numpy()

    # Create a dictionary that maps nodeId to index in the dataframe.
    id2idx = {}
    for i in range(new_nodes.shape[0]):
        id2idx[new_nodes.iloc[i, 0]] = i

    # Construct edge_index with same node indexing as in features and labels
    edge_idx = np.zeros((2, new_edges.shape[0]), dtype = np.int64)
    for index in range(new_edges.shape[0]):   
        node1 = id2idx[new_edges.iloc[index, 0]]
        node2 = id2idx[new_edges.iloc[index, 1]]
        edge_idx[:, index] = [node1, node2]
    edge_index = torch.LongTensor(edge_idx)

    train_index, test_index = train_test_split(np.arange(labels.shape[0]),
                                               test_size = 1 - train_size,
                                               random_state = 42)
    val_index, test_index = train_test_split(test_index,
                                             test_size = test_size / (1 - train_size),
                                             random_state = 42)

    # Construct the training, validation, test set and store
    # in a dictionary, 'data'.
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    data = Data(x = features, edge_index = edge_index, y = labels).to(device)
                  
    return data, {'train': train_index, 'val': val_index, 'test': test_index}

In [None]:
node_sum = nodes.shape[0]
train_node_size = np.sum(graph_info[0: 31, 1]) / node_sum
test_node_size = np.sum(graph_info[36:, 1]) / node_sum
data3, split_idx3 = random_split_transd(nodes, edges, train_size = train_node_size,
                                        test_size = test_node_size, device = device)
data3

Data(x=[46564, 166], edge_index=[2, 36624], y=[46564])

In [None]:
train_node_size, test_node_size

(0.5930547203848466, 0.29252212009277556)

In [None]:
def random_split_ind(new_nodes, new_edges, train_size, test_size, device, seed = 42):
    """
    Create and return the training, validation, and test set by randomly splitting
    the node indices to these three sets. Keep only the node-induced edges within
    each set.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        train_size    The node size proportion in training set
        test_size     The node size proportion in test set
        device        Computing device
        seed          Random seed for data splitting

    Returns:
        data          A dictionary that stores training, validation, and test set
                        each value is one Data object
    """
    ## Create PyG graph separated by time (merge graphs in each set in train/val/test).
    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    labels = new_nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))

    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    nodes_id = torch.LongTensor(nodes_id.reshape(len(nodes_id),))

    # Create random splitting node indices.
    nodes_id_train, nodes_id_test, train_idx, test_idx = \
        train_test_split(nodes_id, range(nodes_id.shape[0]), test_size = 1 - train_size,
                         random_state = seed)
    nodes_id_valid, nodes_id_test, val_idx, test_idx = \
        train_test_split(nodes_id_test, test_idx, test_size = test_size / (1 - train_size),
                         random_state = seed)
    features_set = {'train': features[train_idx], 'val': features[val_idx],
                    'test': features[test_idx]}
    labels_set = {'train': labels[train_idx], 'val': labels[val_idx],
                  'test': labels[test_idx]}

    # Find the induced edge indices by the given node indices.
    id2idx = {}
    for i in range(nodes_id_train.shape[0]):
        id2idx[int(nodes_id_train[i])] = (i, 'train')
    for i in range(nodes_id_valid.shape[0]):
        id2idx[int(nodes_id_valid[i])] = (i, 'val')
    for i in range(nodes_id_test.shape[0]):
        id2idx[int(nodes_id_test[i])] = (i, 'test')   

    edge_index = {'train': [], 'val': [], 'test': []}
    for i in range(new_edges.shape[0]):
        node1 = id2idx[new_edges.iloc[i, 0]]
        node2 = id2idx[new_edges.iloc[i, 1]]
        if (node1[-1] == 'train' and node2[-1] == 'train'):
            edge_index['train'].append([node1[0], node2[0]])
        elif (node1[-1] == 'val' and node2[-1] == 'val'):
            edge_index['val'].append([node1[0], node2[0]])
        elif (node1[-1] == 'test' and node2[-1] == 'test'):
            edge_index['test'].append([node1[0], node2[0]])

    data = {}
    for name in ['train', 'val', 'test']:
        edge_index[name] = torch.LongTensor(edge_index[name]).t()
        data[name] = Data(x = features_set[name], edge_index = edge_index[name],
                          y = labels_set[name]).to(device)
    train_data = data['train']
    val_data = data['val']
    test_data = data['test']
                  
    return data

In [None]:
data4 = random_split_ind(nodes, edges, train_size = train_node_size,
                         test_size = test_node_size, device = device)
data4

{'test': Data(x=[13621, 166], edge_index=[2, 3205], y=[13621]),
 'train': Data(x=[27615, 166], edge_index=[2, 12937], y=[27615]),
 'val': Data(x=[5328, 166], edge_index=[2, 428], y=[5328])}

# Community Splitting Script


In [None]:
def laplacian(A, alpha = 0.1):
    """
    Returns the Laplacian matrix of the given adjacency matrix. For the directed
    acyclic graph (not connected) with adjacency matrix A, we define a modified
    Laplacian matrix as follows:
            A_tilde = (1 - alpha) * (A + A^T) + alpha * 11^T
            L = I - D_tilde^{-1/2} A_tilde D_tilde^{-1/2}
    Args:
        A             Adjacency matrix of certain graph
        alpha         Smoothing constant that prevents isolated nodes

    Returns:
        L             Modified Laplacian matrix of the adjacency matrix A
    """
    # A is sparse, csr format
    A = (1 - alpha) * (A + A.T) + alpha * scsp.csr_matrix(np.outer(np.ones(A.shape[0]), np.ones(A.shape[0])))
    D = scsp.diags(np.asarray(np.sum(A, axis = 0)).reshape(-1) ** (-1/2))
    L = scsp.diags(np.ones(A.shape[0])) - D @ A @ D
    return L

def adj_list_to_mtx(n, edge_index):
    """
    Create a csr-format adjacency matrix by the given adjacency list.
    Args:
        n             The number of nodes in the graph
        edge_index    The (2, |E|) adjacency list of the graph

    Returns:
        a csr-format adjacency matrix
    """
    edge_mtx = np.zeros((n, n))
    for i in range(edge_index.shape[1]):
        node1 = int(edge_index[0, i])
        node2 = int(edge_index[1, i])
        edge_mtx[node1, node2] = 1

    return scsp.csr_matrix(edge_mtx)

def nearest_sum(arr, target):
    """ 
    Get a combination of numbers in the given array that sums nearest to 
    the target number.
    
    Args:
        arr      The given array
        target   The target number

    Returns:
        resid           The residual between the summation and the target number
        elt_idx_list    The indices of the subarray for summation
    """

    n = len(arr)
    opt_arr = np.zeros((n + 1, target + 1))
    for i in range(1, n + 1):
        opt_arr[i, :] = opt_arr[i-1, :]
        for j in np.arange(target, 0, step = -1):
            if opt_arr[i, j] > 0 and j + arr[i - 1] <= target:
                opt_arr[i, j + arr[i - 1]] += 1
        opt_arr[i, arr[i - 1]] += 1
    
    elt_list = []
    elt_idx_list = []
    target_sum = target
    idx = n
    if (opt_arr[idx, target] == 0):
        while opt_arr[idx, target_sum] == 0:
            print(target_sum)
            target_sum -= 1
    resid = target - target_sum

    while (idx > 0 and target_sum != 0):
        if (opt_arr[idx, target_sum] - opt_arr[idx - 1, target_sum] > 0):
            elt_list.append(arr[idx - 1])
            elt_idx_list.append(idx - 1)
            target_sum -= arr[idx - 1]
        idx = idx - 1
    return resid, elt_idx_list

In [None]:
def community_split_transd(new_nodes, new_edges, train_size, test_size, device):
    """
    Create and return the training, validation, and test set by merging small
    clusters of the graphs. Keep edge_index known for all sets.

    Args:
        new_nodes     A dataframe of the node features
        new_edges     A dataframe of the graph's adjacency list
        train_size    The node size proportion in training set
        test_size     The node size proportion in test set

    Returns:
        data          A Data object that stores node features, edge_index, and labels
        dict          A dictionary that stores training, validation, test set node indices
    """
    cluster_num = 500
    features_t, edge_indices, labels_t = time_step_split_helper(new_nodes, new_edges)

     # Construct the features, labels, and edge_index
    features =  torch.FloatTensor(new_nodes.iloc[:, 2:].to_numpy())
    labels = new_nodes.iloc[:, 1].to_numpy()
    labels = torch.LongTensor(labels.reshape(len(labels),))
    nodes_id = new_nodes.iloc[:, 0].to_numpy()
    # Create a dictionary that maps nodeId to index in the dataframe.
    id2idx = {}
    for i in range(new_nodes.shape[0]):
        id2idx[new_nodes.iloc[i, 0]] = i
    # Construct edge_index with same node indexing as in features and labels
    edge_idx = np.zeros((2, new_edges.shape[0]), dtype = np.int64)
    for index in range(new_edges.shape[0]):   
        node1 = id2idx[new_edges.iloc[index, 0]]
        node2 = id2idx[new_edges.iloc[index, 1]]
        edge_idx[:, index] = [node1, node2]
    edge_index = torch.LongTensor(edge_idx)

    # Perform spectral clustering on the entire graph.
    # Since the entire graph's adjacency matrix A can be written as a block
    # diagonal matrix (blocked by time steps), we can recreate the eigenvalues
    # and eigenvectors of A by the eigenvalues and eigenvectors of blocks A_1,
    # A_2, ..., A_49 of A.
    t = 0
    eval_dict = {}
    node_num = []

    for t in range(49): # max_t = 49
        n = features_t[t].shape[0]
        A = adj_list_to_mtx(n, edge_indices[t])
        L = laplacian(A)
        evals, evecs = scsp.linalg.eigsh(L, k = n // 40, which = 'SM')

        for i in range(evals.shape[0]):
            eval_dict[evals[i]] = [t, i, evecs[:, i]]
        node_num.append(n)
    
    # 'node_blk' store node indices that mark time group separation.
    node_blk = np.insert(np.cumsum(node_num), 0, 0)
    # Block diagonal matrix has the first number-of-block (i.e., 49) smallest
    # eigenvalues to be 0.
    small_evals = np.sort(np.array([*eval_dict]))[49: (49 + cluster_num)]

    node_mtx = np.zeros((node_blk[-1], cluster_num))
    for i in range(small_evals.shape[0]):
        eval = small_evals[i]
        t, _, evec = eval_dict[eval]
        node_mtx[node_blk[t]: node_blk[t + 1], i] = evec
    
    # Use K-means algorithm to create certain number of clusters (e.g., 500).
    kmeans = KMeans(n_clusters = cluster_num, init = 'random', random_state = 42,
                    n_init = 3, max_iter = 10).fit(node_mtx)
    
    comm_count = np.bincount(kmeans.labels_)

    # Split and merge the clusters into three sets by the given number of nodes
    # in each set.
    node_num = new_nodes.shape[0]
    train_num = int(np.round(node_sum * train_size))
    val_num = int(np.round(node_sum * (1 - train_size - test_size)))

    train_num_resid, train_clust_idx = nearest_sum(comm_count, train_num)
    val_test_comm_count = np.delete(comm_count, train_clust_idx)
    val_test_clust_idx = np.delete(np.arange(500), train_clust_idx)

    val_num_resid, val_clust_idx_tmp = nearest_sum(val_test_comm_count, val_num)
    val_clust_idx = val_test_clust_idx[val_clust_idx_tmp]
    test_clust_idx = np.delete(np.arange(500), np.hstack((train_clust_idx, val_clust_idx)))

    # Split node indices by their clusters into three datasets.
    train_idx = []
    val_idx = []
    test_idx = []

    train_clust_set = set(train_clust_idx)
    val_clust_set = set(val_clust_idx)
    test_clust_set = set(test_clust_idx)
    for i in range(len(kmeans.labels_)):
        if kmeans.labels_[i] in train_clust_set:
            train_idx.append(i)
        elif kmeans.labels_[i] in val_clust_set:
            val_idx.append(i)
        else:
            test_idx.append(i)

    if train_num_resid > 0:
        train_idx.append(test_idx[-train_num_resid:])
        test_idx = test_idx[:-train_num_resid]
    if val_num_resid > 0:
        val_idx.append(test_idx[-val_num_resid:])
        test_idx = test_idx[:-val_num_resid]

    train_idx = np.array(train_idx)
    val_idx = np.array(val_idx)
    test_idx = np.array(test_idx)
    
    # Construct the training, validation, test set and store
    # in a dictionary, 'data'.
    data = Data(x = features, edge_index = edge_index, y = labels).to(device)
    
    return data, {'train': train_idx, 'val': val_idx, 'test': test_idx}

In [None]:
data5, split_idx5 = community_split_transd(nodes, edges, train_size = train_node_size,
                                    test_size = test_node_size, device = device)
data5

Data(x=[46564, 166], edge_index=[2, 36624], y=[46564])

# GCN Model

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds = False):
        """
        Initialize a GCN model.
        Args:
            input_dim       Input dimension of node embeddings
            hidden_dim      Hidden dimension of node embeddings
            output_dim      Output dimension of node embeddings
            num_layers      The number of GCN layers
            dropout         The dropout ratio in (0, 1]
                              (dropout: the probability of an element getting zeroed)
            return_embeds   A boolean value determining whether we skip the
                              classification layer and return node embeddings
        """

        super(GCN, self).__init__()

        # Construct all convs
        self.num_layers = num_layers
        self.convs = torch.nn.ModuleList([GCNLayer(hidden_dim, hidden_dim, directed = False) 
                                                        for i in range(self.num_layers-1)])

        # Construct batch normalization
        self.bns = torch.nn.ModuleList([torch.nn.BatchNorm1d(hidden_dim)
                                        for i in range(self.num_layers-1)])
        # First GCN layer
        self.convs[0] = GCNLayer(input_dim, hidden_dim, directed = False)
        # Last GCN layer
        self.last_conv = GCNLayer(hidden_dim, output_dim, directed = False)
        self.softmax = torch.nn.LogSoftmax(dim = -1)
        
        self.dropout = dropout
        self.return_embeds = return_embeds

    def reset_parameters(self):
        """
        Reset all learnable parameters in GCN layers and Batch Normalization
        Layers.
        """
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, edge_index):
        """
        Produce a forward propagation of GCN model. Before the last GCN layer,
        we transform the embedding (x) in the following sequence:
          x -> GCN_Layer -> Batch_Norm -> ReLU -> Dropout.
        At the last GCN layer, the following sequence is applied:
          x -> GCN Layer -> Softmax -> output.
        
        Args:
            x             The node embedding
            edge_index    The adjacency list of the graph
        
        Returns:
            out           The predictions of labels / the updated node embedding
        """
        x = torch.clone(x.detach())
        for l in range(self.num_layers - 1):
            # Unweighted graph has weight 1.
            x = self.convs[l](x, edge_index, torch.ones(edge_index.shape[1]))
            x = self.bns[l](x)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training = self.training)

        x = self.last_conv(x, edge_index, torch.ones(edge_index.shape[1]))
        if self.return_embeds:
            out = x
        else:
            out = self.softmax(x)

        return out

In [None]:
class GCNLayer(MessagePassing):
    def __init__(self, in_channels, out_channels, bias = True, 
                 directed = False, self_loop = True, **kwargs):
        """
        Initialize a GCN layer.
        Args:
            in_channels      In-channel dimension of node embeddings
            out_channels     Out-channel dimension of node embeddings
            bias             A boolean value determining whether we add a
                                learnable bias term in linear transformation
            directed         A boolean value determining whether we use directed
                                message passing D^{-1}A or use symmetric normalized
                                adjacency matrix D^{-1/2}AD^{-1/2}
            self_loop        A boolean value determining whether we add a self-
                                loop for each node
        """
        super(GCNLayer, self).__init__(**kwargs, aggr = 'add')

        self.in_channels = in_channels
        self.out_channels = out_channels

        self.directed = directed
        self.self_loop = self_loop

        # Define the layers needed for the message and update functions below.
        # self.lin is the linear transformation that we apply to the embedding.
        self.lin = nn.Linear(self.in_channels, self.out_channels, bias = bias)
        
        self.reset_parameters()

    def reset_parameters(self):
        """
        Reset all learnable parameters in the linear transformation.
        """
        self.lin.reset_parameters()

    def forward(self, x, edge_index, edge_weight):
        """
        Produce a forward propagation of GCN layer.
        
        Args:
            x             The node embedding
            edge_index    The (2, |E|) adjacency list of the graph
            edge_weight   The (|E|) vector specifying the edge weights in the graph
                            (for unweighted graph, edge weight is 1)
        
        Returns:
            An updated node embedding
        """
        # Add self-loops to the adjacency matrix.
        if self.self_loop:
            edge_index, _ = add_self_loops(edge_index, num_nodes = x.size(0))
            edge_weight = torch.cat((edge_weight, torch.ones(x.size(0))), dim = -1)
        
        # Apply linear transformation on node features.
        x = self.lin(x)

        # Compute normalization by updated node degree.
        if self.directed:
            row , _ = edge_index
            deg = degree(row, x.size(0), dtype = x.dtype) # only out-degree
            deg_inv = deg.pow(-1)
            deg_inv[deg_inv == float('inf')] = 0
            norm = deg_inv[row]
        else:
            row, col = edge_index
            deg = degree(col, x.size(0), dtype = x.dtype)
            deg_inv_sqrt = deg.pow(-0.5)
            deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
            norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        return self.propagate(edge_index, x = (x, x), norm = norm, edge_weight = edge_weight)

    def message(self, x_j, edge_weight, norm):
        """
        Send the message of the neighboring node (i.e., x_j) to the source node (i.e., x_i).
        
        Args:
            x_j           The embedding of the neighboring node of source node x_i
            edge_weight   The edge weight of certain edge
            norm          Normalization constant determined by self.directed
        
        Returns:
            A message sending from the neighboring node to the source node
        """
        return norm.view(-1, 1) * x_j * edge_weight.view(-1, 1)

In [None]:
def train_ind_time_step(model, train_data, optimizer, loss_fn):
    """
    Train the model by using the given optimizer and loss_fn.
    
    Args:
        model       The GCN model
        train_data  The Data object that stores x, edge_index, and labels
                      only for training set
        optimizer   The optimizer
        loss_fn     The loss function

    Returns
        The average prediction loss of each time step in the training set
          by the given loss function
    """
    model.train()
    loss = torch.FloatTensor([0]*len(train_data)).to(device)
    optimizer.zero_grad()
    for i, data_t in enumerate(train_data):
        train_slice = model.forward(data_t.x, data_t.edge_index)
        train_label = data_t.y
        loss[i] = loss_fn(train_slice, train_label)
    loss.mean().backward()
    optimizer.step()
    return loss.mean().item()

In [None]:
def train_ind(model, train_data, optimizer, loss_fn):
    """
    Train the model by using the given optimizer and loss_fn.
    
    Args:
        model       The GCN model
        train_data  The Data object that stores x, edge_index, and labels
                      only for training set
        optimizer   The optimizer
        loss_fn     The loss function

    Returns
        The prediction loss by the given loss function
    """
    model.train()
    loss = 0

    optimizer.zero_grad()
    train_slice = model.forward(train_data.x, train_data.edge_index)
    train_label = train_data.y
    loss = loss_fn(train_slice, train_label)

    loss.backward()
    optimizer.step()

    return loss.item()

In [None]:
def train_transd(model, data, train_idx, optimizer, loss_fn):
    """
    Train the model by using the given optimizer and loss_fn.
    
    Args:
        model       The GCN model
        data        The Data object that stores x, edge_index, and labels
        train_idx   The node indices in the training set
        optimizer   The optimizer
        loss_fn     The loss function

    Returns
        The prediction loss by the given loss function
    """
    model.train()
    loss = 0

    optimizer.zero_grad()
    train_slice = model.forward(data.x, data.edge_index)[train_idx]
    train_label = data.y[train_idx]
    loss = loss_fn(train_slice, train_label)

    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
@torch.no_grad()
def test_ind_time_step(model, data, save_model_results=False):
    """
    Test the model by using the given splitted datasets.

    Args:
        model                 The GCN model
        data                  A dictionary of Data objects that store x, edge_index, and labels
                                for three sets
        save_model_results    A boolean determining whether we save the model results

    Returns
        The accuracy and auc-roc score of training, validation, and test set
    """

    model.eval()
    # The output of model on each data sets
    eval = {}
    for name in data.keys():
        data_list = data[name]
        eval_report = []
        eval_auc_roc = 0
        for i,data_i in enumerate(data_list):
            out = model.forward(data_i.x, data_i.edge_index)
            y_pred = out.argmax(dim=-1, keepdim=True)
            acc = classification_report(torch.unsqueeze(data_i.y, -1),
                                        y_pred,output_dict=True, zero_division=0)
            eval_report.append(acc)
            auc_roc = roc_auc_score(torch.unsqueeze(data_i.y, -1),y_pred)
            eval_auc_roc += auc_roc
        report = {}
        for key in eval_report[0].keys():
            if type(eval_report[0][key]) is dict:
                df = pd.DataFrame([sub_report[key] for sub_report in eval_report])
                report[key] = df.mean().to_dict()
            else:
                report[key] = np.mean(np.array([sub_report[key] for sub_report in eval_report]))
        eval_auc_roc /= len(data_list)
        eval[name] = {'report': pd.DataFrame(report), 'auc_roc': eval_auc_roc}
    
    ### TODO: what is the criterion to save the model results, the whole prediction
    ### y_pred and y_true? or only the test sets' prediction?
    if save_model_results:
        print ("Saving Model Predictions")

        data_new = {}
        data_new ['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

        df = pd.DataFrame(data=data_new )
        # Save locally as csv
        df.to_csv('gcn_ind.csv', sep=',', index=False)
    
    return eval['train']['report'], eval['val']['report'], eval['test']['report'], \
           eval['train']['auc_roc'], eval['val']['auc_roc'], eval['test']['auc_roc']

In [None]:
@torch.no_grad()
def test_ind(model, data, save_model_results=False):
    """
    Test the model by using the given splitted datasets.

    Args:
        model                 The GCN model
        data                  A dictionary of Data objects that store x, edge_index, and labels
                                for three sets
        save_model_results    A boolean determining whether we save the model results

    Returns
        The accuracy and auc-roc score of training, validation, and test set
    """

    model.eval()
    # The output of model on each data sets
    train_out = model.forward(data['train'].x, data['train'].edge_index)
    train_pred = train_out.argmax(dim=-1, keepdim=True)
    train_acc = classification_report(torch.unsqueeze(data['train'].y, -1),
                                      train_pred, zero_division=0)
    train_auc_roc = roc_auc_score(torch.unsqueeze(data['train'].y, -1),
                                  train_pred)
    
    val_out = model.forward(data['val'].x, data['val'].edge_index)
    val_pred = val_out.argmax(dim=-1, keepdim=True)
    val_acc = classification_report(torch.unsqueeze(data['val'].y, -1),
                                    val_pred, zero_division=0)
    val_auc_roc = roc_auc_score(torch.unsqueeze(data['val'].y, -1),
                                  val_pred)
    
    test_out = model.forward(data['test'].x, data['test'].edge_index)
    test_pred = test_out.argmax(dim=-1, keepdim=True)
    test_acc = classification_report(torch.unsqueeze(data['test'].y, -1),
                                     test_pred, zero_division=0)
    test_auc_roc = roc_auc_score(torch.unsqueeze(data['test'].y, -1),
                                  test_pred)
    
    ### TODO: what is the criterion to save the model results, the whole prediction
    ### y_pred and y_true? or only the test sets' prediction?
    if save_model_results:
        print ("Saving Model Predictions")

        data = {}
        data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

        df = pd.DataFrame(data=data)
        # Save locally as csv
        df.to_csv('gcn_ind.csv', sep=',', index=False)
    
    return train_acc, val_acc, test_acc, \
           train_auc_roc, val_auc_roc, test_auc_roc

In [None]:
@torch.no_grad()
def test_transd(model, data, split_idx, save_model_results=False):
    """
    Test the model by using the given split_idx.

    Args:
        model                 The GCN model
        data                  The Data object that stores x, edge_index, and labels
        split_idx             A dictionary that stores node indices for three sets
        save_model_results    A boolean determining whether we save the model results

    Returns
        The accuracy and auc-roc score of training, validation, and test set
    """

    model.eval()
    # The output of model on all data
    out = model.forward(data.x, data.edge_index)
    
    train_index = split_idx['train']
    val_index = split_idx['val']
    test_index = split_idx['test']

    y_pred = out.argmax(dim=-1, keepdim=True)
    train_acc = classification_report(torch.unsqueeze(data.y[train_index], -1),
                                      y_pred[train_index], zero_division=0)
    valid_acc = classification_report(torch.unsqueeze(data.y[val_index], -1),
                                      y_pred[val_index], zero_division=0)
    valid_accuracy = classification_report(torch.unsqueeze(data.y[val_index], -1),
                                      y_pred[val_index], output_dict=True,
                                      zero_division=0)['accuracy']
    test_acc = classification_report(torch.unsqueeze(data.y[test_index], -1),
                                     y_pred[test_index], zero_division=0)
    train_auc_roc = roc_auc_score(torch.unsqueeze(data.y[train_index], -1),
                                      y_pred[train_index])
    val_auc_roc = roc_auc_score(torch.unsqueeze(data.y[val_index], -1),
                                      y_pred[val_index])
    test_auc_roc = roc_auc_score(torch.unsqueeze(data.y[test_index], -1),
                                      y_pred[test_index])
    
    if save_model_results:
        print ("Saving Model Predictions")

        data = {}
        data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

        df = pd.DataFrame(data=data)
        # Save locally as csv
        df.to_csv('gcn_transd.csv', sep=',', index=False)
    
    return train_acc, valid_acc, test_acc, \
           train_auc_roc, val_auc_roc, test_auc_roc

In [None]:
args = {
    'device': device,
    'num_layers': 2,
    'hidden_dim': 256,
    'dropout': 0.5,
    'lr': 0.01,
    'epochs': 50,
    'label_weight': torch.Tensor([0.5, 0.5])
}
args

{'device': 'cpu',
 'dropout': 0.5,
 'epochs': 50,
 'hidden_dim': 256,
 'label_weight': tensor([0.5000, 0.5000]),
 'lr': 0.01,
 'num_layers': 2}

# Random split & community split - transductive
Note: Feed "data5" into the model if we were to run community split, "data3" to run random split.


In [None]:
model = GCN(data5.x.shape[1], args['hidden_dim'],
            2, args['num_layers'], args['dropout']).to(device)

In [None]:
import copy

model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.NLLLoss(weight=args['label_weight'])

best_model = None
best_valid_auc = 0
best_result = None
losses = []

for epoch in range(1, 1 + args["epochs"]):
    # train with random split
    loss = train_transd(model, data3, split_idx3['train'], optimizer, loss_fn)
    losses.append(loss)
    result = test_transd(model, data3, split_idx3)
    train_acc, val_acc, test_acc, train_auc, val_auc, test_auc = result 
    if val_auc > best_valid_auc:
        best_valid_auc = val_auc
        best_model = copy.deepcopy(model)
        best_result = [train_acc, val_acc, test_acc, train_auc, val_auc, test_auc]

    print('Epoch: {:02},'.format(epoch),
          'Loss:{:.4f}'.format(loss),
          'Train:\n{}\n'.format(train_acc),
          'Train_auc_roc: {}'.format(train_auc),
          '\n\n'
          'Valid:\n{}\n'.format(val_acc),
          'Val_auc_roc: {}'.format(val_auc),
          '\n\n'
          'Test:\n{}\n'.format(test_acc),
          'Test_auc_roc: {}'.format(test_auc),
          '\n'
          )
  

Epoch: 01, Loss:1.0271 Train:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     24915
           1       0.44      0.21      0.28      2700

    accuracy                           0.90     27615
   macro avg       0.68      0.59      0.61     27615
weighted avg       0.87      0.90      0.88     27615

 Train_auc_roc: 0.588975405266796 

Valid:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      4821
           1       0.46      0.21      0.29       507

    accuracy                           0.90      5328
   macro avg       0.69      0.59      0.62      5328
weighted avg       0.88      0.90      0.88      5328

 Val_auc_roc: 0.5910010322197388 

Test:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     12283
           1       0.42      0.20      0.28      1338

    accuracy                           0.89     13621
   macro avg  

# Time group split - inductive

In [None]:
model = GCN(data2['train'].x.shape[1], args['hidden_dim'],
            2, args['num_layers'], args['dropout']).to(device)

In [None]:
model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.NLLLoss(weight=args['label_weight'])

best_model = None
best_valid_auc = 0
best_result = None
losses = []

for epoch in range(1, 1 + args["epochs"]):
    # train with random split
    loss = train_ind(model, data2['train'], optimizer, loss_fn)
    losses.append(loss)
    result = test_ind(model, data2)
    train_acc, val_acc, test_acc, train_auc, val_auc, test_auc = result 
    if val_auc > best_valid_auc:
        best_valid_auc = val_auc
        best_model = copy.deepcopy(model)
        best_result = [train_acc, val_acc, test_acc, train_auc, val_auc, test_auc]

    print('Epoch: {:02},'.format(epoch),
          'Loss:{:.4f}'.format(loss),
          'Train:\n{}\n'.format(train_acc),
          'Train_auc_roc: {}'.format(train_auc),
          '\n\n'
          'Valid:\n{}\n'.format(val_acc),
          'Val_auc_roc: {}'.format(val_auc),
          '\n\n'
          'Test:\n{}\n'.format(test_acc),
          'Test_auc_roc: {}'.format(test_auc),
          '\n'
          )


# Temporal step split - inductive

In [None]:
model = GCN(data['train'][0].x.shape[1], args['hidden_dim'],
            2, args['num_layers'], args['dropout']).to(device)

In [None]:
model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.NLLLoss(weight=args['label_weight'])

best_model = None
best_valid_auc = 0
best_result = None
losses = []

for epoch in range(1, 1 + args["epochs"]):
    # train with random split
    loss = train_ind_time_step(model, data['train'], optimizer, loss_fn)
    losses.append(loss)
    result = test_ind_time_step(model, data)
    train_acc, val_acc, test_acc, train_auc, val_auc, test_auc = result 
    if val_auc > best_valid_auc:
        best_valid_auc = val_auc
        best_model = copy.deepcopy(model)
        best_result = [train_acc, val_acc, test_acc, train_auc, val_auc, test_auc]

    print('Epoch: {:02},'.format(epoch),
          'Loss:{:.4f}'.format(loss),
          'Train:\n{}\n'.format(train_acc),
          'Train_auc_roc: {}'.format(train_auc),
          '\n\n'
          'Valid:\n{}\n'.format(val_acc),
          'Val_auc_roc: {}'.format(val_auc),
          '\n\n'
          'Test:\n{}\n'.format(test_acc),
          'Test_auc_roc: {}'.format(test_auc),
          '\n'
          )
