In [1]:
# dataset : 'https://snap.stanford.edu/data/reddit_threads.html'

import torch
from torch import nn
import torch.nn.functional as F

from torch_geometric.nn import GCNConv, TopKPooling
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import global_max_pool

import numpy as np
import pandas as pd
import json
import os

graphs_path = './reddit_threads/reddit_edges.json'
graphs_target_path = './reddit_threads/reddit_target.csv'


In [2]:
with open(graphs_path) as f:
    graphs = json.load(f)

graphs_target = pd.read_csv(graphs_target_path)

In [3]:
print('Number of Graph : {} \n'.format(len(graphs.keys())))

print(graphs['0'])
print(graphs['200000'])

Number of Graph : 203088 

[[0, 2], [1, 5], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [3, 8]]
[[0, 9], [1, 9], [2, 9], [3, 9], [4, 12], [5, 9], [6, 9], [7, 9], [8, 9], [9, 10], [9, 11], [9, 12]]


In [4]:
graphs_target

Unnamed: 0,id,target
0,0,0
1,1,1
2,2,0
3,3,1
4,4,0
...,...,...
203083,203083,1
203084,203084,0
203085,203085,0
203086,203086,1


In [5]:
print(graphs_target['target'].value_counts())
print()

true_con = graphs_target['target'] == 1
false_con = graphs_target['target'] == 0

train_ratio = 0.9
train_trues = graphs_target.loc[true_con].sample(frac = train_ratio)
test_trues = pd.concat([graphs_target.loc[true_con],train_trues]).drop_duplicates(keep=False)

train_falses = graphs_target.loc[false_con].sample(frac = train_ratio)
test_falses = pd.concat([graphs_target.loc[false_con], train_falses]).drop_duplicates(keep=False)

train_indices = pd.concat([train_trues, train_falses])['id'].tolist()
test_indices = pd.concat([test_trues, test_trues])['id'].tolist()

print(len(train_indices))
print(len(test_indices))

1    104065
0     99023
Name: target, dtype: int64

182779
20814


In [6]:
graph_number = 1

print(graphs[str(graph_number)])
print(graphs_target['target'][graph_number])

[[0, 3], [0, 6], [1, 8], [2, 8], [4, 8], [5, 8], [6, 8], [7, 8], [8, 9], [8, 10], [8, 11], [8, 12]]
1


In [7]:

def _get_node_features(graph):
    '''[Number of Nodes, Node, Feature size]'''
    
    nodes = set(np.array(graph).flatten())

    all_node_feats = np.empty(shape = (len(nodes),))
    
    all_node_feats = np.expand_dims(all_node_feats, axis = 1)
    all_node_feats = torch.tensor(all_node_feats, dtype = torch.float)
    
    return all_node_feats


def _get_adjacency_info(graph):
    ''' [2, number of edge]'''
    
    all_edge_feats = []
    
    trans_graph = list(zip(*graph))
    
        
    edge_index = np.asarray(trans_graph)
    edge_index = torch.tensor(edge_index, dtype = torch.float)
        
    return edge_index


def _get_label(target):

    label = np.asarray([target])
    label = torch.tensor(label, dtype = torch.int64)

    return label


In [8]:
node_feats = _get_node_features(graphs[str(graph_number)])

print(node_feats)
print(node_feats.shape)

edge_index = _get_adjacency_info(graphs[str(graph_number)])
print()
print(edge_index)
print(edge_index.shape)

y = _get_label(graphs_target['target'][graph_number])
print()
print(y)

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
torch.Size([13, 1])

tensor([[ 0.,  0.,  1.,  2.,  4.,  5.,  6.,  7.,  8.,  8.,  8.,  8.],
        [ 3.,  6.,  8.,  8.,  8.,  8.,  8.,  8.,  9., 10., 11., 12.]])
torch.Size([2, 12])

tensor([1])


In [24]:
from torch_geometric.data import Data, DataLoader

def process(graph_number):
    
    global graphs, graphs_target
    
    node_feats = _get_node_features(graphs[str(graph_number)])
    edge_index = _get_adjacency_info(graphs[str(graph_number)])
    label = _get_label(graphs_target['target'][graph_number])
    
    data = Data(x = node_feats,
                edge_index = edge_index,
                y = label)

    return data

In [25]:
train_dataset = []
test_dataset = []

for idx in train_indices:   
    data = process(idx)
    train_dataset.append(data)

for idx in test_indices:
    data = process(idx)
    test_dataset.append(data)
    
print(train_dataset[0])
print(test_dataset[0])

Data(x=[35, 1], edge_index=[2, 35], y=[1])
Data(x=[19, 1], edge_index=[2, 19], y=[1])


In [26]:
train_dataset[:5]

[Data(x=[35, 1], edge_index=[2, 35], y=[1]),
 Data(x=[18, 1], edge_index=[2, 17], y=[1]),
 Data(x=[12, 1], edge_index=[2, 12], y=[1]),
 Data(x=[12, 1], edge_index=[2, 11], y=[1]),
 Data(x=[13, 1], edge_index=[2, 14], y=[1])]

In [27]:
test_dataset[:5]

[Data(x=[19, 1], edge_index=[2, 19], y=[1]),
 Data(x=[17, 1], edge_index=[2, 16], y=[1]),
 Data(x=[14, 1], edge_index=[2, 15], y=[1]),
 Data(x=[32, 1], edge_index=[2, 35], y=[1]),
 Data(x=[18, 1], edge_index=[2, 17], y=[1])]

In [None]:
train_loader = DataLoader(train_dataset, batch_size = 128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 128, shuffle=True)

In [57]:
for batch in train_loader[:5]:
    print(batch)
    
    

TypeError: 'DataLoader' object is not subscriptable