- **x**:
    节点特征，shape为(N_node,17)；
- **y**:
    节点共有(0,1,2,3)四类label，shape为(N_node,)，其中测试样本对应的label被标为-100；
- **edge_index**:
    有向边信息，shape为(N_edge,2)，其中每一行为(id_a, id_b)，代表用户id_a指向用户id_b的有向边；
- **edge_type**:
    边类型，shape为(N_edge,)；
- **edge_timestamp**：
    边连接日期，shape为(N_edge,)，其中边日期为从1开始的整数，单位为天；
- **train_mask**：
    包含训练样本id的一维array;
- **test_mask**：
    包含测试样本id的一维array;

In [1]:
import numpy as np
import pandas as pd
import torch as th
import dgl
import torch

from collections import Counter

Using backend: pytorch


In [3]:
def build_graph():
    data = np.load('./phase1_gdata.npz')
    print(list(data.keys()))
    
    node_feat = data['x']
    node_label = data['y']
    edge_pair = data['edge_index']
    edge_type = data['edge_type']
    edge_time = data['edge_timestamp']
    
    train_mask = data['train_mask']
    test_mask = data['test_mask']
    
    g = dgl.graph(edge_pair.tolist())
    g.ndata['feat'] = th.Tensor(node_feat)
    g.ndata['label'] = th.Tensor(node_label)
    g.edata['type'] = th.Tensor(edge_type)
    g.edata['time'] = th.Tensor(edge_time)
    print(g)
    
    return g, train_mask, test_mask

# g, train_mask, test_mask = build_graph()

In [4]:
g, train_mask, test_mask = build_graph()

['x', 'y', 'edge_index', 'edge_type', 'edge_timestamp', 'train_mask', 'test_mask']
Graph(num_nodes=4059035, num_edges=4962032,
      ndata_schemes={'feat': Scheme(shape=(17,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'type': Scheme(shape=(), dtype=torch.float32), 'time': Scheme(shape=(), dtype=torch.float32)})


In [2]:
data = np.load('./phase1_gdata.npz')

node_feat = data['x']
node_label = data['y']
edge_pair = data['edge_index']
edge_type = data['edge_type']
edge_time = data['edge_timestamp']

train_mask = data['train_mask']
test_mask = data['test_mask']

g = dgl.graph(edge_pair.tolist())
g.ndata['feat'] = th.Tensor(node_feat)
g.ndata['label'] = th.Tensor(node_label)
g.edata['type'] = th.Tensor(edge_type)
g.edata['time'] = th.Tensor(edge_time)
g

Graph(num_nodes=4059035, num_edges=4962032,
      ndata_schemes={'feat': Scheme(shape=(17,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'type': Scheme(shape=(), dtype=torch.float32), 'time': Scheme(shape=(), dtype=torch.float32)})

In [6]:
class NeighborSampler(object):
    def __init__(self, g, fanouts):
        """
        g 为 DGLGraph；
        fanouts 为采样节点的数量，实验使用 10,25，指一阶邻居采样 10 个，二阶邻居采样 25 个。
        """
        self.g = g
        self.fanouts = fanouts

    def sample_blocks(self, seeds):
        seeds = th.LongTensor(np.asarray(seeds))
        blocks = []
        for fanout in self.fanouts: 
            # sample_neighbors 可以对每一个种子的节点进行邻居采样并返回相应的子图
            # replace=True 表示用采样后的邻居节点代替所有邻居节点
            frontier = dgl.sampling.sample_neighbors(g, seeds, fanout, replace=True)
            # 将图转变为可以用于消息传递的二部图（源节点和目的节点）
            # 其中源节点的 id 也可能包含目的节点的 id（原因上面说了）
            # 转变为二部图主要是为了方便进行消息传递
            block = dgl.to_block(frontier, seeds)
            # 获取新图的源节点作为种子节点，为下一层作准备
            # 之所以是从 src 中获取种子节点，是因为采样操作相对于聚合操作来说是一个逆向操作
            seeds = block.srcdata[dgl.NID]
            # 把这一层放在最前面。
            # PS：如果数据量大的话，插入操作是不是不太友好。
            blocks.insert(0, block)
        return blocks

In [3]:
dgl.NID

'_ID'

In [3]:
dgl.sampling.sample_neighbors(g, th.LongTensor([123, 456]), -1)

Graph(num_nodes=4059035, num_edges=8,
      ndata_schemes={'feat': Scheme(shape=(17,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'type': Scheme(shape=(), dtype=torch.float32), 'time': Scheme(shape=(), dtype=torch.float32), '_ID': Scheme(shape=(), dtype=torch.int64)})

In [37]:
dgl.sampling.sample_neighbors(g, th.LongTensor([123, 456]), 10).all_edges()

(tensor([2000594,  723071, 1139829, 1285271, 2657984, 2972752, 3337103, 3976413]),
 tensor([123, 456, 456, 456, 456, 456, 456, 456]))

In [22]:
frontier = dgl.sampling.sample_neighbors(g, th.LongTensor([123, 456]), 5, replace=True)
block = dgl.to_block(frontier, [123, 456])

In [57]:
block.srcdata

{'feat': tensor([[ 0.0000e+00,  2.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
          0.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00],
        [ 0.0000e+00,  2.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
          0.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00],
        [ 0.0000e+00,  1.0000e+00,  1.3100e+00,  1.0100e+00,  7.7099e-01,
          6.0000e-01,  1.9320e+00,  4.6000e-02,  1.8550e+00,  9.6014e-01,
          1.0000e+00,  2.2901e-01,  3.9855e-02,  2.9703e-02,  6.6667e-01,
          3.8168e-03,  1.6667e-01],
        [ 0.0000e+00,  4.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
          0.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e

In [13]:
g.edata

{'type': tensor([ 3.,  5.,  1.,  ...,  9., 10., 11.]), 'time': tensor([567., 195., 399.,  ..., 578., 578., 578.])}

In [48]:
frontier.edata

{'type': tensor([ 6.,  6.,  6.,  6.,  6., 10., 10., 10., 10., 10.]), 'time': tensor([ 35.,  35.,  35.,  35.,  35., 566.,  77.,  77., 216.,  17.]), '_ID': tensor([2206500, 2206500, 2206500, 2206500, 2206500, 4916695, 3387160, 3387160,
        3991564,  784824])}

In [47]:
g.nodes(), frontier.nodes()

(tensor([      0,       1,       2,  ..., 4059032, 4059033, 4059034]),
 tensor([      0,       1,       2,  ..., 4059032, 4059033, 4059034]))

In [43]:
g.edges(form='all')

(tensor([      0,       1,       1,  ..., 4057114, 4057117, 4057120]),
 tensor([3151644,  124194, 3678931,  ..., 2180496, 3628100, 1592353]),
 tensor([      0,       1,       2,  ..., 4962029, 4962030, 4962031]))

In [49]:
frontier.edges(form='all')

(tensor([2000594, 2000594, 2000594, 2000594, 2000594, 3976413, 2972752, 2972752,
         3337103,  723071]),
 tensor([123, 123, 123, 123, 123, 456, 456, 456, 456, 456]),
 tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

In [21]:
frontier.

10

In [32]:
block.find_edges()

(tensor([2]), tensor([0]))

In [27]:
block.dstnodes()

tensor([0, 1])

In [33]:
block.srcdata

{'feat': tensor([[ 0.0000e+00,  2.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
          0.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00],
        [ 0.0000e+00,  2.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
          0.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00],
        [ 0.0000e+00,  1.0000e+00,  1.3100e+00,  1.0100e+00,  7.7099e-01,
          6.0000e-01,  1.9320e+00,  4.6000e-02,  1.8550e+00,  9.6014e-01,
          1.0000e+00,  2.2901e-01,  3.9855e-02,  2.9703e-02,  6.6667e-01,
          3.8168e-03,  1.6667e-01],
        [ 0.0000e+00,  4.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
         -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00,
          0.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e+00, -1.0000e

In [12]:
frontier.edata

{'type': tensor([ 6.,  6.,  6.,  6.,  6., 10., 10., 10., 10., 10.]), 'time': tensor([ 35.,  35.,  35.,  35.,  35., 260.,  17.,  17.,  77., 216.]), '_ID': tensor([2206500, 2206500, 2206500, 2206500, 2206500, 1241333,  784824,  784824,
        3387160, 3991564])}

In [5]:
frontier.srcdata

{'feat': tensor([[ 0.0000,  2.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [ 0.0000,  2.0000,  0.6800,  ...,  0.8571,  0.0074,  0.1429],
        [ 0.0000,  2.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        ...,
        [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]), 'label': tensor([2., 0., 2.,  ..., 2., 2., 2.])}

In [16]:
frontier.all_edges()

(tensor([2000594, 2000594, 2000594, 2000594, 2000594, 1139829, 1139829, 2657984,
         2972752, 2972752]),
 tensor([123, 123, 123, 123, 123, 456, 456, 456, 456, 456]))

In [10]:
block = dgl.to_block(frontier, [123, 456])

In [14]:
block.srcdata['_ID']

tensor([    123,     456, 2000594, 1139829, 2657984, 2972752])

In [4]:
sampler = NeighborSampler(g, [5, 5])

NameError: name 'NeighborSampler' is not defined

In [5]:
sampler.sample_blocks([0, 1, 2, 3])

NameError: name 'sampler' is not defined

In [58]:
import torch.utils.data

In [59]:
torch.utils.data.DataLoader()

<numpy.lib.npyio.NpzFile at 0x7fd9eca99450>

In [11]:
g = dgl.graph(([1, 2], [2, 3]))
g

Graph(num_nodes=4, num_edges=2,
      ndata_schemes={}
      edata_schemes={})

In [12]:
block = dgl.to_block(g, torch.LongTensor([3, 2]))

In [13]:
block.dstdata

{'_ID': tensor([3, 2])}

In [14]:
block.srcdata

{'_ID': tensor([3, 2, 1])}

In [19]:
block.edges()

(tensor([2, 1]), tensor([1, 0]))

In [15]:
block.edata

{'_ID': tensor([0, 1])}

In [9]:
dgl.EID

'_ID'

In [10]:
block.edges(order='eid')

(tensor([2, 1]), tensor([1, 0]))

In [3]:
g

Graph(num_nodes=4059035, num_edges=4962032,
      ndata_schemes={'feat': Scheme(shape=(17,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'type': Scheme(shape=(), dtype=torch.float32), 'time': Scheme(shape=(), dtype=torch.float32)})

In [4]:
4962032/4059035

1.222465930941714