In [1]:
import os

In [2]:
os.chdir('../..')

In [3]:
from ogb.nodeproppred import DglNodePropPredDataset
import dgl
import torch

Using backend: pytorch


In [4]:
dataset = DglNodePropPredDataset('ogbn-arxiv', root='data/dataset_dgl')

In [5]:
dataset

DglNodePropPredDataset(1)

In [6]:
graph = dataset[0][0]

In [7]:
graph.number_of_nodes()

169343

In [8]:
graph.number_of_edges()

1166243

In [9]:
graph.edges()

(tensor([104447,  15858, 107156,  ...,  45118,  45118,  45118]),
 tensor([ 13091,  47283,  69161,  ..., 162473, 162537,  72717]))

In [10]:
graph.has_edges_between(graph.edges()[1], graph.edges()[0]).all()

tensor(False)

In [11]:
graph.has_edges_between(graph.edges()[1], graph.edges()[0]).sum()

tensor(16888)

In [12]:
split_idx = dataset.get_idx_split()
split_idx

{'train': tensor([     0,      1,      2,  ..., 169145, 169148, 169251]),
 'valid': tensor([   349,    357,    366,  ..., 169185, 169261, 169296]),
 'test': tensor([   346,    398,    451,  ..., 169340, 169341, 169342])}

## Train Embedding

In [13]:
graph_train = graph.subgraph(split_idx['train'])

In [14]:
graph_train.add_edges(graph_train.edges()[1], graph_train.edges()[0])
graph_train

Graph(num_nodes=90941, num_edges=749678,
      ndata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32), '_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})

In [15]:
mask_tensor = graph_train.out_degrees() > 0
graph_train = graph_train.subgraph(mask_tensor)
graph_train

Graph(num_nodes=87599, num_edges=749678,
      ndata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32), '_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})

In [16]:
device = 0

In [17]:
dim = 50
walk_length = 30
window_size = 2
batch_size = 160
only_cpu = True
only_gpu = False
mix = False
neg_weight = 1.
negative = 1
lr = 0.1
lap_norm = 0.05
fast_neg = False
print_loss = True
norm = False
use_context_weight = False
async_update = False
num_threads = 16
gpus = [-1]
count_params = False
num_walks = 30
num_sampler_threads = 0
print_interval = 2000
save_in_txt = False
save_in_pt = False
output_emb_file = 'models/deepwalk_01_tr_embedding.npy'

In [18]:
from notebooks.deepwalk_link_pred.src.deepwalk import DeepwalkTrainer

In [19]:
trainer = DeepwalkTrainer(
    graph=graph_train,
    device=device,
    dim=dim,
    walk_length=walk_length,
    window_size=window_size,
    batch_size=batch_size,
    only_cpu=only_cpu,
    only_gpu=only_gpu,
    mix=mix,
    neg_weight=neg_weight,
    negative=negative,
    lr=lr,
    lap_norm=lap_norm,
    fast_neg=fast_neg,
    print_loss=print_loss,
    norm=norm,
    use_context_weight=use_context_weight,
    async_update=async_update,
    num_threads=num_threads,
    gpus=gpus,
    count_params=count_params,
    num_walks=num_walks,
    num_sampler_threads=num_sampler_threads,
    print_interval=print_interval,
    save_in_txt=save_in_txt,
    save_in_pt=save_in_pt,
    output_emb_file=output_emb_file,
)

2627970 seeds in 0.10s


In [20]:
trainer.train()

Run in CPU process
num batchs: 16425

Batch 2000 training time: 44.23s loss: 0.5693
Batch 4000 training time: 43.40s loss: 0.3015
Batch 6000 training time: 43.32s loss: 0.2805
Batch 8000 training time: 43.60s loss: 0.2747
Batch 10000 training time: 43.60s loss: 0.2711
Batch 12000 training time: 44.24s loss: 0.2689
Batch 14000 training time: 44.36s loss: 0.2673
Batch 16000 training time: 44.72s loss: 0.2662
Training used time: 360.67s
