https://github.com/williamleif/graphsage-simple

In [1]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable

import numpy as np
import time
import random
from sklearn.metrics import f1_score
from collections import defaultdict

from graphsage.encoders import Encoder
from graphsage.aggregators import MeanAggregator

The Cora dataset consists of Machine Learning papers. These papers are classified into one of the following seven classes:
		
        Case_Based
		
        Genetic_Algorithms
		
        Neural_Networks
		
        Probabilistic_Methods
		
        Reinforcement_Learning
		
        Rule_Learning
		
        Theory
        
The .content file contains descriptions of the papers in the following format:

		<paper_id> <word_attributes>+ <class_label>

The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper.

The .cites file contains the citation graph of the corpus. Each line describes a link in the following format:

		<ID of cited paper> <ID of citing paper>

Each line contains two paper IDs. The first entry is the ID of the paper being cited and the second ID stands for the paper which contains the citation. The direction of the link is from right to left. If a line is represented by "paper1 paper2" then the link is "paper2->paper1".

In [9]:
"""
Simple supervised GraphSAGE model as well as examples running the model
on the Cora and Pubmed datasets.
"""

class SupervisedGraphSage(nn.Module):

    def __init__(self, num_classes, enc):
        super(SupervisedGraphSage, self).__init__()
        self.enc = enc
        self.xent = nn.CrossEntropyLoss()

        self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
        init.xavier_uniform(self.weight)

    def forward(self, nodes):
        embeds = self.enc(nodes)
        scores = self.weight.mm(embeds)
        return scores.t()

    def loss(self, nodes, labels):
        scores = self.forward(nodes)
        return self.xent(scores, labels.squeeze())

def load_cora():
    num_nodes = 2708
    num_feats = 1433 # уникальных слов бинарного вектора каждой вершины
    feat_data = np.zeros((num_nodes, num_feats))
    labels = np.empty((num_nodes,1), dtype=np.int64)
    node_map = {}
    label_map = {}
    with open("data/cora.content") as fp:
        for i,line in enumerate(fp):
            info = line.strip().split()
            feat_data[i,:] = list(map(float, info[1:-1]))
            node_map[info[0]] = i
            if not info[-1] in label_map:
                label_map[info[-1]] = len(label_map)
            labels[i] = label_map[info[-1]]

    adj_lists = defaultdict(set)
    with open("data/cora.cites") as fp:
        for i,line in enumerate(fp):
            info = line.strip().split()
            paper1 = node_map[info[0]]
            paper2 = node_map[info[1]]
            adj_lists[paper1].add(paper2)
            adj_lists[paper2].add(paper1)
    return feat_data, labels, adj_lists

In [10]:
num_nodes = 2708
num_feats = 1433 # уникальных слов бинарного вектора каждой вершины
feat_data = np.zeros((num_nodes, num_feats))
labels = np.empty((num_nodes,1), dtype=np.int64)
node_map = {}
label_map = {} # словарь с кодировкой 7 классов
with open("data/cora.content") as fp:
    for i, line in enumerate(fp):
        info = line.strip().split()
        feat_data[i,:] = list(map(float, info[1:-1]))
        node_map[info[0]] = i
        if not info[-1] in label_map:
            label_map[info[-1]] = len(label_map)
            labels[i] = label_map[info[-1]]

adj_lists = defaultdict(set)
with open("data/cora.cites") as fp:
    for i,line in enumerate(fp):
        info = line.strip().split()
        paper1 = node_map[info[0]]
        paper2 = node_map[info[1]]
        adj_lists[paper1].add(paper2)
        adj_lists[paper2].add(paper1)

In [4]:
adj_lists

defaultdict(set,
            {163: {22,
              42,
              55,
              129,
              141,
              145,
              174,
              188,
              189,
              191,
              219,
              237,
              266,
              290,
              309,
              346,
              380,
              390,
              395,
              402,
              415,
              422,
              448,
              523,
              530,
              546,
              563,
              602,
              606,
              624,
              658,
              659,
              689,
              714,
              717,
              727,
              743,
              744,
              757,
              765,
              769,
              781,
              793,
              800,
              813,
              856,
              910,
              935,
              940,
              942,
              961,
            

In [5]:
label_map

{'Case_Based': 6,
 'Genetic_Algorithms': 5,
 'Neural_Networks': 0,
 'Probabilistic_Methods': 3,
 'Reinforcement_Learning': 2,
 'Rule_Learning': 1,
 'Theory': 4}

In [6]:
labels

array([[                  0],
       [                  1],
       [                  2],
       ..., 
       [8319395793583698029],
       [7020302995668494624],
       [8243124852499640931]])

In [14]:
def run_cora():
    np.random.seed(1)
    random.seed(1)
    num_nodes = 2708
    feat_data, labels, adj_lists = load_cora()
    features = nn.Embedding(2708, 1433)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
   # features.cuda()

    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
    agg2 = MeanAggregator(lambda nodes : enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
            base_model=enc1, gcn=True, cuda=False)
    enc1.num_samples = 5
    enc2.num_samples = 5
    

    graphsage = SupervisedGraphSage(7, enc2)
#    graphsage.cuda()
    rand_indices = np.random.permutation(num_nodes)
    test = rand_indices[:1000]
    val = rand_indices[1000:1500]
    train = list(rand_indices[1500:])

    optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.7)
    times = []
    for batch in range(100):
        batch_nodes = train[:256]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(batch_nodes, 
                Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time-start_time)
        print (batch, loss.data[0])

    val_output = graphsage.forward(val) 
    print ("Validation F1-macro:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="macro"))
    print ("Average batch time:", np.mean(times))

In [13]:
# Micro F1
if __name__ == "__main__":
    run_cora()

0 1.937639832496643
1 1.9147496223449707
2 1.887885332107544
3 1.8689135313034058
4 1.8268251419067383
5 1.7783761024475098
6 1.7468388080596924
7 1.7317333221435547
8 1.6568830013275146
9 1.601458191871643
10 1.5310522317886353
11 1.5150411128997803
12 1.4162908792495728
13 1.4085880517959595
14 1.3570396900177002
15 1.2648512125015259
16 1.1958935260772705
17 1.1283855438232422
18 1.0600134134292603
19 1.0936168432235718
20 0.9382601976394653
21 1.0473201274871826
22 0.8194878101348877
23 0.8784224390983582
24 0.7661449909210205
25 0.7471791505813599
26 0.79213547706604
27 1.114971399307251
28 1.0644900798797607
29 1.0470399856567383
30 0.6002852320671082
31 0.5742293000221252
32 0.54297935962677
33 0.533199667930603
34 0.4572354853153229
35 0.4821424186229706
36 0.49609968066215515
37 0.5377072691917419
38 0.4116293489933014
39 0.452080637216568
40 0.38660740852355957
41 0.4094346761703491
42 0.457983136177063
43 0.3903793394565582
44 0.3879888951778412
45 0.3278603255748749
46 0.36

In [15]:
# Macro F1
if __name__ == "__main__":
    run_cora()

0 1.9703139066696167
1 1.9331425428390503
2 1.9050146341323853
3 1.878568410873413
4 1.8451486825942993
5 1.796291708946228
6 1.7535303831100464
7 1.719900131225586
8 1.637424111366272
9 1.5677900314331055
10 1.5062410831451416
11 1.4635809659957886
12 1.346321940422058
13 1.3378537893295288
14 1.2888845205307007
15 1.2039886713027954
16 1.1415718793869019
17 1.0821990966796875
18 1.0097836256027222
19 1.0383180379867554
20 0.8714260458946228
21 0.9593992233276367
22 0.7846949696540833
23 0.8612028956413269
24 0.7367470264434814
25 0.6848942637443542
26 0.6722915172576904
27 0.7663747668266296
28 1.0628385543823242
29 0.9160172343254089
30 0.9285128712654114
31 0.5763152241706848
32 0.5256641507148743
33 0.5218801498413086
34 0.4563329517841339
35 0.4685043692588806
36 0.4938141107559204
37 0.5386505722999573
38 0.41832640767097473
39 0.45402219891548157
40 0.3811386823654175
41 0.3965171277523041
42 0.45558640360832214
43 0.3941207528114319
44 0.3970416188240051
45 0.3375629782676697


In [16]:
def run_cora():
    np.random.seed(1)
    random.seed(1)
    num_nodes = 2708
    feat_data, labels, adj_lists = load_cora()
    features = nn.Embedding(2708, 1433)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
   # features.cuda()

    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
    agg2 = MeanAggregator(lambda nodes : enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
            base_model=enc1, gcn=True, cuda=False)
    agg3 = MeanAggregator(lambda nodes : enc2(nodes).t(), cuda=False)
    enc3 = Encoder(lambda nodes : enc2(nodes).t(), enc1.embed_dim, 128, adj_lists, agg3,
            base_model=enc1, gcn=True, cuda=False)
    enc1.num_samples = 5
    enc2.num_samples = 5
    enc3.num_samples = 5
    

    graphsage = SupervisedGraphSage(7, enc3)
#    graphsage.cuda()
    rand_indices = np.random.permutation(num_nodes)
    test = rand_indices[:1000]
    val = rand_indices[1000:1500]
    train = list(rand_indices[1500:])

    optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.7)
    times = []
    for batch in range(100):
        batch_nodes = train[:256]
        random.shuffle(train)
        start_time = time.time()
        optimizer.zero_grad()
        loss = graphsage.loss(batch_nodes, 
                Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
        loss.backward()
        optimizer.step()
        end_time = time.time()
        times.append(end_time-start_time)
        print (batch, loss.data[0])

    val_output = graphsage.forward(val) 
    print ("Validation F1-micro:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro"))
    print ("Average batch time:", np.mean(times))

In [17]:
# C тремя слоями
# Micro F1

if __name__ == "__main__":
    run_cora()

0 1.949341058731079
1 1.935502529144287
2 1.9246896505355835
3 1.9082671403884888
4 1.8989578485488892
5 1.8841526508331299
6 1.8495084047317505
7 1.8477576971054077
8 1.7956022024154663
9 1.8051812648773193
10 1.7562358379364014
11 1.7065073251724243
12 1.6451534032821655
13 1.6076226234436035
14 1.5564063787460327
15 1.5537155866622925
16 1.532894492149353
17 1.4998431205749512
18 1.4196962118148804
19 1.4206531047821045
20 1.3466644287109375
21 1.3465982675552368
22 1.2867040634155273
23 1.2781925201416016
24 1.2396233081817627
25 1.0949335098266602
26 1.2188154458999634
27 1.115837574005127
28 1.0730262994766235
29 1.0475517511367798
30 1.0555176734924316
31 0.9429539442062378
32 1.0034583806991577
33 0.9392188191413879
34 0.9162774682044983
35 0.9041797518730164
36 0.9169316291809082
37 0.9290920495986938
38 0.7711609601974487
39 0.7382809519767761
40 0.7528645396232605
41 0.7866998910903931
42 0.6675924062728882
43 0.7047138214111328
44 0.6938216090202332
45 0.7771335244178772
46