![Multi-layer Graph Convolutional Network (GCN) with first-order filters,来源:http://tkipf.github.io/graph-convolutional-networks/](images/gcn_web.png)

# 数据集-Cora Dataset

## 下载地址

https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz

## 内容介绍

Cora Dataset是对Machine Learning Paper进行分类的数据集，它包含三个文件:

-- README: 对数据集的介绍;

-- cora.cites: 论文之间的引用关系图。文件中每行包含两个Paper ID， 第一个ID是被引用的Paper ID； 第二个是引用的Paper ID。格式如下: <ID of cited paper> <ID of citing paper>

-- cora.content: 包含了2708篇Paper的信息，每行的数据格式如下: <paper_id> <word_attributes>+ <class_label>。paper id是论文的唯一标识；word_attributes是是一个维度为1433的词向量，词向量的每个元素对应一个词，0表示该元素对应的词不在Paper中，1表示该元素对应的词在Paper中。class_label是论文的类别，每篇Paper被映射到如下7个分类之一: Case_Based、Genetic_Algorithms、Neural_Networks、Probabilistic_Methods、Reinforcement_Learning、Rule_Learning、Theory。

In [10]:
import pandas as pd
import numpy as np
# 导入数据：分隔符为空格
raw_data = pd.read_csv('data/cora/cora.content',sep = '\t',header = None)
num = raw_data.shape[0] # 样本点数2708


raw_data_sample = raw_data.head(10)

features =raw_data_sample.iloc[:,1:-1]

labels = pd.get_dummies(raw_data_sample.iloc[:, -1])


print("features:")
print(features)

print("labels:")
print(labels)


features
   1     2     3     4     5     6     7     8     9     10    ...  1424  \
0     0     0     0     0     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     0     0     0  ...     0   
2     0     0     0     0     0     0     0     0     0     0  ...     0   
3     0     0     0     0     0     0     0     0     0     0  ...     0   
4     0     0     0     0     0     0     0     0     0     0  ...     0   
5     0     0     0     0     0     0     0     0     0     0  ...     0   
6     0     0     0     0     0     0     0     0     0     0  ...     0   
7     0     0     0     1     0     0     0     0     0     0  ...     0   
8     0     0     0     0     0     0     0     0     0     0  ...     0   
9     0     0     0     0     0     0     0     0     0     0  ...     0   

   1425  1426  1427  1428  1429  1430  1431  1432  1433  
0     0     0     1     0     0     0     0     0     0  
1     0     1     0     0     0     0 

In [5]:
raw_data_cites = pd.read_csv('data/cora/cora.cites',sep = '\t',header = None)

print(raw_data_cites.head(10))

    0        1
0  35     1033
1  35   103482
2  35   103515
3  35  1050679
4  35  1103960
5  35  1103985
6  35  1109199
7  35  1112911
8  35  1113438
9  35  1113831


In [14]:
from util import load_data

features, adj, labels = load_data()

print('features shape: {}, labels shape: {}, adj shape: {}'.format(features.shape, labels.shape, adj.shape))

print(adj)

Loading cora dataset...
Dataset has 2708 nodes, 5429 edges, 1433 features.
features shape: (2708, 1433), labels shape: (2708, 7), adj shape: (2708, 2708)
  (0, 8)	1.0
  (0, 14)	1.0
  (0, 258)	1.0
  (0, 435)	1.0
  (0, 544)	1.0
  (1, 344)	1.0
  (2, 410)	1.0
  (2, 471)	1.0
  (2, 552)	1.0
  (2, 565)	1.0
  (3, 197)	1.0
  (3, 463)	1.0
  (3, 601)	1.0
  (4, 170)	1.0
  (5, 490)	1.0
  (5, 2164)	1.0
  (6, 251)	1.0
  (6, 490)	1.0
  (7, 258)	1.0
  (8, 0)	1.0
  (8, 14)	1.0
  (8, 258)	1.0
  (8, 435)	1.0
  (8, 751)	1.0
  (9, 308)	1.0
  :	:
  (2698, 2697)	1.0
  (2698, 2700)	1.0
  (2699, 2153)	1.0
  (2700, 2697)	1.0
  (2700, 2698)	1.0
  (2701, 2247)	1.0
  (2701, 2263)	1.0
  (2702, 881)	1.0
  (2702, 2624)	1.0
  (2703, 1221)	1.0
  (2703, 1409)	1.0
  (2703, 2200)	1.0
  (2704, 209)	1.0
  (2704, 2407)	1.0
  (2705, 1784)	1.0
  (2705, 1839)	1.0
  (2705, 1840)	1.0
  (2705, 2216)	1.0
  (2706, 1046)	1.0
  (2706, 1138)	1.0
  (2706, 1640)	1.0
  (2706, 1752)	1.0
  (2707, 774)	1.0
  (2707, 1389)	1.0
  (2707, 2344)	1.

拆分训练集、测试集和验证集

In [29]:
from util import get_splits, preprocess_adj

DATASET = 'cora'

X, A, y = load_data(dataset=DATASET)
y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)

print(train_mask)


X /= X.sum(1).reshape(-1, 1)


print(X)

A_ = preprocess_adj(A, True)

print(A_)

Loading cora dataset...
Dataset has 2708 nodes, 5429 edges, 1433 features.
[ True  True  True ... False False False]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
  (0, 0)	0.16666666666666666
  (0, 8)	0.16666666666666666
  (0, 14)	0.09128709291752768
  (0, 258)	0.11785113019775792
  (0, 435)	0.16666666666666666
  (0, 544)	0.18257418583505536
  (1, 1)	0.5000000000000001
  (1, 344)	0.12500000000000003
  (2, 2)	0.19999999999999998
  (2, 410)	0.18257418583505536
  (2, 471)	0.15811388300841897
  (2, 552)	0.06819943394704735
  (2, 565)	0.05031546054266276
  (3, 3)	0.25
  (3, 197)	0.22360679774997896
  (3, 463)	0.2041241452319315
  (3, 601)	0.2041241452319315
  (4, 4)	0.5000000000000001
  (4, 170)	0.408248290463863
  (5, 5)	0.3333333333333333
  (5, 490)	0.2041241452319315
  (5, 2164)	0.23570226039551584
  (6, 6)	0.3333333333333333
  (6, 251)	0.19245008972987523
  (6, 490)	0.2041241452

In [24]:
# https://github.com/tkipf/keras-gcn

from __future__ import print_function

import tensorflow as tf

class GraphConvolution(tf.keras.layers.Layer):
    """Basic graph convolution layer as in https://arxiv.org/abs/1609.02907"""
    def __init__(self, units, support=1,
                 activation=None,
                 use_bias=True
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                ):
        
        super(GraphConvolution, self).__init__()
        self.units = units
        self.use_bias = use_bias
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer
        self.kernel_regularizer = kernel_regularizer
        self.bias_regularizer = bias_regularizer
        
        self.supports_masking = True

        self.support = support
        assert support >= 1

    def build(self, input_shapes):
        features_shape = input_shapes[0]
        assert len(features_shape) == 2
        input_dim = features_shape[1]

        self.kernel = self.add_weight(shape = (input_dim * self.support, self.units),
                                      initializer = self.kernel_initializer,
                                      name = 'kernel',
                                      regularizer = self.kernel_regularizer)
        if self.use_bias:
            self.bias = self.add_weight(shape=(self.units,),
                                        initializer=self.bias_initializer,
                                        name='bias',
                                       regularizer = self.kernel_regularizer)
        else:
            self.bias = None
            
        self.built = True

    def call(self, inputs, mask=None):
        features = inputs[0]
        basis = inputs[1:]

        supports = list()
        for i in range(self.support):
            supports.append(K.dot(basis[i], features))
        supports = K.concatenate(supports, axis=1)
        output = K.dot(supports, self.kernel)

        if self.bias:
            output += self.bias
            
        return self.activation(output)

In [25]:
from __future__ import print_function

import tensorflow as tf
from tf.keras.layers import Input, Dropout
from tf.keras.models import Model
from tf.keras.optimizers import Adam
from tf.keras.regularizers import l2

from utils import *

import time

# Define parameters
DATASET = 'cora'
FILTER = 'localpool'  # 'chebyshev'
MAX_DEGREE = 2  # maximum polynomial degree
SYM_NORM = True  # symmetric (True) vs. left-only (False) normalization
NB_EPOCH = 200
PATIENCE = 10  # early stopping patience

if FILTER == 'localpool':
    """ Local pooling filters (see 'renormalization trick' in Kipf & Welling, arXiv 2016) """
    print('Using local pooling filters...')
    A_ = preprocess_adj(A, SYM_NORM)
    support = 1
    graph = [X, A_]
    G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

elif FILTER == 'chebyshev':
    """ Chebyshev polynomial basis filters (Defferard et al., NIPS 2016)  """
    print('Using Chebyshev polynomial basis filters...')
    L = normalized_laplacian(A, SYM_NORM)
    L_scaled = rescale_laplacian(L)
    T_k = chebyshev_polynomial(L_scaled, MAX_DEGREE)
    support = MAX_DEGREE + 1
    graph = [X]+T_k
    G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True) for _ in range(support)]

else:
    raise Exception('Invalid filter type.')

# Define model architecture
# NOTE: We pass arguments for graph convolutional layers as a list of tensors.
# This is somewhat hacky, more elegant options would require rewriting the Layer base class.
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', kernel_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(y.shape[1], support, activation='softmax')([H]+G)

# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))

# Helper variables for main training loop
wait = 0
preds = None
best_val_loss = 99999

# Fit
for epoch in range(1, NB_EPOCH+1):

    # Log wall-clock time
    t = time.time()

    # Single training iteration (we mask nodes without labels for loss calculation)
    model.fit(graph, y_train, sample_weight=train_mask,
              batch_size=A.shape[0], epochs=1, shuffle=False, verbose=0)

    # Predict on full dataset
    preds = model.predict(graph, batch_size=A.shape[0])

    # Train / validation scores
    train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val],
                                                   [idx_train, idx_val])
    print("Epoch: {:04d}".format(epoch),
          "train_loss= {:.4f}".format(train_val_loss[0]),
          "train_acc= {:.4f}".format(train_val_acc[0]),
          "val_loss= {:.4f}".format(train_val_loss[1]),
          "val_acc= {:.4f}".format(train_val_acc[1]),
          "time= {:.4f}".format(time.time() - t))

    # Early stopping
    if train_val_loss[1] < best_val_loss:
        best_val_loss = train_val_loss[1]
        wait = 0
    else:
        if wait >= PATIENCE:
            print('Epoch {}: early stopping'.format(epoch))
            break
        wait += 1

# Testing
test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
print("Test set results:",
      "loss= {:.4f}".format(test_loss[0]),
      "accuracy= {:.4f}".format(test_acc[0]))

ModuleNotFoundError: No module named 'tf'