In [1]:
import dgl
import torch
from transformers import *
from transformers import AdamW
import torch.utils.data as Data
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import pickle as pk
import copy
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import torch.nn.functional as F
from IPython.display import display,HTML
import os
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch.nn.utils.rnn import pack_sequence
from torch.nn import CrossEntropyLoss, MSELoss
import math
device=torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
import argparse
import glob
import json
import logging
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook as tqdm
import torch.utils.data as Data
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph

gcn_msg = fn.copy_src(src='h', out='m')
gcn_reduce = fn.sum(msg='m', out='h')

Using backend: pytorch


## GCN GAT

In [2]:
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, feature):
        # Creating a local scope so that all the stored ndata and edata
        # (such as the `'h'` ndata below) are automatically popped out
        # when the scope exits.
        with g.local_scope():
            g.ndata['h'] = feature
            g.update_all(gcn_msg, gcn_reduce)
            h = g.ndata['h']
            return self.linear(h)
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))
class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, h):
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        return h

In [3]:
class Net(nn.Module):
    def __init__(self,input_features,hidden_size,num_labels):
        super(Net, self).__init__()
        self.layer1 = GCNLayer(input_features,hidden_size)
        self.layer2 = GCNLayer(hidden_size, num_labels)
    
    def forward(self, g, features):
        x = F.relu(self.layer1(g, features))
        x = self.layer2(g, x)
        return x

In [27]:
from dgl.data import citation_graph as citegrh
import networkx as nx
def load_cora_data():
    data = citegrh.load_cora()
    features = th.FloatTensor(data.features)
    labels = th.LongTensor(data.labels)
    train_mask = th.BoolTensor(data.train_mask)
    test_mask = th.BoolTensor(data.test_mask)
    g = DGLGraph(data.graph)
    return g, features, labels, train_mask, test_mask
def build_graph():
    num_of_nodes=max(pd.read_csv("./text_classify/data/text.csv")["id"])+1
    g = dgl.DGLGraph()
    g.add_nodes(num_of_nodes)
    edge_list=pd.read_csv("./text_classify/data/reference.csv").values

    src, dst = tuple(zip(*edge_list))
    g.add_edges(src, dst)
    full_labeled_nodes=pd.read_csv("./text_classify/data/train.csv")['id'].values
    full_labels=pd.read_csv("./text_classify/data/train.csv")['label'].values
    id2features=pk.load(open("./text_classify/data/bert_hidden_and_probs.pk","rb"))
    features=np.zeros((len(id2features),768))
    for key,value in id2features.items():
        features[key]=value[1]
    features=torch.tensor(features).float()
    labels=np.zeros(g.number_of_nodes())
    labels[full_labeled_nodes]=full_labels
    labels = torch.LongTensor(labels)  # their labels are different
    train_labeled_nodes=full_labeled_nodes[:10000]
    val_labeled_nodes=full_labeled_nodes[10000:]
    idxs=np.zeros(g.number_of_nodes())
    idxs[train_labeled_nodes]=1
    train_mask=torch.BoolTensor(idxs)
    idxs=np.zeros(g.number_of_nodes())
    idxs[val_labeled_nodes]=1
    val_mask=torch.BoolTensor(idxs)
    idxs=np.ones(g.number_of_nodes())
    idxs[full_labeled_nodes]=0
    test_mask=torch.BoolTensor(idxs)
    return g, features, labels, train_mask, val_mask,test_mask

In [5]:
def evaluate(model, g, features, labels, mask):
    model.eval()
    with th.no_grad():
#         logits = model( features) #GAT
        logits=model(g,features) #GCN
        logits = logits[mask]
        labels = labels[mask]
        _, indices = th.max(logits, dim=1)
        correct = th.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)
def predict(model, g, features, labels, mask):
    model.eval()
    pred_indices=[]
    with th.no_grad():
#         logits = model( features) #GAT
        logits=model(g,features) #GCN
        logits = logits[mask]
        labels = labels[mask]
        _, indices = th.max(logits, dim=1)
        pred_indices.append(indices.detach().cpu().numpy())
        return  np.concatenate(pred_indices)

## 建图及读取特征

In [6]:
g, features, labels, train_mask, val_mask,test_mask=build_graph()
print('We have %d nodes.' % g.number_of_nodes())
print('We have %d edges.' % g.number_of_edges())

We have 25561 nodes.
We have 73313 edges.


In [7]:
features.shape

torch.Size([25561, 768])

## 训练

In [20]:
net = GAT(g,
          in_dim=features.size()[1],
          hidden_dim=384,
          out_dim=5,
          num_heads=2)

In [8]:
net=Net(768,384,5)

In [56]:
net

Net(
  (layer1): GCNLayer(
    (linear): Linear(in_features=768, out_features=384, bias=True)
  )
  (layer2): GCNLayer(
    (linear): Linear(in_features=384, out_features=5, bias=True)
  )
)

In [10]:
import time
import numpy as np
# g, features, labels, train_mask, test_mask = load_cora_data()
optimizer = th.optim.Adam(net.parameters(), lr=1e-3)
dur = []
# net=net.to(device)
# g=g.to(device)
# features=features.to(device)
# train_mask=train_mask.to(device)
# test_mask=test_mask.to(device)
for epoch in range(50):
    if epoch >=3:
        t0 = time.time()

    net.train()
#     logits = net(features) #GAT
    logits=net(g,features) #GCN
    logp = F.log_softmax(logits, 1)
    logp=logp.cpu()
    loss = F.nll_loss(logp[train_mask], labels[train_mask])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch >=3:
        dur.append(time.time() - t0)
    
    acc = evaluate(net, g, features, labels, val_mask)
    print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
            epoch, loss.item(), acc, np.mean(dur)))

Epoch 00000 | Loss 1.3490 | Test Acc 0.4192 | Time(s) nan
Epoch 00001 | Loss 1.4897 | Test Acc 0.4509 | Time(s) nan
Epoch 00002 | Loss 1.3383 | Test Acc 0.4520 | Time(s) nan
Epoch 00003 | Loss 1.3357 | Test Acc 0.4473 | Time(s) 1.1836
Epoch 00004 | Loss 1.3587 | Test Acc 0.4484 | Time(s) 1.1631
Epoch 00005 | Loss 1.3597 | Test Acc 0.4462 | Time(s) 1.2757
Epoch 00006 | Loss 1.3558 | Test Acc 0.4476 | Time(s) 1.2471
Epoch 00007 | Loss 1.3509 | Test Acc 0.4491 | Time(s) 1.2354
Epoch 00008 | Loss 1.3342 | Test Acc 0.4484 | Time(s) 1.2589
Epoch 00009 | Loss 1.3096 | Test Acc 0.4480 | Time(s) 1.2637
Epoch 00010 | Loss 1.2844 | Test Acc 0.4494 | Time(s) 1.2718
Epoch 00011 | Loss 1.2675 | Test Acc 0.4502 | Time(s) 1.2680
Epoch 00012 | Loss 1.2619 | Test Acc 0.4494 | Time(s) 1.3304
Epoch 00013 | Loss 1.2559 | Test Acc 0.4476 | Time(s) 1.3641
Epoch 00014 | Loss 1.2445 | Test Acc 0.4473 | Time(s) 1.3706
Epoch 00015 | Loss 1.2337 | Test Acc 0.4476 | Time(s) 1.3608
Epoch 00016 | Loss 1.2283 | Test 

## 预测

In [65]:
test_mask_array=test_mask.to(torch.long).detach().numpy()
test_ids=list(filter(lambda x:x>0,list(np.arange(test_mask_array.shape[0])*test_mask_array)))
test_pred_labels=predict(net,g,features,labels,test_mask)
test_submit=pd.read_csv("./text_classify/data/sample.csv")
for idx in tqdm(range(len(test_ids))):
    index=test_submit.query("id==%d"%test_ids[idx])['label'].index[0]
    test_submit.loc[index,'label']=test_pred_labels[idx]
test_submit.to_csv("./text_classify/data/bert_gcn.csv",index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(IntProgress(value=0, max=12782), HTML(value='')))


