<a href="https://colab.research.google.com/github/ananyas168/GNN/blob/main/link_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dgl-cu110 
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
%matplotlib inline
!pip install scikit-plot
import scikitplot as skplt

Collecting dgl-cu110
[?25l  Downloading https://files.pythonhosted.org/packages/76/e5/8536dc2f7048a91c63c913ece8c7b2e7f3e438e741aa4c2d847395486617/dgl_cu110-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (39.9MB)
[K     |████████████████████████████████| 39.9MB 33.9MB/s 
Installing collected packages: dgl-cu110
Successfully installed dgl-cu110-0.6.1
Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


Collecting scikit-plot
  Downloading https://files.pythonhosted.org/packages/7c/47/32520e259340c140a4ad27c1b97050dd3254fdc517b1d59974d47037510e/scikit_plot-0.3.7-py3-none-any.whl
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [None]:
import dgl.data

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]
g = dgl.add_self_loop(g)

Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
Extracting file to /root/.dgl/cora_v2
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.


In [None]:
# Split edge set for training and testing
print(g.edges())
u, v = g.edges()
print(len(u))
print(len(v))
eids = np.arange(g.number_of_edges())
print(eids)
eids = np.random.permutation(eids)
print(len(eids))
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
print(adj.todense())
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
print(adj_neg)
neg_u, neg_v = np.where(adj_neg != 0)
print(len(neg_u))#, neg_v)
print(g.number_of_edges())
neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

(tensor([   0,    0,    0,  ..., 2705, 2706, 2707]), tensor([ 633, 1862, 2582,  ..., 2705, 2706, 2707]))
13264
13264
[    0     1     2 ... 13261 13262 13263]
13264
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 1.]
 [0. 0. 0. ... 0. 1. 1.]]
[[-1.  1.  1. ...  1.  1.  1.]
 [ 1. -1.  0. ...  1.  1.  1.]
 [ 1.  0. -1. ...  1.  1.  1.]
 ...
 [ 1.  1.  1. ... -1.  1.  1.]
 [ 1.  1.  1. ...  1. -1.  0.]
 [ 1.  1.  1. ...  1.  0. -1.]]
7322708
13264


In [None]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [None]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [None]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [None]:
import dgl.function as fn
import pandas as pd

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [None]:
#model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
from sklearn.metrics import accuracy_score
def train(train_g,train_pos_g,train_neg_g, model):
    pred = DotPredictor()

    train=[]
    test=[]

    def compute_loss(pos_score, neg_score):
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
        #print(scores)
        #print(labels)
        return F.binary_cross_entropy_with_logits(scores, labels)

    def compute_auc(pos_score, neg_score):
        scores = torch.cat([pos_score, neg_score]).numpy()
        
        labels = torch.cat(
            [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
        return roc_auc_score(labels, scores)
    def accuracy(pos_score, neg_score):
        scores = torch.cat([pos_score, neg_score]).detach().numpy()
        scores = (scores>0.5)
        labels = torch.cat(
            [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
        print(scores)    
        return accuracy_score(labels, scores)    
    # ----------- 3. set up loss and optimizer -------------- #
    # in this case, loss will in training loop
    optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

    # ----------- 4. training -------------------------------- #
    all_logits = []
    for e in range(100):
        # forward
        h = model(train_g, train_g.ndata['feat'])
        pos_train_score = pred(train_pos_g, h)
        print('pos_train_score',pos_train_score)
        neg_train_score = pred(train_neg_g, h)
        print('neg_train_score',neg_train_score)
        loss = compute_loss(pos_train_score, neg_train_score)
        # train_auc = compute_auc(pos_train_score, neg_train_score)
        # pos_test_score = pred(test_pos_g, h)
        # neg_test_score = pred(test_neg_g, h)
        # test_loss = compute_loss(pos_test_score, neg_test_score)
        # #test_auc = compute_auc(pos_test_score, neg_test_score)
        # train.append(loss.item())
        # test.append(test_loss.item())
        

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print('In epoch {}, loss: {}'.format(e, loss))

    # ----------- 5. check results ------------------------ #
    # df= pd.DataFrame()
    # df['test_loss']= test
    # df['train_loss'] = train
    
    # print(df)
    # ax = df.plot.line(rot=0,ylim=(0,1),figsize=[4,4],title='comparative_accuracy_plot')#,colormap='flare')
    # #ax = bar_data.plot.bar(rot=0)
    # #ax.label_outer('comparative_accuracy_bar_plot')
    # fig=ax.get_figure()
    # fig.savefig(path)
    # ax.set_xlabel("iteration")
    # ax.set_ylabel("loss")
    from sklearn.metrics import roc_auc_score
    with torch.no_grad():
        pos_score = pred(test_pos_g, h)
        neg_score = pred(test_neg_g, h)
        print(neg_score)
        print('AUC', compute_auc(pos_score, neg_score))
        print('Auccuracy', accuracy(pos_score, neg_score))
        #confusion_matrix(pos_score, neg_score)


    

In [None]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
train(train_g,train_pos_g,train_neg_g, model)

pos_train_score tensor([0.0029, 0.0020, 0.0007,  ..., 0.0032, 0.0066, 0.0023],
       grad_fn=<SelectBackward>)
neg_train_score tensor([ 0.0007,  0.0012,  0.0006,  ...,  0.0008, -0.0003,  0.0005],
       grad_fn=<SelectBackward>)
In epoch 0, loss: 0.6927748322486877
pos_train_score tensor([0.0096, 0.0225, 0.0139,  ..., 0.0251, 0.0418, 0.0285],
       grad_fn=<SelectBackward>)
neg_train_score tensor([0.0120, 0.0156, 0.0121,  ..., 0.0123, 0.0071, 0.0012],
       grad_fn=<SelectBackward>)
pos_train_score tensor([0.0367, 0.0646, 0.0400,  ..., 0.0617, 0.1036, 0.1000],
       grad_fn=<SelectBackward>)
neg_train_score tensor([0.0379, 0.0441, 0.0280,  ..., 0.0384, 0.0179, 0.0190],
       grad_fn=<SelectBackward>)
pos_train_score tensor([0.0730, 0.0963, 0.0646,  ..., 0.0828, 0.1883, 0.2175],
       grad_fn=<SelectBackward>)
neg_train_score tensor([0.0538, 0.0566, 0.0111,  ..., 0.0478, 0.0084, 0.0436],
       grad_fn=<SelectBackward>)
pos_train_score tensor([0.1212, 0.1188, 0.0931,  ..., 0.0831,

In [None]:
from dgl.nn import GraphConv
g = dgl.add_self_loop(g)
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, h_feats, allow_zero_in_degree=True)
        
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

     
    #     class gin_conv(nn.Module):
    # def __init__(self, in_feats, h_feats):
    #     super(gin_conv, self).__init__()
    #     self.conv1 =  GINConv(in_feats, h_feats, 'max')
    #     self.conv2 = GINConv(h_feats, h_feats, 'max')

    # def forward(self, g, in_feat):
    #     h = self.conv1(g, in_feat)
    #     h = F.relu(h)
    #     h = self.conv2(g, h)
    #     return h

# Create the model with given dimensions
model2 = GCN(train_g.ndata['feat'].shape[1], 16)
train(train_g,train_pos_g,train_neg_g, model2)

In epoch 0, loss: 0.6931235194206238
In epoch 5, loss: 0.6827359795570374
In epoch 10, loss: 0.6763810515403748
In epoch 15, loss: 0.668129026889801
In epoch 20, loss: 0.6570489406585693
In epoch 25, loss: 0.6411725282669067
In epoch 30, loss: 0.6169807314872742
In epoch 35, loss: 0.5817654132843018
In epoch 40, loss: 0.5407572984695435
In epoch 45, loss: 0.5216805934906006
In epoch 50, loss: 0.5121780633926392
In epoch 55, loss: 0.501544177532196
In epoch 60, loss: 0.49046215415000916
In epoch 65, loss: 0.4792875051498413
In epoch 70, loss: 0.4664355218410492
In epoch 75, loss: 0.45692914724349976
In epoch 80, loss: 0.45091113448143005
In epoch 85, loss: 0.4439762532711029
In epoch 90, loss: 0.43805623054504395
In epoch 95, loss: 0.4325326979160309
tensor([ 0.4324, -0.5252, -1.4770,  ...,  1.1590, -1.2474, -1.4790])
AUC 0.8818285638887183
[ True  True  True ...  True False False]
Auccuracy 0.7816742081447964


In [None]:
from dgl.nn.pytorch.conv import GATConv

class GAT(torch.nn.Module):
    def __init__(self, in_feats,hidden_dim,h_feats, num_heads):
        super(GAT, self).__init__()
        self.layer1 = GATConv(in_feats, hidden_dim, num_heads, allow_zero_in_degree=True)
        self.layer2 = GATConv(hidden_dim * num_heads, h_feats, 1, allow_zero_in_degree=True)
    def forward(self, g, h):
        h = self.layer1(g, h)
        h = h.view(-1, h.size(1) * h.size(2))
        h = F.elu(h)
        h = self.layer2(g, h)
        h = h.squeeze() 
        return h

model3 =GAT(train_g.ndata['feat'].shape[1], 16,30,2)
train(train_g,train_pos_g,train_neg_g, model3)


In epoch 0, loss: 0.6927238702774048
In epoch 5, loss: 0.6795674562454224
In epoch 10, loss: 0.6202573180198669
In epoch 15, loss: 0.5692955255508423
In epoch 20, loss: 0.5319256782531738
In epoch 25, loss: 0.5131011605262756
In epoch 30, loss: 0.4980435073375702
In epoch 35, loss: 0.478251188993454
In epoch 40, loss: 0.4710206985473633
In epoch 45, loss: 0.4587682783603668
In epoch 50, loss: 0.44706839323043823
In epoch 55, loss: 0.43710261583328247
In epoch 60, loss: 0.42780038714408875
In epoch 65, loss: 0.4187321364879608
In epoch 70, loss: 0.4091523289680481
In epoch 75, loss: 0.3995278477668762
In epoch 80, loss: 0.3898409307003021
In epoch 85, loss: 0.38005444407463074
In epoch 90, loss: 0.3705344498157501
In epoch 95, loss: 0.36109089851379395
tensor([ 0.7568,  0.2325, -2.2273,  ..., -0.0855, -0.4884, -2.2936])
AUC 0.9234130477808946
[ True  True  True ... False False False]
Auccuracy 0.802790346907994
