In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!pip install dgl==0.6.1
!pip install torch==1.9.1

Collecting dgl==0.6.1
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 33.2 MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1
Collecting torch==1.9.1
  Downloading torch-1.9.1-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 5.6 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.9.1 which is incompatible.
torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.9.1 which is incompatible.
torchaudio 0.11.0+cu113 requires torch==1.11.0, but you

In [21]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import pandas as pd
import networkx as nx
import torch as th
import random

In [22]:
df = pd.read_csv('/content/gdrive/MyDrive/PP-Pathways_ppi.csv', header = None)
#ds = dgl.data.CSVDataset('/path/to/dataset')
random.seed(47)

In [23]:
G=nx.from_pandas_edgelist(df,  0,  1,create_using=nx.Graph())
g=dgl.from_networkx(G)
g=dgl.to_bidirected(g)

In [24]:
print("type: ",g.is_multigraph)
print("No of nodes: ",g.number_of_nodes())
print("No of edges: ",g.number_of_edges())

type:  False
No of nodes:  21557
No of edges:  680989


In [25]:
# creating features for dataset i,e; `Degree`
feature=g.in_degrees()
feature=torch.reshape(feature, (21557,1))
g.ndata['feat']=feature

#### formulate the link prediction problem as a binary classification problem as follows:

- Treat the edges in the graph as positive examples.
- Sample a number of non-existent edges (i.e. node pairs with no edges between them) as negative examples.
- Divide the positive examples and negative examples into a training set and a test set.
- Evaluate the model with any binary classification metric such as Area Under Curve (AUC).

- randomly pick 10% of the edges for positive examples in the test set, and leave the rest for the training set. Then samples the same number of edges for negative examples in both sets.

In [26]:
u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)  #  Disorder the order 

test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))  #  Use a full 1 vector , And corresponding u and v To construct adjacency matrix 
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())  #  Obtain the negative sampling adjacency matrix  adj.todense() Indicates that the sparse matrix adj Become a dense matrix 
neg_u, neg_v = np.where(adj_neg != 0)  #  Negative sampling on adjacency matrix 

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [27]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [28]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [29]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [30]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [31]:

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)



In [32]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 24)
pred = DotPredictor()

In [33]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(350):
    # forward
    h = model(train_g, train_g.ndata['feat'].float())
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))


In epoch 0, loss: 358703.3125
In epoch 5, loss: 68061.453125
In epoch 10, loss: 19094.791015625
In epoch 15, loss: 16641.2578125
In epoch 20, loss: 11313.10546875
In epoch 25, loss: 7252.27197265625
In epoch 30, loss: 4897.7392578125
In epoch 35, loss: 2768.9716796875
In epoch 40, loss: 1808.370849609375
In epoch 45, loss: 1512.4710693359375
In epoch 50, loss: 1346.7034912109375
In epoch 55, loss: 1140.1376953125
In epoch 60, loss: 926.1487426757812
In epoch 65, loss: 775.4495849609375
In epoch 70, loss: 698.5653686523438
In epoch 75, loss: 658.1506958007812
In epoch 80, loss: 622.3213500976562
In epoch 85, loss: 585.63134765625
In epoch 90, loss: 552.7191162109375
In epoch 95, loss: 523.5584106445312
In epoch 100, loss: 497.858642578125
In epoch 105, loss: 475.5787353515625
In epoch 110, loss: 454.7919921875
In epoch 115, loss: 434.9115905761719
In epoch 120, loss: 416.3943176269531
In epoch 125, loss: 399.2481384277344
In epoch 130, loss: 382.4240417480469
In epoch 135, loss: 367.050

In [34]:
# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

AUC 0.8155434542306832
