## 1. Environment setup

In [1]:

import os
import dgl
import torch
import numpy as np
import pandas as pd
import networkx as nx
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
from dgl.data import DGLDataset

Using backend: pytorch


## 2. Setting up the Dataset

In [102]:
class PolypharmacyDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='polypharmacy')

    def process(self):
        edges = pd.read_csv('../data/GNN_edges-toy.csv')
        properties = pd.read_csv('../data/GNN_properties-toy.csv')
        drug_comb = pd.read_csv('../data/GNN-TWOSIDE-train-PSE-964-toy.csv', sep=',') # or 3347
        features = pd.read_csv('../data/GNN-GSE_full_pkd_norm.csv', index_col = 'ProteinID', sep=',')
        
        self.graphs = []
        self.labels = []
        self.comb_graphs = []
        self.comb_labels = []
        
        num_features = len(features.columns) # no. of PSEs
        self.dim_nfeats = num_features
        self.gclasses = num_features

        # Create a graph for each graph ID from the edges table.
        # First process the properties table into two dictionaries with graph IDs as keys.
        # The label and number of nodes are values.
        label_dict = {}
        num_nodes_dict = {}
        
        for _, row in properties.iterrows():
            label_dict[row['graph_id']] = row['label']
            num_nodes_dict[row['graph_id']] = row['num_nodes']

        # For the edges, first group the table by graph IDs.
        edges_group = edges.groupby('graph_id')
        
        #Node features or PSEs dictionary
        feature_dic = {i+1:torch.tensor(features.loc[i+1,]) for i in range(len(features))}
        
        # For each graph ID...
        for graph_id in edges_group.groups:
            # Find the edges as well as the number of nodes and its label.
            edges_of_id = edges_group.get_group(graph_id)
            src = edges_of_id['src'].to_numpy()
            dst = edges_of_id['dst'].to_numpy()
            num_nodes = num_nodes_dict[graph_id]
            label = label_dict[graph_id]
            
            # Create a graph and add it to the list of graphs and labels.
            g = dgl.graph((src, dst), num_nodes=num_nodes)
            
            # Need to convert proteinsIDs for feature assigning
            prot_ids = edges_of_id['src_prot'].unique().tolist()
            for prot in edges_of_id['dst_prot'].unique().tolist():
                if prot not in prot_ids:
                    prot_ids.append(prot)
            convert_prot = {prot_ids.index(prot):prot for prot in prot_ids}
            
            #Adding features of each node
            g.ndata['PSE'] = torch.zeros(g.num_nodes(), num_features)
            for node in g.nodes().tolist():
                g.ndata['PSE'][node] = feature_dic[convert_prot[node]]
                
            self.graphs.append(g)
            self.labels.append(label)
        
        # conver drugid to their respective graph id
        #drug2graph = {properties['label'][i]:i for i in range(len(properties))} 
        drug2graph = {self.labels[i]:i for i in range(len(self.labels))} 

        for i in range(len(drug_comb)):
            row = drug_comb.loc[i]
            g1 = self.graphs[drug2graph[row[0]]] # Drug1 graph
            g2 = self.graphs[drug2graph[row[1]]] # Drug2 graph  
            self.comb_graphs.append([g1,g2])
            self.comb_labels.append(torch.tensor(row[2:])) # PSE values

            
        # Convert the label list to tensor for saving.
        #self.comb_labels = torch.LongTensor(self.comb_labels)

    def __getitem__(self, i):
       # return self.comb_graphs[i], self.comb_labels[i]
        return self.comb_graphs[i], self.comb_labels[i]

    def __len__(self):
        return len(self.comb_graphs)
    

dataset = PolypharmacyDataset()
graph, label = dataset[0]
print(graph)


[Graph(num_nodes=722, num_edges=38344,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=4515, num_edges=799034,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})]


In [110]:
print(graph)
print(graph[0])
print(graph[1])

[Graph(num_nodes=722, num_edges=38344,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=4515, num_edges=799034,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})]
Graph(num_nodes=722, num_edges=38344,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})
Graph(num_nodes=4515, num_edges=799034,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})


In [103]:
graph, label = dataset[0]
print(graph)

[Graph(num_nodes=722, num_edges=38344,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=4515, num_edges=799034,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})]


## 3. Data loading and batch

In [97]:
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))


train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False)

In [111]:
len(dataset)

18

In [128]:
it = iter(train_dataloader)
batch = next(it)
print(batch[0][1])

Graph(num_nodes=14495, num_edges=2443660,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})


In [151]:
batched_graph, labels = batch
print('Number of nodes for each graph1 element in the batch:', batched_graph[0].batch_num_nodes())
print('Number of nodes for each graph2 element in the batch:', batched_graph[1].batch_num_nodes())
print('Number of edges for each graph1 element in the batch:', batched_graph[0].batch_num_edges())
print('Number of edges for each graph2 element in the batch:', batched_graph[1].batch_num_edges())

# Recover the original graph elements from the minibatch
graphs = dgl.unbatch(batched_graph[0])
print('The original graphs1 in the minibatch:')
print(graphs)

Number of nodes for each graph1 element in the batch: tensor([7585,  722, 5656, 7585, 7585])
Number of nodes for each graph2 element in the batch: tensor([2756, 4515,  846, 5656,  722])
Number of edges for each graph1 element in the batch: tensor([2270116,   38344, 1269352, 2270116, 2270116])
Number of edges for each graph2 element in the batch: tensor([ 292614,  799034,   44316, 1269352,   38344])
The original graphs1 in the minibatch:
[Graph(num_nodes=7585, num_edges=2270116,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=722, num_edges=38344,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=5656, num_edges=1269352,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nodes=7585, num_edges=2270116,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={}), Graph(num_nod

## 4. GNN Model: Siamese GCN

In [174]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)
        self.in_feats = in_feats
        self.num_classes=num_classes
        
    def forward(self, g):
        h = self.conv1(g, self.in_feats)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        return dgl.mean_nodes(g, 'h')
    '''
    def forward(self, g):
        out1 = self.forward_one(g[0])
        out2 = self.forward_one(g[1])
        out = F.relu((out1+out2)/2)
        #out = self.out(mean)
        return out
    '''  
'''
# for test
#if __name__ == '__main__':
 #   net = GCN()
 #   print(net)
 #   print(list(net.parameters()))
    def forward_one(self, g):
        h = self.conv1(g, self.in_feat)
        h = F.relu(h)
        h = self.conv2(h, self.n_classes)
        h = F.relu(h)
        g.ndata['h'] = h
        return dgl.mean_nodes(g, 'h')
        '''

"\n# for test\n#if __name__ == '__main__':\n #   net = GCN()\n #   print(net)\n #   print(list(net.parameters()))\n    def forward_one(self, g):\n        h = self.conv1(g, self.in_feat)\n        h = F.relu(h)\n        h = self.conv2(h, self.n_classes)\n        h = F.relu(h)\n        g.ndata['h'] = h\n        return dgl.mean_nodes(g, 'h')\n        "

In [176]:
# Create the model with given dimensions
model = GCN(dataset.dim_nfeats, 450, dataset.gclasses)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    for batched_graph, labels in dataset:
        pred = model(batched_graph[0])
        #pred2 = model(batched_graph[1], batched_graph[1].ndata['PSE'].float())
        #pred = F.relu((pred1 + pred2)/2)
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


DGLError: Invalid key "0". Must be one of the edge types.

In [146]:
# Create the model with given dimensions
model = GCN(dataset.dim_nfeats, 450, dataset.gclasses)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    for batched_graph, labels in train_dataloader:
        pred = model([batched_graph[0],batched_graph[1]])
        #pred2 = model(batched_graph[1], batched_graph[1].ndata['PSE'].float())
        #pred = F.relu((pred1 + pred2)/2)
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph)
    #pred2 = model(batched_graph[1], batched_graph[1].ndata['PSE'].float())
    #pred = F.relu((pred1 + pred2)/2)
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

print('Test accuracy:', num_correct / num_tests)

AttributeError: 'int' object has no attribute 'dim'

### ==================== Testing ====================

In [123]:
edges = pd.read_csv('../data/GNN_edges-toy.csv')
properties = pd.read_csv('../data/GNN_properties-toy.csv')
features = pd.read_csv('../data/GNN-GSE_full_pkd_norm.csv', index_col = 'ProteinID', sep=',')

In [130]:
feature_dic = {i+1:torch.tensor(features.loc[i+1,]) for i in range(len(features))}
len(feature_dic)

19555

In [111]:
properties

Unnamed: 0,graph_id,label,num_nodes
0,1,85,722
1,2,119,719
2,3,137,935
3,4,143,5656
4,5,146,5376
5,6,158,2600
6,7,159,10469
7,8,160,6921
8,9,175,1158
9,10,187,6830


In [124]:
graph = edges.loc[edges['graph_id']==3]
src = graph['src'].to_numpy()
dst = graph['dst'].to_numpy()
graph

Unnamed: 0,graph_id,src,dst,src_prot,dst_prot
76538,3,0,172,32,4621
76539,3,0,50,32,1027
76540,3,0,54,32,1129
76541,3,0,828,32,18028
76542,3,0,717,32,16363
...,...,...,...,...,...
126803,3,933,128,19514,3408
126804,3,933,679,19514,15709
126805,3,933,664,19514,15442
126806,3,934,420,19530,10008


In [125]:
g = dgl.graph((src, dst), num_nodes=935)
g

Graph(num_nodes=935, num_edges=50270,
      ndata_schemes={}
      edata_schemes={})

In [137]:
edges_group = edges.groupby('graph_id')
edges_of_id = edges_group.get_group(3)
edges_of_id

Unnamed: 0,graph_id,src,dst,src_prot,dst_prot
76538,3,0,172,32,4621
76539,3,0,50,32,1027
76540,3,0,54,32,1129
76541,3,0,828,32,18028
76542,3,0,717,32,16363
...,...,...,...,...,...
126803,3,933,128,19514,3408
126804,3,933,679,19514,15709
126805,3,933,664,19514,15442
126806,3,934,420,19530,10008


In [142]:
prot_ids = edges_of_id['src_prot'].unique().tolist()
convert_prot = {prot_ids.index(prot):prot for prot in prot_ids}
g.ndata['PSE'] = torch.zeros(g.num_nodes(), 964)
for node in g.nodes().tolist():
    g.ndata['PSE'][node] = feature_dic[convert_prot[node]]
    
g

Graph(num_nodes=935, num_edges=50270,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})

In [150]:
features.loc[3291]

Arthralgia               0.781995
Diarrhoea                0.782882
Headache                 0.784467
Vomiting                 0.783447
Dyspepsia                0.780225
                           ...   
Hypertensive crisis      0.773061
Pneumonia bacterial      0.712454
Hepatocellular injury    0.782869
Shock haemorrhagic       0.737688
Haemorrhagic stroke      0.785618
Name: 3291, Length: 964, dtype: float64

In [2]:
g.ndata['PSE'][127]

In [3]:
convert_prot

In [None]:
g = dgl.graph(([0, 0, 1, 5], [1, 2, 2, 0])) # 6 nodes, 4 edges
g
g.ndata['x'] = th.ones(g.num_nodes(), 3)               # node feature of length 3
g.edata['x'] = th.ones(g.num_edges(), dtype=th.int32)  # scalar integer feature
g
# different names can have different shapes
g.ndata['y'] = th.randn(g.num_nodes(), 5)
g.ndata['x'][1]                  # get node 1's feature
g.edata['x'][th.tensor([0, 3])]  # get features of edge 0 and 3
g.ndata['x'][0] = th.zeros(1, 3)