In [1]:
import os
import dgl
import torch
import numpy as np
import pandas as pd
import networkx as nx
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
from dgl.data import DGLDataset

Using backend: pytorch


In [8]:
class PolypharmacyDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='polypharmacy')

    def process(self):
        edges = pd.read_csv('../data/GNN_edges-toy.csv')
        properties = pd.read_csv('../data/GNN_properties-toy.csv')
        features = pd.read_csv('../data/GNN-GSE_full_pkd_norm.csv', index_col = 'ProteinID', sep=',')
        drug_comb = pd.read_csv('../data/GNN-TWOSIDE-train-PSE-964.csv', sep=',')
        self.graphs = []
        self.labels = []

        # Create a graph for each graph ID from the edges table.
        # First process the properties table into two dictionaries with graph IDs as keys.
        # The label and number of nodes are values.
        label_dict = {}
        num_nodes_dict = {}
        
        for _, row in properties.iterrows():
            label_dict[row['graph_id']] = row['label']
            num_nodes_dict[row['graph_id']] = row['num_nodes']

        # For the edges, first group the table by graph IDs.
        edges_group = edges.groupby('graph_id')
        
        #Node features or PSEs dictionary
        feature_dic = {i:torch.tensor(features.loc[i+1,]) for i in range(len(features))}
        
        # For each graph ID...
        for graph_id in edges_group.groups:
            # Find the edges as well as the number of nodes and its label.
            edges_of_id = edges_group.get_group(graph_id)
            src = edges_of_id['src'].to_numpy()
            dst = edges_of_id['dst'].to_numpy()
            num_nodes = num_nodes_dict[graph_id]
            label = label_dict[graph_id]
            
            # Need to convert proteinsIDs for feature assigning
            edges_of_id['scr_prot'].unique.tolist()

            # Create a graph and add it to the list of graphs and labels.
            g = dgl.graph((src, dst), num_nodes=num_nodes)
            #g.ndata['PSE'] = th.ones(g.num_nodes(), 964) # 964 is the #PSE 
            self.graphs.append(g)
            self.labels.append(label)

        # Convert the label list to tensor for saving.
        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

dataset = PolypharmacyDataset()
graph, label = dataset[0]
print(graph, label)


Graph(num_nodes=825, num_edges=38344,
      ndata_schemes={}
      edata_schemes={}) tensor(85)


In [12]:
print('Node feature dimensionality:', dataset.dim_nfeats)
print('Number of graph categories:', dataset.gclasses)

AttributeError: 'SyntheticDataset' object has no attribute 'dim_nfeats'

In [13]:
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False)

In [14]:
it = iter(train_dataloader)
batch = next(it)
print(batch)

[Graph(num_nodes=31451, num_edges=8772038,
      ndata_schemes={}
      edata_schemes={}), tensor([160, 159, 146, 158, 143])]


In [15]:
batched_graph, labels = batch
print('Number of nodes for each graph element in the batch:', batched_graph.batch_num_nodes())
print('Number of edges for each graph element in the batch:', batched_graph.batch_num_edges())

# Recover the original graph elements from the minibatch
graphs = dgl.unbatch(batched_graph)
print('The original graphs in the minibatch:')
print(graphs)

Number of nodes for each graph element in the batch: tensor([ 7009, 10575,  5454,  2677,  5736])
Number of edges for each graph element in the batch: tensor([1876090, 4199472, 1159084,  268040, 1269352])
The original graphs in the minibatch:
[Graph(num_nodes=7009, num_edges=1876090,
      ndata_schemes={}
      edata_schemes={}), Graph(num_nodes=10575, num_edges=4199472,
      ndata_schemes={}
      edata_schemes={}), Graph(num_nodes=5454, num_edges=1159084,
      ndata_schemes={}
      edata_schemes={}), Graph(num_nodes=2677, num_edges=268040,
      ndata_schemes={}
      edata_schemes={}), Graph(num_nodes=5736, num_edges=1269352,
      ndata_schemes={}
      edata_schemes={})]


In [16]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        return dgl.mean_nodes(g, 'h')

In [17]:
# Create the model with given dimensions
model = GCN(dataset.dim_nfeats, 16, dataset.gclasses)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    for batched_graph, labels in train_dataloader:
        pred = model(batched_graph, batched_graph.ndata['attr'].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph, batched_graph.ndata['attr'].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

print('Test accuracy:', num_correct / num_tests)

AttributeError: 'SyntheticDataset' object has no attribute 'dim_nfeats'

### Testing

In [44]:
edges = pd.read_csv('../data/GNN_edges-toy.csv')
properties = pd.read_csv('../data/GNN_properties-toy.csv')
features = pd.read_csv('../data/GNN-GSE_full_pkd_norm.csv', index_col = 'ProteinID', sep=',')

In [84]:
#feature_dic = {i:torch.tensor(features.loc[i+1,]) for i in range(len(features))}
len(feature_dic)
#y = torch.tensor(features.loc[1,])
#y = torch.tensor(x.values[1:])

19555

In [18]:
prop = properties.loc[properties['graph_id'] == 3]
prop

Unnamed: 0,graph_id,label,num_nodes
2,3,137,1030


In [6]:
graph = edges.loc[edges['graph_id']==3]
src = graph['src'].to_numpy()
dst = graph['dst'].to_numpy()
graph

Unnamed: 0,graph_id,src,dst,src_prot,dst_prot
76538,3,1,176,32,4621
76539,3,1,52,32,1027
76540,3,1,56,32,1129
76541,3,1,876,32,18028
76542,3,1,756,32,16363
...,...,...,...,...,...
126803,3,1024,132,19514,3408
126804,3,1024,716,19514,15709
126805,3,1024,700,19514,15442
126806,3,1027,441,19530,10008


In [86]:
g = dgl.graph((src, dst), num_nodes=1030)
g

Graph(num_nodes=1030, num_edges=50270,
      ndata_schemes={}
      edata_schemes={})

In [100]:
len(g)



1030

In [77]:
g.ndata['PSE'] = torch.ones(g.num_nodes(), 964) 

In [78]:
g

Graph(num_nodes=1030, num_edges=50270,
      ndata_schemes={'PSE': Scheme(shape=(964,), dtype=torch.float32)}
      edata_schemes={})

In [90]:
edges_group = edges.groupby('graph_id')
edges_of_id = edges_group.get_group(3)
edges_of_id

Unnamed: 0,graph_id,src,dst,src_prot,dst_prot
76538,3,1,176,32,4621
76539,3,1,52,32,1027
76540,3,1,56,32,1129
76541,3,1,876,32,18028
76542,3,1,756,32,16363
...,...,...,...,...,...
126803,3,1024,132,19514,3408
126804,3,1024,716,19514,15709
126805,3,1024,700,19514,15442
126806,3,1027,441,19530,10008


In [99]:

edges_of_id = edges_group.get_group(3)
edges_of_id
prot_ids = edges_of_id['src_prot'].unique().tolist()
prot_dic = {prot:prot_ids.index(prot)+1 for prot in prot_ids}
prot_dic

{32: 1,
 90: 2,
 95: 3,
 100: 4,
 119: 5,
 153: 6,
 172: 7,
 182: 8,
 207: 9,
 210: 10,
 221: 11,
 225: 12,
 268: 13,
 302: 14,
 310: 15,
 315: 16,
 322: 17,
 332: 18,
 336: 19,
 355: 20,
 368: 21,
 405: 22,
 421: 23,
 525: 24,
 550: 25,
 615: 26,
 627: 27,
 638: 28,
 650: 29,
 663: 30,
 668: 31,
 673: 32,
 744: 33,
 766: 34,
 769: 35,
 803: 36,
 819: 37,
 834: 38,
 848: 39,
 850: 40,
 857: 41,
 887: 42,
 901: 43,
 906: 44,
 916: 45,
 929: 46,
 931: 47,
 947: 48,
 954: 49,
 1026: 50,
 1027: 51,
 1052: 52,
 1054: 53,
 1096: 54,
 1129: 55,
 1136: 56,
 1180: 57,
 1217: 58,
 1256: 59,
 1258: 60,
 1262: 61,
 1265: 62,
 1300: 63,
 1330: 64,
 1336: 65,
 1340: 66,
 1341: 67,
 1350: 68,
 1355: 69,
 1356: 70,
 1359: 71,
 1364: 72,
 1368: 73,
 1390: 74,
 1414: 75,
 1417: 76,
 1427: 77,
 1454: 78,
 1477: 79,
 1478: 80,
 1513: 81,
 1536: 82,
 1545: 83,
 1571: 84,
 1605: 85,
 1629: 86,
 1685: 87,
 1695: 88,
 1708: 89,
 1738: 90,
 1756: 91,
 1842: 92,
 1868: 93,
 1936: 94,
 1969: 95,
 1995: 96,
 2011

In [102]:
# Drug-protein file (DTI)
dti = pd.read_csv('../data/GNN-DTI_full.csv', sep=',')
dti['ProteinID'].loc[dti['DrugID'] == 137]
#genes = dti['ProteinID'].loc[dti['DrugID'] == drug].tolist()

1649       32
1650       90
1651       95
1652      100
1653      119
        ...  
2674    19517
2675    19530
2676    19539
2677    19544
2678    19554
Name: ProteinID, Length: 1030, dtype: int64

In [None]:
g = dgl.graph(([0, 0, 1, 5], [1, 2, 2, 0])) # 6 nodes, 4 edges
g
g.ndata['x'] = th.ones(g.num_nodes(), 3)               # node feature of length 3
g.edata['x'] = th.ones(g.num_edges(), dtype=th.int32)  # scalar integer feature
g
# different names can have different shapes
g.ndata['y'] = th.randn(g.num_nodes(), 5)
g.ndata['x'][1]                  # get node 1's feature
g.edata['x'][th.tensor([0, 3])]  # get features of edge 0 and 3
g.ndata['x'][0] = th.zeros(1, 3)