In [1]:
import dgl
import torch
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm

Using backend: pytorch


### Making the comb files

In [None]:
# Eval combinations
eval_df = pd.read_csv('../data/TWOSIDE-evaluation-PSE-964.csv', sep=',')
eval_df

In [None]:
drugs_list = eval_df['Drug1'].unique().tolist()
for drug in eval_df['Drug2'].unique().tolist():
    if drug not in drugs_list:
        drugs_list.append(drug)

len(drugs_list)

In [None]:
%%time
from urllib.request import urlopen

f = open('../data/Eval_drugs_964.tsv', 'a')
f.write('Drug_name\tPubChemID\tSMILES\n')

for drug in tqdm(drugs_list):
    drug_name = drug
    
    if ' ' in drug:
        drug = drug.replace(' ', '%20')
        
    try:
        url1 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+drug+'/property/CanonicalSMILES/TXT'
        url2 = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+drug+'/cids/TXT'
        res1 = urlopen(url1)
        smiles = str(res1.read())[2:-3]
        res2 = urlopen(url2)
        drugid = str(res2.read())[2:-3].split('\\n')[0]
        
        row = drug_name + '\t' + drugid +'\t'+ smiles + '\n'
        f.write(row)
    except:
        #row = drug_name + '\t' + '-\n'
        #f.write(row)
        pass

f.close()

In [None]:
df = pd.read_csv('../data/Eval_drugs_964.tsv', sep='\t')
df

### Making the DTI and DrugID file

#### DrugID

In [None]:
#DrugID
eval_drugs = pd.read_csv('../data/Eval_drugs_964.tsv', sep='\t')
eval_drugs = eval_drugs[['Drug_name','PubChemID']]
eval_drugs

In [None]:
eval_drugs

In [None]:
drugs = eval_drugs.sort_values(by='PubChemID')['PubChemID'].unique().tolist()
drugs

In [None]:
eval_drugs['GraphID'] = '-'
drugs = eval_drugs.sort_values(by='PubChemID')['PubChemID'].unique().tolist()
dic = {drug:drugs.index(drug)+1 for drug in drugs} # conversion dic, starts at 0
eval_drugs['GraphID'] = eval_drugs['PubChemID'].map(dic) #DrugIDs
eval_drugs

In [None]:
eval_drugs = eval_drugs.sort_values(by='GraphID')
eval_drugs

In [None]:
eval_drugs = eval_drugs.rename({'Drug_name':'Name','PubChemID':'DrugID'}, axis=1)
eval_drugs = eval_drugs[['GraphID','DrugID','Name']]
eval_drugs

In [None]:
eval_drugs.to_csv('../data/Eval_DrugID.csv', index=False, sep = ',')

#### DTI

In [None]:
eval_dti = pd.read_csv('../data/Eval_affinity_cut_83.37.csv', sep=',')
eval_dti

In [None]:
gene_id = pd.read_csv('../data/GeneID.csv', sep=',')
gene_id

In [None]:
eval_dti['ProteinID'] = '-'
genes = gene_id['Name'].tolist()
gene_dic = {gene:genes.index(gene)+1 for gene in genes}
eval_dti['ProteinID'] = eval_dti['GeneID'].map(gene_dic) #DrugIDs
eval_dti

In [None]:
eval_dti[['DrugID','ProteinID']].to_csv('../data/Eval_DTI_full.csv', index=False, sep = ',')

## Evaluation 

In [2]:
import os
import torch.nn as nn
from sklearn import metrics
import torch.nn.functional as F
from dgl.data import DGLDataset
torch.cuda.set_device(0)  

In [3]:
class PSE_eval(DGLDataset):
    def __init__(self):
        super().__init__(name='PSE_eval')

    def process(self):
        features = pd.read_csv('../data/GNN-GSE_full_pkd_norm.csv',index_col = 'ProteinID', sep=',')
        drug_comb = pd.read_csv('../data/Eval_TWOSIDE-evaluation-PSE-964.csv', sep=',') 
        nodes = pd.read_csv('../data/GNN-GSE_full_pkd_norm.csv', sep=',')
        edges = pd.read_csv('../data/GNN-PPI-net.csv', sep=',')
        dti = pd.read_csv('../data/Eval_DTI_full.csv', sep=',')
        DrugID = pd.read_csv('../data/Eval_DrugID.csv', sep = ',')
        print('data loaded!')
        
        # generate drug specific ppi subgraph for GNN edges
        def drug2ppi(drug):
            genes = dti['ProteinID'].loc[dti['DrugID'] == drug].tolist()
            df = edges[['protein1','protein2']].loc[edges['protein1'].isin(genes)]
            df = df.loc[df['protein2'].isin(genes)]
            num_nodes = len(df['protein1'].unique())
            df['graph_id'] = DrugID.loc[DrugID['DrugID'] == drug]['GraphID'].tolist()[0]  #DrugID
            df = df.rename(columns={'protein1': 'src_prot', 'protein2': 'dst_prot'}) # prot: actual protein id
            final_genes =df['src_prot'].unique().tolist() # final genes that have ppi data
            dic = {gene:final_genes.index(gene) for gene in final_genes} # conversion dic, starts at 0
            df['src'] = df['src_prot'].map(dic) #local ids
            df['dst'] = df['dst_prot'].map(dic) #local ids
            return(df[['graph_id', 'src', 'dst', 'src_prot', 'dst_prot']],num_nodes)
        
        self.graphs = []
        self.labels = []
        self.comb_graphs = []
        self.comb_labels = []

        #Node features or PSEs dictionary
        feature_dic = {i+1:torch.tensor(features.loc[i+1,]) for i in range(len(features))}
    
        # For each graph ID...
        for drug in tqdm(DrugID['DrugID'].tolist()):
            # Find the edges as well as the number of nodes and its label.
            edges_of_id,num_nodes = drug2ppi(drug)
            src = edges_of_id['src'].to_numpy()
            dst = edges_of_id['dst'].to_numpy()
            label = DrugID.loc[DrugID['DrugID'] == drug]['Name'].tolist()[0]
            
            # Create a graph and add it to the list of graphs and labels.
            g = dgl.graph((src, dst), num_nodes=num_nodes)
            
            # Need to convert proteinsIDs for feature assigning
            prot_ids = edges_of_id['src_prot'].unique().tolist()
            for prot in edges_of_id['dst_prot'].unique().tolist():
                if prot not in prot_ids:
                    prot_ids.append(prot)
            convert_prot = {prot_ids.index(prot):prot for prot in prot_ids}
            
            #Adding features of each node
            g.ndata['PSE'] = torch.zeros(g.num_nodes(), 964)
            for node in g.nodes().tolist():
                g.ndata['PSE'][node] = feature_dic[convert_prot[node]]
                
            self.graphs.append(g)
            self.labels.append(label)
            
        # conver drugid to their respective graph id
        #drug2graph = {properties['label'][i]:i for i in range(len(properties))} 
        drug2graph = {self.labels[i]:i for i in range(len(self.labels))} 
        
        for i in range(len(drug_comb)):
            row = drug_comb.loc[i]
            g1 = self.graphs[drug2graph[row[0]]] # Drug1 graph
            g2 = self.graphs[drug2graph[row[1]]] # Drug2 graph  
            self.comb_graphs.append([g1,g2])
            self.comb_labels.append(torch.tensor(row[2:])) # PSE values

            
        # Convert the label list to tensor for saving.
        #self.comb_labels = torch.LongTensor(self.comb_labels)

    def __getitem__(self, i):
        return self.comb_graphs[i], self.comb_labels[i]
        #return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.comb_graphs)

In [4]:
dataset = PSE_eval()
graph, label = dataset[0]
print(graph)

data loaded!


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=605.0), HTML(value='')))




MemoryError: Unable to allocate 89.6 MiB for an array with shape (11738330,) and data type int64

In [None]:
# Making the batches
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
eval_sampler = SubsetRandomSampler(torch.arange(num_examples))
eval_dataloader = GraphDataLoader(dataset, sampler=eval_sampler, batch_size=3, drop_last=False)


In [None]:
# GNN model
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats,  num_classes)
        
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        out = F.relu(dgl.mean_nodes(g, 'h'))
        #out = F.relu(dgl.max_nodes(g, 'h'))
        return out