In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install hickle



In [5]:
import argparse
import random,os,sys
import numpy as np
import csv
from scipy import stats
import time
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import pandas as pd
import keras.backend as K
from keras.models import Model, Sequential
from keras.models import load_model
from keras.layers import Input,InputLayer,Multiply,ZeroPadding2D
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense,Activation,Dropout,Flatten,Concatenate
from keras.layers import BatchNormalization
from keras.layers import Lambda
from keras import optimizers,utils
from keras.constraints import max_norm
from keras import regularizers
from keras.callbacks import ModelCheckpoint,Callback,EarlyStopping,History
from tensorflow.keras.optimizers import Adam, SGD
from keras.models import model_from_json
import tensorflow as tf
from sklearn.metrics import average_precision_score
from scipy.stats import pearsonr

import hickle as hkl
import scipy.sparse as sp


####################################Constants Settings###########################

DPATH = '/content/drive/MyDrive/Datasets/ModelAGCN'
Drug_info_file = '%s/GDSC/1.Drug_listMon Jun 24 09_00_55 2019.csv'%DPATH
Cell_line_info_file = '%s/CCLE/Cell_lines_annotations_20181226.txt'%DPATH
Drug_feature_file = '/content/drive/MyDrive/Datasets/103_smiles'
Genomic_mutation_file = '/content/drive/MyDrive/Testing files/TCGAbiolinks/dae_mutation.xlsx'
Cancer_response_exp_file = '/content/drive/MyDrive/Datasets/103BRCADrugs(normalized).xlsx'
Gene_expression_file = '/content/drive/MyDrive/Testing files/TCGAbiolinks/dae_expression.xlsx'
Methylation_file = '/content/drive/MyDrive/Testing files/TCGAbiolinks/dae_meth.xlsx'
cnv_file = '/content/drive/MyDrive/Testing files/TCGAbiolinks/dae_cnv.xlsx'
Max_atoms = 100
model_suffix = 'abc'

def MetadataGenerate(Drug_info_file,Cell_line_info_file,Genomic_mutation_file,Drug_feature_file,Gene_expression_file,Methylation_file,filtered):


    #load demap cell lines genomic mutation features
    mutation_feature = pd.read_excel(Genomic_mutation_file, header=0, index_col=0)
    cell_line_name_set = list(mutation_feature.index)

    # load drug features
    drug_name_set = []
    drug_feature = {}
    for each in os.listdir(Drug_feature_file):
        drug_name_set.append(each.split('.')[0])
        feat_mat,adj_list,degree_list = hkl.load('%s/%s'%(Drug_feature_file,each))
        drug_feature[each.split('.')[0]] = [feat_mat,adj_list,degree_list]
    
    #load gene expression faetures
    gexpr_feature = pd.read_excel(Gene_expression_file, header=0, index_col=0)
    
    #load methylation 
    methylation_feature = pd.read_excel(Methylation_file, header=0, index_col=0)

    cnv_feature = pd.read_excel(cnv_file, header=0, index_col=0)

    assert methylation_feature.shape[0]==gexpr_feature.shape[0]==mutation_feature.shape[0]

    experiment_data = pd.read_excel(Cancer_response_exp_file, header=0, index_col=0)
    print(experiment_data)
    #filter experiment data
    #drug_match_list=[item for item in experiment_data.index if item.split(':')[1] in drugid2pubchemid.keys()]
    experiment_data_filtered = experiment_data.copy()
    
    data_idx = []
    for each_cellline in mutation_feature.index:
      for each_drug in experiment_data_filtered.columns:
        ln_IC50 = float(0)
        data_idx.append((each_cellline, each_drug, ln_IC50))

    nb_celllines = len(set([item[0] for item in data_idx]))
    nb_drugs = len(set([item[1] for item in data_idx]))
    print('%d instances across %d cell lines and %d drugs were generated.'%(len(data_idx),nb_celllines,nb_drugs))
    return mutation_feature, drug_feature, gexpr_feature, methylation_feature, cnv_feature, data_idx

#split into training and test set
def DataSplit(data_idx,ratio = 0.95):
    data_train_idx,data_test_idx = [], []
    train_list = random.sample(data_idx,int(ratio*len(data_idx)))
    test_list = [item for item in data_idx if item not in train_list]
    data_train_idx += train_list
    data_test_idx += test_list
    return data_train_idx,data_test_idx

def NormalizeAdj(adj):
    adj = adj + np.eye(adj.shape[0])
    d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0).toarray()
    a_norm = adj.dot(d).transpose().dot(d)
    return a_norm

def random_adjacency_matrix(n):   
    matrix = [[random.randint(0, 1) for i in range(n)] for j in range(n)]
    # No vertex connects to itself
    for i in range(n):
        matrix[i][i] = 0
    # If i is connected to j, j is connected to i
    for i in range(n):
        for j in range(n):
            matrix[j][i] = matrix[i][j]
    return matrix

def CalculateGraphFeat(feat_mat,adj_list):
    assert feat_mat.shape[0] == len(adj_list)
    feat = np.zeros((Max_atoms,feat_mat.shape[-1]),dtype='float32')
    adj_mat = np.zeros((Max_atoms,Max_atoms),dtype='float32')
       
    feat[:feat_mat.shape[0],:] = feat_mat
    for i in range(len(adj_list)):
        nodes = adj_list[i]
        for each in nodes:
            adj_mat[i,int(each)] = 1
    assert np.allclose(adj_mat,adj_mat.T)
    adj_ = adj_mat[:len(adj_list),:len(adj_list)]
    adj_2 = adj_mat[len(adj_list):,len(adj_list):]
    norm_adj_ = NormalizeAdj(adj_)
    norm_adj_2 = NormalizeAdj(adj_2)
    adj_mat[:len(adj_list),:len(adj_list)] = norm_adj_
    adj_mat[len(adj_list):,len(adj_list):] = norm_adj_2    
    return [feat,adj_mat]

def FeatureExtract(data_idx,drug_feature,mutation_feature,gexpr_feature,methylation_feature, cnv_feature):

    nb_instance = len(data_idx)
    nb_mutation_feature = mutation_feature.shape[1]
    nb_gexpr_features = gexpr_feature.shape[1]
    nb_methylation_features = methylation_feature.shape[1]
    nb_cnv_features = cnv_feature.shape[1]

    drug_data = [[] for item in range(nb_instance)]
    mutation_data = np.zeros((nb_instance, nb_mutation_feature),dtype='float32')
    gexpr_data = np.zeros((nb_instance,nb_gexpr_features),dtype='float32') 
    cnv_data = np.zeros((nb_instance,nb_cnv_features),dtype='float32') 
    methylation_data = np.zeros((nb_instance, nb_methylation_features),dtype='float32') 
    target = np.zeros(nb_instance,dtype='float32')

    for idx in range(nb_instance):
        cell_line_name, drug_name, ln_IC50 = data_idx[idx]
        #modify
        feat_mat,adj_list,_ = drug_feature[str(drug_name)]
        #fill drug data,padding to the same size with zeros
        drug_data[idx] = CalculateGraphFeat(feat_mat,adj_list)
        #randomlize X A
        mutation_data[idx,:] = mutation_feature.loc[cell_line_name].values
        gexpr_data[idx,:] = gexpr_feature.loc[cell_line_name].values
        cnv_data[idx,:] = cnv_feature.loc[cell_line_name].values
        methylation_data[idx,:] = methylation_feature.loc[cell_line_name].values
        target[idx] = ln_IC50
    return drug_data,mutation_data,gexpr_data,methylation_data,target,cnv_data
    
        
    

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
import numpy as np
import keras.backend as K
from keras.layers import Lambda

In [7]:
random.seed(0)
mutation_feature, drug_feature,gexpr_feature,methylation_feature, cnv_feature, data_idx = MetadataGenerate(Drug_info_file,Cell_line_info_file,Genomic_mutation_file,Drug_feature_file,Gene_expression_file,Methylation_file,False)

#Extract features for training and test 
X_drug_data_train, X_mutation_data_train, X_gexpr_data_train, X_methylation_data_train, Y_train, X_cnv_data_train = FeatureExtract(data_idx,drug_feature,mutation_feature,gexpr_feature,methylation_feature, cnv_feature)

X_drug_feat_data_train = [item[0] for item in X_drug_data_train]
X_drug_adj_data_train = [item[1] for item in X_drug_data_train]
X_drug_feat_data_train = np.array(X_drug_feat_data_train)#nb_instance * Max_stom * feat_dim
X_drug_adj_data_train = np.array(X_drug_adj_data_train)#nb_instance * Max_stom * Max_stom  

model = keras.models.load_model('/content/drive/MyDrive/Datasets/ModelAGCN')


pred = model.predict([X_drug_feat_data_train,X_drug_adj_data_train,X_mutation_data_train,X_gexpr_data_train,X_methylation_data_train,X_cnv_data_train])
df = pd.DataFrame(index=list(pd.read_excel(Genomic_mutation_file, header=0, index_col=0).index), columns=list(pd.read_excel(Cancer_response_exp_file, header=0, index_col=0).columns))
i = 0
for x in range(16) :
  for y in range(103) :
    df.iloc[x, y] = pred[i][0]
    i = i + 1
print(df)
df.to_excel('/content/drive/MyDrive/Testing files/TCGAbiolinks/prediction.xlsx')
    # for x in data_idx :
    #   print(x[0]+"+"+x[1])

            Camptothecin  Vinblastine  Cisplatin  Cytarabine  Docetaxel  \
cell_name                                                                 
AU565           0.187431     0.379760   0.260516    0.469145   0.088032   
BT-20           0.293815     0.492720   0.448197    0.679825   0.623601   
BT-474          1.000000     0.447836   0.938684    0.885903   0.335922   
BT-483          0.825094     1.000000   0.968622    1.000000   0.447496   
BT-549          0.418418     0.437978   0.567320    0.588275   0.253112   
CAL-120         0.428605     0.515088   0.925781    0.618654   0.301334   
CAL-148         0.000000     0.047356   0.020386    0.000000   0.007510   
CAL-51          0.141218     0.111984   0.187146    0.092361   0.138036   
CAL-85-1        0.425566     0.471763   0.525818    0.605667   0.307670   
CAMA-1          0.491949     0.457263   0.387248    0.518697   0.285255   
COLO-824        0.563146     0.331473   0.463572    0.477280   0.561618   
DU-4475         0.050216 