In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import train_test_split

from rdkit import Chem
from rdkit.Chem import AllChem

# Gene Exp

In [2]:
gene_exp = pd.read_csv('nci60_gene_exp.csv', index_col=0).T
gene_exp.head()

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MCF7,1.92,2.89,0.03,0.03,0.21,0.0,0.0,0.0,2.46,0.0,...,8.52,22.48,0.44,1.51,5.59,0.97,0.94,6.99,2.52,3.05
MDA_MB_231,0.49,0.12,0.02,0.02,0.0,0.01,0.0,0.07,0.22,0.0,...,16.76,27.56,0.32,3.43,1.44,0.55,1.58,8.15,0.45,18.57
HS578T,3.37,1.17,0.04,0.04,0.0,0.09,0.0,0.0,0.35,0.3,...,1.97,3.22,0.16,0.34,2.11,0.14,1.42,103.53,1.27,1.65
BT_549,6.0,1.92,0.0,0.0,0.0,0.0,0.0,0.0,5.52,0.0,...,1.93,3.75,0.45,0.94,2.79,0.49,1.22,36.05,2.5,2.31
T47D,3.73,1.65,0.01,0.01,0.06,0.0,0.0,0.15,0.71,0.0,...,4.02,25.63,0.55,1.85,5.18,0.8,1.51,13.0,1.64,1.28


In [3]:
nsc_class = pd.read_csv('../Figs/nsc_cid_smiles_class_name.csv', index_col=0)[['NSC', 'MECHANISM']]
nsc_class = nsc_class[nsc_class.MECHANISM == 'DNA']
nsc_class.shape

(269, 2)

# Drug Response

In [4]:
drug_response = pd.read_csv('nci60Act.csv', index_col=0)
drug_response = drug_response[drug_response.index.isin(list(nsc_class.NSC))].fillna(0)
drug_response.columns = gene_exp.index
drug_response

Unnamed: 0,MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,...,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
740,0.703626,-1.219032,-1.892792,-0.877267,-1.156158,0.510978,0.536910,0.589970,-0.356387,-1.460314,...,0.626838,0.507493,0.682664,-0.891882,0.499337,0.541612,-1.400771,0.602244,-1.641942,0.231533
752,0.475296,-0.312852,-1.089067,-0.441030,-0.058619,0.057507,0.125700,0.111693,-3.285729,-0.114051,...,-0.321691,0.507798,0.384102,-1.314527,-0.318444,0.557175,0.345056,-0.047731,0.155244,-0.160223
755,0.704027,-0.438857,-0.548744,-1.441942,0.496864,0.096265,-0.082186,0.417634,-1.927502,-0.372021,...,0.123647,0.543639,0.623318,-1.374212,-0.173024,0.314436,-1.002183,-0.881252,0.491364,-0.183200
762,0.547964,-1.033803,-1.399273,-0.538268,1.137432,0.135942,-0.094460,0.562628,-1.398911,-1.050409,...,-0.294845,0.934592,0.591263,-0.552673,1.797227,1.260987,0.172806,0.869675,-0.529810,-0.634413
1390,0.517269,0.960399,-1.710657,-0.260192,-0.428596,-2.369012,0.224249,-1.481654,-2.369012,-0.950229,...,0.534589,0.819455,0.091197,-0.188655,0.800745,1.468891,0.182377,-0.100527,0.519999,-0.167908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,-0.672839,-0.009522,1.593912,-0.015442,-0.336637,0.263434,0.524381,1.450999,-0.073388,1.877515,...,0.140564,-0.863786,0.409866,2.250914,0.739791,1.910757,2.324851,0.728314,1.252344,0.959549
783107,0.780763,0.353024,-1.901341,-1.812769,0.402564,0.004460,0.329454,-0.072378,-2.294693,1.159211,...,0.031677,0.463469,0.134452,0.537280,0.483558,0.484789,0.790988,0.112736,-0.212100,0.516311
784722,1.544864,-0.722438,-0.591117,-0.292344,0.491538,0.250683,0.672894,0.135474,0.555915,0.754361,...,-0.565436,0.387815,0.702974,0.713895,0.149978,-1.265221,-0.104946,0.714125,-0.574573,-1.220123
789797,-0.561025,-0.217302,-0.638256,-0.638256,1.112001,-0.638256,-0.674451,-0.638256,-0.638256,0.861387,...,-0.266452,-0.638256,-0.638256,1.989888,-0.521750,2.481620,-0.331090,1.672402,-0.602061,2.006186


# DTI

In [5]:
dti = pd.read_csv('../Figs/dti_drugbank.csv', index_col=0)
dti.shape

(100, 403)

## All drugs are in drug response.

In [6]:
dti = dti[sorted(list(set(gene_exp.columns) & set(dti.columns)))]
dti

Unnamed: 0,AAK1,ABL1,ABL2,ACTB,ACVR1,ACVR1B,ADA,ADORA3,AHR,AKR1C3,...,UBA1,UGT1A1,UGT3A1,ULK1,ULK2,ULK3,WEE1,XDH,YES1,ZAP70
740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
767125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
772992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
783107,1,1,1,0,1,1,0,1,0,0,...,0,1,0,1,1,1,1,0,1,1


In [7]:
len(set(dti.index) & set(drug_response.index))

100

# Selected highly variable genes

In [8]:
print('Density: ', round((len(gene_exp.values.nonzero()[0])/gene_exp.size)*100, 2), '%')

Density:  75.09 %


In [9]:
variance = gene_exp.std()
variance = variance.sort_values(ascending=False)
variance = pd.DataFrame(variance > np.percentile(variance, 90))
variance = list(variance[variance[0] == True].index)
len(variance)

2383

In [10]:
print('DTI: ', len(dti.columns))
print('Top 90% variable genes: ', len(variance))
print('Total: ', len(set(variance) | set(dti.columns)))

DTI:  381
Top 90% variable genes:  2383
Total:  2718


# Preprocessed data dims

In [11]:
genes = sorted(list(set(variance) | set(dti.columns)))
gene_exp = gene_exp[genes]
gene_exp.shape

(60, 2718)

In [12]:
drug_response.shape

(269, 60)

In [13]:
dti.shape

(100, 381)

# Normalize

In [14]:
gene_norm = pd.DataFrame(
    StandardScaler().fit_transform(gene_exp), 
    index=gene_exp.index, 
    columns=gene_exp.columns
)
gene_norm

Unnamed: 0,A2M,AAK1,ABCB1,ABL1,ABL2,ABRACL,ACAT1,ACKR3,ACLY,ACO2,...,ZNF207,ZNF22,ZNF580,ZNF593,ZNF706,ZNHIT1,ZNHIT3,ZNRD1,ZWINT,ZYX
MCF7,-0.202516,-1.154608,-0.165123,-0.271267,-0.770514,-0.387611,-0.515989,5.267045,0.173049,0.072213,...,-0.217212,-0.199171,-0.587631,-0.425805,-0.053803,1.583642,0.133603,0.091624,0.544875,-1.000041
MDA_MB_231,-0.203121,0.473487,-0.168799,-1.245398,0.740272,1.908655,2.660994,-0.271766,0.505949,-1.159772,...,-0.758387,2.746291,-0.997701,-1.261613,3.785998,-0.289811,1.007383,0.971839,0.992416,-0.982226
HS578T,-0.201911,-0.289038,-0.162917,-0.126488,-0.58599,-0.743055,-0.500049,-0.224821,-0.680063,-0.730216,...,-1.285914,-0.223103,-0.13486,-0.54431,-0.551808,-0.490514,-0.908373,-0.5495,-1.151904,0.482635
BT_549,-0.20433,-0.927911,-0.168799,3.612744,2.450958,-0.073691,-0.310607,0.217375,-0.729776,-0.41757,...,-1.247937,-0.432291,2.169666,1.576875,-0.256067,0.015607,-0.815552,-0.596172,-1.105212,-0.553733
T47D,-0.203725,-0.927911,-0.168064,-0.47503,-0.68594,-0.377645,0.505403,-0.268738,0.514826,-0.520309,...,0.681187,-0.409245,-0.272454,-0.018873,-0.211746,-0.42876,1.302598,0.501845,0.822386,-0.907739
SF_268,-0.201306,-0.412691,-0.168799,-0.757439,-0.355336,-0.380137,2.46726,-0.218763,-1.206488,-1.049948,...,-1.039063,-0.270968,-0.05149,0.189922,-0.671878,-0.565694,-0.817645,-0.507741,-0.562524,-0.463888
SF_295,-0.203121,3.111413,-0.074688,0.143409,-0.043952,-0.189958,0.006969,-0.251322,2.953429,0.472541,...,3.527697,-0.156624,1.714861,2.626495,0.276589,4.171303,2.326429,0.19725,0.646189,2.257423
SF_539,-0.190421,1.318448,-0.168799,3.693177,1.489899,-0.28214,-0.59385,5.222371,0.213884,-0.33343,...,0.106188,-0.286923,0.502951,-0.247106,-0.430128,-0.528776,-0.825322,-0.080325,-0.539619,0.996058
SNB_19,-0.18014,-0.577562,-0.165123,0.161283,-0.347647,-0.568655,-0.604886,0.227219,-1.427534,-0.496396,...,-0.858671,0.007357,0.376203,-0.55058,-0.646897,0.617716,-0.403088,-0.61746,-1.280528,-0.639739
SNB_75,7.334074,0.968098,-0.168064,2.238236,0.317406,-0.394255,-0.401343,0.181788,-0.400427,-0.065954,...,-0.081918,-0.308197,0.101015,-0.648394,-0.382583,0.345189,-0.539878,-0.451243,-0.050671,0.261171


# Keep 90 % expressed cells (Association)

In [15]:
A_dc = drug_response[drug_response > 0].fillna(0)
A_dc

Unnamed: 0,MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,...,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
740,0.703626,0.000000,0.000000,0.0,0.000000,0.510978,0.536910,0.589970,0.000000,0.000000,...,0.626838,0.507493,0.682664,0.000000,0.499337,0.541612,0.000000,0.602244,0.000000,0.231533
752,0.475296,0.000000,0.000000,0.0,0.000000,0.057507,0.125700,0.111693,0.000000,0.000000,...,0.000000,0.507798,0.384102,0.000000,0.000000,0.557175,0.345056,0.000000,0.155244,0.000000
755,0.704027,0.000000,0.000000,0.0,0.496864,0.096265,0.000000,0.417634,0.000000,0.000000,...,0.123647,0.543639,0.623318,0.000000,0.000000,0.314436,0.000000,0.000000,0.491364,0.000000
762,0.547964,0.000000,0.000000,0.0,1.137432,0.135942,0.000000,0.562628,0.000000,0.000000,...,0.000000,0.934592,0.591263,0.000000,1.797227,1.260987,0.172806,0.869675,0.000000,0.000000
1390,0.517269,0.960399,0.000000,0.0,0.000000,0.000000,0.224249,0.000000,0.000000,0.000000,...,0.534589,0.819455,0.091197,0.000000,0.800745,1.468891,0.182377,0.000000,0.519999,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,0.000000,0.000000,1.593912,0.0,0.000000,0.263434,0.524381,1.450999,0.000000,1.877515,...,0.140564,0.000000,0.409866,2.250914,0.739791,1.910757,2.324851,0.728314,1.252344,0.959549
783107,0.780763,0.353024,0.000000,0.0,0.402564,0.004460,0.329454,0.000000,0.000000,1.159211,...,0.031677,0.463469,0.134452,0.537280,0.483558,0.484789,0.790988,0.112736,0.000000,0.516311
784722,1.544864,0.000000,0.000000,0.0,0.491538,0.250683,0.672894,0.135474,0.555915,0.754361,...,0.000000,0.387815,0.702974,0.713895,0.149978,0.000000,0.000000,0.714125,0.000000,0.000000
789797,0.000000,0.000000,0.000000,0.0,1.112001,0.000000,0.000000,0.000000,0.000000,0.861387,...,0.000000,0.000000,0.000000,1.989888,0.000000,2.481620,0.000000,1.672402,0.000000,2.006186


In [16]:
A_cg = gene_norm[gene_norm > 0].fillna(0)
A_cg

Unnamed: 0,A2M,AAK1,ABCB1,ABL1,ABL2,ABRACL,ACAT1,ACKR3,ACLY,ACO2,...,ZNF207,ZNF22,ZNF580,ZNF593,ZNF706,ZNHIT1,ZNHIT3,ZNRD1,ZWINT,ZYX
MCF7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.267045,0.173049,0.072213,...,0.0,0.0,0.0,0.0,0.0,1.583642,0.133603,0.091624,0.544875,0.0
MDA_MB_231,0.0,0.473487,0.0,0.0,0.740272,1.908655,2.660994,0.0,0.505949,0.0,...,0.0,2.746291,0.0,0.0,3.785998,0.0,1.007383,0.971839,0.992416,0.0
HS578T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.482635
BT_549,0.0,0.0,0.0,3.612744,2.450958,0.0,0.0,0.217375,0.0,0.0,...,0.0,0.0,2.169666,1.576875,0.0,0.015607,0.0,0.0,0.0,0.0
T47D,0.0,0.0,0.0,0.0,0.0,0.0,0.505403,0.0,0.514826,0.0,...,0.681187,0.0,0.0,0.0,0.0,0.0,1.302598,0.501845,0.822386,0.0
SF_268,0.0,0.0,0.0,0.0,0.0,0.0,2.46726,0.0,0.0,0.0,...,0.0,0.0,0.0,0.189922,0.0,0.0,0.0,0.0,0.0,0.0
SF_295,0.0,3.111413,0.0,0.143409,0.0,0.0,0.006969,0.0,2.953429,0.472541,...,3.527697,0.0,1.714861,2.626495,0.276589,4.171303,2.326429,0.19725,0.646189,2.257423
SF_539,0.0,1.318448,0.0,3.693177,1.489899,0.0,0.0,5.222371,0.213884,0.0,...,0.106188,0.0,0.502951,0.0,0.0,0.0,0.0,0.0,0.0,0.996058
SNB_19,0.0,0.0,0.0,0.161283,0.0,0.0,0.0,0.227219,0.0,0.0,...,0.0,0.007357,0.376203,0.0,0.0,0.617716,0.0,0.0,0.0,0.0
SNB_75,7.334074,0.968098,0.0,2.238236,0.317406,0.0,0.0,0.181788,0.0,0.0,...,0.0,0.0,0.101015,0.0,0.0,0.345189,0.0,0.0,0.0,0.261171


In [17]:
A_dg = dti.copy()
A_dg

Unnamed: 0,AAK1,ABL1,ABL2,ACTB,ACVR1,ACVR1B,ADA,ADORA3,AHR,AKR1C3,...,UBA1,UGT1A1,UGT3A1,ULK1,ULK2,ULK3,WEE1,XDH,YES1,ZAP70
740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
767125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
772992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
783107,1,1,1,0,1,1,0,1,0,0,...,0,1,0,1,1,1,1,0,1,1


In [18]:
A_dg = pd.DataFrame(np.ones([len(A_dc.index), len(A_cg.columns)]), index=A_dc.index, columns=A_cg.columns)
A_dg.loc[dti.index, dti.columns] = dti
A_dg

Unnamed: 0,A2M,AAK1,ABCB1,ABL1,ABL2,ABRACL,ACAT1,ACKR3,ACLY,ACO2,...,ZNF207,ZNF22,ZNF580,ZNF593,ZNF706,ZNHIT1,ZNHIT3,ZNRD1,ZWINT,ZYX
740,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
752,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
755,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
762,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1390,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
783107,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
784722,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
789797,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
print('Drug Density: ', round(len(A_dc.values.nonzero()[0])/drug_response.size, 4)*100, '%')
print('Cell Density: ', round(len(A_cg.values.nonzero()[0])/A_cg.size, 4)*100, '%')
print('Gene Density: ', round(len(A_dg.values.nonzero()[0])/A_dg.size, 4)*100, '%')

Drug Density:  47.24 %
Cell Density:  30.12 %
Gene Density:  94.86 %


# Similarity

In [20]:
cell_sim = rbf_kernel(drug_response.T)
cell_sim = pd.DataFrame(cell_sim, index=drug_response.columns)
cell_sim.to_csv('../data/cell_sim.csv')
cell_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
MCF7,1.0,0.016977,0.006904,0.125486,0.359068,0.409043,0.421733,0.392764,0.269231,0.236164,...,0.152521,0.53934,0.38798,0.186985,0.486847,0.301696,0.193685,0.473162,0.014011,0.233766
MDA_MB_231,0.016977,1.0,0.435066,0.272195,0.080522,0.042246,0.03338,0.028667,0.058538,0.090206,...,0.353464,0.046456,0.033035,0.196118,0.018457,0.024355,0.208644,0.042056,0.450865,0.091639
HS578T,0.006904,0.435066,1.0,0.181434,0.038879,0.020693,0.016911,0.015666,0.039075,0.055654,...,0.170999,0.017181,0.015786,0.11766,0.007654,0.009833,0.105482,0.019614,0.292555,0.033664
BT_549,0.125486,0.272195,0.181434,1.0,0.361048,0.261308,0.196176,0.200216,0.31207,0.410615,...,0.455587,0.243985,0.20528,0.436434,0.12518,0.139631,0.495337,0.263214,0.178574,0.277053
T47D,0.359068,0.080522,0.038879,0.361048,1.0,0.434107,0.334393,0.352774,0.409699,0.458068,...,0.339999,0.487191,0.319658,0.359887,0.286677,0.255855,0.470186,0.45419,0.059111,0.359769
SF_268,0.409043,0.042246,0.020693,0.261308,0.434107,1.0,0.593623,0.600189,0.532719,0.514663,...,0.272642,0.566086,0.553149,0.289098,0.47125,0.314059,0.358163,0.656817,0.025588,0.338044
SF_295,0.421733,0.03338,0.016911,0.196176,0.334393,0.593623,1.0,0.529468,0.353502,0.426596,...,0.215714,0.517691,0.484729,0.280242,0.494304,0.347474,0.300723,0.54429,0.023789,0.287047
SF_539,0.392764,0.028667,0.015666,0.200216,0.352774,0.600189,0.529468,1.0,0.340853,0.426342,...,0.185796,0.448341,0.512293,0.24821,0.487096,0.313607,0.304612,0.543071,0.01793,0.264741
SNB_19,0.269231,0.058538,0.039075,0.31207,0.409699,0.532719,0.353502,0.340853,1.0,0.443122,...,0.301123,0.422104,0.322641,0.341297,0.28501,0.18644,0.312568,0.51899,0.041214,0.289053
SNB_75,0.236164,0.090206,0.055654,0.410615,0.458068,0.514663,0.426596,0.426342,0.443122,1.0,...,0.297154,0.407679,0.343111,0.428763,0.268251,0.227977,0.511326,0.466112,0.055779,0.302924


In [21]:
gene_sim = rbf_kernel(gene_exp.T)
gene_sim = pd.DataFrame(gene_sim, index=gene_exp.columns)
gene_sim.to_csv('../data/gene_sim.csv')
gene_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2708,2709,2710,2711,2712,2713,2714,2715,2716,2717
A2M,1.000000e+00,2.216707e-118,7.358278e-207,1.003905e-120,2.151523e-117,1.596769e-219,5.128743e-311,1.186712e-194,0.000000e+00,6.810373e-218,...,0.000000e+00,8.906289e-190,0.000000e+00,0.000000e+00,6.370655e-221,0.000000e+00,0.000000e+00,4.544615e-212,7.803238e-256,0.0
AAK1,2.216707e-118,1.000000e+00,2.526145e-82,2.519342e-28,5.745041e-05,4.090727e-101,7.511206e-190,1.278749e-77,4.092189e-251,2.520826e-126,...,0.000000e+00,6.822693e-64,1.750848e-234,0.000000e+00,1.031054e-106,0.000000e+00,1.866494e-214,8.901781e-87,5.570512e-146,0.0
ABCB1,7.358278e-207,2.526145e-82,1.000000e+00,4.163332e-97,3.391134e-83,3.246683e-168,5.875094e-237,2.103074e-163,1.326788e-297,6.602733e-183,...,0.000000e+00,1.341920e-108,2.670048e-211,0.000000e+00,5.113221e-156,0.000000e+00,6.155055e-255,5.909694e-115,1.107043e-172,0.0
ABL1,1.003905e-120,2.519342e-28,4.163332e-97,1.000000e+00,1.360996e-17,1.258828e-82,2.854102e-167,2.911306e-72,1.169317e-163,4.734115e-88,...,0.000000e+00,7.423705e-71,1.668276e-152,6.453689e-253,2.555491e-99,9.716275e-258,1.758511e-169,3.609173e-75,6.966429e-101,0.0
ABL2,2.151523e-117,5.745041e-05,3.391134e-83,1.360996e-17,1.000000e+00,6.064609e-91,3.111763e-174,1.061499e-76,3.192944e-214,1.375733e-104,...,0.000000e+00,5.615268e-64,4.207832e-207,3.509834e-308,3.879723e-95,6.103285e-310,4.387479e-194,2.741662e-82,1.155204e-128,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNHIT1,0.000000e+00,0.000000e+00,0.000000e+00,9.716275e-258,6.103285e-310,3.080042e-213,2.826370e-268,0.000000e+00,2.637649e-121,1.054127e-179,...,3.675135e-123,1.317498e-278,2.780755e-118,4.675834e-100,1.167284e-251,1.000000e+00,1.340225e-115,1.275976e-231,2.088766e-144,0.0
ZNHIT3,0.000000e+00,1.866494e-214,6.155055e-255,1.758511e-169,4.387479e-194,1.272908e-78,7.525323e-148,4.294096e-280,1.632500e-102,6.364521e-168,...,1.186637e-160,3.149273e-126,3.475242e-99,1.572431e-136,2.151460e-106,1.340225e-115,1.000000e+00,1.001090e-96,1.260025e-51,0.0
ZNRD1,4.544615e-212,8.901781e-87,5.909694e-115,3.609173e-75,2.741662e-82,9.075149e-34,4.801041e-155,2.846274e-153,1.160407e-184,1.878696e-122,...,0.000000e+00,4.396768e-15,4.286102e-118,3.151819e-231,1.057493e-82,1.275976e-231,1.001090e-96,1.000000e+00,7.204850e-47,0.0
ZWINT,7.803238e-256,5.570512e-146,1.107043e-172,6.966429e-101,1.155204e-128,5.629936e-51,2.533434e-133,1.546226e-203,6.031128e-98,1.533522e-96,...,4.009744e-186,1.923722e-74,2.220074e-77,2.075226e-162,1.166268e-82,2.088766e-144,1.260025e-51,7.204850e-47,1.000000e+00,0.0


# NSC to SMILES

In [22]:
drug_response

Unnamed: 0,MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,...,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
740,0.703626,-1.219032,-1.892792,-0.877267,-1.156158,0.510978,0.536910,0.589970,-0.356387,-1.460314,...,0.626838,0.507493,0.682664,-0.891882,0.499337,0.541612,-1.400771,0.602244,-1.641942,0.231533
752,0.475296,-0.312852,-1.089067,-0.441030,-0.058619,0.057507,0.125700,0.111693,-3.285729,-0.114051,...,-0.321691,0.507798,0.384102,-1.314527,-0.318444,0.557175,0.345056,-0.047731,0.155244,-0.160223
755,0.704027,-0.438857,-0.548744,-1.441942,0.496864,0.096265,-0.082186,0.417634,-1.927502,-0.372021,...,0.123647,0.543639,0.623318,-1.374212,-0.173024,0.314436,-1.002183,-0.881252,0.491364,-0.183200
762,0.547964,-1.033803,-1.399273,-0.538268,1.137432,0.135942,-0.094460,0.562628,-1.398911,-1.050409,...,-0.294845,0.934592,0.591263,-0.552673,1.797227,1.260987,0.172806,0.869675,-0.529810,-0.634413
1390,0.517269,0.960399,-1.710657,-0.260192,-0.428596,-2.369012,0.224249,-1.481654,-2.369012,-0.950229,...,0.534589,0.819455,0.091197,-0.188655,0.800745,1.468891,0.182377,-0.100527,0.519999,-0.167908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,-0.672839,-0.009522,1.593912,-0.015442,-0.336637,0.263434,0.524381,1.450999,-0.073388,1.877515,...,0.140564,-0.863786,0.409866,2.250914,0.739791,1.910757,2.324851,0.728314,1.252344,0.959549
783107,0.780763,0.353024,-1.901341,-1.812769,0.402564,0.004460,0.329454,-0.072378,-2.294693,1.159211,...,0.031677,0.463469,0.134452,0.537280,0.483558,0.484789,0.790988,0.112736,-0.212100,0.516311
784722,1.544864,-0.722438,-0.591117,-0.292344,0.491538,0.250683,0.672894,0.135474,0.555915,0.754361,...,-0.565436,0.387815,0.702974,0.713895,0.149978,-1.265221,-0.104946,0.714125,-0.574573,-1.220123
789797,-0.561025,-0.217302,-0.638256,-0.638256,1.112001,-0.638256,-0.674451,-0.638256,-0.638256,0.861387,...,-0.266452,-0.638256,-0.638256,1.989888,-0.521750,2.481620,-0.331090,1.672402,-0.602061,2.006186


In [23]:
convert = dict(pd.read_csv('../Figs/nsc_cid_smiles_class_name.csv', index_col=0)[['NSC', 'SMILES']].values)
SMILES = [convert[i] for i in drug_response.index]
SMILES

['CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)NC(CCC(=O)O)C(=O)O',
 'C1=NC2=C(N1)C(=S)N=C(N2)N',
 'C1=NC2=C(N1)C(=S)N=CN2',
 'CN(CCCl)CCCl.Cl',
 'C1=NNC2=C1C(=O)NC=N2',
 'C1(=NC(=NN1)N)N',
 'CC1C(C(=O)NC(C(=O)N2CCCC2C(=O)N(CC(=O)N(C(C(=O)O1)C(C)C)C)C)C(C)C)NC(=O)C3=C4C(=C(C=C3)C)OC5=C(C(=O)C(=C(C5=N4)C(=O)NC6C(OC(=O)C(N(C(=O)CN(C(=O)C7CCCN7C(=O)C(NC6=O)C(C)C)C)C)C(C)C)C)N)C',
 'C1=CC(=CC=C1CCCC(=O)O)N(CCCl)CCCl',
 'C1CN1P(=S)(N2CC2)N3CC3',
 'C1=CC(=CC=C1CC(C(=O)O)N)N(CCCl)CCCl.Cl',
 'C1CN1C2=NC(=NC(=N2)N3CC3)N4CC4',
 'CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)Cl)OC.Cl',
 'C1=C(C(=O)NC(=O)N1)F',
 'CC1C(C(CC(O1)OC2CC(OC(C2O)C)OC3=CC4=CC5=C(C(=O)C(C(C5)C(C(=O)C(C(C)O)O)OC)OC6CC(C(C(O6)C)O)OC7CC(C(C(O7)C)O)OC8CC(C(C(O8)C)O)(C)O)C(=C4C(=C3C)O)O)O)O',
 'C1CN(CCN1C(=O)CCBr)C(=O)CCBr',
 'C1CNP(=O)(OC1)N(CCCl)CCCl',
 'CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N',
 'C1C(C(OC1N2C=C(C(=O)NC2=O)F)CO)O',
 'C(=O)(N)NO',
 'C1=C(C(=O)NC(=O)N1)N(CCCl)CCCl',
 'C1=CC=C(C(=C1)C(C2=CC=C(C=C2)

In [24]:
params = Chem.SmilesParserParams()
params.useChirality = True
params.radicalElectrons = 2
params.removeHs = False
params.replacements = {}

mfp = []

for i in SMILES:
    mol = Chem.MolFromSmiles(i, params=params)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    mfp.append(np.array(fp))

In [25]:
# pd.DataFrame(mfp, index=drug_response.index).to_csv('../data/mfp.csv')

In [26]:
drug_sim = rbf_kernel(mfp)
drug_sim = pd.DataFrame(drug_sim, index=drug_response.index)
drug_sim.to_csv('../data/drug_sim.csv')
drug_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,259,260,261,262,263,264,265,266,267,268
740,1.000000,0.968287,0.968287,0.969707,0.968287,0.970654,0.944011,0.970654,0.967814,0.969707,...,0.953741,0.957006,0.953741,0.957474,0.952810,0.950487,0.957006,0.939413,0.960283,0.957474
752,0.968287,1.000000,0.993187,0.981137,0.985458,0.986903,0.952345,0.974453,0.983055,0.974453,...,0.967814,0.964512,0.967814,0.967814,0.966870,0.959814,0.970180,0.947706,0.965926,0.964983
755,0.968287,0.993187,1.000000,0.984016,0.988350,0.985940,0.953275,0.977312,0.986903,0.976358,...,0.970654,0.966398,0.970654,0.971602,0.969707,0.962630,0.973977,0.951415,0.969707,0.968760
762,0.969707,0.981137,0.984016,1.000000,0.982096,0.986421,0.953741,0.985458,0.988350,0.985458,...,0.969233,0.967814,0.969233,0.971128,0.973027,0.962160,0.974453,0.950951,0.969233,0.968287
1390,0.968287,0.985458,0.988350,0.982096,1.000000,0.984977,0.953275,0.976358,0.983055,0.975405,...,0.972552,0.964512,0.972552,0.972552,0.968760,0.962630,0.976835,0.949559,0.967814,0.966870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,0.950487,0.959814,0.962630,0.962160,0.962630,0.962160,0.937580,0.956539,0.963100,0.955605,...,0.952810,0.952345,0.952810,0.960283,0.954673,1.000000,0.954207,0.938496,0.955605,0.952810
783107,0.957006,0.970180,0.973977,0.974453,0.976835,0.976358,0.947706,0.968760,0.975405,0.967814,...,0.968760,0.958877,0.968760,0.963100,0.968760,0.954207,1.000000,0.946781,0.964041,0.960283
784722,0.939413,0.947706,0.951415,0.950951,0.949559,0.950951,0.931193,0.949095,0.952810,0.947243,...,0.941709,0.947706,0.941709,0.947243,0.951880,0.938496,0.946781,1.000000,0.947243,0.950023
789797,0.960283,0.965926,0.969707,0.969233,0.967814,0.969233,0.944472,0.965455,0.969233,0.965455,...,0.957006,0.957474,0.957006,0.964512,0.961691,0.955605,0.964041,0.947243,1.000000,0.959814


# Unified Graph

In [27]:
A_cg

Unnamed: 0,A2M,AAK1,ABCB1,ABL1,ABL2,ABRACL,ACAT1,ACKR3,ACLY,ACO2,...,ZNF207,ZNF22,ZNF580,ZNF593,ZNF706,ZNHIT1,ZNHIT3,ZNRD1,ZWINT,ZYX
MCF7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.267045,0.173049,0.072213,...,0.0,0.0,0.0,0.0,0.0,1.583642,0.133603,0.091624,0.544875,0.0
MDA_MB_231,0.0,0.473487,0.0,0.0,0.740272,1.908655,2.660994,0.0,0.505949,0.0,...,0.0,2.746291,0.0,0.0,3.785998,0.0,1.007383,0.971839,0.992416,0.0
HS578T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.482635
BT_549,0.0,0.0,0.0,3.612744,2.450958,0.0,0.0,0.217375,0.0,0.0,...,0.0,0.0,2.169666,1.576875,0.0,0.015607,0.0,0.0,0.0,0.0
T47D,0.0,0.0,0.0,0.0,0.0,0.0,0.505403,0.0,0.514826,0.0,...,0.681187,0.0,0.0,0.0,0.0,0.0,1.302598,0.501845,0.822386,0.0
SF_268,0.0,0.0,0.0,0.0,0.0,0.0,2.46726,0.0,0.0,0.0,...,0.0,0.0,0.0,0.189922,0.0,0.0,0.0,0.0,0.0,0.0
SF_295,0.0,3.111413,0.0,0.143409,0.0,0.0,0.006969,0.0,2.953429,0.472541,...,3.527697,0.0,1.714861,2.626495,0.276589,4.171303,2.326429,0.19725,0.646189,2.257423
SF_539,0.0,1.318448,0.0,3.693177,1.489899,0.0,0.0,5.222371,0.213884,0.0,...,0.106188,0.0,0.502951,0.0,0.0,0.0,0.0,0.0,0.0,0.996058
SNB_19,0.0,0.0,0.0,0.161283,0.0,0.0,0.0,0.227219,0.0,0.0,...,0.0,0.007357,0.376203,0.0,0.0,0.617716,0.0,0.0,0.0,0.0
SNB_75,7.334074,0.968098,0.0,2.238236,0.317406,0.0,0.0,0.181788,0.0,0.0,...,0.0,0.0,0.101015,0.0,0.0,0.345189,0.0,0.0,0.0,0.261171


In [28]:
indexes = list(A_dc.index) + list(A_cg.index) + list(A_dg.columns)
n_all = len(indexes)
base = pd.DataFrame(np.zeros([n_all, n_all]), index=indexes, columns=indexes)
base

Unnamed: 0,740,752,755,762,1390,1895,3053,3088,6396,8806,...,ZNF207,ZNF22,ZNF580,ZNF593,ZNF706,ZNHIT1,ZNHIT3,ZNRD1,ZWINT,ZYX
740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNHIT1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNHIT3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNRD1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZWINT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
base.loc[A_cg.index, A_cg.columns] = A_cg
base.loc[A_cg.columns, A_cg.index] = A_cg.T
base.loc[A_dc.index, A_dc.columns] = A_dc
base.loc[A_dc.columns, A_dc.index] = A_dc.T
base.loc[A_dg.index, A_dg.columns] = A_dg
base.loc[A_dg.columns, A_dg.index] = A_dg.T

In [30]:
np.save('../data/idxs.npy', pd.DataFrame([list(range(len(base.index))), base.index]).values)

In [31]:
# edge_index = np.array(base.values.nonzero())
# edge_index

In [32]:
# np.save('edges.npy', edge_index)

# Create train, test, val data

In [33]:
drug_response

Unnamed: 0,MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,...,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
740,0.703626,-1.219032,-1.892792,-0.877267,-1.156158,0.510978,0.536910,0.589970,-0.356387,-1.460314,...,0.626838,0.507493,0.682664,-0.891882,0.499337,0.541612,-1.400771,0.602244,-1.641942,0.231533
752,0.475296,-0.312852,-1.089067,-0.441030,-0.058619,0.057507,0.125700,0.111693,-3.285729,-0.114051,...,-0.321691,0.507798,0.384102,-1.314527,-0.318444,0.557175,0.345056,-0.047731,0.155244,-0.160223
755,0.704027,-0.438857,-0.548744,-1.441942,0.496864,0.096265,-0.082186,0.417634,-1.927502,-0.372021,...,0.123647,0.543639,0.623318,-1.374212,-0.173024,0.314436,-1.002183,-0.881252,0.491364,-0.183200
762,0.547964,-1.033803,-1.399273,-0.538268,1.137432,0.135942,-0.094460,0.562628,-1.398911,-1.050409,...,-0.294845,0.934592,0.591263,-0.552673,1.797227,1.260987,0.172806,0.869675,-0.529810,-0.634413
1390,0.517269,0.960399,-1.710657,-0.260192,-0.428596,-2.369012,0.224249,-1.481654,-2.369012,-0.950229,...,0.534589,0.819455,0.091197,-0.188655,0.800745,1.468891,0.182377,-0.100527,0.519999,-0.167908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,-0.672839,-0.009522,1.593912,-0.015442,-0.336637,0.263434,0.524381,1.450999,-0.073388,1.877515,...,0.140564,-0.863786,0.409866,2.250914,0.739791,1.910757,2.324851,0.728314,1.252344,0.959549
783107,0.780763,0.353024,-1.901341,-1.812769,0.402564,0.004460,0.329454,-0.072378,-2.294693,1.159211,...,0.031677,0.463469,0.134452,0.537280,0.483558,0.484789,0.790988,0.112736,-0.212100,0.516311
784722,1.544864,-0.722438,-0.591117,-0.292344,0.491538,0.250683,0.672894,0.135474,0.555915,0.754361,...,-0.565436,0.387815,0.702974,0.713895,0.149978,-1.265221,-0.104946,0.714125,-0.574573,-1.220123
789797,-0.561025,-0.217302,-0.638256,-0.638256,1.112001,-0.638256,-0.674451,-0.638256,-0.638256,0.861387,...,-0.266452,-0.638256,-0.638256,1.989888,-0.521750,2.481620,-0.331090,1.672402,-0.602061,2.006186


In [34]:
df = pd.DataFrame()
for i in drug_response.columns:
    tmp = pd.DataFrame(drug_response[i]).reset_index().dropna()
    tmp['cell'] = [i] * len(tmp)
    tmp.columns = ['Drug', 'Value', 'Cell']
    tmp = tmp[['Drug', 'Cell', 'Value']]
    df = pd.concat([df, tmp])

In [35]:
df

Unnamed: 0,Drug,Cell,Value
0,740,MCF7,0.703626
1,752,MCF7,0.475296
2,755,MCF7,0.704027
3,762,MCF7,0.547964
4,1390,MCF7,0.517269
...,...,...,...
264,772992,UO_31,0.959549
265,783107,UO_31,0.516311
266,784722,UO_31,-1.220123
267,789797,UO_31,2.006186


In [36]:
X = df[['Drug', 'Cell']]
y = np.array(df['Value'] > 0, dtype=float)

In [37]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

In [38]:
print("Train:")
print(X_train.shape, y_train.shape)
print("Test:")
print(X_test.shape, y_test.shape)
print("Val:")
print(X_val.shape, y_val.shape)

Train:
(9684, 2) (9684,)
Test:
(3228, 2) (3228,)
Val:
(3228, 2) (3228,)


In [39]:
X_train.to_csv('../data/train.csv', index=False)
X_test.to_csv('../data/test.csv', index=False)
X_val.to_csv('../data/val.csv', index=False)

np.save('../data/rain_labels.npy', y_train)
np.save('../data/test_labels.npy', y_test)
np.save('../data/val_labels.npy', y_val)