### Machine learning based drug sensitivity prediction
We will predict drug sensitivity from the GDSC dataset using Random Forest Regression. At first we will create cell and drug specific features.

In [1]:
import pandas as pd
import numpy as np

from subprocess import call
import pickle
from cmapPy.pandasGEXpress.parse import parse

from sklearn.ensemble import RandomForestRegressor as RFR
from scipy.stats import pearsonr as pcor

In [63]:
#histology
data=pd.read_excel('../data/GDSC/Cell_Lines_Details.xlsx',sep=',',header=0,index_col=[0]).iloc[:-1,:]
data['COSMIC identifier']=data['COSMIC identifier'].astype(int).astype(str)
data.index=data['COSMIC identifier']
features=list(set(data['GDSC\nTissue descriptor 1'])|set(data['GDSC\nTissue\ndescriptor 2']))
features=pd.DataFrame(0,index=data.index,columns=features)    
for cosmic in features.index:
    features.loc[cosmic,data.loc[cosmic,['GDSC\nTissue descriptor 1','GDSC\nTissue\ndescriptor 2']]]=1
features.T.to_csv('../data/GDSC/ML/cell_features/histology.csv',sep=',')   

In [64]:
#progeny features for pathway activity
data=pd.read_csv('../data/GDSC/norm_gex.csv',sep=',',header=0,index_col=0)
progeny=pd.read_csv('../data/Functional/PROGENy.csv',sep=',',header=0,index_col=0)
genes=list(set(data.index)&set(progeny.index))
progeny_scores=pd.DataFrame(np.dot(data.T[genes],progeny.loc[genes]),index=data.columns,columns=progeny.columns)
progeny_scores.T.to_csv('../data/GDSC/ML/cell_features/progeny.csv',sep=',')

In [58]:
#dorothea tf activities
call(['Rscript','GDSC_TF_activity.R'])

0

In [101]:
#merge cell features
progeny=pd.read_csv('../data/GDSC/ML/cell_features/progeny.csv',sep=',',header=0,index_col=0)
dorothea=pd.read_csv('../data/GDSC/ML/cell_features/dorothea.csv',sep=',',header=0,index_col=0)
dorothea.index=pd.Series(dorothea.index).apply(lambda x:x.split(' - ')[0]).values
fil=pd.Series(dorothea.index).apply(lambda x:x[-1] in ['A','B']).values
dorothea=dorothea[fil]
dorothea.index=pd.Series(dorothea.index).apply(lambda x:x[:-2]).values
histology=pd.read_csv('../data/GDSC/ML/cell_features/histology.csv',sep=',',header=0,index_col=0)
cosmics=list(set(progeny.columns)&set(dorothea.columns)&set(histology.columns))
progeny.index='PRO_'+pd.Series(progeny.index).values
dorothea.index='TF_'+pd.Series(dorothea.index).values
histology.index='HIST_'+pd.Series(histology.index).values
progeny=progeny[cosmics]
dorothea=dorothea[cosmics]
histology=histology[cosmics]
features=pd.concat([histology,progeny,dorothea]).T
features.to_csv('../data/GDSC/ML/cell_features/cell_features.csv',sep=',')

In [154]:
#drug target and pathway
data=pd.read_excel('../data/GDSC/Screened_Compounds.xlsx')
data.index=data['DRUG_ID']
targets=[]
pathways=[]
#correct some bad annotations
correct={'HDAC1-10, EGFR, ERBB2':'HDAC1, HDAC2, HDAC3, HDAC6, HDAC8, EGFR, ERBB2',
        'HDAC1 ':'HDAC1','HDAC,RAR':'HDAC1, RAR',
        'HDAC inhibitor Class I, IIa, IIb, IV':'HDAC1, HDAC2, HDAC3, HDAC6, HDAC8',
        'CDK1,CDK2,CDK5,CDK7,CDK9, PKC':'CDK1, CDK2, CDK5, CDK7, CDK9, PKC',
        'Antimetabolite (DNA & RNA)':'Antimetabolite',
        'CSF1R, KIT,  PDGFRA, PDGFRB':'CSF1R, KIT, PDGFRA, PDGFRB',
        'VEGFR, MET, RET, KIT, FLT1, FLT3, FLT4, TIE2,AXL':'VEGFR, MET, RET, KIT, FLT1, FLT3, FLT4, TIE2, AXL'}
for i in data.index:
    if data.loc[i,'TARGET'] in list(correct.keys()):
        data.loc[i,'TARGET']=correct[data.loc[i,'TARGET']]
for i in data.index:
    targets+=data.loc[i,'TARGET'].split(', ')
    pathways.append(data.loc[i,'TARGET_PATHWAY'])
targets=list(set(targets))
pathways=list(set(pathways))
target_feat=pd.DataFrame(0,index=data.index,columns=targets)
pathway_feat=pd.DataFrame(0,index=data.index,columns=pathways)
for i in data.index:
    targets=data.loc[i,'TARGET'].split(', ')
    pathway=data.loc[i,'TARGET_PATHWAY']
    target_feat.loc[i,targets]=1
    pathway_feat.loc[i,pathway]=1
del target_feat['others']
del target_feat['not defined']
del pathway_feat['Other']
target_feat.to_csv('../data/GDSC/ML/drug_features/target.csv',sep=',')
pathway_feat.to_csv('../data/GDSC/ML/drug_features/pathway.csv',sep=',')

In [10]:
#signatures
fin=open('../data/GDSC/l1000_gdsc_match.pkl','rb')
gdsc_l1000_matching=pickle.load(fin,encoding='latin1')
fin.close()
brds=[]
for i in gdsc_l1000_matching:
        brds+=gdsc_l1000_matching[i]
brds=list(set(brds))
#select relevant signatures
sig_info_gse92742=pd.read_csv('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_sig_info.txt',
                                sep='\t',header=0,index_col=0,low_memory=False)
fil=np.in1d(sig_info_gse92742['pert_itime'],['24 h'])
sig_info_gse92742=sig_info_gse92742[fil]
sig_info_gse70138=pd.read_csv('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info.txt',
                              sep='\t',header=0,index_col=0,low_memory=False)
fil=np.in1d(sig_info_gse70138['pert_itime'],['24 h'])
sig_info_gse70138=sig_info_gse70138[fil]
fil=np.in1d(sig_info_gse92742['pert_id'],brds)
sig_info_gse92742=sig_info_gse92742[fil]
fil=np.in1d(sig_info_gse70138['pert_id'],brds)
sig_info_gse70138=sig_info_gse70138[fil]
sig_ids_gse70138=list(sig_info_gse70138.index)
sig_ids_gse92742=list(sig_info_gse92742.index)
#read landmark genes   
gene_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',sep='\t')
fil=gene_info.loc[:,'pr_is_lm']==1
gene_ids = list(gene_info.loc[gene_info.index[fil],'pr_gene_id'].astype(str))
#read signatures
signatures_gse92742=parse('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
                          cid=sig_ids_gse92742,rid=gene_ids)
signatures_gse92742=signatures_gse92742.data_df.T
signatures_gse70138=parse('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx',
                          cid=sig_ids_gse70138,rid=gene_ids)
signatures_gse70138=signatures_gse70138.data_df.T
signatures_gse70138.to_csv('../data/GDSC/ML/signatures_gse70138.csv',sep=',')
signatures_gse92742.to_csv('../data/GDSC/ML/signatures_gse92742.csv',sep=',')
#order everything
signatures_gse70138=signatures_gse70138.loc[:,signatures_gse92742.columns]
sig_info_gse70138=sig_info_gse70138.loc[:,['pert_id','cell_id','pert_itime']]
sig_info_gse92742=sig_info_gse92742.loc[:,['pert_id','cell_id','pert_itime']]
signatures=pd.concat([signatures_gse70138,signatures_gse92742])
sig_info=pd.concat([sig_info_gse70138,sig_info_gse92742])
sig_info=sig_info.loc[signatures.index,:]

sig_info.to_csv('../data/GDSC/ML/sig_info.csv',sep=',')
signatures.to_csv('../data/GDSC/ML/signatures.csv',sep=',')

In [18]:
LM_GENES=gene_ids
from scipy.stats import spearmanr as scor
def calc_MODZ(data):
    """calculates MODZ based on the original CMAP/L1000 study
    use only lm genes for MODZ calculation! Uses LM_GENES global
    variable."""
    if len(data)==1:
        return data
    if len(data)==2:
        return np.mean(data,0)
    else:
        CM=scor(data[LM_GENES].T)[0]
        fil=CM<0
        CM[fil]=0.01
        weights=np.sum(CM,1)-1
        weights=weights/np.sum(weights)
        weights=weights.reshape((-1,1))
        return pd.Series(np.dot(data.T,weights).reshape((-1,1)[0]),index=data.columns)

In [14]:
#calculating consensus signatures
sig_info=pd.read_csv('../data/GDSC/ML/sig_info.csv',sep=',',header=0,index_col=0)
signatures=pd.read_csv('../data/GDSC/ML/signatures.csv',sep=',',header=0,index_col=0)
assert np.sum(sig_info.index!=signatures.index)==0
features=pd.DataFrame(index=list(gdsc_l1000_matching.keys()),columns=signatures.columns)
for gdsc_drug in features.index:
    brds=gdsc_l1000_matching[gdsc_drug]
    fil=np.in1d(sig_info['pert_id'],brds)
    indexes=sig_info.index[fil]
    features.loc[gdsc_drug]=calc_MODZ(signatures.loc[indexes])
features.to_csv('../data/GDSC/ML/drug_features/signatures.csv',sep=',')

In [32]:
#PCA and cor matrix
from sklearn.decomposition import PCA
model=PCA(40)
features_pca=pd.DataFrame(model.fit_transform(features),index=features.index,columns=range(40))
features_sim=pd.DataFrame(scor(features.T)[0],index=features.index,columns=features.index)
features_pca.to_csv('../data/GDSC/ML/drug_features/signatures_pca.csv',sep=',')
features_sim.to_csv('../data/GDSC/ML/drug_features/signatures_sim.csv',sep=',')

In [103]:
#fingerprints
fingerprints=pd.read_table('../data/Chemical/LINCS_PertID_SMILES_MorganFP_256bits.csv',
                       sep=',',header=0,index_col=[0])
fingerprints.index=fingerprints['CompoundNames']
del fingerprints['CompoundNames']
fingerprints=fingerprints.astype(int)
fin=open('../data/GDSC/l1000_gdsc_match.pkl','rb')
gdsc_l1000_matching=pickle.load(fin,encoding='latin1')
fin.close()
brds=[]
for i in gdsc_l1000_matching:
        brds+=gdsc_l1000_matching[i]
brds=list(set(brds)&set(fingerprints.index))
fingerprints=fingerprints.loc[brds]
features=pd.DataFrame(index=gdsc_l1000_matching.keys(),columns=fingerprints.columns)
for drug in gdsc_l1000_matching.keys():
    brd=gdsc_l1000_matching[drug][0]
    features.loc[drug]=fingerprints.loc[brd]
features.to_csv('../data/GDSC/ML/drug_features/fingerprints.csv',sep=',')
features=features.astype(bool)
features_sim=pd.DataFrame(index=features.index,columns=features.index)
for drug in features_sim.index:
    features_sim.loc[drug]=np.sum(features.loc[drug]&features,1)/np.sum(features.loc[drug]|features,1)
features_sim.to_csv('../data/GDSC/ML/drug_features/fingerprints_sim.csv',sep=',')

In [169]:
#drug response
data=pd.read_excel('../data/GDSC/v17.3_fitted_dose_response.xlsx')

In [170]:
cell_feat=pd.read_csv('../data/GDSC/ML/cell_features/cell_features.csv',sep=',',
                     header=0,index_col=0)

In [171]:
target=pd.read_csv('../data/GDSC/ML/drug_features/target.csv',sep=',',header=0,
                  index_col=0)
pathway=pd.read_csv('../data/GDSC/ML/drug_features/pathway.csv',sep=',',header=0,index_col=0)
signatures=pd.read_csv('../data/GDSC/ML/drug_features/signatures.csv',sep=',',header=0,index_col=0)
singatures_pca=pd.read_csv('../data/GDSC/ML/drug_features/signatures_pca.csv',sep=',',header=0,index_col=0)
singatures_sim=pd.read_csv('../data/GDSC/ML/drug_features/signatures_sim.csv',sep=',',header=0,index_col=0)
fingerprints=pd.read_csv('../data/GDSC/ML/drug_features/fingerprints.csv',sep=',',header=0,index_col=0)
fingerprints_sim=pd.read_csv('../data/GDSC/ML/drug_features/fingerprints_sim.csv',sep=',',header=0,index_col=0)

In [172]:
cosmics=list(set(data['COSMIC_ID'])&set(cell_feat.index))
drug_ids=list(set(data['DRUG_ID'])&set(target.index)&set(pathway.index)&set(fingerprints.index)&set(signatures.index))

In [173]:
fil=np.in1d(data['DRUG_ID'],drug_ids)&np.in1d(data['COSMIC_ID'],cosmics)
data=data[fil]
data_auc=data[['DRUG_ID','COSMIC_ID','AUC']]
data_ic50=data[['DRUG_ID','COSMIC_ID','LN_IC50']]
data_auc.columns=['DRUG_ID','COSMIC_ID','RESPONSE']
data_ic50.columns=['DRUG_ID','COSMIC_ID','RESPONSE']
data_auc.to_csv('../data/GDSC/ML/response/response_auc.csv',sep=',')
data_ic50.to_csv('../data/GDSC/ML/response/response_ic50.csv',sep=',')

In [174]:
cell_feat.loc[cosmics].to_csv('../data/GDSC/ML/cell_features/cell_features.csv',sep=',')
target.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/target.csv',sep=',')
pathway.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/pathway.csv',sep=',')
signatures.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/signatures.csv',sep=',')
singatures_pca.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/signatures_pca.csv',sep=',')
singatures_sim.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/signatures_sim.csv',sep=',')
fingerprints.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/fingerprints.csv',sep=',')
fingerprints_sim.loc[drug_ids].to_csv('../data/GDSC/ML/drug_features/fingerprints_sim.csv',sep=',')


Now we can start the predction.

In [175]:
def read_data(drug_feature_type='target',response_type='auc'):
    response=pd.read_csv('../data/GDSC/ML/response/response_%s.csv' % response_type,sep=',',header=0,index_col=0)
    cell_feat=pd.read_csv('../data/GDSC/ML/cell_features/cell_features.csv',sep=',',header=0,index_col=0)
    drug_feat=pd.read_csv('../data/GDSC/ML/drug_features/%s.csv' % drug_feature_type,sep=',',header=0,index_col=0)
    cell_feat=cell_feat.loc[response['COSMIC_ID'].values]
    cell_feat.index=response.index
    drug_feat=drug_feat.loc[response['DRUG_ID'].values]
    drug_feat.index=response.index
    features=pd.concat([cell_feat,drug_feat],1)
    return response,features

In [176]:
TARGET_MATRIX=pd.read_csv('../data/GDSC/ML/drug_features/target.csv',sep=',',header=0,
                  index_col=0)
def make_split(data,s,split_type='random'):
    np.random.seed(s)
    drugs=list(set(data.loc[:,'DRUG_ID']))
    if split_type=='random':
        drugs_tr=np.random.choice(drugs,int(len(drugs)/2),False)
        fil=np.in1d(data.loc[:,'DRUG_ID'],drugs_tr)
        tr=data.index[fil]
        ts=data.index[~fil]
    elif split_type=='same_target':
        tr_drugs=[]
        ts_drugs=[]
        while len(drugs)>0:
            drug=np.random.choice(drugs,1)[0]
            fil=TARGET_MATRIX.loc[drug,:]==1
            targets_round=list(TARGET_MATRIX.columns[fil])
            fil=np.sum(TARGET_MATRIX.loc[:,targets_round],1)>0
            drugs_round=list(set(TARGET_MATRIX.index[fil])&set(drugs))
            if len(drugs_round)>1:
                drugs_round_tr=list(np.random.choice(drugs_round,int(len(drugs_round)/2),False))
                drugs_round_ts=list(set(drugs_round)-set(drugs_round_tr))
                tr_drugs+=drugs_round_tr
                ts_drugs+=drugs_round_ts
            else:
                tr_drugs+=drugs_round
            drugs=list(set(drugs)-set(drugs_round))
        fil=np.in1d(data.loc[:,'DRUG_ID'],tr_drugs)
        tr=data.index[fil]
        ts=data.index[~fil]
    elif split_type=='diff_target':
        tr_drugs=[]
        l=len(drugs)
        while len(drugs)>l/2:
            drug=np.random.choice(drugs,1)[0]
            fil=TARGET_MATRIX.loc[drug,:]==1
            targets_round=list(TARGET_MATRIX.columns[fil])
            targets_round_prev=[]
            while set(targets_round)!=set(targets_round_prev):
                targets_round_prev=targets_round[::]
                fil=np.sum(TARGET_MATRIX.loc[:,targets_round],1)>0
                drugs_round=list(set(TARGET_MATRIX.index[fil])&set(drugs))
                fil=np.sum(TARGET_MATRIX.loc[drugs_round,:],0)>0
                targets_round=TARGET_MATRIX.columns[fil]
            tr_drugs+=drugs_round
            drugs=list(set(drugs)-set(drugs_round))
        fil=np.in1d(data.loc[:,'DRUG_ID'],tr_drugs)
        tr=data.index[fil]
        ts=data.index[~fil]
    return tr,ts

In [177]:
def make_prediction(response,features,tr,ts):
    model=RFR(n_estimators=50,n_jobs=11)
    model.fit(features.loc[tr,:],response.loc[tr,'RESPONSE'])
    results=response.loc[ts,:].copy()
    y_pr=model.predict(features.loc[ts,:])
    results['Predicted']=y_pr
    return results

In [231]:
for resp in ['auc','ic50']:    
    for split in ['random','same_target','diff_target']:
        for df in ['target','pathway','fingerprints','signatures_pca']:
            data,features=read_data(df,response_type=resp)
            for s in range(20):
                tr,ts=make_split(data,s,split)
                results=make_prediction(data,features,tr,ts)
                rname='_'.join([resp,split,df,str(s)])
                results.to_csv('../results/GDSC/%s.csv' % rname,sep=',')

random target 0.472204734219 0.412156358878
random pathway 0.333446264452 0.395757030069
random fingerprints 0.235288938269 0.388158851678
random fingerprints_sim 0.457833915742 0.410212817378
random signatures_pca 0.556443298478 0.382209129066
random signatures_sim 0.453401503881 0.389723876593
same_target target 0.614074077636 0.470444210724
same_target pathway 0.557011874928 0.472560708842
same_target fingerprints 0.193678072803 0.380082422058
same_target fingerprints_sim 0.20762474886 0.419347263335
same_target signatures_pca 0.552093921308 0.39703378354
same_target signatures_sim 0.480424867998 0.412495558792
diff_target target 0.203392307531 0.417014795846
diff_target pathway 0.27057253416 0.410258632343
diff_target fingerprints 0.167917537525 0.417956297545
diff_target fingerprints_sim -0.00496247679401 0.395272783966
diff_target signatures_pca 0.395599773684 0.389644723629
diff_target signatures_sim 0.426905114335 0.375029364524
