### Machine learning based drug sensitivity prediction
We will predict drug sensitivity from the GDSC dataset using Random Forest Regression. At first we will create cell and drug specific features.

In [155]:
import pandas as pd
import numpy as np
from subprocess import call
import pickle

In [63]:
#histology
data=pd.read_excel('../data/GDSC/Cell_Lines_Details.xlsx',sep=',',header=0,index_col=[0]).iloc[:-1,:]
data['COSMIC identifier']=data['COSMIC identifier'].astype(int).astype(str)
data.index=data['COSMIC identifier']
features=list(set(data['GDSC\nTissue descriptor 1'])|set(data['GDSC\nTissue\ndescriptor 2']))
features=pd.DataFrame(0,index=data.index,columns=features)    
for cosmic in features.index:
    features.loc[cosmic,data.loc[cosmic,['GDSC\nTissue descriptor 1','GDSC\nTissue\ndescriptor 2']]]=1
features.T.to_csv('../data/GDSC/ML/cell_features/histology.csv',sep=',')   

In [64]:
#progeny features for pathway activity
data=pd.read_csv('../data/GDSC/norm_gex.csv',sep=',',header=0,index_col=0)
progeny=pd.read_csv('../data/Functional/PROGENy.csv',sep=',',header=0,index_col=0)
genes=list(set(data.index)&set(progeny.index))
progeny_scores=pd.DataFrame(np.dot(data.T[genes],progeny.loc[genes]),index=data.columns,columns=progeny.columns)
progeny_scores.T.to_csv('../data/GDSC/ML/cell_features/progeny.csv',sep=',')

In [58]:
#dorothea tf activities
call(['Rscript','GDSC_TF_activity.R'])

0

In [101]:
#merge cell features
progeny=pd.read_csv('../data/GDSC/ML/cell_features/progeny.csv',sep=',',header=0,index_col=0)
dorothea=pd.read_csv('../data/GDSC/ML/cell_features/dorothea.csv',sep=',',header=0,index_col=0)
dorothea.index=pd.Series(dorothea.index).apply(lambda x:x.split(' - ')[0]).values
fil=pd.Series(dorothea.index).apply(lambda x:x[-1] in ['A','B']).values
dorothea=dorothea[fil]
dorothea.index=pd.Series(dorothea.index).apply(lambda x:x[:-2]).values
histology=pd.read_csv('../data/GDSC/ML/cell_features/histology.csv',sep=',',header=0,index_col=0)
cosmics=list(set(progeny.columns)&set(dorothea.columns)&set(histology.columns))
progeny.index='PRO_'+pd.Series(progeny.index).values
dorothea.index='TF_'+pd.Series(dorothea.index).values
histology.index='HIST_'+pd.Series(histology.index).values
progeny=progeny[cosmics]
dorothea=dorothea[cosmics]
histology=histology[cosmics]
features=pd.concat([histology,progeny,dorothea]).T
features.to_csv('../data/GDSC/ML/cell_features/cell_features.csv',sep=',')

In [154]:
#drug target and pathway
data=pd.read_excel('../data/GDSC/Screened_Compounds.xlsx')
data.index=data['DRUG_ID']
targets=[]
pathways=[]
#correct some bad annotations
correct={'HDAC1-10, EGFR, ERBB2':'HDAC1, HDAC2, HDAC3, HDAC6, HDAC8, EGFR, ERBB2',
        'HDAC1 ':'HDAC1','HDAC,RAR':'HDAC1, RAR',
        'HDAC inhibitor Class I, IIa, IIb, IV':'HDAC1, HDAC2, HDAC3, HDAC6, HDAC8',
        'CDK1,CDK2,CDK5,CDK7,CDK9, PKC':'CDK1, CDK2, CDK5, CDK7, CDK9, PKC',
        'Antimetabolite (DNA & RNA)':'Antimetabolite',
        'CSF1R, KIT,  PDGFRA, PDGFRB':'CSF1R, KIT, PDGFRA, PDGFRB',
        'VEGFR, MET, RET, KIT, FLT1, FLT3, FLT4, TIE2,AXL':'VEGFR, MET, RET, KIT, FLT1, FLT3, FLT4, TIE2, AXL'}
for i in data.index:
    if data.loc[i,'TARGET'] in list(correct.keys()):
        data.loc[i,'TARGET']=correct[data.loc[i,'TARGET']]
for i in data.index:
    targets+=data.loc[i,'TARGET'].split(', ')
    pathways.append(data.loc[i,'TARGET_PATHWAY'])
targets=list(set(targets))
pathways=list(set(pathways))
target_feat=pd.DataFrame(0,index=data.index,columns=targets)
pathway_feat=pd.DataFrame(0,index=data.index,columns=pathways)
for i in data.index:
    targets=data.loc[i,'TARGET'].split(', ')
    pathway=data.loc[i,'TARGET_PATHWAY']
    target_feat.loc[i,targets]=1
    pathway_feat.loc[i,pathway]=1
del target_feat['others']
del target_feat['not defined']
del pathway_feat['Other']
target_feat.to_csv('../data/GDSC/ML/drug_features/target.csv',sep=',')
pathway_feat.to_csv('../data/GDSC/ML/drug_features/pathway.csv',sep=',')

In [156]:
#signatures
fin=open('../data/GDSC/l1000_gdsc_match.pkl','rb')
gdsc_l1000_matching=pickle.load(fin,encoding='latin1')
fin.close()

In [158]:
len(gdsc_l1000_matching)

150