In [1]:
from __future__ import print_function
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from scipy.stats import zscore
import seaborn as sns
import sys,os
import gzip
import ftplib
import re
import gzip
import shutil
from io import StringIO
#pd.options.mode.chained_assignment = None  # default='warn'
import warnings

# suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
# define directories

root_dir = "/Volumes/Expansion/Thesis Work/Results/"
tmp_dir = "/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/annotations/"
pdx_path = "/Volumes/Expansion/Thesis Work/Datasets/PDX/Mutation/"
tcga_path = "/Volumes/Expansion/Thesis Work/Supplementary Files/TCGA/"
gdsc_path = "/Volumes/Expansion/Thesis Work/Supplementary Files/GDSC/"

 # PDX 
RECIST Response Categories

# TCGA
RECIST Response Categories

# GDSC 
- binary 
- continious 

In [3]:
# create variables for inhibitor classes according to drug targets
# use the drugs having binary response information in GDSC   

EGFRi_drugs = ['Erlotinib',
               'Lapatinib',
               'CP724714',
               'EKB-569',
               'Gefitinib',
               'Afatinib',
               'Cetuximab',
               'HG-5-88-01']

DNA_REPi_drugs = ["Pyrimethamine",
                  "Doxorubicin",
                  "Etoposide",
                  "Gemcitabine",
                  "Mitomycin C",
                  "5-Fluorouracil",
                  "Bleomycin",
                  "Camptothecin",
                  "Cisplatin",
                  "Cytarabine",
                  "Methotrexate",
                  "Temozolomide",
                  "SN-38"]

CYTOi_drugs = ['Paclitaxel',
               'GSK269962A',
               'Vinorelbine',
               'PF-562271',
               'IPA-3',
               'Epothilone B',
               'GSK429286A',
               'Y-39983',
               'Vinblastine',
               'Docetaxel',
               'EHT 1864']

MITOSISi_drugs = ['VX-680',
                  'S-Trityl-L-cysteine',
                  'BI-2536',
                  'GW843682X',
                  'SB-715992',
                  'Genentech Cpd 10',
                  'GSK1070916',
                  'NPK76-II-72-1',
                  'MPS-1-IN-1',
                  'ZM-447439']

In [4]:
# create folders for training,testing and pre-training in the root_dir  

for folder in ["preprocessed_results2/","preprocessed_results2/annotations/"]:
    if not os.path.exists(root_dir+"/"+folder):
        os.makedirs(root_dir+"/"+folder)

# PDX
Supplementary file nm.3954-S2.xlsx from https://www.nature.com/articles/nm.3954, tab "PCT curve metrics"

- all combinational treatemnts were excluded
- records containing '-->' or '-->-->' signs in ResponseCategory were excluded; these records correspond non-stable response, e.g. PR --> PD means SD-->-->PD means
- we focus on 5 drugs: 'Cetuximab', 'Paclitaxel', 'Gemcitabine', '5-Fluorouracil', 'Erlotinib';
 'Tamoxifen' has no "S" xenografts
 

In [5]:
# read PDX annotation file 

df = pd.read_excel(pdx_path +"nm.3954-S2.xlsx", "PCT curve metrics")
print(df.shape)
df.drop_duplicates(inplace=True)
print("Combo drugs responses dropped:",df.loc[df["Treatment type"]=="combo",:].shape[0])
df = df.loc[df["Treatment type"]=="single",:]
print(df.shape)
df.head()

(4758, 11)
Combo drugs responses dropped: 1279
(3479, 11)


Unnamed: 0,Model,Treatment,Treatment target,Treatment type,BestResponse,Day_BestResponse,BestAvgResponse,Day_BestAvgResponse,TimeToDouble,Day_Last,ResponseCategory
0,X-007,BGJ398,FGFR,single,396.5,11,220.475,11,4.0,11,PD
1,X-007,BKM120,"PIK3CA,PIK3CB,PIK3CG,PIK3CD,panPI3K",single,189.1,14,77.05,11,6.207547,14,PD
2,X-007,BYL719,PIK3CA,single,303.7,11,196.175,11,4.0,11,PD
5,X-007,CLR457,"PIK3CA,PIK3CB,PIK3CG,PIK3CD,panPI3K",single,25.0,16,26.533333,16,36.835,37,SD
6,X-007,HDM201,MDM2,single,330.8,11,182.75,11,4.0,11,PD


In [6]:
# count of treatment size

df[["Model","Treatment"]].groupby("Treatment").size().head(5)

Treatment
5FU        43
BGJ398    112
BKM120    224
BYL719    212
CGM097    140
dtype: int64

In [7]:
# show response categories

df[["Model","ResponseCategory"]].groupby("ResponseCategory").size()

ResponseCategory
CR              60
CR-->-->PD      10
CR-->PD          4
PD            2250
PR              77
PR-->-->PD      27
PR-->PD         63
SD             230
SD-->-->PD      49
SD-->PD        709
dtype: int64

In [8]:
# create a dictionary for PDX drugs

drug_dict = {"5FU":"5-Fluorouracil",
             "erlotinib":"Erlotinib",
             "cetuximab":"Cetuximab",
             "gemcitabine-50mpk":"Gemcitabine",
             "paclitaxel":"Paclitaxel"}

response_dict = {"CR":"S","PR":"S","SD":"R","PD":"R"}

In [9]:
# records for PDX drugs

df = df.loc[df["Treatment"].isin(drug_dict.keys()),:]
print("Records for drugs",drug_dict.values(),df.shape[0])

Records for drugs dict_values(['5-Fluorouracil', 'Erlotinib', 'Cetuximab', 'Gemcitabine', 'Paclitaxel']) 249


In [10]:
# treatment size

df[["Model","Treatment"]].groupby("Treatment").size()

Treatment
5FU                  43
cetuximab            72
erlotinib            29
gemcitabine-50mpk    38
paclitaxel           67
dtype: int64

In [11]:
# show response categories

df = df.loc[~df["ResponseCategory"].str.contains("-->"),:]
df.groupby("ResponseCategory").size()

ResponseCategory
CR     11
PD    140
PR     13
SD     20
dtype: int64

In [12]:
# count of sensitive and resistant samples

df.loc[:,"drug"] = df["Treatment"].apply(lambda x : drug_dict[x])
df.loc[:,"response"] = df["ResponseCategory"].apply(lambda x : response_dict[x])
df = df[["Model","drug","response","ResponseCategory","Treatment","Treatment target",
   "Treatment type","BestResponse","Day_BestResponse","BestAvgResponse","Day_BestAvgResponse","TimeToDouble","Day_Last"]]

for drug in drug_dict.values():
    d = df.loc[df["drug"]==drug,:]
    d.set_index("Model",inplace = True,drop=True)
    d.index.name = "sample_name"
    d.sort_values(by="sample_name",inplace=True)
    d.to_csv(root_dir+"/preprocessed_results2/annotations/"+"PDX_response."+drug+".tsv",sep = "\t")
    print(drug,"R:",d[d["response"]=="R"].shape[0],
      "S:",d[d["response"]=="S"].shape[0])
d.head(3)

5-Fluorouracil R: 24 S: 1
Erlotinib R: 19 S: 4
Cetuximab R: 58 S: 6
Gemcitabine R: 19 S: 7
Paclitaxel R: 40 S: 6


Unnamed: 0_level_0,drug,response,ResponseCategory,Treatment,Treatment target,Treatment type,BestResponse,Day_BestResponse,BestAvgResponse,Day_BestAvgResponse,TimeToDouble,Day_Last
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
X-1008,Paclitaxel,S,PR,paclitaxel,Tubulin,single,-77.6,90,-44.7625,90,90.0,90
X-1156,Paclitaxel,R,PD,paclitaxel,Tubulin,single,395.4,21,83.9,14,7.573427,31
X-1172,Paclitaxel,R,PD,paclitaxel,Tubulin,single,321.8,20,-14.525,11,14.845745,20


# TCGA 

Ding et al. 2016, Supplementary tables , tab "Table S2"

"bcr_patient_barcode" matches with first 12 symbols in sample barcore. One patient in TCGA may have more than one tumor sample and even one or several normal samples. 


In [13]:
# create a dictionary for response categories

response_dict = {'Clinical Progressive Disease':"R",'Complete Response':"S",
                 'Partial Response':"S",'Stable Disease':"R"}

In [14]:
# read annotation file 

df = pd.read_excel(tcga_path + "/bioinfo16_supplementary_tables.xlsx",
                   "Table S2")
df = df.drop_duplicates()
df.drop([0,2],inplace=True)
cols = df.loc[1,:]
df = df.drop(1)
df.columns = cols
df.loc[:,"cohort"] = df["Cancer"].apply(lambda x: re.search(r'\((.*?)\)',x).group(1))
df.loc[:,"response"] =  df["measure_of_response"].apply(lambda x: response_dict[x])
print(df.shape)
dup_indices = df.loc[df[["bcr_patient_barcode",
           "days_to_drug_therapy_start","days_to_drug_therapy_end"]].duplicated(keep=False),:].index.values
df = df[["bcr_patient_barcode","cohort","drug_name","response","measure_of_response",
         "days_to_drug_therapy_start","days_to_drug_therapy_end","DrugBank ID",
         "days_to_initial_pathologic_diagnosis","method_of_sample_procurement",
         "days_to_sample_procurement","days_to_new_tumor_event_after_initial_treatment",
         "additional_pharmaceutical_therapy","new_tumor_event_additional_surgery_procedure",
         "history_of_neoadjuvant_treatment"]]
df.rename({"drug_name":"drug"},axis="columns",inplace=True)
df.head()

(2569, 16)


1,bcr_patient_barcode,cohort,drug,response,measure_of_response,days_to_drug_therapy_start,days_to_drug_therapy_end,DrugBank ID,days_to_initial_pathologic_diagnosis,method_of_sample_procurement,days_to_sample_procurement,days_to_new_tumor_event_after_initial_treatment,additional_pharmaceutical_therapy,new_tumor_event_additional_surgery_procedure,history_of_neoadjuvant_treatment
3,TCGA-OR-A5JM,ACC,Sunitinib,R,Clinical Progressive Disease,378,439,DB01268,0,Surgical Resection,1,72,YES,NO,Yes
4,TCGA-OR-A5JM,ACC,Ketoconazole,R,Clinical Progressive Disease,378,439,DB01026,0,Surgical Resection,1,72,YES,NO,Yes
5,TCGA-OU-A5PI,ACC,Etoposide,R,Stable Disease,69,239,DB00773,0,Surgical Resection,0,351,YES,YES,No
6,TCGA-OU-A5PI,ACC,Doxorubicin,R,Stable Disease,69,239,DB00997,0,Surgical Resection,0,351,YES,YES,No
7,TCGA-OU-A5PI,ACC,Cisplatin,R,Stable Disease,55,239,DB00515,0,Surgical Resection,0,351,YES,YES,No


In [15]:
# define functions for barcodes

for group in df.iloc[2:10,0:7].groupby("bcr_patient_barcode"):
    pass

def exclude_combos(df_group):
    if df_group.shape[0] == 1:
        return df_group
    d = df_group.T.to_dict()
    keys_to_remove = set()
    for key in d.keys():
        start = d[key]["days_to_drug_therapy_start"]
        end = d[key]["days_to_drug_therapy_end"]
        
        if end == "[Not Available]":
            end = 0
                
        if start == "[Not Available]":
            start = 0
        
        #print(key,start,end)
        for key2 in d.keys():
            if key2 != key:
                start2 = d[key2]["days_to_drug_therapy_start"]
                end2 = d[key2]["days_to_drug_therapy_end"]
                
                if end2 == "[Not Available]":
                    end2 = 0
                
                if start2 == "[Not Available]":
                    start2 = 0
                    
                if not (end < start2) and not (end2 < start):
                    # if not non-overlapping time intervals
                    keys_to_remove.add(key)
                    keys_to_remove.add(key2)
                    
    #print(list(keys_to_remove))
    return df_group.loc[~df_group.index.isin(keys_to_remove),:]
exclude_combos(group[1])

1,bcr_patient_barcode,cohort,drug,response,measure_of_response,days_to_drug_therapy_start,days_to_drug_therapy_end
12,TCGA-OU-A5PI,ACC,Carboplatin,R,Stable Disease,725,817


In [16]:
# exclude drug combinations

df_single = []
for group in df.groupby("bcr_patient_barcode"):
    df_single.append(exclude_combos(group[1]))
df_single = pd.concat(df_single)
print(df_single.shape)
print("Records with combo drugs excluded:",df.shape[0] - df_single.shape[0])
df_single.head()

(812, 15)
Records with combo drugs excluded: 1757


1,bcr_patient_barcode,cohort,drug,response,measure_of_response,days_to_drug_therapy_start,days_to_drug_therapy_end,DrugBank ID,days_to_initial_pathologic_diagnosis,method_of_sample_procurement,days_to_sample_procurement,days_to_new_tumor_event_after_initial_treatment,additional_pharmaceutical_therapy,new_tumor_event_additional_surgery_procedure,history_of_neoadjuvant_treatment
1355,TCGA-05-4402,LUAD,Erlotinib,S,Complete Response,122,122,DB00530,0,Other Method (please specify),0,,,,No
1360,TCGA-05-5425,LUAD,Gefitinib,R,Clinical Progressive Disease,608,669,DB00317,0,Other Method (please specify),31,,,,No
883,TCGA-06-1806,GBM,veliparib,R,Clinical Progressive Disease,81,256,,0,Subtotal Resection,0,256.0,YES,[Not Available],No
884,TCGA-06-1806,GBM,Cabozantinib,R,Clinical Progressive Disease,293,455,DB08875,0,Subtotal Resection,0,256.0,YES,[Not Available],No
885,TCGA-06-A5U0,GBM,Temozolomide,R,Clinical Progressive Disease,31,74,DB00853,0,Subtotal Resection,0,100.0,YES,[Not Available],No


In [17]:
# count of sensitive and resistant patient samples

drugs = list(set(['Docetaxel', 
                  'Cisplatin', 
                  'Gemcitabine', 
                  'Temozolomide',
                  'Fluorouracil',
                  'Cetuximab', 
                  'Paclitaxel', 
                  'Erlotinib',
                  'Docetaxel']))

for drug in drugs:
    d = df_single[df_single["drug"] == drug ]
    print(drug, d.shape[0],"R:",d[d["response"] =="R"].shape[0],"S:",d[d["response"] =="S"].shape[0] )
    if d.shape[0] > 0 :
        d.set_index("bcr_patient_barcode",drop=True,inplace=True)
        d.to_csv(root_dir+"/preprocessed_results2/annotations/"+"TCGA_response."+drug+".tsv",sep = "\t")

Fluorouracil 55 R: 19 S: 36
Cetuximab 9 R: 3 S: 6
Gemcitabine 71 R: 46 S: 25
Paclitaxel 52 R: 16 S: 36
Erlotinib 6 R: 4 S: 2
Temozolomide 109 R: 97 S: 12
Cisplatin 111 R: 20 S: 91
Docetaxel 21 R: 12 S: 9


In [18]:
# show TCGA annotation file

d = df_single[df_single["drug"] == drugs[1] ]
d.set_index("bcr_patient_barcode",drop=True,inplace=True)

d.head(3)

1,cohort,drug,response,measure_of_response,days_to_drug_therapy_start,days_to_drug_therapy_end,DrugBank ID,days_to_initial_pathologic_diagnosis,method_of_sample_procurement,days_to_sample_procurement,days_to_new_tumor_event_after_initial_treatment,additional_pharmaceutical_therapy,new_tumor_event_additional_surgery_procedure,history_of_neoadjuvant_treatment
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
TCGA-BA-A4IG,HNSC,Cetuximab,S,Complete Response,79,140,DB00002,0,[Not Available],41,,,,No
TCGA-CN-4731,HNSC,Cetuximab,R,Clinical Progressive Disease,827,869,DB00002,0,[Not Available],22,,,,No
TCGA-CN-A49C,HNSC,Cetuximab,S,Complete Response,26,78,DB00002,0,[Not Available],0,,,,No


# GDSC

###  Continuous response - log(IC50) values 

* Supplementary files from  "A landscape of pharmacogenomic interactions in cancer" by Iorio F et al. Cell. 2016:
TableS4A.xlsx from https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources//Data/suppData/TableS4A.xlsx , tab 'TableS4A-IC50s'

* Also, log(IC50) for are available here ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/current_release/v17.3_fitted_dose_response.xlsx
(ln(IC50), these values seem to be just slightly different)

###  Binary response 

*  Supplementary files from  Iorio F et al. 2016
https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources///Data/suppData/TableS5C.xlsx

Cell line names were replaced with corresponding COSMIC ids from 
https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources//Data/suppData/TableS1E.xlsx

### GDSC - binarized response 

In [19]:
# match GDSC cell line samples to COSMIC Ids

COSMIC_ids = pd.read_excel(gdsc_path + "GDSC_TableS1E.xlsx")
COSMIC_ids = COSMIC_ids.iloc[2:,[1,2]]
COSMIC_ids = COSMIC_ids.iloc[:-1,]
COSMIC_ids.columns = ["name",'COSMIC']
# 1002 pair, all IDs are unique
#print(COSMIC_ids.shape[0],len(set(COSMIC_ids["name"])),len(set(COSMIC_ids["COSMIC"])))
COSMIC_ids.set_index("name",inplace=True,drop=True)
names2COSMIC = dict(COSMIC_ids["COSMIC"])

In [20]:
# show sensitive and resistant cell lines

df = pd.read_excel(gdsc_path + "GDSC_Drug Resistance_TableS5C.xlsx")
df.set_index("Screened Compounds:",inplace=True,drop=True)
df.index.name = "cell_line"

IC50_thr = df.iloc[0,:]
IC50_thr.name = "logIC50_threshold"
df =  df.iloc[1:,:]

df.rename(names2COSMIC,axis="index",inplace=True)
drugs = list(set(EGFRi_drugs + MITOSISi_drugs + CYTOi_drugs + DNA_REPi_drugs).intersection(set(df.columns.values)))
df = df.loc[:,drugs]
df.sort_values(by="cell_line",inplace=True)
df.to_csv(root_dir+"/preprocessed_results2/annotations/"+"GDSC_response."+"all_drugs"+".tsv",sep = "\t")

df.head()

Unnamed: 0_level_0,Afatinib,Cytarabine,Epothilone B,HG-5-88-01,NPK76-II-72-1,Genentech Cpd 10,Temozolomide,5-Fluorouracil,Cisplatin,Mitomycin C,...,SB-715992,GSK429286A,Gefitinib,Camptothecin,Gemcitabine,Lapatinib,Erlotinib,CP724714,GW843682X,Y-39983
cell_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683665,R,S,R,R,S,S,R,S,R,R,...,R,R,R,R,R,R,R,R,R,S
683667,R,R,R,S,R,R,,R,R,R,...,R,R,R,R,R,,,R,,S
684052,,,R,R,R,R,R,R,,R,...,R,R,,,R,,,R,,R
684055,,,R,S,R,S,R,R,,S,...,R,R,,,R,R,R,S,R,S
684057,R,R,R,S,S,S,R,R,S,R,...,R,R,R,S,R,R,R,R,R,S


In [21]:
# show IC50 values of drugs screened against cell lines

df_ic50 = pd.read_excel(gdsc_path + "GDSC_logIC50_Values_TableS4A.xlsx",'TableS4A-IC50s')
df_ic50 = df_ic50.iloc[4:,:]
df_ic50.drop(['Unnamed: 1'],axis=1,inplace=True)
df_ic50.columns = df_ic50.iloc[0,:].values
df_ic50 = df_ic50.iloc[1:,:]
df_ic50.index = df_ic50.iloc[:,0].values
df_ic50.index.name = "cell_line"
df_ic50 = df_ic50.iloc[:,1:]
df_ic50.sort_values(by="cell_line",inplace=True)
df_ic50.to_csv(root_dir+"/preprocessed_results2/annotations/"+"GDSC_response."+"logIC50.all_drugs"+".tsv",sep = "\t")

df_ic50[list(drugs)].head()

Unnamed: 0_level_0,Afatinib,Cytarabine,Epothilone B,HG-5-88-01,NPK76-II-72-1,Genentech Cpd 10,Temozolomide,5-Fluorouracil,Cisplatin,Mitomycin C,...,SB-715992,GSK429286A,Gefitinib,Camptothecin,Gemcitabine,Lapatinib,Erlotinib,CP724714,GW843682X,Y-39983
cell_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
683665,1.490015,-2.954429,-4.92425,3.394775,-0.473633,0.270443,4.85884,0.145949,2.807269,-1.597524,...,-3.716158,4.066824,1.464855,-4.527144,-4.408972,2.684181,2.436586,4.291379,-3.038878,2.536259
683667,1.868376,0.201181,-4.246559,0.949009,2.341061,2.650363,,3.772202,1.757559,1.56661,...,-3.295407,5.205801,1.174825,-4.461084,-0.399711,,,4.665435,,3.300246
684052,,,-5.751133,2.041549,2.551975,2.133583,5.604447,4.707996,,-0.262992,...,-4.190831,4.640195,,,-3.707244,,,4.173326,,5.021895
684055,,,-4.616658,1.259116,1.202188,0.00018,5.595628,3.740453,,-3.491516,...,-4.125172,4.109771,,,-2.99645,3.226492,3.342826,2.682825,0.225636,3.5846
684057,0.463011,-1.562465,-3.475365,0.651254,-0.675081,-0.783992,5.904012,1.93716,1.131967,0.30313,...,-4.086624,3.937856,2.152026,-6.615419,-2.410022,3.571787,3.571787,4.426893,0.357716,3.561218


In [22]:
# IC50 thresholds

IC50_thr = IC50_thr[list(drugs)].to_dict()
IC50_thr

{'Afatinib': -0.22156,
 'Cytarabine': -1.9516,
 'Epothilone B': -7.4389,
 'HG-5-88-01': 1.8792,
 'NPK76-II-72-1': 0.47397,
 'Genentech Cpd 10': 0.33153,
 'Temozolomide': 4.6302,
 '5-Fluorouracil': 1.1236,
 'Cisplatin': 1.3801,
 'Mitomycin C': -2.9647,
 'Bleomycin': -1.4805,
 'EHT 1864': 2.4876,
 'IPA-3': 1.704,
 'BI-2536': -4.0077,
 'Vinorelbine': -5.9536,
 'Cetuximab': 5.144,
 'Vinblastine': -5.9201,
 'Methotrexate': -2.4743,
 'GSK1070916': 0.43383,
 'Etoposide': -1.2198,
 'EKB-569': -0.47851,
 'MPS-1-IN-1': 1.8522,
 'GSK269962A': 0.35074,
 'Doxorubicin': -3.9565,
 'S-Trityl-L-cysteine': -0.71324,
 'ZM-447439': 0.56209,
 'VX-680': -0.59242,
 'PF-562271': 0.49307,
 'Paclitaxel': -5.6772,
 'SN-38': -6.559,
 'Docetaxel': -6.897,
 'Pyrimethamine': 1.5774,
 'SB-715992': -4.6156,
 'GSK429286A': 3.4721,
 'Gefitinib': -0.05346,
 'Camptothecin': -6.584,
 'Gemcitabine': -5.9903,
 'Lapatinib': 1.6257,
 'Erlotinib': 1.5671,
 'CP724714': 3.257,
 'GW843682X': -4.9006,
 'Y-39983': 4.0613}

In [23]:
# count of sensitive and resistant samples

df_long = []
for drug in drugs:
    d1 = df_ic50.loc[:,[drug]]
    d1 = pd.DataFrame(d1.iloc[:,0])
    d1.columns = ["logIC50"]
    d2 = df.loc[:,[drug]]
    d2.columns = ["response"]
    d1.dropna(inplace=True)
    d2.dropna(inplace=True)
    d = pd.concat([d2,d1],axis=1)
    d.loc[:,"drug"] = drug
    d.index.name = "sample_name"
    df_long.append(d)
    if d.shape[0] >0 :
        d.to_csv(root_dir+"/preprocessed_results2/annotations/"+"GDSC_response."+drug+".tsv",sep = "\t")
    print(drug,"total:",d.shape[0],"R:",d.loc[d["logIC50"]>IC50_thr[drug],:].shape[0],
          "S:",d.loc[d["logIC50"]<=IC50_thr[drug],:].shape[0])

Afatinib total: 849 R: 696 S: 153
Cytarabine total: 846 R: 761 S: 85
Epothilone B total: 875 R: 816 S: 59
HG-5-88-01 total: 496 R: 437 S: 59
NPK76-II-72-1 total: 922 R: 829 S: 93
Genentech Cpd 10 total: 926 R: 823 S: 103
Temozolomide total: 910 R: 807 S: 103
5-Fluorouracil total: 916 R: 822 S: 94
Cisplatin total: 850 R: 771 S: 79
Mitomycin C total: 876 R: 805 S: 71
Bleomycin total: 867 R: 793 S: 74
EHT 1864 total: 924 R: 839 S: 85
IPA-3 total: 874 R: 824 S: 50
BI-2536 total: 400 R: 369 S: 31
Vinorelbine total: 885 R: 833 S: 52
Cetuximab total: 873 R: 749 S: 124
Vinblastine total: 850 R: 786 S: 64
Methotrexate total: 849 R: 717 S: 132
GSK1070916 total: 897 R: 737 S: 160
Etoposide total: 885 R: 830 S: 55
EKB-569 total: 923 R: 806 S: 117
MPS-1-IN-1 total: 920 R: 862 S: 58
GSK269962A total: 963 R: 831 S: 97
Doxorubicin total: 875 R: 812 S: 63
S-Trityl-L-cysteine total: 400 R: 383 S: 17
ZM-447439 total: 799 R: 720 S: 79
VX-680 total: 396 R: 313 S: 83
PF-562271 total: 866 R: 795 S: 71
Paclit

In [24]:
# show drugs with their responses

df_long = pd.concat(df_long)
df_long.loc[:,"logIC50"] = pd.DataFrame(df_long["logIC50"]).astype(np.float64)

df_long.head()

Unnamed: 0_level_0,response,logIC50,drug
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
683665,R,1.490015,Afatinib
683667,R,1.868376,Afatinib
684057,R,0.463011,Afatinib
684059,R,1.019383,Afatinib
684062,R,1.23602,Afatinib
