In [6]:
# import libraries

from __future__ import print_function
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from scipy.stats import zscore
import seaborn as sns
import sys,os
%matplotlib  inline
import warnings

# suppress all warnings
warnings.filterwarnings("ignore")

In [11]:
# define directories

gdsc_dir = "/Volumes/Expansion/Thesis Work/Supplementary Files/GDSC/"
gdsc_expr_dir = "/Volumes/Expansion/Thesis Work/Datasets/GDSC/Expression/"
root_dir = "/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/" 
preprocessed_data = root_dir 

### GDSC 

~20 sample names do not match between expression and response files - ignored

### TCGA 
 * expression profiles of normal samples with barcodes 10* and 11* were excluded
 - some cohorts seem to be duplicates, e.g. GBMLGG = GBM + LGG - excluded
 - try with the latest dataset in FireBrowse (2016, current is 2015) or take directly from GDC
 
### PDX 
 - some gene names failed to map to Entrez because of outdated version of HGNC. Try remapping with myGene.  

## Training dataset: 
#### Pre-training: 
- TCGA samples without drug response
    - expression, CNA, SNV
    - XXXX samples in total from XX cohorts
    
#### Training: 
- GDSC
    - expression, CNA, SNV
    - known responses for 9 drugs: 

## Testing dataset: 
#### Cell line validation: 
 - CCLE cell lines
 - expression, CNA, SNV
 - known responses for 9 drugs
  
#### Clinical trials (expression only)
 - GSE18864
 - GSE9782
 - GSE25065
 - GSE
#### TCGA samples with response to single drugs:




In [3]:
# create data folders

for directory in [root_dir]:
    for subdir in ["exprs","CNA","mutations"]:
        if not os.path.exists(directory+"/"+subdir):
            os.makedirs(directory+"/"+subdir)
            
response_dir = root_dir +"response/"
if not os.path.exists(response_dir):
    os.makedirs(response_dir)

In [4]:
# create lists for inhibitor classes

EGFRi_drugs = ['Erlotinib',
               'Lapatinib',
               'CP724714',
               'EKB-569',
               'Gefitinib',
               'Afatinib',
               'Cetuximab',
               'HG-5-88-01']

DNA_REPi_drugs = ["Pyrimethamine",
                  "Doxorubicin",
                  "Etoposide",
                  "Gemcitabine",
                  "Mitomycin C",
                  "5-Fluorouracil",
                  "Bleomycin",
                  "Camptothecin",
                  "Cisplatin",
                  "Cytarabine",
                  "Methotrexate",
                  "Temozolomide",
                  "SN-38"]

CYTOi_drugs = ['Paclitaxel',
               'GSK269962A',
               'Vinorelbine',
               'PF-562271',
               'IPA-3',
               'Epothilone B',
               'GSK429286A',
               'Y-39983',
               'Vinblastine',
               'Docetaxel',
               'EHT 1864']

MITOSISi_drugs = ['VX-680',
                  'S-Trityl-L-cysteine',
                  'BI-2536',
                  'GW843682X',
                  'SB-715992',
                  'Genentech Cpd 10',
                  'GSK1070916',
                  'NPK76-II-72-1',
                  'MPS-1-IN-1',
                  'ZM-447439']

# define TCGA cohorts used
tcga_cohorts = ["ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", 
             "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML", "LGG", "LIHC", 
             "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM",  
             "STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM"]
gene_id = "ENTREZID"

# Trainig Dataset  - GDSC

##### Responses:
* binary and log(IC50) from Iorio et al. 2016
* COSMIC cell line IDs from Iorio et al. 2016

##### Expressions:
 * from E-MTAB-3610, preprocessed
 * Some cell lines not matched between ArrayExpress and drug response file were excluded.
 * 4 replicated cell lines profiles were averaged; Correlations of expression profiles of replicated samples was 0.96-0.99

In [9]:
# GDSC cell line details

exprs_names = pd.read_csv(gdsc_dir + "GDSC_E-MTAB-3610.sdrf.txt",sep = "\t")
exprs_names = exprs_names[["Assay Name","Source Name"]].copy()
exprs_names["COSMIC_ID"] = exprs_names["Source Name"].apply(lambda x: int(x.split("_")[-1]))
exprs_names["cell_line_name"] = exprs_names["Source Name"].apply(lambda x: x.split("_")[2])
dups = exprs_names.loc[exprs_names.duplicated(["COSMIC_ID"],keep=False),:]
exprs_names.set_index("Assay Name",inplace=True,drop=True)
exprs_names.sort_values(by="Assay Name",inplace=True)
exprs_names.to_csv(preprocessed_data+"annotations/"+"GDSC.Assay2COSMICID.tsv",sep = "\t")
Assay2COSMICID = exprs_names[["COSMIC_ID"]].to_dict()['COSMIC_ID']
dups = dups.sort_values(by=["COSMIC_ID"])
dups

Unnamed: 0,Assay Name,Source Name,COSMIC_ID,cell_line_name
870,5500994158987071513207_H10,J132_EPH10P5_SK-MEL-28_Skin_905954,905954,SK-MEL-28
993,5500994175999120813240_H11,J154_EPH04P12_SK-MEL-28_skin_905954,905954,SK-MEL-28
42,5500994172383112813928_A05,J132_EPA05P6_KM-H2_Blood_909976,909976,KM-H2
979,5500994175999120813240_G11,J154_EPG04P12_KM-H2_haematopoietic_and_lymphoi...,909976,KM-H2
316,5500994172383112813930_C10,J132_EPC10P8_OCI-AML5_Blood_1330983,1330983,OCI-AML5
838,5500994173212120213068_H07,J132_EPH07P10_OCI-AML5_Blood_1330983,1330983,OCI-AML5
66,5500994173212120213068_A08,J132_EPA08P10_OACp4C_Esophagus_1503362,1503362,OACp4C
429,5500994172383112813930_D10,J132_EPD10P8_OACp4C_Esophagus_1503362,1503362,OACp4C


In [12]:
# read GDSC expression dataset

gdsc_exprs = pd.read_csv(gdsc_expr_dir + "GDSC_micro.BrainArray.RMAlog2Average."+gene_id+".Expr.tsv",
                    sep = "\t",index_col=0)
gdsc_exprs.rename(Assay2COSMICID,axis = "columns",inplace=True)
gdsc_exprs.sort_index(axis=1,inplace=True)
gdsc_exprs.sort_index(axis=0,inplace=True)
gdsc_exprs.index.name = gene_id
gdsc_exprs.rename(int,axis="columns",inplace=True)
gdsc_exprs.head()

Unnamed: 0_level_0,683665,683667,684052,684055,684057,684059,684062,684072,684681,687448,...,1659823,1659928,1659929,1660034,1660035,1660036,1674021,1723793,1723794,11223344
ENTREZID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.805253,5.140331,4.252704,5.594708,4.903946,4.708577,4.357586,5.467886,3.793932,4.8177,...,3.126983,3.214622,3.567029,3.273336,2.976021,3.173461,2.974951,5.26616,5.150991,2.973192
2,3.655408,3.612306,3.337473,3.503549,6.705561,5.02577,3.414269,3.454856,6.388228,6.885098,...,3.759219,3.66071,3.648689,3.690009,3.365638,3.394408,3.142336,5.697181,5.81331,3.498103
9,7.64388,4.377345,5.563119,6.489859,5.722173,5.888262,6.422792,5.006021,4.596526,5.718388,...,6.758054,6.804065,8.043999,7.799323,6.168235,8.412595,5.594489,6.634082,7.144707,6.561471
10,3.039498,2.958211,3.157324,3.114843,3.202469,3.104771,3.274578,3.174249,3.208779,3.000329,...,7.130509,3.19818,5.202321,3.429123,3.766297,3.752278,3.267572,3.20852,3.162356,3.384278
12,3.119811,3.642967,2.969843,2.93349,3.153036,3.091319,2.88278,3.293808,3.873429,9.838993,...,3.012664,3.142688,3.1023,3.192675,3.37041,3.177591,5.349932,7.952317,9.556548,3.421077


In [13]:
# find duplicated samples

dups = gdsc_exprs.columns[gdsc_exprs.columns.duplicated()]
gdsc_exprs.loc[:,dups].head(3)

Unnamed: 0_level_0,905954,905954,909976,909976,1330983,1330983,1503362,1503362
ENTREZID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5.434676,5.339831,6.311768,5.563456,3.338442,3.560485,3.29099,2.990678
2,10.260771,8.860099,3.379947,3.817879,3.595809,3.432538,3.410747,3.545168
9,6.413309,6.176417,6.506337,7.735421,8.310339,8.399793,6.335858,6.260497


In [14]:
# take average of duplicated columns

#corrs = exprs[dups].corr().rename(Assay2COSMICID).rename(Assay2COSMICID,axis=1)
#corrs.sort_index().sort_index(axis=1)
for id in dups:
    averaged_profile  = gdsc_exprs[id].apply(sum,axis=1)/(gdsc_exprs[id].shape[1])
    gdsc_exprs.drop(columns=[id],inplace=True)
    gdsc_exprs.loc[:,id] = averaged_profile 
gdsc_exprs.loc[:,dups].head(3)

Unnamed: 0_level_0,905954,909976,1330983,1503362
ENTREZID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.387253,5.937612,3.449463,3.140834
2,9.560435,3.598913,3.514174,3.477958
9,6.294863,7.120879,8.355066,6.298177


In [17]:
# count of sensitive and resistant samples

gdsc_exprs.to_csv(gdsc_expr_dir + "GDSC_micro.BrainArray.RMAlog2Average."+gene_id+".Expr_renamed.tsv",
                    sep = "\t")
print("GDSC samples with responses")
gdsc_drugs = []
for drug in list(set(EGFRi_drugs + MITOSISi_drugs + CYTOi_drugs + DNA_REPi_drugs)):
    try:
        r = pd.read_csv(preprocessed_data+"/annotations/"+"GDSC_response."+drug+".tsv",
                               sep = "\t",index_col=0)
        R, S = r.loc[r["response"]=="R",:].shape[0], r.loc[r["response"]=="S",:].shape[0]
        print(drug, r.shape[0],"R:",R,"S:",S)
        gdsc_drugs.append(drug)
    except: 
        print("No response for drug",drug,"in GDSC", file = sys.stderr)


GDSC samples with responses
Paclitaxel 402 R: 357 S: 26
NPK76-II-72-1 922 R: 712 S: 83
Gemcitabine 870 R: 735 S: 51
S-Trityl-L-cysteine 400 R: 367 S: 15
HG-5-88-01 496 R: 377 S: 56
Genentech Cpd 10 926 R: 708 S: 90
Vinorelbine 885 R: 751 S: 49
Epothilone B 875 R: 738 S: 53
Pyrimethamine 402 R: 347 S: 36
Afatinib 849 R: 626 S: 133
VX-680 396 R: 298 S: 80
Gefitinib 846 R: 645 S: 110
Cisplatin 850 R: 690 S: 69
EKB-569 923 R: 698 S: 98
GW843682X 402 R: 369 S: 14
5-Fluorouracil 916 R: 706 S: 84
Temozolomide 910 R: 680 S: 91
CP724714 924 R: 703 S: 94
Methotrexate 849 R: 640 S: 119
Etoposide 885 R: 755 S: 45
SN-38 930 R: 731 S: 58
Docetaxel 850 R: 699 S: 60
IPA-3 874 R: 742 S: 48
ZM-447439 799 R: 643 S: 67
Y-39983 922 R: 477 S: 318
BI-2536 400 R: 352 S: 29
GSK269962A 963 R: 334 S: 50
Cytarabine 846 R: 681 S: 75
MPS-1-IN-1 920 R: 743 S: 50
Erlotinib 372 R: 308 S: 64
Bleomycin 867 R: 719 S: 65
Camptothecin 849 R: 706 S: 52
EHT 1864 924 R: 707 S: 74
Lapatinib 398 R: 322 S: 60
GSK1070916 897 R: 6

In [18]:
# standardize GDSC expression dataset

gdsc_exprs_z = (gdsc_exprs.T - gdsc_exprs.T.mean())/gdsc_exprs.T.std()
gdsc_exprs_z = gdsc_exprs_z.T

In [19]:
# store samples with multi-omics profiles an responses available

gdsc_responses_multiomics = {}
print("Expression profiles with response in GDSC:")
for drug in gdsc_drugs:

    r = pd.read_csv(preprocessed_data+"/annotations/"+"GDSC_response."+drug+".tsv",
                           sep = "\t",index_col=0)

    shared_samples = sorted(list(set(r.index.values).intersection(set(gdsc_exprs.columns.values))))
    exprs = gdsc_exprs.loc[:,shared_samples]
    gdsc_responses_exprs = r.loc[shared_samples,:]
    # save samples with multi-omics profiles ands responses available
    gdsc_responses_multiomics[drug] = {"exprs":gdsc_responses_exprs}
    gdsc_exprs.to_csv(root_dir+"/exprs/" +"GDSC_exprs."+drug+".tsv", sep = "\t")
    exprs_z =  gdsc_exprs_z.loc[:,shared_samples]
    gdsc_responses_exprs.to_csv(root_dir+"/response/"+"GDSC_response."+drug+".tsv", sep = "\t")
    gdsc_exprs_z.to_csv(root_dir+"/exprs/" +"GDSC_exprs.z."+drug+".tsv", sep = "\t")

    R = gdsc_responses_exprs.loc[gdsc_responses_exprs["response"]=="R",:].shape[0]
    S = gdsc_responses_exprs.loc[gdsc_responses_exprs["response"]=="S",:].shape[0]
    print(drug, gdsc_responses_exprs.shape[0],"R:",R,"S:",S)
    if not exprs.shape[1]==gdsc_responses_exprs.shape[0] == len(shared_samples):
        print(exprs.shape[1],gdsc_responses_exprs.shape[0], len(shared_samples),file=sys.stderr)


Expression profiles with response in GDSC:
Paclitaxel 389 R: 344 S: 26
NPK76-II-72-1 900 R: 693 S: 80
Gemcitabine 849 R: 714 S: 51
S-Trityl-L-cysteine 387 R: 354 S: 15
HG-5-88-01 485 R: 368 S: 54
Genentech Cpd 10 903 R: 689 S: 86
Vinorelbine 864 R: 731 S: 48
Epothilone B 854 R: 718 S: 52
Pyrimethamine 391 R: 336 S: 36
Afatinib 832 R: 612 S: 130
VX-680 383 R: 286 S: 79
Gefitinib 829 R: 632 S: 106
Cisplatin 833 R: 675 S: 67
EKB-569 901 R: 677 S: 97
GW843682X 389 R: 357 S: 13
5-Fluorouracil 894 R: 686 S: 82
Temozolomide 895 R: 667 S: 89
CP724714 902 R: 683 S: 92
Methotrexate 832 R: 628 S: 114
Etoposide 864 R: 736 S: 43
SN-38 916 R: 719 S: 56
Docetaxel 833 R: 683 S: 59
IPA-3 853 R: 722 S: 47
ZM-447439 787 R: 632 S: 66
Y-39983 900 R: 463 S: 310
BI-2536 387 R: 340 S: 28
GSK269962A 940 R: 325 S: 48
Cytarabine 829 R: 666 S: 73
MPS-1-IN-1 898 R: 723 S: 48
Erlotinib 362 R: 298 S: 64
Bleomycin 846 R: 698 S: 65
Camptothecin 832 R: 691 S: 50
EHT 1864 908 R: 693 S: 72
Lapatinib 387 R: 311 S: 60
GSK1

In [20]:
# plot histograms of some genes

gene =  7157    #5701, 2625, 3169 
plt.figure(figsize=(17,5))
plt.subplot(1,2,1)
tmp = plt.hist(gdsc_exprs.loc[gene,:], bins =50)
tmp= plt.xlabel(str(gene)+" log2(exprs)")
plt.subplot(1,2,2)
tmp = plt.hist(gdsc_exprs_z.loc[gene,:], bins =50)
tmp= plt.xlabel(str(gene)+" Z-scores")

### CNA and mutations 

In [21]:
# CNA profiles with response in GDSC

print("CNA profiles with response in GDSC:")
gdsc_cna = pd.read_csv(preprocessed_data+"/CNA/"+"GDSC.Segment_Mean.CNA.tsv",sep = "\t",index_col=0)
gdsc_cna.rename(int,axis="columns",inplace=True)
for drug in gdsc_drugs:

    r = pd.read_csv(preprocessed_data+"/annotations/"+"GDSC_response."+drug+".tsv",
                           sep = "\t",index_col=0)

    shared_samples = sorted(list(set(r.index.values).intersection(set(gdsc_cna.columns.values))))
    cna = gdsc_cna.loc[:,shared_samples]
    gdsc_responses_cna = r.loc[shared_samples,:]
    # save samples with multi-omics profiles ands responses available
    gdsc_responses_multiomics[drug]["CNA"] = gdsc_responses_cna
    cna.to_csv(root_dir+"/CNA/" +"GDSC_CNA."+drug+".tsv", sep = "\t")
    r.to_csv(root_dir+"/response/"+"GDSC_response.CNA."+drug+".tsv", sep = "\t")

    R = gdsc_responses_cna.loc[gdsc_responses_cna["response"]=="R",:].shape[0]
    S = gdsc_responses_cna.loc[gdsc_responses_cna["response"]=="S",:].shape[0]
    print(drug, gdsc_responses_cna.shape[0],"R:",R,"S:",S)
    if not cna.shape[1]==gdsc_responses_cna.shape[0] == len(shared_samples):
        print(cna.shape[1],gdsc_responses_cna.shape[0], len(shared_samples),file=sys.stderr)


CNA profiles with response in GDSC:
Paclitaxel 402 R: 357 S: 26
NPK76-II-72-1 917 R: 708 S: 83
Gemcitabine 865 R: 732 S: 50
S-Trityl-L-cysteine 400 R: 367 S: 15
HG-5-88-01 496 R: 377 S: 56
Genentech Cpd 10 921 R: 704 S: 90
Vinorelbine 880 R: 747 S: 49
Epothilone B 870 R: 734 S: 53
Pyrimethamine 402 R: 347 S: 36
Afatinib 845 R: 622 S: 133
VX-680 396 R: 298 S: 80
Gefitinib 842 R: 641 S: 110
Cisplatin 846 R: 686 S: 69
EKB-569 918 R: 694 S: 98
GW843682X 402 R: 369 S: 14
5-Fluorouracil 911 R: 702 S: 84
Temozolomide 905 R: 676 S: 91
CP724714 919 R: 699 S: 94
Methotrexate 845 R: 636 S: 119
Etoposide 880 R: 752 S: 44
SN-38 925 R: 728 S: 57
Docetaxel 846 R: 695 S: 60
IPA-3 869 R: 738 S: 48
ZM-447439 795 R: 639 S: 67
Y-39983 917 R: 475 S: 316
BI-2536 400 R: 352 S: 29
GSK269962A 958 R: 334 S: 50
Cytarabine 842 R: 678 S: 74
MPS-1-IN-1 915 R: 739 S: 50
Erlotinib 372 R: 308 S: 64
Bleomycin 862 R: 715 S: 65
Camptothecin 845 R: 702 S: 52
EHT 1864 919 R: 704 S: 73
Lapatinib 398 R: 322 S: 60
GSK1070916 

In [22]:
# Point mutations with response in GDSC

print("Point mutations with response in GDSC:")
gdsc_mut = pd.read_csv(preprocessed_data+"/mutations/"+"GDSC.non_bin_mutations.tsv",sep = "\t",index_col=0)
gdsc_mut.rename(int,axis="columns",inplace=True)
for drug in gdsc_drugs:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"GDSC_response."+drug+".tsv",
                           sep = "\t",index_col=0)

    shared_samples = sorted(list(set(r.index.values).intersection(set(gdsc_mut.columns.values))))

    mut = gdsc_mut.loc[:,shared_samples]
    gdsc_responses_mut = r.loc[shared_samples,:]
    # save samples with multi-omics profiles ands responses available
    gdsc_responses_multiomics[drug]["mutations"] = gdsc_responses_mut
    mut.to_csv(root_dir+"/mutations/" +"GDSC_mutations."+drug+".tsv", sep = "\t")

    R = gdsc_responses_mut.loc[gdsc_responses_mut["response"]=="R",:].shape[0]
    S = gdsc_responses_mut.loc[gdsc_responses_mut["response"]=="S",:].shape[0]
    print(drug, gdsc_responses_mut.shape[0],"R:",R,"S:",S)

    if not mut.shape[1]==gdsc_responses_mut.shape[0] == len(shared_samples):
        print(mut.shape[1],gdsc_responses_mut.shape[0], len(shared_samples),file=sys.stderr)


Point mutations with response in GDSC:
Paclitaxel 402 R: 357 S: 26
NPK76-II-72-1 922 R: 712 S: 83
Gemcitabine 870 R: 735 S: 51
S-Trityl-L-cysteine 400 R: 367 S: 15
HG-5-88-01 496 R: 377 S: 56
Genentech Cpd 10 926 R: 708 S: 90
Vinorelbine 885 R: 751 S: 49
Epothilone B 875 R: 738 S: 53
Pyrimethamine 402 R: 347 S: 36
Afatinib 849 R: 626 S: 133
VX-680 396 R: 298 S: 80
Gefitinib 846 R: 645 S: 110
Cisplatin 850 R: 690 S: 69
EKB-569 923 R: 698 S: 98
GW843682X 402 R: 369 S: 14
5-Fluorouracil 916 R: 706 S: 84
Temozolomide 910 R: 680 S: 91
CP724714 924 R: 703 S: 94
Methotrexate 849 R: 640 S: 119
Etoposide 885 R: 755 S: 45
SN-38 930 R: 731 S: 58
Docetaxel 850 R: 699 S: 60
IPA-3 874 R: 742 S: 48
ZM-447439 799 R: 643 S: 67
Y-39983 922 R: 477 S: 318
BI-2536 400 R: 352 S: 29
GSK269962A 963 R: 334 S: 50
Cytarabine 846 R: 681 S: 75
MPS-1-IN-1 920 R: 743 S: 50
Erlotinib 372 R: 308 S: 64
Bleomycin 867 R: 719 S: 65
Camptothecin 849 R: 706 S: 52
EHT 1864 924 R: 707 S: 74
Lapatinib 398 R: 322 S: 60
GSK10709

In [23]:
# GDSC samples with all multi-OMICs and responses

print("GDSC samples with all multi-OMICs and responses")
for drug in gdsc_drugs:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"GDSC_response."+drug+".tsv",
                           sep = "\t",index_col=0)
    
    exprs_samples = gdsc_responses_multiomics[drug]["exprs"].index.values
    cna_samples = gdsc_responses_multiomics[drug]["CNA"].index.values
    mut_samples = gdsc_responses_multiomics[drug]["mutations"].index.values
    
    r["exprs"] = 0
    r.loc[exprs_samples,'exprs'] =1
    r["CNA"]= 0
    r.loc[cna_samples,'CNA'] =1
    r["mutations"]= 0
    r.loc[mut_samples,'mutations'] =1
    r = r.loc[r[['exprs',"CNA","mutations"]].sum(axis =1)==3,:]
    r.to_csv(root_dir+"/response/"+"GDSC_response."+drug+".tsv", sep = "\t")
    shared_samples = r.index.values
    # expession
    gdsc_exprs.loc[:,shared_samples].to_csv(root_dir+"/exprs/" +"GDSC_exprs."+drug+".tsv", sep = "\t")
    gdsc_exprs_z.loc[:,shared_samples].to_csv(root_dir+"/exprs/" +"GDSC_exprs.z."+drug+".tsv", sep = "\t")
    # CNA
    gdsc_cna.loc[:,shared_samples].to_csv(root_dir+"/CNA/" +"GDSC_CNA."+drug+".tsv", sep = "\t")
    # mutations
    gdsc_mut.loc[:,shared_samples].to_csv(root_dir+"/mutations/" +"GDSC_mutations."+drug+".tsv", sep = "\t")
    
    R, S = r.loc[r["response"]=="R",:].shape[0], r.loc[r["response"]=="S",:].shape[0]
    print(drug, r.shape[0],"R:",R,"S:",S)
    

GDSC samples with all multi-OMICs and responses
Paclitaxel 389 R: 344 S: 26
NPK76-II-72-1 895 R: 689 S: 80
Gemcitabine 844 R: 711 S: 50
S-Trityl-L-cysteine 387 R: 354 S: 15
HG-5-88-01 485 R: 368 S: 54
Genentech Cpd 10 898 R: 685 S: 86
Vinorelbine 859 R: 727 S: 48
Epothilone B 849 R: 714 S: 52
Pyrimethamine 391 R: 336 S: 36
Afatinib 828 R: 608 S: 130
VX-680 383 R: 286 S: 79
Gefitinib 825 R: 628 S: 106
Cisplatin 829 R: 671 S: 67
EKB-569 896 R: 673 S: 97
GW843682X 389 R: 357 S: 13
5-Fluorouracil 889 R: 682 S: 82
Temozolomide 890 R: 663 S: 89
CP724714 897 R: 679 S: 92
Methotrexate 828 R: 624 S: 114
Etoposide 859 R: 733 S: 42
SN-38 911 R: 716 S: 55
Docetaxel 829 R: 679 S: 59
IPA-3 848 R: 718 S: 47
ZM-447439 783 R: 628 S: 66
Y-39983 895 R: 461 S: 308
BI-2536 387 R: 340 S: 28
GSK269962A 935 R: 325 S: 48
Cytarabine 825 R: 663 S: 72
MPS-1-IN-1 893 R: 719 S: 48
Erlotinib 362 R: 298 S: 64
Bleomycin 841 R: 694 S: 65
Camptothecin 828 R: 687 S: 50
EHT 1864 903 R: 690 S: 71
Lapatinib 387 R: 311 S: 60

# Testing datasets



# PDX 



### expressions 

In [24]:
# store samples with multi-omics profiles an responses available

pdx_responses_multiomics = {}
pdx_exprs = pd.read_csv(preprocessed_data+"/exprs/"+"PDX.FPKM2TPMplus1log2.Expr.tsv",sep="\t",index_col=0)
pdx_exprs_z = (pdx_exprs.T - pdx_exprs.T.mean())/pdx_exprs.T.std()
pdx_exprs_z = pdx_exprs_z.T

for drug in ["Cetuximab","Paclitaxel","Gemcitabine","5-Fluorouracil","Erlotinib","Gemcitabine"]:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"PDX_response."+drug+".tsv",
                           sep = "\t",index_col=0)
    shared_samples = sorted(list(set(r.index.values).intersection(set(pdx_exprs.columns.values))))
    pdx_responses_exprs = r.loc[shared_samples,:]
    pdx_responses_multiomics[drug] = {"exprs":pdx_responses_exprs}
    e = pdx_exprs.loc[:,shared_samples]
    e.index.name = gene_id
    e.to_csv(root_dir+"/exprs/"+"PDX_exprs."+drug+".tsv", sep = "\t")
    e_z = pdx_exprs_z.loc[:,shared_samples]
    e_z.to_csv(root_dir+"/exprs/"+"PDX_exprs.z."+drug+".tsv", sep = "\t")
    pdx_responses_exprs.to_csv(root_dir+"/response/"+"PDX_response."+drug+".tsv", sep = "\t")
    R = pdx_responses_exprs.loc[pdx_responses_exprs["response"]=="R",:].shape[0]
    S = pdx_responses_exprs.loc[pdx_responses_exprs["response"]=="S",:].shape[0]
    print(drug, pdx_responses_exprs.shape[0],"R:",R,"S:",S)

Cetuximab 60 R: 55 S: 5
Paclitaxel 43 R: 38 S: 5
Gemcitabine 25 R: 18 S: 7
5-Fluorouracil 23 R: 22 S: 1
Erlotinib 21 R: 18 S: 3
Gemcitabine 25 R: 18 S: 7


### CNA and mutations

In [25]:
# count of sensitive and resistant samples with CNA profiles

pdx_cna = pd.read_csv(preprocessed_data+"/CNA/"+"PDX.Segment_Mean.CNA.tsv",sep="\t",index_col=0)

for drug in ["Cetuximab","Paclitaxel","Gemcitabine","5-Fluorouracil","Erlotinib"]:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"PDX_response."+drug+".tsv",
                           sep = "\t",index_col=0)
    
    shared_samples = sorted(list(set(r.index.values).intersection(set(pdx_cna.columns.values))))
    #print(drug,"with response",r.shape[0],"with Expressions:",e.shape[1],"Shared samples",len(shared_samples))
    pdx_responses_cna = r.loc[shared_samples,:]
    pdx_responses_multiomics[drug]["CNA"] = pdx_responses_cna
    cna = pdx_cna.loc[:,shared_samples]
    cna.index.name = gene_id
    cna.to_csv(root_dir+"/CNA/"+"PDX_CNA."+drug+".tsv", sep = "\t")
    r.to_csv(root_dir+"/response/"+"PDX_response."+drug+".tsv", sep = "\t")
    R = pdx_responses_cna.loc[pdx_responses_cna["response"]=="R",:].shape[0]
    S = pdx_responses_cna.loc[pdx_responses_cna["response"]=="S",:].shape[0]
    print(drug, pdx_responses_cna.shape[0],"R:",R,"S:",S)

Cetuximab 60 R: 55 S: 5
Paclitaxel 43 R: 38 S: 5
Gemcitabine 25 R: 18 S: 7
5-Fluorouracil 23 R: 22 S: 1
Erlotinib 21 R: 18 S: 3


In [26]:
# count of sensitive and resistant samples with mutation profiles

pdx_mut = pd.read_csv(preprocessed_data+"/mutations/"+"PDX.non_bin_mutations.tsv",sep="\t",index_col=0)

for drug in ["Cetuximab","Paclitaxel","Gemcitabine","5-Fluorouracil","Erlotinib"]:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"PDX_response."+drug+".tsv",
                           sep = "\t",index_col=0)
    
    shared_samples = sorted(list(set(r.index.values).intersection(set(pdx_mut.columns.values))))
    #print(drug,"with response",r.shape[0],"with Expressions:",e.shape[1],"Shared samples",len(shared_samples))
    pdx_responses_mut = r.loc[shared_samples,:]
    pdx_responses_multiomics[drug]["mutations"] = pdx_responses_mut
    mut = pdx_mut.loc[:,shared_samples]
    mut.index.name = gene_id
    mut.to_csv(root_dir+"/mutations/"+"PDX_mutations."+drug+".tsv", sep = "\t")
    r.to_csv(root_dir+"/response/"+"PDX_response."+drug+".tsv", sep = "\t")
    R = pdx_responses_mut.loc[pdx_responses_mut["response"]=="R",:].shape[0]
    S = pdx_responses_mut.loc[pdx_responses_mut["response"]=="S",:].shape[0]
    print(drug, pdx_responses_mut.shape[0],"R:",R,"S:",S)

Cetuximab 60 R: 55 S: 5
Paclitaxel 43 R: 38 S: 5
Gemcitabine 25 R: 18 S: 7
5-Fluorouracil 23 R: 22 S: 1
Erlotinib 21 R: 18 S: 3


In [27]:
# count of sensitive and resistant samples with response profiles

print("PDX samples with responses")
for drug in ["Cetuximab","Paclitaxel","Gemcitabine","5-Fluorouracil","Erlotinib"]:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"PDX_response."+drug+".tsv",
                           sep = "\t",index_col=0)
    
    R, S = r.loc[r["response"]=="R",:].shape[0], r.loc[r["response"]=="S",:].shape[0]
    print(drug, r.shape[0],"R:",R,"S:",S)

PDX samples with responses
Cetuximab 64 R: 58 S: 6
Paclitaxel 46 R: 40 S: 6
Gemcitabine 26 R: 19 S: 7
5-Fluorouracil 25 R: 24 S: 1
Erlotinib 23 R: 19 S: 4


In [28]:
# PDX samples with responses and all mutli-OMICs profiles

print("PDX samples with responses and all mutli-OMICs profiles")
for drug in ["Cetuximab","Paclitaxel","Gemcitabine","5-Fluorouracil","Erlotinib"]:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"PDX_response."+drug+".tsv",
                           sep = "\t",index_col=0)
    
    exprs_snames = pdx_responses_multiomics[drug]["exprs"].index.values
    cna_snames = pdx_responses_multiomics[drug]["CNA"].index.values
    mut_snames = pdx_responses_multiomics[drug]["mutations"].index.values
    
    r["exprs"] = 0
    r.loc[exprs_snames,'exprs'] =1
    r["CNA"]= 0
    r.loc[cna_snames,'CNA'] =1
    r["mutations"]= 0
    r.loc[mut_snames,'mutations'] =1
    
    r = r.loc[r[['exprs',"CNA","mutations"]].sum(axis =1)==3,:]
    r.to_csv(root_dir+"/response/"+"PDX_response."+drug+".tsv", sep = "\t")
    # multi-omics
    shared_samples = r.index.values
    pdx_exprs.loc[:,shared_samples].to_csv(root_dir+"/exprs/"+"PDX_exprs."+drug+".tsv", sep = "\t")
    pdx_exprs_z.loc[:,shared_samples].to_csv(root_dir+"/exprs/"+"PDX_exprs.z."+drug+".tsv", sep = "\t")
    pdx_cna.loc[:,shared_samples].to_csv(root_dir+"/CNA/"+"PDX_CNA."+drug+".tsv", sep = "\t")
    pdx_mut.loc[:,shared_samples].to_csv(root_dir+"/mutations/"+"PDX_mutations."+drug+".tsv", sep = "\t")
    R, S = r.loc[r["response"]=="R",:].shape[0], r.loc[r["response"]=="S",:].shape[0]
    print(drug, r.shape[0],"R:",R,"S:",S)

PDX samples with responses and all mutli-OMICs profiles
Cetuximab 60 R: 55 S: 5
Paclitaxel 43 R: 38 S: 5
Gemcitabine 25 R: 18 S: 7
5-Fluorouracil 23 R: 22 S: 1
Erlotinib 21 R: 18 S: 3


# TCGA
 
### Testing data:
 - match sample and patient names by first 12 symbols. 
      * Some patients have more than one sample - primary and metastatic tumors; metastatic were excluded
      * Some samples represent normal tissue (barcodes 10 and 11) - excluded
      * For some patients two tumor samples available - primary and metastatic (barcode 06) - metastatic samples excluded
 

In [220]:
# find and save expression, mutation and CNA profiles of TCGA samples
    
testing_tcga_samples = [] 
for drug in ['Docetaxel','Cisplatin','Cetuximab','Paclitaxel','Gemcitabine',
             'Doxorubicin', 'Fluorouracil']: 
    print(drug, file = sys.stderr)
    #try:
    r = pd.read_csv(preprocessed_data+"/annotations/"+"TCGA_response."+drug+".tsv",
                       sep = "\t",index_col=0)
    #print(drug, r.shape[0],"R:",r[r["response"]=="R"].shape[0],
    #      "S:",r[r["response"]=="S"].shape[0])

    expr_profiles = []
    expr_profiles_z = []
    exprs_response_per_sample = []
    cna_profiles = []
    cna_response_per_sample = []
    mut_profiles = []
    mut_response_per_sample = []
    
    for cohort in set(r["cohort"].values):
        r_ = r[r["cohort"]==cohort]
        print("\t",cohort,"samples with response:",r_.shape[0])
        #print(cohort, r_.shape[0],"R:",r_[r_["response"]=="R"].shape[0],
        #      "S:",r_[r_["response"]=="S"].shape[0])
###################### exprs ###########################################################
        try:
            e = pd.read_csv(preprocessed_data + "/exprs/"+"TCGA-"+cohort+"_exprs.RSEMscaled_est2TPMplus1log2.tsv",sep = "\t",index_col=0)
            no_data = False
        except:
            print("\t","no expression for cohort", cohort,file = sys.stderr) 
            no_data = True
        if not no_data:
            e_z = (e.T - e.T.mean())/e.T.std()
            e_z = e_z.T
            pat2sample = {}
            for sample_barcode in e.columns.values:
                sname = sample_barcode.split("-")
                patient = "-".join(sname[:3])
                tissue_code = sname[3][:-1]
                if not tissue_code in["10","11","06"]:
                    if patient in r_.index.values:
                        # save patient barcode to exclude it from training cohort
                        testing_tcga_samples.append("-".join(sname[:4]))
                        if patient in pat2sample.keys():
                            pat2sample[patient].append(sample_barcode)
                            print("\t","more than one sample for patient",patient,"from cohort",cohort,file = sys.stderr)
                        else:
                            pat2sample[patient]= [sample_barcode]
            samples = []
            for pat in pat2sample.keys():
                for sample in pat2sample[pat]:
                    samples.append(sample)
                    x = r.loc[pat,["drug","response","measure_of_response","cohort"]]
                    x["bcr_patient_barcode"] = pat
                    x["exprs_sample_barcode"] = sample
                    x.name = sample
                    exprs_response_per_sample.append(x)
            expr_profiles.append(e[samples])
            expr_profiles_z.append(e_z[samples])
            print("\t\t","with expression",len(samples))
        
###################### CNA ###########################################################
        try:
            cna = pd.read_csv(preprocessed_data+"/CNA/"+"TCGA-"+cohort+".Segment_Mean.CNA.tsv",sep = "\t",index_col=0)
            no_data = False
        except:
            print("\t","no CNA for cohort", cohort,file = sys.stderr)
            no_data = True
        if not no_data:
            pat2sample = {}
            for sample_barcode in cna.columns.values:
                sname = sample_barcode.split("-")
                patient = "-".join(sname[:3])
                tissue_code = sname[3][:-1]
                if not tissue_code in["10","11","06"]:
                    if patient in r_.index.values:
                        # save patient barcode to exclude it from training cohort
                        testing_tcga_samples.append("-".join(sname[:4]))
                        if patient in pat2sample.keys():
                            pat2sample[patient].append(sample_barcode)
                            print("\t","more than one sample for patient",patient,"from cohort",cohort,file = sys.stderr)
                        else:
                            pat2sample[patient]= [sample_barcode]
            samples = []
            for pat in pat2sample.keys():
                for sample in pat2sample[pat]:
                    samples.append(sample)
                    x = r.loc[pat,["drug","response","measure_of_response","cohort"]]
                    x["bcr_patient_barcode"] = pat
                    x["cna_sample_barcode"] = sample
                    x.name = sample
                    cna_response_per_sample.append(x)
            cna_profiles.append(cna[samples])
            print("\t\t","with CNA",len(samples))
        
###################### mutations ###########################################################
        try:
            mut = pd.read_csv(preprocessed_data+"/mutations/"+"TCGA-"+cohort+".non_bin_mutations.tsv",sep = "\t",index_col=0)
            no_data = False
        except:
            print("\t","no mutations for cohort", cohort,file = sys.stderr)
            no_data = True
        if not no_data:
            pat2sample = {}
            for sample_barcode in mut.columns.values:
                sname = sample_barcode.split("-")
                patient = "-".join(sname[:3])
                tissue_code = sname[3][:-1]
                if not tissue_code in["10","11","06"]:
                    if patient in r_.index.values:
                        # save patient barcode to exclude it from training cohort
                        testing_tcga_samples.append("-".join(sname[:4]))
                        if patient in pat2sample.keys():
                            pat2sample[patient].append(sample_barcode)
                            print("\t","more than one sample for patient",patient,"from cohort",cohort,file = sys.stderr)
                        else:
                            pat2sample[patient]= [sample_barcode]
            samples = []
            for pat in pat2sample.keys():
                for sample in pat2sample[pat]:
                    samples.append(sample)
                    x = r.loc[pat,["drug","response","measure_of_response","cohort"]]
                    x["bcr_patient_barcode"] = pat
                    x.name = sample
                    x["mut_sample_barcode"] = sample
                    mut_response_per_sample.append(x)
            mut_profiles.append(mut[samples])
            print("\t\t","with mutations",len(samples))

    exprs_response_per_sample = pd.concat(exprs_response_per_sample,axis=1).T
    exprs_response_per_sample.sort_index(inplace=True)
    exprs_response_per_sample.to_csv(root_dir+"/response/"+"TCGA_response."+drug+".tsv", sep = "\t")
    expr_profiles = pd.concat(expr_profiles,axis=1)
    expr_profiles_z = pd.concat(expr_profiles_z,axis=1)
    expr_profiles.rename(lambda x: x[:16], axis="columns", inplace=True)
    expr_profiles.sort_index(axis=1,inplace=True)
    expr_profiles_z.rename(lambda x: x[:16], axis="columns", inplace=True)
    expr_profiles_z.sort_index(axis=1,inplace=True)
    if exprs_response_per_sample.shape[0]!=expr_profiles.shape[1]:
        print(drug, exprs_response_per_sample.shape[0],expr_profiles.shape[1],file=sys.stderr)
    print(drug,"Response:",r.shape[0])
    print(drug,"Expression:", exprs_response_per_sample.shape[0],
          "R:",exprs_response_per_sample.loc[exprs_response_per_sample["response"]=="R"].shape[0],
         "S:",exprs_response_per_sample.loc[exprs_response_per_sample["response"]=="S"].shape[0])
    ##### CNA and mutations #####################

    cna_profiles = pd.concat(cna_profiles,axis=1)
    cna_profiles.rename(lambda x: x[:16], axis="columns", inplace=True)
    cna_profiles.sort_index(axis=1,inplace=True)
    mut_profiles = pd.concat(mut_profiles,axis=1)
    # because some genes were mutated not in ll cohorts NA values appear
    mut_profiles.fillna(0,inplace=True)
    mut_profiles.rename(lambda x: x[:16], axis="columns", inplace=True)
    mut_profiles.sort_index(axis=1,inplace=True)
    ########## multi-omics responses ############
    cna_response_per_sample = pd.concat(cna_response_per_sample,axis=1).T
    cna_response_per_sample.sort_index(inplace=True)
    mut_response_per_sample = pd.concat(mut_response_per_sample,axis=1).T
    mut_response_per_sample.sort_index(inplace=True)
    #exprs_response_per_sample.columns
    print(drug,"CNA:", cna_response_per_sample.shape[0],
          "R:",cna_response_per_sample.loc[cna_response_per_sample["response"]=="R"].shape[0],
         "S:",cna_response_per_sample.loc[cna_response_per_sample["response"]=="S"].shape[0])
    print(drug, "mutations:", mut_response_per_sample.shape[0],
          "R:",mut_response_per_sample.loc[mut_response_per_sample["response"]=="R"].shape[0],
         "S:",mut_response_per_sample.loc[mut_response_per_sample["response"]=="S"].shape[0])
        #mut_response_per_sample.to_csv(root_dir+"/response/"+"TCGA_response."+drug+".tsv", sep = "\t")
    
    exprs_response_per_sample.rename(lambda x: x[:12],axis="index",inplace=True)
    cna_response_per_sample.rename(lambda x: x[:12],axis="index",inplace=True)
    mut_response_per_sample.rename(lambda x: x[:12],axis="index",inplace=True)

    df_reset = exprs_response_per_sample.reset_index()
    duplicated_index = df_reset[["index","bcr_patient_barcode"]].duplicated(subset='index', keep='first')
    df_reset[~duplicated_index].set_index('index', inplace = True)
    df_reset = df_reset[~duplicated_index]
    df_reset.set_index('index', inplace = True)
    exprs_response_per_sample = df_reset

    df_reset = mut_response_per_sample.reset_index()
    duplicated_index = df_reset[["index","bcr_patient_barcode"]].duplicated(subset='index', keep='first')
    df_reset[~duplicated_index].set_index('index', inplace = True)
    df_reset = df_reset[~duplicated_index]
    df_reset.set_index('index', inplace = True)
    mut_response_per_sample = df_reset

    df_reset = cna_response_per_sample.reset_index()
    duplicated_index = df_reset[["index","bcr_patient_barcode"]].duplicated(subset='index', keep='first')
    df_reset[~duplicated_index].set_index('index', inplace = True)
    df_reset = df_reset[~duplicated_index]
    df_reset.set_index('index', inplace = True)
    cna_response_per_sample = df_reset

    multi_omics_response_per_sample = pd.concat([r[["cohort","drug","response","measure_of_response"]],
               exprs_response_per_sample[["exprs_sample_barcode"]],
              cna_response_per_sample[["cna_sample_barcode"]],
              mut_response_per_sample[["mut_sample_barcode"]]],axis=1,sort=True)
    multi_omics_response_per_sample["patient"] = multi_omics_response_per_sample.index.values
    
    multi_omics_response_per_sample["patient"] = multi_omics_response_per_sample.index
    
    multi_omics_response_per_sample.dropna(subset=["exprs_sample_barcode","cna_sample_barcode","mut_sample_barcode"],inplace=True)
    samples = multi_omics_response_per_sample[["exprs_sample_barcode","cna_sample_barcode","mut_sample_barcode"]].applymap(lambda x: x[:16])
    if not multi_omics_response_per_sample.shape[0] == samples.loc[samples["exprs_sample_barcode"] == samples["cna_sample_barcode"], :].shape[0] == samples.loc[samples["exprs_sample_barcode"] == samples["mut_sample_barcode"], :].shape[0]:
        print("Sample names in expression, CNA and SNA matrices do not match",file = sys.stderr)
    else:
        multi_omics_response_per_sample["sample"] = samples["exprs_sample_barcode"] 
        multi_omics_response_per_sample= multi_omics_response_per_sample[["sample","patient","cohort","drug","response","measure_of_response",
                                                                       "exprs_sample_barcode","cna_sample_barcode","mut_sample_barcode"]]
        multi_omics_response_per_sample.set_index("sample",inplace=True)
    
    ###### write tables ###########
    # multi-OMICs responses
    multi_omics_response_per_sample.to_csv(root_dir+"/response/"+"TCGA_response."+drug+".tsv", sep = "\t")  
    # expressions
    if drug == "Fluorouracil":
        expr_profiles.columns = expr_profiles.columns.str.replace(r"-(01A|01)$", "", regex=True)    
    expr_profiles = expr_profiles.loc[:,multi_omics_response_per_sample.index.values]
    expr_profiles.to_csv(root_dir+"/exprs/"+"TCGA_exprs."+drug+".tsv", sep = "\t")
    if drug == "Fluorouracil":
        expr_profiles_z.columns = expr_profiles_z.columns.str.replace(r"-(01A|01)$", "", regex=True)     
    expr_profiles_z = expr_profiles_z.loc[:,multi_omics_response_per_sample.index.values]
    expr_profiles_z.to_csv(root_dir+"/exprs/"+"TCGA_exprs.z."+drug+".tsv", sep = "\t")
    # cna
    if drug == "Fluorouracil":
        cna_profiles.columns = cna_profiles.columns.str.replace(r"-(01A|01)$", "", regex=True) 
    cna_profiles = cna_profiles.loc[:,multi_omics_response_per_sample.index.values]
    cna_profiles.to_csv(root_dir+"/CNA/"+"TCGA_CNA."+drug+".tsv", sep = "\t")
    # mutations
    if drug == "Fluorouracil":
        mut_profiles.columns = mut_profiles.columns.str.replace(r"-(01A|01)$", "", regex=True) 
    mut_profiles = mut_profiles.loc[:,multi_omics_response_per_sample.index.values]
    mut_profiles.to_csv(root_dir+"/mutations/"+"TCGA_mutations."+drug+".tsv", sep = "\t")
    print(drug, "samples with multi-OMICs+response",multi_omics_response_per_sample.shape[0],
          "R:",multi_omics_response_per_sample.loc[multi_omics_response_per_sample["response"]=="R"].shape[0],
          "S:",multi_omics_response_per_sample.loc[multi_omics_response_per_sample["response"]=="S"].shape[0])
    

Docetaxel


	 SARC samples with response: 2
		 with expression 2
		 with CNA 2
		 with mutations 2
	 STAD samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 1
	 BRCA samples with response: 10
		 with expression 10
		 with CNA 0
		 with mutations 9
	 BLCA samples with response: 3
		 with expression 3
		 with CNA 0
		 with mutations 3
	 LUSC samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 0
	 LUAD samples with response: 3
		 with expression 3
		 with CNA 0
		 with mutations 3
	 HNSC samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 1
Docetaxel Response: 21
Docetaxel Expression: 21 R: 12 S: 9
Docetaxel CNA: 5 R: 3 S: 2
Docetaxel mutations: 19 R: 10 S: 9
Docetaxel samples with multi-OMICs+response 4 R: 2 S: 2
	 UCS samples with response: 2


Cisplatin


		 with expression 2
		 with CNA 2
		 with mutations 2
	 SARC samples with response: 2
		 with expression 2
		 with CNA 2
		 with mutations 2
	 CESC samples with response: 54
		 with expression 52
		 with CNA 53
		 with mutations 20
	 UCEC samples with response: 3
		 with expression 1
		 with CNA 0
		 with mutations 0
	 LUSC samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 0
	 SKCM samples with response: 1
		 with expression 0
		 with CNA 0
		 with mutations 0
	 MESO samples with response: 4
		 with expression 4
		 with CNA 4
	 STAD samples with response: 6


	 no mutations for cohort MESO


		 with expression 5
		 with CNA 6
		 with mutations 4
	 HNSC samples with response: 38
		 with expression 38
		 with CNA 38
		 with mutations 37
Cisplatin Response: 111
Cisplatin Expression: 105 R: 19 S: 86
Cisplatin CNA: 106 R: 20 S: 86
Cisplatin mutations: 65 R: 7 S: 58
Cisplatin samples with multi-OMICs+response 64 R: 6 S: 58
	 HNSC samples with response: 9


Cetuximab


		 with expression 9
		 with CNA 9
		 with mutations 9
Cetuximab Response: 9
Cetuximab Expression: 9 R: 3 S: 6
Cetuximab CNA: 9 R: 3 S: 6
Cetuximab mutations: 9 R: 3 S: 6
Cetuximab samples with multi-OMICs+response 9 R: 3 S: 6
	 UCS samples with response: 3


Paclitaxel


		 with expression 3
		 with CNA 3
		 with mutations 3
	 STAD samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 1
	 CESC samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 0
	 UCEC samples with response: 6
		 with expression 2
		 with CNA 0
		 with mutations 1
	 BRCA samples with response: 33
		 with expression 33
		 with CNA 0
		 with mutations 27
	 LUSC samples with response: 2
		 with expression 2
		 with CNA 2
		 with mutations 0
	 SKCM samples with response: 1
		 with expression 0
		 with CNA 0
		 with mutations 0
	 BLCA samples with response: 1
		 with expression 1
		 with CNA 0
		 with mutations 1
	 LUAD samples with response: 3
		 with expression 3
		 with CNA 0
		 with mutations 3
	 HNSC samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 0
Paclitaxel Response: 52
Paclitaxel Expression: 47 R: 13 S: 34
Paclitaxel CNA: 8 R: 3 S: 5
Paclitaxel mutations: 36 R: 10 S: 26
Paclitaxel samples with multi-OM

Gemcitabine


		 with expression 3
		 with CNA 3
		 with mutations 3
	 PCPG samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 1
	 CESC samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 0
	 UCEC samples with response: 1
		 with expression 0
		 with CNA 0
		 with mutations 0
	 LUSC samples with response: 3
		 with expression 3
		 with CNA 3
		 with mutations 1
	 BLCA samples with response: 5
		 with expression 5
		 with CNA 0
		 with mutations 5
	 SKCM samples with response: 1
		 with expression 0
		 with CNA 0
		 with mutations 0
	 LUAD samples with response: 2
		 with expression 2
		 with CNA 0
		 with mutations 2
	 MESO samples with response: 1
		 with expression 1
		 with CNA 1
	 PAAD samples with response: 51


	 no mutations for cohort MESO


		 with expression 49
		 with CNA 51
		 with mutations 50
	 LIHC samples with response: 2
		 with expression 2
		 with CNA 2
		 with mutations 2
Gemcitabine Response: 71
Gemcitabine Expression: 67 R: 42 S: 25
Gemcitabine CNA: 62 R: 41 S: 21
Gemcitabine mutations: 64 R: 40 S: 24
Gemcitabine samples with multi-OMICs+response 55 R: 35 S: 20
	 THCA samples with response: 2


Doxorubicin


		 with expression 2
		 with CNA 2
		 with mutations 2
	 UCS samples with response: 1
		 with expression 1
		 with CNA 1
		 with mutations 1
	 SARC samples with response: 17


	 more than one sample for patient TCGA-K1-A42X from cohort SARC


		 with expression 18


	 more than one sample for patient TCGA-K1-A42X from cohort SARC


		 with CNA 18
		 with mutations 15
	 UCEC samples with response: 3
		 with expression 1
		 with CNA 0
		 with mutations 0
	 SKCM samples with response: 1
		 with expression 0
		 with CNA 0
		 with mutations 0
	 MESO samples with response: 1
		 with expression 1
		 with CNA 1
Doxorubicin Response: 25
Doxorubicin Expression: 23 R: 16 S: 7
Doxorubicin CNA: 22 R: 16 S: 6
Doxorubicin mutations: 18 R: 14 S: 4


	 no mutations for cohort MESO


Doxorubicin samples with multi-OMICs+response 18 R: 14 S: 4
	 STAD samples with response: 35


Fluorouracil


		 with expression 32
		 with CNA 34
		 with mutations 19
	 COAD samples with response: 4
		 with expression 1
		 with CNA 0
		 with mutations 1
	 PAAD samples with response: 6
		 with expression 6
		 with CNA 6
		 with mutations 6
	 READ samples with response: 8
		 with expression 8
		 with CNA 7
		 with mutations 8
	 ESCA samples with response: 2
		 with expression 2
		 with CNA 2
		 with mutations 2
Fluorouracil Response: 55
Fluorouracil Expression: 49 R: 17 S: 32
Fluorouracil CNA: 49 R: 17 S: 32
Fluorouracil mutations: 36 R: 9 S: 27


Sample names in expression, CNA and SNA matrices do not match


Fluorouracil samples with multi-OMICs+response 33 R: 9 S: 24


# Drugs targeting mitotic, cytoskeleton, DNA replication, and EGFR singnalling pathways



### Trainig: GDSC


### 1. EGFRi

In [238]:
EGFRi_train_dict = {"GDSC":['Erlotinib',
                            'Lapatinib',
                            'CP724714',
                            'EKB-569',
                            'Gefitinib',
                            'Afatinib',
                            'Cetuximab',
                            'HG-5-88-01']}
exprs_z = []
exprs = []
cnas = []
mutations = []
responses = []

for cohort in EGFRi_train_dict.keys():
    for drug in EGFRi_train_dict[cohort]:
        r = pd.read_csv(root_dir + "response/"+cohort+"_response."+drug+".tsv",sep = "\t", index_col = 0)
        r.rename(str,axis="index",inplace=True)
        snames= r.index.values
        responses.append(r)
        e = pd.read_csv(root_dir + "exprs/"+cohort+"_exprs."+drug+".tsv", sep = "\t", index_col = 0)
        e.rename(str,axis="columns",inplace=True)
        e = e[snames]
        exprs.append(e) 
        e_z = pd.read_csv(root_dir+"exprs/"+cohort+"_exprs.z."+drug+".tsv", sep = "\t", index_col = 0)
        e_z.rename(str,axis="columns",inplace=True)
        e_z = e_z[snames]
        exprs_z.append(e_z)
        snames = e_z.columns.values
        cna = pd.read_csv(root_dir + "CNA/"+cohort+"_CNA."+drug+".tsv",sep = "\t", index_col = 0)
        cna.rename(str,axis="columns",inplace=True)
        cna = cna[snames]
        cnas.append(cna)
        muts = pd.read_csv(root_dir + "mutations/"+cohort+"_mutations."+drug+".tsv",sep = "\t", index_col = 0)
        muts.rename(str,axis="columns",inplace=True)
        muts = muts[snames]
        mutations.append(muts)
        print(cohort,drug, len(snames))

order  = [exprs, exprs_z, cnas, mutations]
order_names  = ["exprs", "exprs_z", "cnas", "mutations"]
for i in range(0,4):
    order[i] = pd.concat(order[i],axis =1)
    order[i] = order[i].loc[:,~order[i].columns.duplicated()]
    order[i] = order[i].dropna()
    print(f"{order_names[i]}: genes x samples", order[i].shape)

[exprs, exprs_z, cnas, mutations] = order
print(exprs.shape, exprs_z.shape, cnas.shape, mutations.shape)
exprs.to_csv(root_dir+"/exprs/" +"GDSC_exprs."+"EGFRi"+".tsv", sep = "\t")
exprs_z.to_csv(root_dir+"/exprs/" +"GDSC_exprs.z."+"EGFRi"+".tsv", sep = "\t")
cnas.to_csv(root_dir+"/CNA/" +"GDSC_CNA."+"EGFRi"+".tsv", sep = "\t")
mutations.to_csv(root_dir+"/mutations/" +"GDSC_mutations."+"EGFRi"+".tsv", sep = "\t")

GDSC Erlotinib 362
GDSC Lapatinib 387
GDSC CP724714 897
GDSC EKB-569 896
GDSC Gefitinib 825
GDSC Afatinib 828
GDSC Cetuximab 856
GDSC HG-5-88-01 485
exprs: genes x samples (18597, 954)
exprs_z: genes x samples (18597, 954)
cnas: genes x samples (24402, 954)
mutations: genes x samples (18377, 954)
(18597, 954) (18597, 954) (24402, 954) (18377, 954)


In [239]:
responses = pd.concat(responses,axis=0).loc[exprs.columns.values,:]
responses.drop_duplicates(inplace=True)
responses.to_csv(root_dir+"/response/"+"GDSC_response."+"EGFRi"+".tsv", sep = "\t")
responses.head(10)

Unnamed: 0_level_0,response,logIC50,drug,exprs,CNA,mutations
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
683665,R,2.436586,Erlotinib,1,1,1
683665,R,2.684181,Lapatinib,1,1,1
683665,R,4.291379,CP724714,1,1,1
683665,S,-0.966774,EKB-569,1,1,1
683665,R,1.464855,Gefitinib,1,1,1
683665,R,1.490015,Afatinib,1,1,1
683665,R,6.294447,Cetuximab,1,1,1
683665,R,3.394775,HG-5-88-01,1,1,1
684055,R,3.342826,Erlotinib,1,1,1
684055,R,3.226492,Lapatinib,1,1,1


In [240]:
responses.shape

(5536, 6)

### 2. MITOSISi

In [241]:
MITOSISi_train_dict = {"GDSC":['VX-680',
                               'S-Trityl-L-cysteine',
                               'BI-2536',
                               'GW843682X',
                               'SB-715992',
                               'Genentech Cpd 10',
                               'GSK1070916',
                               'NPK76-II-72-1',
                               'MPS-1-IN-1',
                               'ZM-447439']}
exprs_z = []
exprs = []
cnas = []
mutations = []
responses = []

for cohort in MITOSISi_train_dict.keys():
    for drug in MITOSISi_train_dict[cohort]:
        r = pd.read_csv(root_dir + "response/"+cohort+"_response."+drug+".tsv",sep = "\t", index_col = 0)
        r.rename(str,axis="index",inplace=True)
        snames= r.index.values
        responses.append(r)
        e = pd.read_csv(root_dir + "exprs/"+cohort+"_exprs."+drug+".tsv", sep = "\t", index_col = 0)
        e.rename(str,axis="columns",inplace=True)
        e = e[snames]
        exprs.append(e) 
        e_z = pd.read_csv(root_dir+"exprs/"+cohort+"_exprs.z."+drug+".tsv", sep = "\t", index_col = 0)
        e_z.rename(str,axis="columns",inplace=True)
        e_z = e_z[snames]
        exprs_z.append(e_z)
        snames = e_z.columns.values
        cna = pd.read_csv(root_dir + "CNA/"+cohort+"_CNA."+drug+".tsv",sep = "\t", index_col = 0)
        cna.rename(str,axis="columns",inplace=True)
        cna = cna[snames]
        cnas.append(cna)
        muts = pd.read_csv(root_dir + "mutations/"+cohort+"_mutations."+drug+".tsv",sep = "\t", index_col = 0)
        muts.rename(str,axis="columns",inplace=True)
        muts = muts[snames]
        mutations.append(muts)
        print(cohort,drug, len(snames))

order  = [exprs, exprs_z, cnas, mutations]
order_names  = ["exprs", "exprs_z", "cnas", "mutations"]
for i in range(0,4):
    order[i] = pd.concat(order[i],axis =1)
    order[i] = order[i].loc[:,~order[i].columns.duplicated()]
    order[i] = order[i].dropna()
    print(f"{order_names[i]}: genes x samples", order[i].shape)

[exprs, exprs_z, cnas, mutations] = order
print(exprs.shape, exprs_z.shape, cnas.shape, mutations.shape)
exprs.to_csv(root_dir+"/exprs/" +"GDSC_exprs."+"MITOSISi"+".tsv", sep = "\t")
exprs_z.to_csv(root_dir+"/exprs/" +"GDSC_exprs.z."+"MITOSISi"+".tsv", sep = "\t")
cnas.to_csv(root_dir+"/CNA/" +"GDSC_CNA."+"MITOSISi"+".tsv", sep = "\t")
mutations.to_csv(root_dir+"/mutations/" +"GDSC_mutations."+"MITOSISi"+".tsv", sep = "\t")

GDSC VX-680 383
GDSC S-Trityl-L-cysteine 387
GDSC BI-2536 387
GDSC GW843682X 389
GDSC SB-715992 897
GDSC Genentech Cpd 10 898
GDSC GSK1070916 870
GDSC NPK76-II-72-1 895
GDSC MPS-1-IN-1 893
GDSC ZM-447439 783
exprs: genes x samples (18597, 949)
exprs_z: genes x samples (18597, 949)
cnas: genes x samples (24402, 949)
mutations: genes x samples (18377, 949)
(18597, 949) (18597, 949) (24402, 949) (18377, 949)


In [242]:
responses = pd.concat(responses,axis=0).loc[exprs.columns.values,:]
responses.drop_duplicates(inplace=True)
responses.to_csv(root_dir+"/response/"+"GDSC_response."+"MITOSISi"+".tsv", sep = "\t")
responses.head(10)

Unnamed: 0_level_0,response,logIC50,drug,exprs,CNA,mutations
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
683665,R,0.704027,VX-680,1,1,1
683665,R,0.307773,S-Trityl-L-cysteine,1,1,1
683665,R,-1.862192,BI-2536,1,1,1
683665,R,-3.038878,GW843682X,1,1,1
683665,R,-3.716158,SB-715992,1,1,1
683665,S,0.270443,Genentech Cpd 10,1,1,1
683665,R,0.436689,GSK1070916,1,1,1
683665,S,-0.473633,NPK76-II-72-1,1,1,1
683665,R,2.369307,MPS-1-IN-1,1,1,1
683665,R,0.752087,ZM-447439,1,1,1


In [243]:
responses.shape

(6782, 6)

### 3. CYTOi

In [244]:
CYTOi_train_dict = {"GDSC":['Paclitaxel',
                            'GSK269962A',
                            'Vinorelbine',
                            'PF-562271',
                            'IPA-3',
                            'Epothilone B',
                            'GSK429286A',
                            'Y-39983',
                            'Vinblastine',
                            'Docetaxel',
                            'EHT 1864']}
exprs_z = []
exprs = []
cnas = []
mutations = []
responses = []

for cohort in CYTOi_train_dict.keys():
    for drug in CYTOi_train_dict[cohort]:
        r = pd.read_csv(root_dir + "response/"+cohort+"_response."+drug+".tsv",sep = "\t", index_col = 0)
        r.rename(str,axis="index",inplace=True)
        snames= r.index.values
        responses.append(r)
        e = pd.read_csv(root_dir + "exprs/"+cohort+"_exprs."+drug+".tsv", sep = "\t", index_col = 0)
        e.rename(str,axis="columns",inplace=True)
        e = e[snames]
        exprs.append(e) 
        e_z = pd.read_csv(root_dir+"exprs/"+cohort+"_exprs.z."+drug+".tsv", sep = "\t", index_col = 0)
        e_z.rename(str,axis="columns",inplace=True)
        e_z = e_z[snames]
        exprs_z.append(e_z)
        snames = e_z.columns.values
        cna = pd.read_csv(root_dir + "CNA/"+cohort+"_CNA."+drug+".tsv",sep = "\t", index_col = 0)
        cna.rename(str,axis="columns",inplace=True)
        cna = cna[snames]
        cnas.append(cna)
        muts = pd.read_csv(root_dir + "mutations/"+cohort+"_mutations."+drug+".tsv",sep = "\t", index_col = 0)
        muts.rename(str,axis="columns",inplace=True)
        muts = muts[snames]
        mutations.append(muts)
        print(cohort,drug, len(snames))

order  = [exprs, exprs_z, cnas, mutations]
order_names  = ["exprs", "exprs_z", "cnas", "mutations"]
for i in range(0,4):
    order[i] = pd.concat(order[i],axis =1)
    order[i] = order[i].loc[:,~order[i].columns.duplicated()]
    order[i] = order[i].dropna()
    print(f"{order_names[i]}: genes x samples", order[i].shape)

[exprs, exprs_z, cnas, mutations] = order
print(exprs.shape, exprs_z.shape, cnas.shape, mutations.shape)
exprs.to_csv(root_dir+"/exprs/" +"GDSC_exprs."+"CYTOi"+".tsv", sep = "\t")
exprs_z.to_csv(root_dir+"/exprs/" +"GDSC_exprs.z."+"CYTOi"+".tsv", sep = "\t")
cnas.to_csv(root_dir+"/CNA/" +"GDSC_CNA."+"CYTOi"+".tsv", sep = "\t")
mutations.to_csv(root_dir+"/mutations/" +"GDSC_mutations."+"CYTOi"+".tsv", sep = "\t")

GDSC Paclitaxel 389
GDSC GSK269962A 935
GDSC Vinorelbine 859
GDSC PF-562271 842
GDSC IPA-3 848
GDSC Epothilone B 849
GDSC GSK429286A 898
GDSC Y-39983 895
GDSC Vinblastine 829
GDSC Docetaxel 829
GDSC EHT 1864 903
exprs: genes x samples (18597, 955)
exprs_z: genes x samples (18597, 955)
cnas: genes x samples (24402, 955)
mutations: genes x samples (18377, 955)
(18597, 955) (18597, 955) (24402, 955) (18377, 955)


In [245]:
responses = pd.concat(responses,axis=0).loc[exprs.columns.values,:]
responses.drop_duplicates(inplace=True)
responses.to_csv(root_dir+"/response/"+"GDSC_response."+"CYTOi"+".tsv", sep = "\t")
responses.head(10)

Unnamed: 0_level_0,response,logIC50,drug,exprs,CNA,mutations
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
683665,R,-3.647287,Paclitaxel,1,1,1
683665,R,1.275538,GSK269962A,1,1,1
683665,R,-4.544286,Vinorelbine,1,1,1
683665,R,1.722199,PF-562271,1,1,1
683665,R,1.707361,IPA-3,1,1,1
683665,R,-4.92425,Epothilone B,1,1,1
683665,R,4.066824,GSK429286A,1,1,1
683665,S,2.536259,Y-39983,1,1,1
683665,R,-5.123138,Vinblastine,1,1,1
683665,R,-4.918727,Docetaxel,1,1,1


In [246]:
responses.shape

(9051, 6)

### 4. DNA_REPi

In [247]:
DNA_REPi_train_dict = {"GDSC":["Pyrimethamine",
                               "Doxorubicin",
                               "Etoposide",
                               "Gemcitabine",
                               "Mitomycin C",
                               "5-Fluorouracil",
                               "Bleomycin",
                               "Camptothecin",
                               "Cisplatin",
                               "Cytarabine",
                               "Methotrexate",
                               "Temozolomide",
                               "SN-38"]}
exprs_z = []
exprs = []
cnas = []
mutations = []
responses = []

for cohort in DNA_REPi_train_dict.keys():
    for drug in DNA_REPi_train_dict[cohort]:
        r = pd.read_csv(root_dir + "response/"+cohort+"_response."+drug+".tsv",sep = "\t", index_col = 0)
        r.rename(str,axis="index",inplace=True)
        snames= r.index.values
        responses.append(r)
        e = pd.read_csv(root_dir + "exprs/"+cohort+"_exprs."+drug+".tsv", sep = "\t", index_col = 0)
        e.rename(str,axis="columns",inplace=True)
        e = e[snames]
        exprs.append(e) 
        e_z = pd.read_csv(root_dir+"exprs/"+cohort+"_exprs.z."+drug+".tsv", sep = "\t", index_col = 0)
        e_z.rename(str,axis="columns",inplace=True)
        e_z = e_z[snames]
        exprs_z.append(e_z)
        snames = e_z.columns.values
        cna = pd.read_csv(root_dir + "CNA/"+cohort+"_CNA."+drug+".tsv",sep = "\t", index_col = 0)
        cna.rename(str,axis="columns",inplace=True)
        cna = cna[snames]
        cnas.append(cna)
        muts = pd.read_csv(root_dir + "mutations/"+cohort+"_mutations."+drug+".tsv",sep = "\t", index_col = 0)
        muts.rename(str,axis="columns",inplace=True)
        muts = muts[snames]
        mutations.append(muts)
        print(cohort,drug, len(snames))

order  = [exprs, exprs_z, cnas, mutations]
order_names  = ["exprs", "exprs_z", "cnas", "mutations"]
for i in range(0,4):
    order[i] = pd.concat(order[i],axis =1)
    order[i] = order[i].loc[:,~order[i].columns.duplicated()]
    order[i] = order[i].dropna()
    print(f"{order_names[i]}: genes x samples", order[i].shape)

[exprs, exprs_z, cnas, mutations] = order
print(exprs.shape, exprs_z.shape, cnas.shape, mutations.shape)
exprs.to_csv(root_dir+"/exprs/" +"GDSC_exprs."+"DNA_REPi"+".tsv", sep = "\t")
exprs_z.to_csv(root_dir+"/exprs/" +"GDSC_exprs.z."+"DNA_REPi"+".tsv", sep = "\t")
cnas.to_csv(root_dir+"/CNA/" +"GDSC_CNA."+"DNA_REPi"+".tsv", sep = "\t")
mutations.to_csv(root_dir+"/mutations/" +"GDSC_mutations."+"DNA_REPi"+".tsv", sep = "\t")

GDSC Pyrimethamine 391
GDSC Doxorubicin 849
GDSC Etoposide 859
GDSC Gemcitabine 844
GDSC Mitomycin C 850
GDSC 5-Fluorouracil 889
GDSC Bleomycin 841
GDSC Camptothecin 828
GDSC Cisplatin 829
GDSC Cytarabine 825
GDSC Methotrexate 828
GDSC Temozolomide 890
GDSC SN-38 911
exprs: genes x samples (18597, 954)
exprs_z: genes x samples (18597, 954)
cnas: genes x samples (24402, 954)
mutations: genes x samples (18377, 954)
(18597, 954) (18597, 954) (24402, 954) (18377, 954)


In [248]:
responses = pd.concat(responses,axis=0).loc[exprs.columns.values,:]
responses.drop_duplicates(inplace=True)
responses.to_csv(root_dir+"/response/"+"GDSC_response."+"DNA_REPi"+".tsv", sep = "\t")
responses.head(10)

Unnamed: 0_level_0,response,logIC50,drug,exprs,CNA,mutations
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
683665,R,5.120755,Pyrimethamine,1,1,1
683665,R,-3.037116,Doxorubicin,1,1,1
683665,R,-0.712119,Etoposide,1,1,1
683665,R,-4.408972,Gemcitabine,1,1,1
683665,R,-1.597524,Mitomycin C,1,1,1
683665,S,0.145949,5-Fluorouracil,1,1,1
683665,R,0.474214,Bleomycin,1,1,1
683665,R,-4.527144,Camptothecin,1,1,1
683665,R,2.807269,Cisplatin,1,1,1
683665,S,-2.954429,Cytarabine,1,1,1


In [249]:
responses.shape

(10634, 6)