This notebook does stratified 3 fold splits of the datasets to get splits for each experiment.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
def get_response_labels(category): # like in CODE-AE
    if category in ["Clinical Progressive Disease", "Stable Disease", "PD", "SD", "Progressive Disease"]:
        return 0
    if category in ["Partial Response", "Complete Response", "PR", "Non-PR/CR"]:
        return 1
    else:
        return None

In [3]:
# cancer type mapping done by searching on Google for "<Moore's tumor type> tcga" - choosing the TCGA Project page link that shows up in the search results
def get_cancer_type(cancer):
    if cancer in ["Breast"]:
        return "TCGA-BRCA" # based on info at https://portal.gdc.cancer.gov/projects/TCGA-BRCA
    if cancer in ["Brain"]:
        return "TCGA-GBM" # based on info at https://portal.gdc.cancer.gov/projects/TCGA-GBM
    if cancer in ["Lung"]:
        return "TCGA-LUAD" # based on info at https://portal.gdc.cancer.gov/projects/TCGA-LUAD
    if cancer in ["Skin/Melanoma"]:
        return "TCGA-SKCM" # based on info at https://portal.gdc.cancer.gov/projects/TCGA-SKCM
    if cancer in ["Head and neck"]:
        return "TCGA-HNSC" # based on info at https://portal.gdc.cancer.gov/projects/TCGA-HNSC
    else:
        return cancer # for GI, GU could not find project page for TCGA when searching for cancer type in Google Search

In [4]:
drugs_with_smiles = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/metadata/drug_smiles.csv", header=None, index_col=0)
drugs_with_smiles

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
JW-7-24-1,COC1=CC(=CC(=C1)C2=CC3=C4C(=CN=C3C=C2)C=CC(=O)...
KIN001-260,C1CC1COC2=CC=CC(=O)C2=C3C=C(C(=C(N3)N)C#N)C4CC...
NSC-87877,C1=CC2=C(C(=O)C(=NNC3=CC4=C(C=C3)C=C(C=C4)S(=O...
GNE-317,CC1=C(SC2=C1N=C(N=C2N3CCOCC3)C4=CN=C(N=C4)N)C5...
NAVITOCLAX,CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(...
...,...
SB590885,O=NC1=c2cc/c(=c/3\nc([nH]c3=c3cc[nH]cc3)c3ccc(...
STAUROSPORINE,[H][C@]1(C[C@@]2([H])O[C@](C)(N3C4=CC=CC=C4C4=...
TW 37,CC(C)C1=CC=CC=C1CC1=C(O)C(O)=C(O)C(=C1)C(=O)NC...
ULIXERTINIB,[H]N([C@H](CO)C1=CC(Cl)=CC=C1)C(=O)C1=CC(=CN1[...


### Patient datasets

#### TCGA

In [5]:
tcga_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/tcga_recist.csv")
tcga_response_df["dataset_name"] = "TCGA"
tcga_response_df

Unnamed: 0,sample_id,drug_name,recist_like,days_to_drug_therapy_start,days_to_drug_therapy_end,drug_category,dataset_name
0,TCGA-VQ-A928,FLUOROURACIL,Clinical Progressive Disease,98,138,1,TCGA
1,TCGA-JY-A93D,FLUOROURACIL,Complete Response,150,280,1,TCGA
2,TCGA-P3-A6T3,CISPLATIN,Complete Response,61,92,1,TCGA
3,TCGA-AZ-6606,FLUOROURACIL,Clinical Progressive Disease,165,165,1,TCGA
4,TCGA-VS-A954,CISPLATIN,Complete Response,94,122,1,TCGA
...,...,...,...,...,...,...,...
606,TCGA-A7-A2KD,TRASTUZUMAB,Complete Response,57,435,1,TCGA
607,TCGA-A2-A3XX,PACLITAXEL,Complete Response,143,185,1,TCGA
608,TCGA-A2-A0CK,DOCETAXEL,Complete Response,1451,1518,1,TCGA
609,TCGA-A2-A3XT,DOCETAXEL,Partial Response,97,161,1,TCGA


In [6]:
# merge with cancer type
tcga_metadata = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/metadata/tcga_metadata.csv", index_col = 0)
tcga_metadata

Unnamed: 0,case_submitter_id,project_id
0,TCGA-DD-AAVP,TCGA-LIHC
1,TCGA-KK-A7B2,TCGA-PRAD
2,TCGA-DC-6158,TCGA-READ
3,TCGA-DD-A4NP,TCGA-LIHC
4,TCGA-HQ-A5ND,TCGA-BLCA
...,...,...
11423,TCGA-BP-4790,TCGA-KIRC
11424,TCGA-N9-A4Q4,TCGA-UCS
11425,TCGA-RY-A847,TCGA-LGG
11426,TCGA-AB-2881,TCGA-LAML


In [7]:
tcga_response_df = tcga_response_df.merge(tcga_metadata, left_on="sample_id", right_on="case_submitter_id", how="left").drop(["case_submitter_id"], axis=1)
tcga_response_df

Unnamed: 0,sample_id,drug_name,recist_like,days_to_drug_therapy_start,days_to_drug_therapy_end,drug_category,dataset_name,project_id
0,TCGA-VQ-A928,FLUOROURACIL,Clinical Progressive Disease,98,138,1,TCGA,TCGA-STAD
1,TCGA-JY-A93D,FLUOROURACIL,Complete Response,150,280,1,TCGA,TCGA-ESCA
2,TCGA-P3-A6T3,CISPLATIN,Complete Response,61,92,1,TCGA,TCGA-HNSC
3,TCGA-AZ-6606,FLUOROURACIL,Clinical Progressive Disease,165,165,1,TCGA,TCGA-COAD
4,TCGA-VS-A954,CISPLATIN,Complete Response,94,122,1,TCGA,TCGA-CESC
...,...,...,...,...,...,...,...,...
606,TCGA-A7-A2KD,TRASTUZUMAB,Complete Response,57,435,1,TCGA,TCGA-BRCA
607,TCGA-A2-A3XX,PACLITAXEL,Complete Response,143,185,1,TCGA,TCGA-BRCA
608,TCGA-A2-A0CK,DOCETAXEL,Complete Response,1451,1518,1,TCGA,TCGA-BRCA
609,TCGA-A2-A3XT,DOCETAXEL,Partial Response,97,161,1,TCGA,TCGA-BRCA


In [8]:
tcga_response_df.recist_like.value_counts()

recist_like
Complete Response               252
Clinical Progressive Disease    220
Stable Disease                  109
Partial Response                 30
Name: count, dtype: int64

In [9]:
tcga_response_df["recist"] = tcga_response_df.recist_like.apply(lambda x: get_response_labels(x))
tcga_response_df

Unnamed: 0,sample_id,drug_name,recist_like,days_to_drug_therapy_start,days_to_drug_therapy_end,drug_category,dataset_name,project_id,recist
0,TCGA-VQ-A928,FLUOROURACIL,Clinical Progressive Disease,98,138,1,TCGA,TCGA-STAD,0
1,TCGA-JY-A93D,FLUOROURACIL,Complete Response,150,280,1,TCGA,TCGA-ESCA,1
2,TCGA-P3-A6T3,CISPLATIN,Complete Response,61,92,1,TCGA,TCGA-HNSC,1
3,TCGA-AZ-6606,FLUOROURACIL,Clinical Progressive Disease,165,165,1,TCGA,TCGA-COAD,0
4,TCGA-VS-A954,CISPLATIN,Complete Response,94,122,1,TCGA,TCGA-CESC,1
...,...,...,...,...,...,...,...,...,...
606,TCGA-A7-A2KD,TRASTUZUMAB,Complete Response,57,435,1,TCGA,TCGA-BRCA,1
607,TCGA-A2-A3XX,PACLITAXEL,Complete Response,143,185,1,TCGA,TCGA-BRCA,1
608,TCGA-A2-A0CK,DOCETAXEL,Complete Response,1451,1518,1,TCGA,TCGA-BRCA,1
609,TCGA-A2-A3XT,DOCETAXEL,Partial Response,97,161,1,TCGA,TCGA-BRCA,1


In [10]:
tcga_response_df.rename(columns={"project_id": "mappedProject"}, inplace=True)

#### Moore's

In [11]:
moores_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/moores_recist.csv")
moores_response_df["dataset_name"] = "Moores"
moores_response_df

Unnamed: 0,patient_id,drug_name,recist,drug_category,dataset_name
0,2,LAPATINIB,1,1,Moores
1,3,TAMOXIFEN,1,1,Moores
2,6,PONATINIB,0,1,Moores
3,7,BEVACIZUMAB,0,1,Moores
4,8,LAPATINIB,0,1,Moores
5,10,SORAFENIB,0,1,Moores
6,12,TRASTUZUMAB,1,1,Moores
7,13,BEVACIZUMAB,1,1,Moores
8,14,DABRAFENIB,0,1,Moores
9,16,LETROZOLE,0,1,Moores


In [12]:
# merge with cancer type
moores_metadata = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/metadata/labelled_moores_metadata.csv", skiprows=[1])
moores_metadata

Unnamed: 0,#,Tumor type,Alterations,Matched drug,SD ≥ 6 months/PR/CR
0,1,Lung,PTEN splice site 493-1 G>A,everolimus,YES
1,2,Breast,"EGFR amplification, CCND1 amplification, CDKN...",lapatinib,YES
2,3,Breast,ESR1a Y537S,tamoxifen,YES
3,4,Head and neck,"PTEN I67K, CDKN2A/B loss, CTNNB1 T257I, MCL1 a...",everolimus,NO
4,5,Head and neck,"ERBB2 amplification, FGFR4 amplification, NF1 ...",everolimus + lapatinib,NO
...,...,...,...,...,...
82,83,Breast,"IGF1R amplification, GATA3 M401fs*45+",exemestane for ER+,NO
83,84,Breast,"SRC amplification, TOP1 amplification, MDM2 am...",exemestane for ER+,NO
84,85,Breast,"RPTOR amplification, CDKN2A/B loss, CCND1 amp...",letrozole for ER+,NO
85,86,Breast,"AURKA amplification – equivocal⌘, CCND1 amplif...",exemestane for ER+,NO


In [13]:
moores_response_df = pd.merge(moores_response_df, moores_metadata[["#", "Tumor type"]], left_on="patient_id", right_on="#", how="left").drop(["#"], axis=1)
moores_response_df.rename(columns={"patient_id": "sample_id"}, inplace=True)
moores_response_df

Unnamed: 0,sample_id,drug_name,recist,drug_category,dataset_name,Tumor type
0,2,LAPATINIB,1,1,Moores,Breast
1,3,TAMOXIFEN,1,1,Moores,Breast
2,6,PONATINIB,0,1,Moores,GI
3,7,BEVACIZUMAB,0,1,Moores,GI
4,8,LAPATINIB,0,1,Moores,Brain
5,10,SORAFENIB,0,1,Moores,Breast
6,12,TRASTUZUMAB,1,1,Moores,Breast
7,13,BEVACIZUMAB,1,1,Moores,GI
8,14,DABRAFENIB,0,1,Moores,Skin/Melanoma
9,16,LETROZOLE,0,1,Moores,Breast


In [14]:
moores_response_df["Tumor type"].value_counts()

Tumor type
Breast           17
GU                7
Brain             6
GI                5
Lung              5
Skin/Melanoma     2
Head and neck     2
Name: count, dtype: int64

In [15]:
moores_response_df["mappedProject"] = moores_response_df["Tumor type"].apply(lambda x: get_cancer_type(x))
moores_response_df

Unnamed: 0,sample_id,drug_name,recist,drug_category,dataset_name,Tumor type,mappedProject
0,2,LAPATINIB,1,1,Moores,Breast,TCGA-BRCA
1,3,TAMOXIFEN,1,1,Moores,Breast,TCGA-BRCA
2,6,PONATINIB,0,1,Moores,GI,GI
3,7,BEVACIZUMAB,0,1,Moores,GI,GI
4,8,LAPATINIB,0,1,Moores,Brain,TCGA-GBM
5,10,SORAFENIB,0,1,Moores,Breast,TCGA-BRCA
6,12,TRASTUZUMAB,1,1,Moores,Breast,TCGA-BRCA
7,13,BEVACIZUMAB,1,1,Moores,GI,GI
8,14,DABRAFENIB,0,1,Moores,Skin/Melanoma,TCGA-SKCM
9,16,LETROZOLE,0,1,Moores,Breast,TCGA-BRCA


#### CBIO hcc_mskimpact_2018 (Hepatocellular Carcinoma HCC)

In [16]:
cbio_hcc_mskimpact_2018_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/cbio_hcc_mskimpact_2018_recist.csv")
cbio_hcc_mskimpact_2018_response_df["dataset_name"] = "CBIO_hcc_mskimpact_2018"
cbio_hcc_mskimpact_2018_response_df["mappedProject"] = "TCGA-LIHC" # Hepatocellular Carcinoma based on info at https://portal.gdc.cancer.gov/projects/TCGA-LIHC Google search "Hepatocellular carcinoma tcga" and selecting TCGA project page from results
cbio_hcc_mskimpact_2018_response_df

Unnamed: 0,sample_id,drug_name,recist_like,drug_category,dataset_name,mappedProject
0,P-0005038-T02-IM6,SORAFENIB,SD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
1,P-0015203-T01-IM6,SORAFENIB,SD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
2,P-0019058-T01-IM6,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
3,P-0019238-T01-IM6,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
4,P-0020359-T01-IM6,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
5,P-0001324-T01-IM3,SORAFENIB,SD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
6,P-0003212-T01-IM5,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
7,P-0005757-T01-IM5,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
8,P-0006245-T02-IM5,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC
9,P-0012628-T01-IM5,SORAFENIB,PR,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC


In [17]:
cbio_hcc_mskimpact_2018_response_df.recist_like.value_counts()

recist_like
PD           25
SD           16
PR            1
Non-PR/CR     1
Name: count, dtype: int64

In [18]:
cbio_hcc_mskimpact_2018_response_df["recist"] = cbio_hcc_mskimpact_2018_response_df.recist_like.apply(lambda x: get_response_labels(x))
cbio_hcc_mskimpact_2018_response_df

Unnamed: 0,sample_id,drug_name,recist_like,drug_category,dataset_name,mappedProject,recist
0,P-0005038-T02-IM6,SORAFENIB,SD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
1,P-0015203-T01-IM6,SORAFENIB,SD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
2,P-0019058-T01-IM6,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
3,P-0019238-T01-IM6,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
4,P-0020359-T01-IM6,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
5,P-0001324-T01-IM3,SORAFENIB,SD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
6,P-0003212-T01-IM5,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
7,P-0005757-T01-IM5,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
8,P-0006245-T02-IM5,SORAFENIB,PD,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,0
9,P-0012628-T01-IM5,SORAFENIB,PR,1,CBIO_hcc_mskimpact_2018,TCGA-LIHC,1


#### CBIO brca_mskcc_2019

In [19]:
cbio_brca_mskcc_2019_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/cbio_brca_mskcc_2019_recist.csv")
cbio_brca_mskcc_2019_response_df["dataset_name"] = "CBIO_brca_mskcc_2019"
cbio_brca_mskcc_2019_response_df["mappedProject"] = "TCGA-BRCA" # Breast Cancer based on info at https://portal.gdc.cancer.gov/projects/TCGA-BRCA search for "Breast cancer tcga" and selecting TCGA Project page from results
cbio_brca_mskcc_2019_response_df

Unnamed: 0,sample_id,drug_name,recist_like,drug_category,dataset_name,mappedProject
0,s_DS_bkm_001_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
1,s_DS_bkm_002_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
2,s_DS_bkm_003_T,BUPARLISIB,Partial Response,2,CBIO_brca_mskcc_2019,TCGA-BRCA
3,s_DS_bkm_005_T,BUPARLISIB,Complete Response,2,CBIO_brca_mskcc_2019,TCGA-BRCA
4,s_DS_bkm_006_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
5,s_DS_bkm_007_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
6,s_DS_bkm_008_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
7,s_DS_bkm_009_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
8,s_DS_bkm_010_T,BUPARLISIB,Progressive Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA
9,s_DS_bkm_013_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA


In [20]:
cbio_brca_mskcc_2019_response_df.recist_like.value_counts()

recist_like
Stable Disease         16
Progressive Disease     9
Partial Response        1
Complete Response       1
Name: count, dtype: int64

In [21]:
cbio_brca_mskcc_2019_response_df["recist"] = cbio_brca_mskcc_2019_response_df.recist_like.apply(lambda x: get_response_labels(x))
cbio_brca_mskcc_2019_response_df

Unnamed: 0,sample_id,drug_name,recist_like,drug_category,dataset_name,mappedProject,recist
0,s_DS_bkm_001_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
1,s_DS_bkm_002_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
2,s_DS_bkm_003_T,BUPARLISIB,Partial Response,2,CBIO_brca_mskcc_2019,TCGA-BRCA,1
3,s_DS_bkm_005_T,BUPARLISIB,Complete Response,2,CBIO_brca_mskcc_2019,TCGA-BRCA,1
4,s_DS_bkm_006_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
5,s_DS_bkm_007_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
6,s_DS_bkm_008_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
7,s_DS_bkm_009_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
8,s_DS_bkm_010_T,BUPARLISIB,Progressive Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0
9,s_DS_bkm_013_T,BUPARLISIB,Stable Disease,2,CBIO_brca_mskcc_2019,TCGA-BRCA,0


In [22]:
combined_patient_response_df = pd.concat([
    tcga_response_df[["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],
    moores_response_df[["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],
    cbio_brca_mskcc_2019_response_df[["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],
    cbio_hcc_mskimpact_2018_response_df[["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],
])

In [23]:
combined_patient_response_df.shape

(725, 5)

In [24]:
combined_patient_response_df

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,TCGA-VQ-A928,FLUOROURACIL,0,TCGA-STAD,TCGA
1,TCGA-JY-A93D,FLUOROURACIL,1,TCGA-ESCA,TCGA
2,TCGA-P3-A6T3,CISPLATIN,1,TCGA-HNSC,TCGA
3,TCGA-AZ-6606,FLUOROURACIL,0,TCGA-COAD,TCGA
4,TCGA-VS-A954,CISPLATIN,1,TCGA-CESC,TCGA
...,...,...,...,...,...
38,P-0013161-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
39,P-0010148-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
40,P-0013312-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
41,P-0010226-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


In [25]:
len(combined_patient_response_df.drug_name.unique()) # 56 drugs in Exp 2

56

### Divide into train-test split based on cancer type and drug

* For each mappedProject, per dataset, in each drug, divide into stratified 3 fold when number of samples per mappedProject per drug is >= 20. Else use them for training.

In [26]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state = 42)

In [27]:
combined_patient_response_df.groupby(["drug_name", "mappedProject", "dataset_name"]).agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sample_id,recist
drug_name,mappedProject,dataset_name,Unnamed: 3_level_1,Unnamed: 4_level_1
AXITINIB,TCGA-KIRC,TCGA,1,1
AXITINIB,TCGA-SKCM,TCGA,1,1
BCG,TCGA-BLCA,TCGA,2,2
BEVACIZUMAB,GI,Moores,4,4
BEVACIZUMAB,GU,Moores,4,4
...,...,...,...,...
VINORELBINE,TCGA-MESO,TCGA,1,1
VISMODEGIB,TCGA-HNSC,Moores,1,1
VORINOSTAT,TCGA-MESO,TCGA,1,1
VORINOSTAT,TCGA-SARC,TCGA,1,1


In [28]:
# Drugs considered for experiments
combined_patient_response_df.groupby(["drug_name"]).agg("count")[combined_patient_response_df.groupby(["drug_name"])["sample_id"].agg("count") > 20]

Unnamed: 0_level_0,sample_id,recist,mappedProject,dataset_name
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BEVACIZUMAB,23,23,23,23
BUPARLISIB,27,27,27,27
CAPECITABINE,24,24,24,24
CISPLATIN,104,104,104,104
DOCETAXEL,26,26,26,26
FLUOROURACIL,50,50,50,50
GEMCITABINE,61,61,61,61
PACLITAXEL,40,40,40,40
SORAFENIB,60,60,60,60
TEMOZOLOMIDE,99,99,99,99


In [29]:
combined_patient_response_df[combined_patient_response_df.drug_name == "DASATINIB"].groupby(["drug_name", "mappedProject", "dataset_name"]).agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sample_id,recist
drug_name,mappedProject,dataset_name,Unnamed: 3_level_1,Unnamed: 4_level_1
DASATINIB,TCGA-BRCA,Moores,1,1
DASATINIB,TCGA-MESO,TCGA,7,7


In [30]:
combined_patient_response_df[combined_patient_response_df.drug_name == "BUPARLISIB"].recist.value_counts()

recist
0    25
1     2
Name: count, dtype: int64

In [31]:
combined_patient_response_df[combined_patient_response_df.dataset_name == "CBIO_hcc_mskimpact_2018"].recist.value_counts()

recist
0    41
1     2
Name: count, dtype: int64

In [32]:
groupings = list(combined_patient_response_df.groupby(["drug_name", "mappedProject", "dataset_name"]).agg("count").index)
len(groupings)

164

In [33]:
split0_samples = {"train": {}, "test": {}}
split1_samples = {"train": {}, "test": {}}
split2_samples = {"train": {}, "test": {}}

In [34]:
for drug, cancer_type, dataset in groupings:
    subset_df = combined_patient_response_df[
    (combined_patient_response_df.drug_name == drug) &
    (combined_patient_response_df.mappedProject == cancer_type) &
    (combined_patient_response_df.dataset_name == dataset)
    ].reset_index(drop=True) # same as count attribute above
    # divide into stratified 3 fold when number of samples per mappedProject per drug is >= 20. Else use them for training.
    if len(subset_df) >= 20:
        three_fold_indices = list(skf.split(subset_df["sample_id"], subset_df["recist"]))
        split0_samples["train"][(drug, cancer_type, dataset)] = subset_df.loc[three_fold_indices[0][0]] # 0th index for split 0, 0th index for train
        # split0_samples["train"][(drug, cancer_type, dataset)], split0_samples["val"][(drug, cancer_type, dataset)] = train_test_split(subset_df.loc[three_fold_indices[0][0]].reset_index(drop=True), random_state=42, test_size=0.1) # 0th index for split 0, 0th index for train
        split0_samples["test"][(drug, cancer_type, dataset)] = subset_df.loc[three_fold_indices[0][1]] # 0th index for split 0, 1st index for test
        split1_samples["train"][(drug, cancer_type, dataset)] = subset_df.loc[three_fold_indices[1][0]] # 1st index for split 1, 0th index for train
        # split1_samples["train"][(drug, cancer_type, dataset)], split1_samples["val"][(drug, cancer_type, dataset)] = train_test_split(subset_df.loc[three_fold_indices[1][0]].reset_index(drop=True), random_state=42, test_size=0.1) # 1st index for split 1, 0th index for train
        split1_samples["test"][(drug, cancer_type, dataset)] = subset_df.loc[three_fold_indices[1][1]] # 1st index for split 1, 1st index for test
        split2_samples["train"][(drug, cancer_type, dataset)] = subset_df.loc[three_fold_indices[2][0]] # 2nd index for split 2, 0th index for train
        # split2_samples["train"][(drug, cancer_type, dataset)], split2_samples["val"][(drug, cancer_type, dataset)] = train_test_split(subset_df.loc[three_fold_indices[2][0]].reset_index(drop=True), random_state=42, test_size=0.1) # 2nd index for split 2, 0th index for train
        split2_samples["test"][(drug, cancer_type, dataset)] = subset_df.loc[three_fold_indices[2][1]] # 2nd index for split 2, 1st index for test
    else:
        # use these samples in training all 3 splits
        split0_samples["train"][(drug, cancer_type, dataset)] = subset_df
        split1_samples["train"][(drug, cancer_type, dataset)] = subset_df
        split2_samples["train"][(drug, cancer_type, dataset)] = subset_df



In [35]:
split0_samples["test"][("SORAFENIB", "TCGA-LIHC", "CBIO_hcc_mskimpact_2018")]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
1,P-0015203-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
2,P-0019058-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
4,P-0020359-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
10,P-0021780-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
12,P-0012317-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
14,P-0005357-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
20,P-0001409-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
22,P-0001188-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
23,P-0001852-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
24,P-0002697-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


In [36]:
split1_samples["test"][("SORAFENIB", "TCGA-LIHC", "CBIO_hcc_mskimpact_2018")]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,P-0005038-T02-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
3,P-0019238-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
6,P-0003212-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
11,P-0000228-T03-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
13,P-0009644-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
15,P-0002719-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
17,P-0002317-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
21,P-0001307-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
26,P-0001806-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
33,P-0006403-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


In [37]:
split2_samples["test"][("SORAFENIB", "TCGA-LIHC", "CBIO_hcc_mskimpact_2018")]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
5,P-0001324-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
7,P-0005757-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
8,P-0006245-T02-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
9,P-0012628-T01-IM5,SORAFENIB,1,TCGA-LIHC,CBIO_hcc_mskimpact_2018
16,P-0001321-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
18,P-0000587-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
19,P-0004247-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
25,P-0000182-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
29,P-0005782-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
30,P-0007787-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


### Experiment 1 - single drug models

In [38]:
combined_patient_response_df.groupby(["drug_name", "mappedProject"]).agg("count").sort_values(by="sample_id", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id,recist,dataset_name
drug_name,mappedProject,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TEMOZOLOMIDE,TCGA-LGG,91,91,91
SORAFENIB,TCGA-LIHC,56,56,56
CISPLATIN,TCGA-CESC,50,50,50
GEMCITABINE,TCGA-PAAD,46,46,46
CISPLATIN,TCGA-HNSC,38,38,38
...,...,...,...,...
DOXORUBICIN,TCGA-UCS,1,1,1
ERLOTINIB,TCGA-BLCA,1,1,1
ERLOTINIB,TCGA-PAAD,1,1,1
ETOPOSIDE,TCGA-ESCA,1,1,1


#### Setting A: train on pancancer data and test on single cancer type

* Select drugs with atleast 20 labelled samples in combined dataframe.
* Check if these drugs are present in the test splits - consider only if test data exists.

In [39]:
selected_drugs_setting1 = combined_patient_response_df.groupby(["drug_name"]).agg("count")[combined_patient_response_df.groupby(["drug_name"]).agg("count")["sample_id"] >= 20]
selected_drugs_setting1

Unnamed: 0_level_0,sample_id,recist,mappedProject,dataset_name
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BEVACIZUMAB,23,23,23,23
BUPARLISIB,27,27,27,27
CAPECITABINE,24,24,24,24
CARBOPLATIN,20,20,20,20
CISPLATIN,104,104,104,104
DOCETAXEL,26,26,26,26
DOXORUBICIN,20,20,20,20
FLUOROURACIL,50,50,50,50
GEMCITABINE,61,61,61,61
PACLITAXEL,40,40,40,40


In [40]:
list(split0_samples["test"].keys()) # no Docetaxel, no Bevacizumab, no Capecitabine, no Carboplatin, no Doxorubicin since they have 0 test samples

[('BUPARLISIB', 'TCGA-BRCA', 'CBIO_brca_mskcc_2019'),
 ('CISPLATIN', 'TCGA-CESC', 'TCGA'),
 ('CISPLATIN', 'TCGA-HNSC', 'TCGA'),
 ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'),
 ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'),
 ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'),
 ('SORAFENIB', 'TCGA-LIHC', 'CBIO_hcc_mskimpact_2018'),
 ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')]

In [41]:
setting1A_split0 = {"train": {}, "val": {}, "test": {}}
setting1A_split1 = {"train": {}, "val": {}, "test": {}}
setting1A_split2 = {"train": {}, "val": {}, "test": {}}

In [42]:
for drug in selected_drugs_setting1.index:
    if drug in [i[0] for i in list(split0_samples["test"].keys())]: # Buparlisib, Cisplatin, Fluorouracil, Gemcitabine, Temozolomide, Paclitaxel, Sorafenib, split number does not matter since they all match in keys
        # split 0 train
        df_list = []
        for k, v in split0_samples["train"].items():
            if k[0] == drug:
                df_list.append(v)
        # setting1A_split0["train"][drug] = pd.concat(df_list, ignore_index=True)
        train_data = pd.concat(df_list, ignore_index=True)
        # split 0 val carved out from train split
        setting1A_split0["train"][drug], setting1A_split0["val"][drug] = train_test_split(train_data, random_state=42, test_size=0.1)

        # split 0 test
        df_list = []
        for k, v in split0_samples["test"].items():
            if k[0] == drug:
                df_list.append(v)
        setting1A_split0["test"][drug] = pd.concat(df_list, ignore_index=True)

        # split 1 train
        df_list = []
        for k, v in split1_samples["train"].items():
            if k[0] == drug:
                df_list.append(v)
        # setting1A_split1["train"][drug] = pd.concat(df_list, ignore_index=True)
        train_data = pd.concat(df_list, ignore_index=True)
        # split 1 val carved out from train split
        setting1A_split1["train"][drug], setting1A_split1["val"][drug] = train_test_split(train_data, random_state=42, test_size=0.1)

        # split 1 test
        df_list = []
        for k, v in split1_samples["test"].items():
            if k[0] == drug:
                df_list.append(v)
        setting1A_split1["test"][drug] = pd.concat(df_list, ignore_index=True)
        
        # split 2 train
        df_list = []
        for k, v in split2_samples["train"].items():
            if k[0] == drug:
                df_list.append(v)
        # setting1A_split2["train"][drug] = pd.concat(df_list, ignore_index=True)
        train_data = pd.concat(df_list, ignore_index=True)
        # split 2 val carved out from train split
        setting1A_split2["train"][drug], setting1A_split2["val"][drug] = train_test_split(train_data, random_state=42, test_size=0.1)

        # split 2 test
        df_list = []
        for k, v in split2_samples["test"].items():
            if k[0] == drug:
                df_list.append(v)
        setting1A_split2["test"][drug] = pd.concat(df_list, ignore_index=True)
            
            
    

In [43]:
setting1A_split2["test"]["SORAFENIB"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,P-0001324-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
1,P-0005757-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
2,P-0006245-T02-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
3,P-0012628-T01-IM5,SORAFENIB,1,TCGA-LIHC,CBIO_hcc_mskimpact_2018
4,P-0001321-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
5,P-0000587-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
6,P-0004247-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
7,P-0000182-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
8,P-0005782-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
9,P-0007787-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


In [44]:
setting1A_split0["train"]["CISPLATIN"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
12,TCGA-VS-A94Y,CISPLATIN,0,TCGA-CESC,TCGA
55,TCGA-IQ-A61G,CISPLATIN,0,TCGA-HNSC,TCGA
65,TCGA-D3-A1Q3,CISPLATIN,1,TCGA-SKCM,TCGA
31,TCGA-VS-A9UH,CISPLATIN,1,TCGA-CESC,TCGA
9,TCGA-DS-A7WF,CISPLATIN,1,TCGA-CESC,TCGA
...,...,...,...,...,...
20,TCGA-VS-A9UA,CISPLATIN,1,TCGA-CESC,TCGA
60,TCGA-LK-A4NZ,CISPLATIN,0,TCGA-MESO,TCGA
71,TCGA-QS-A5YQ,CISPLATIN,1,TCGA-UCEC,TCGA
14,TCGA-IR-A3LL,CISPLATIN,1,TCGA-CESC,TCGA


In [45]:
setting1A_split1["val"]["PACLITAXEL"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
27,TCGA-N5-A4RO,PACLITAXEL,1,TCGA-UCS,TCGA
15,TCGA-A2-A3XW,PACLITAXEL,1,TCGA-BRCA,TCGA
23,TCGA-99-8033,PACLITAXEL,0,TCGA-LUAD,TCGA


In [46]:
setting1A_split0["train"].keys()

dict_keys(['BUPARLISIB', 'CISPLATIN', 'FLUOROURACIL', 'GEMCITABINE', 'PACLITAXEL', 'SORAFENIB', 'TEMOZOLOMIDE'])

In [47]:
setting1A_split0["test"].keys()

dict_keys(['BUPARLISIB', 'CISPLATIN', 'FLUOROURACIL', 'GEMCITABINE', 'PACLITAXEL', 'SORAFENIB', 'TEMOZOLOMIDE'])

In [48]:
setting1A_split0["test"]["PACLITAXEL"].shape

(10, 5)

#### Setting 1B: train and test on same cancer type
* Select drugs with atleast 20 labelled samples in combined dataframe `selected_drugs_setting1`
* Check if these drugs are present in the test splits - consider only if test data exists.
* Ensure test split has both class labels for calculation of AUROC, AUPRC and correlations.
* Unlike 1A, saving is per drug, per cancer type, per dataset

In [49]:
setting1B_split0 = {"train": {}, "val": {}, "test": {}}
setting1B_split1 = {"train": {}, "val": {}, "test": {}}
setting1B_split2 = {"train": {}, "val": {}, "test": {}}

In [50]:
split0_samples["test"].keys()

dict_keys([('BUPARLISIB', 'TCGA-BRCA', 'CBIO_brca_mskcc_2019'), ('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('SORAFENIB', 'TCGA-LIHC', 'CBIO_hcc_mskimpact_2018'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [51]:
selected_drugs_setting1

Unnamed: 0_level_0,sample_id,recist,mappedProject,dataset_name
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BEVACIZUMAB,23,23,23,23
BUPARLISIB,27,27,27,27
CAPECITABINE,24,24,24,24
CARBOPLATIN,20,20,20,20
CISPLATIN,104,104,104,104
DOCETAXEL,26,26,26,26
DOXORUBICIN,20,20,20,20
FLUOROURACIL,50,50,50,50
GEMCITABINE,61,61,61,61
PACLITAXEL,40,40,40,40


In [52]:
split1_samples["test"][('SORAFENIB',
  'TCGA-LIHC',
  'CBIO_hcc_mskimpact_2018')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,P-0005038-T02-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
3,P-0019238-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
6,P-0003212-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
11,P-0000228-T03-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
13,P-0009644-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
15,P-0002719-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
17,P-0002317-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
21,P-0001307-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
26,P-0001806-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
33,P-0006403-T01-IM5,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


In [53]:
# split 0
for k, v in split0_samples["test"].items(): # Buparlisib, Cisplatin, Fluorouracil, Temozolomide, Gemcitabine, Paclitaxel, Sorafenib, split number does not matter since they all match in keys
    if drug in selected_drugs_setting1.index:
        if len(v["recist"].unique()) > 1: # atleast 2 labels exist for later calculation of AUROC and AUPRC
            setting1B_split0["train"][k], setting1B_split0["val"][k] = train_test_split(split0_samples["train"][k], test_size=0.1, random_state=42)
            setting1B_split0["test"][k] = v

# split 1
for k, v in split1_samples["test"].items(): # Buparlisib, Cisplatin, Fluorouracil, Temozolomide, Gemcitabine, Paclitaxel, Sorafenib, split number does not matter since they all match in keys
    if drug in selected_drugs_setting1.index:
        if len(v["recist"].unique()) > 1: # atleast 2 labels exist for later calculation of AUROC and AUPRC
            setting1B_split1["train"][k], setting1B_split1["val"][k] = train_test_split(split1_samples["train"][k], test_size=0.1, random_state=42)
            setting1B_split1["test"][k] = v

# split 2
for k, v in split2_samples["test"].items(): # Buparlisib, Cisplatin, Fluorouracil, Temozolomide, Gemcitabine, Paclitaxel, Sorafenib, split number does not matter since they all match in keys
    if drug in selected_drugs_setting1.index:
        if len(v["recist"].unique()) > 1: # atleast 2 labels exist for later calculation of AUROC and AUPRC
            setting1B_split2["train"][k], setting1B_split2["val"][k] = train_test_split(split2_samples["train"][k], test_size=0.1, random_state=42)
            setting1B_split2["test"][k] = v
        

In [54]:
setting1B_split0["test"].keys()

dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('SORAFENIB', 'TCGA-LIHC', 'CBIO_hcc_mskimpact_2018'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [55]:
setting1B_split1["test"].keys()

dict_keys([('BUPARLISIB', 'TCGA-BRCA', 'CBIO_brca_mskcc_2019'), ('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [56]:
setting1B_split2["test"].keys()

dict_keys([('BUPARLISIB', 'TCGA-BRCA', 'CBIO_brca_mskcc_2019'), ('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('SORAFENIB', 'TCGA-LIHC', 'CBIO_hcc_mskimpact_2018'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [57]:
keys2retain_setting1B = set(setting1B_split0["test"].keys()) & set(setting1B_split1["test"].keys()) & set(setting1B_split2["test"].keys()) # only retain these
keys2retain_setting1B

{('CISPLATIN', 'TCGA-CESC', 'TCGA'),
 ('CISPLATIN', 'TCGA-HNSC', 'TCGA'),
 ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'),
 ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'),
 ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'),
 ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')}

In [58]:
split0_setting1B_keys2remove = set(setting1B_split0["train"].keys()) - keys2retain_setting1B
split1_setting1B_keys2remove = set(setting1B_split1["train"].keys()) - keys2retain_setting1B
split2_setting1B_keys2remove = set(setting1B_split2["train"].keys()) - keys2retain_setting1B

# split 0
for k in split0_setting1B_keys2remove:
    setting1B_split0["train"].pop(k)
    setting1B_split0["val"].pop(k)
    setting1B_split0["test"].pop(k)

# split 1
for k in split1_setting1B_keys2remove:
    setting1B_split1["train"].pop(k)
    setting1B_split1["val"].pop(k)
    setting1B_split1["test"].pop(k)
    
# split 2
for k in split2_setting1B_keys2remove:
    setting1B_split2["train"].pop(k)
    setting1B_split2["val"].pop(k)
    setting1B_split2["test"].pop(k)
        

In [59]:
setting1B_split2["test"].keys()

dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [60]:
setting1B_split2["val"].keys()

dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [61]:
setting1B_split2["train"].keys()

dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [62]:
setting1B_split0["train"][('CISPLATIN', 'TCGA-CESC', 'TCGA')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
14,TCGA-EX-A3L1,CISPLATIN,1,TCGA-CESC,TCGA
16,TCGA-DS-A7WF,CISPLATIN,1,TCGA-CESC,TCGA
32,TCGA-DS-A7WI,CISPLATIN,1,TCGA-CESC,TCGA
35,TCGA-IR-A3LH,CISPLATIN,1,TCGA-CESC,TCGA
19,TCGA-VS-A94Y,CISPLATIN,0,TCGA-CESC,TCGA
0,TCGA-VS-A954,CISPLATIN,1,TCGA-CESC,TCGA
7,TCGA-VS-A9UD,CISPLATIN,1,TCGA-CESC,TCGA
25,TCGA-VS-A9U5,CISPLATIN,1,TCGA-CESC,TCGA
8,TCGA-VS-A94Z,CISPLATIN,1,TCGA-CESC,TCGA
21,TCGA-VS-A8QM,CISPLATIN,0,TCGA-CESC,TCGA


In [63]:
setting1B_split0["val"][('CISPLATIN', 'TCGA-CESC', 'TCGA')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
48,TCGA-VS-A9UH,CISPLATIN,1,TCGA-CESC,TCGA
23,TCGA-Q1-A5R2,CISPLATIN,1,TCGA-CESC,TCGA
42,TCGA-VS-A9V3,CISPLATIN,1,TCGA-CESC,TCGA
26,TCGA-UC-A7PG,CISPLATIN,0,TCGA-CESC,TCGA


### Experiment 2: Use all drugs for training

#### Setting 2A: pan cancer to single cancer, all drugs for training
* Combine all patients across drugs, as long as drug SMILES is available.

In [64]:
len(split0_samples["train"].keys())

164

In [65]:
len(split0_samples["test"].keys())

8

In [66]:
setting2A_split0 = {"train": {}, "val": {}, "test": {}}
setting2A_split1 = {"train": {}, "val": {}, "test": {}}
setting2A_split2 = {"train": {}, "val": {}, "test": {}}

In [67]:
# split 0 train
df_list = []
for k, v in split0_samples["train"].items():
    if k[0] in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split0["train"], setting2A_split0["val"] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
# split 0 test
df_list = []
for k, v in split0_samples["test"].items():
    if k[0] in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split0["test"] = pd.concat(df_list, ignore_index=True)

# split 1 train
df_list = []
for k, v in split1_samples["train"].items():
    if k[0] in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split1["train"], setting2A_split1["val"] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
# split 1 test
df_list = []
for k, v in split1_samples["test"].items():
    if k[0] in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split1["test"] = pd.concat(df_list, ignore_index=True)

# split 2 train
df_list = []
for k, v in split2_samples["train"].items():
    if k[0] in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split2["train"], setting2A_split2["val"] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
# split 2 test
df_list = []
for k, v in split2_samples["test"].items():
    if k[0] in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split2["test"] = pd.concat(df_list, ignore_index=True)

In [68]:
setting2A_split2["train"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
437,P-0021780-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018
76,TCGA-A5-A1OH,CARBOPLATIN,1,TCGA-UCEC,TCGA
227,TCGA-DX-A7EQ,DOXORUBICIN,0,TCGA-SARC,TCGA
334,TCGA-FB-A5VM,GEMCITABINE,0,TCGA-PAAD,TCGA
30,s_DS_bkm_035_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
...,...,...,...,...,...
71,TCGA-GN-A8LK,CARBOPLATIN,0,TCGA-SKCM,TCGA
106,TCGA-EX-A3L1,CISPLATIN,1,TCGA-CESC,TCGA
270,TCGA-3A-A9IC,FLUOROURACIL,0,TCGA-PAAD,TCGA
435,P-0020359-T01-IM6,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018


In [69]:
setting2A_split2["val"].shape

(59, 5)

In [70]:
setting2A_split2["test"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,s_DS_bkm_005_T,BUPARLISIB,1,TCGA-BRCA,CBIO_brca_mskcc_2019
1,s_DS_bkm_007_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
2,s_DS_bkm_018_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
3,s_DS_bkm_029_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
4,s_DS_bkm_030_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
...,...,...,...,...,...
111,TCGA-IK-8125,TEMOZOLOMIDE,1,TCGA-LGG,TCGA
112,TCGA-TM-A84B,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
113,TCGA-DH-A66B,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
114,TCGA-DH-A7UU,TEMOZOLOMIDE,0,TCGA-LGG,TCGA


In [71]:
combined_patient_response_df["mappedProject"].unique()

array(['TCGA-STAD', 'TCGA-ESCA', 'TCGA-HNSC', 'TCGA-COAD', 'TCGA-CESC',
       'TCGA-LGG', 'TCGA-BLCA', 'TCGA-PAAD', 'TCGA-BRCA', 'TCGA-UCEC',
       'TCGA-UCS', 'TCGA-LUAD', 'TCGA-MESO', 'TCGA-TGCT', 'TCGA-SARC',
       'TCGA-SKCM', 'TCGA-PRAD', 'TCGA-READ', 'TCGA-LUSC', 'TCGA-LIHC',
       'TCGA-KIRC', 'TCGA-THCA', 'TCGA-KICH', 'TCGA-KIRP', 'TCGA-GBM',
       'TCGA-PCPG', 'TCGA-OV', 'TCGA-ACC', 'GI', 'GU'], dtype=object)

#### Setting 2B: Cancer type specific models across drugs
* Combine samples within a specific cancer type.
* Ensure drug SMILES exists

In [72]:
# only considering cancer types where test data is available for Exp 2B. This means all cancer types in exp 1B.

In [73]:
setting1B_split0["test"].keys()

dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [74]:
cancer_types_considered = ["TCGA-BRCA", "TCGA-CESC", "TCGA-HNSC", "TCGA-STAD", "TCGA-PAAD", "TCGA-LGG"]

In [75]:
setting2B_split0 = {"train": {}, "val": {}, "test": {}}
setting2B_split1 = {"train": {}, "val": {}, "test": {}}
setting2B_split2 = {"train": {}, "val": {}, "test": {}}

In [76]:
for ctype in cancer_types_considered:
    # split 0 train
    df_list = []
    for k, v in split0_samples["train"].items():
        if (k[0] in drugs_with_smiles.index) & (k[1] == ctype):
            df_list.append(v)
    setting2B_split0["train"][ctype], setting2B_split0["val"][ctype] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
    # split 0 test
    df_list = []
    for k, v in split0_samples["test"].items():
        if (k[0] in drugs_with_smiles.index) & (k[1] == ctype):
            df_list.append(v)
    setting2B_split0["test"][ctype] = pd.concat(df_list, ignore_index=True)

    # split 1 train
    df_list = []
    for k, v in split1_samples["train"].items():
        if (k[0] in drugs_with_smiles.index) & (k[1] == ctype):
            df_list.append(v)
    setting2B_split1["train"][ctype], setting2B_split1["val"][ctype] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
    # split 1 test
    df_list = []
    for k, v in split1_samples["test"].items():
        if (k[0] in drugs_with_smiles.index) & (k[1] == ctype):
            df_list.append(v)
    setting2B_split1["test"][ctype] = pd.concat(df_list, ignore_index=True)

    # split 2 train
    df_list = []
    for k, v in split2_samples["train"].items():
        if (k[0] in drugs_with_smiles.index) & (k[1] == ctype):
            df_list.append(v)
    setting2B_split2["train"][ctype], setting2B_split2["val"][ctype] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
    # split 1 test
    df_list = []
    for k, v in split2_samples["test"].items():
        if (k[0] in drugs_with_smiles.index) & (k[1] == ctype):
            df_list.append(v)
    setting2B_split2["test"][ctype] = pd.concat(df_list, ignore_index=True)

In [77]:
setting2B_split0["test"]["TCGA-BRCA"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,s_DS_bkm_001_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
1,s_DS_bkm_006_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
2,s_DS_bkm_013_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
3,s_DS_bkm_020_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
4,s_DS_bkm_021_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
5,s_DS_bkm_025_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
6,s_DS_bkm_028_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
7,s_DS_bkm_035_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
8,s_DS_bkm_043_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
9,TCGA-AQ-A54O,PACLITAXEL,1,TCGA-BRCA,TCGA


In [78]:
setting2B_split0["val"]["TCGA-BRCA"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
44,85,LETROZOLE,0,TCGA-BRCA,Moores
53,TCGA-GM-A2DF,PACLITAXEL,1,TCGA-BRCA,TCGA
30,TCGA-A2-A0CK,DOCETAXEL,1,TCGA-BRCA,TCGA
12,s_DS_bkm_031_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
49,TCGA-LL-A73Z,PACLITAXEL,0,TCGA-BRCA,TCGA
0,s_DS_bkm_002_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019
64,TCGA-GM-A2DB,PACLITAXEL,1,TCGA-BRCA,TCGA
18,TCGA-GM-A2DA,CAPECITABINE,0,TCGA-BRCA,TCGA
10,s_DS_bkm_029_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019


### Cell Lines

In [79]:
cell_line_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/ccle_auc.csv")
cell_line_response_df

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category
0,PR-00UtU3,5-FLUOROURACIL,0.952134,4.327152,1
1,PR-00UtU3,ABT737,0.595380,-0.445402,3
2,PR-00UtU3,ACETALAX,0.982784,5.474095,3
3,PR-00UtU3,AFATINIB,0.824703,1.064304,1
4,PR-00UtU3,AFURESERTIB,0.901798,2.332643,2
...,...,...,...,...,...
224490,PR-zwU925,WEE1 INHIBITOR,0.875812,2.080707,2
224491,PR-zwU925,WEHI-539,0.797865,1.240418,3
224492,PR-zwU925,WIKI4,0.968185,3.888230,3
224493,PR-zwU925,YK-4-279,0.762317,1.607404,3


In [80]:
# Create binary response labels based on median response of each drug
audrc_median_per_drug = {}
for drug in cell_line_response_df.drug_name.unique():
    subset_df = cell_line_response_df[cell_line_response_df.drug_name == drug]
    audrc_median_per_drug[drug] = subset_df.auc.median() # skips na by default

In [81]:
len(audrc_median_per_drug)

211

In [82]:
def get_response_label(x):
    drug_name = x["drug_name"]
    if x["auc"] < audrc_median_per_drug[drug_name]:
        return 1 # lower the audrc better the response
    else:
        return 0

In [83]:
cell_line_response_df["response_label"] = cell_line_response_df.apply(lambda x: get_response_label(x), axis = 1)

In [84]:
cell_line_response_df.replace("5-FLUOROURACIL", "FLUOROURACIL", inplace=True)
cell_line_response_df

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
0,PR-00UtU3,FLUOROURACIL,0.952134,4.327152,1,0
1,PR-00UtU3,ABT737,0.595380,-0.445402,3,1
2,PR-00UtU3,ACETALAX,0.982784,5.474095,3,0
3,PR-00UtU3,AFATINIB,0.824703,1.064304,1,1
4,PR-00UtU3,AFURESERTIB,0.901798,2.332643,2,0
...,...,...,...,...,...,...
224490,PR-zwU925,WEE1 INHIBITOR,0.875812,2.080707,2,0
224491,PR-zwU925,WEHI-539,0.797865,1.240418,3,1
224492,PR-zwU925,WIKI4,0.968185,3.888230,3,0
224493,PR-zwU925,YK-4-279,0.762317,1.607404,3,1


### Divide into train-test split based on cancer type and drug

* For each drug, divide into 1 train-test split.

In [85]:
split0_samples_cl = {"train": {}, "val": {}, "test": {}}

In [86]:
for drug in cell_line_response_df.drug_name.unique():
    subset_df = cell_line_response_df[cell_line_response_df.drug_name == drug].reset_index(drop=True)
    X_train, X_test = train_test_split(subset_df, test_size=0.1, random_state=42, stratify=subset_df["response_label"])
    split0_samples_cl["train"][drug], split0_samples_cl["val"][drug] = train_test_split(X_train.reset_index(drop=True), test_size=0.1, random_state=42)
    split0_samples_cl["test"][drug] = X_test.reset_index(drop=True)
    

In [87]:
split0_samples_cl["test"]["PACLITAXEL"].response_label.value_counts()

response_label
0    60
1    59
Name: count, dtype: int64

In [88]:
split0_samples_cl["val"]["PACLITAXEL"].response_label.value_counts()

response_label
0    54
1    53
Name: count, dtype: int64

In [89]:
split0_samples_cl["train"]["PACLITAXEL"].response_label.value_counts()

response_label
1    480
0    480
Name: count, dtype: int64

#### Experiment 1A: Pan cancer to single cancer - drug specific models

In [90]:
setting1A_split0_cl = {"train": {}, "val": {}, "test": {}} # only 1 train-test split

In [91]:
# Consider all cell lines with a specific drug given
for k in setting1A_split0["train"].keys():
    setting1A_split0_cl["train"][k], setting1A_split0_cl["val"][k] = train_test_split(split0_samples_cl["train"][k], test_size=0.1, random_state=42)
    setting1A_split0_cl["test"][k] = split0_samples_cl["test"][k]

In [92]:
setting1A_split0_cl["train"]["PACLITAXEL"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
224,PR-vSKNEp,PACLITAXEL,0.941572,-2.705048,1,0
1058,PR-TwNLyX,PACLITAXEL,0.843060,-4.920125,1,1
569,PR-c9OVYZ,PACLITAXEL,0.914369,-4.543239,1,0
542,PR-xcjpPu,PACLITAXEL,0.982848,-0.871209,1,0
606,PR-A0j8Yf,PACLITAXEL,0.903745,-4.072013,1,1
...,...,...,...,...,...,...
941,PR-GYfpLB,PACLITAXEL,0.992993,-2.198257,1,0
155,PR-O9KXmv,PACLITAXEL,0.971973,-1.509648,1,0
838,PR-OFRo5c,PACLITAXEL,0.982462,-0.854200,1,0
500,PR-8vXkr9,PACLITAXEL,0.950205,-4.173867,1,0


In [93]:
setting1A_split0_cl["val"]["PACLITAXEL"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
929,PR-ntXWoU,PACLITAXEL,0.989462,-1.531706,1,0
1045,PR-cEu8oY,PACLITAXEL,0.983805,-0.898478,1,0
164,PR-zGaxVZ,PACLITAXEL,0.950405,-2.058524,1,0
161,PR-MUPtwP,PACLITAXEL,0.868094,-4.085509,1,1
13,PR-UOveiJ,PACLITAXEL,0.954047,-2.371742,1,0
...,...,...,...,...,...,...
397,PR-VOyovx,PACLITAXEL,0.971391,-2.518605,1,0
816,PR-hVwjEz,PACLITAXEL,0.815892,-2.820735,1,1
797,PR-uIFwjz,PACLITAXEL,0.915753,-3.494100,1,0
745,PR-a2Kg84,PACLITAXEL,0.842480,-2.210910,1,1


In [94]:
setting1A_split0_cl["test"]["PACLITAXEL"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
0,PR-rCh6Mh,PACLITAXEL,0.878776,-4.400960,1,1
1,PR-RG9TgA,PACLITAXEL,0.954886,-0.790300,1,0
2,PR-5uQTam,PACLITAXEL,0.917854,-4.065784,1,0
3,PR-8PpFpX,PACLITAXEL,0.987112,-1.667986,1,0
4,PR-HN3C6C,PACLITAXEL,0.904540,-3.839013,1,1
...,...,...,...,...,...,...
114,PR-qY1GC4,PACLITAXEL,0.873364,-4.937147,1,1
115,PR-Tek4yt,PACLITAXEL,0.976530,0.440676,1,0
116,PR-HzNj7n,PACLITAXEL,0.980573,-1.473669,1,0
117,PR-dRFAtd,PACLITAXEL,0.975560,-1.453724,1,0


#### Experiment 1B: Single cancer to single cancer - drug specific models
* Here we use all cell lines which are given the drug under consideration, we do not consider cancer type info.
* This works out to be the same train-test split for cell lines as in experiment 1A.

In [95]:
setting1B_split0_cl = {"train": {}, "val": {}, "test": {}} # only 1 train-test split

In [96]:
setting1B_split0["train"].keys()

dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')])

In [97]:
# Consider all cell lines with a specific drug given
for k in setting1B_split0["train"].keys():
    setting1B_split0_cl["train"][k], setting1B_split0_cl["val"][k] = train_test_split(split0_samples_cl["train"][k[0]], test_size=0.1, random_state=42) # k[0] has the drug name
    setting1B_split0_cl["test"][k] = split0_samples_cl["test"][k[0]]

In [98]:
setting1B_split0_cl["train"][('CISPLATIN', 'TCGA-HNSC', 'TCGA')]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
690,PR-8EE2ka,CISPLATIN,0.776032,0.285054,1,1
747,PR-9ByeKo,CISPLATIN,0.779539,0.211263,1,1
21,PR-1ofDIZ,CISPLATIN,0.921701,1.621675,1,1
38,PR-kSuyzj,CISPLATIN,0.935353,2.934207,1,1
522,PR-EvLXSQ,CISPLATIN,0.981732,4.660050,1,0
...,...,...,...,...,...,...
539,PR-UOveiJ,CISPLATIN,0.920806,2.967637,1,1
755,PR-sQS7tJ,CISPLATIN,0.967626,5.570299,1,0
462,PR-O1Dni4,CISPLATIN,0.967654,4.333717,1,0
17,PR-kuCd2G,CISPLATIN,0.891760,2.305190,1,1


#### Experiment 2A: All drugs to single drug, pan cancer model
* Use all available labelled cell lines for training.

In [99]:
setting2A_split0_cl = {"train": {}, "val": {}, "test": {}}

In [100]:
len(split0_samples_cl["train"].keys())

211

In [101]:
len(split0_samples_cl["test"].keys())

211

In [102]:
df_list = []
for k, v in split0_samples_cl["train"].items():
    if k in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split0_cl["train"], setting2A_split0_cl["val"] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)

df_list = []
for k, v in split0_samples_cl["test"].items():
    if k in drugs_with_smiles.index:
        df_list.append(v)
setting2A_split0_cl["test"] = pd.concat(df_list, ignore_index=True)

In [103]:
setting2A_split0_cl["train"].shape, setting2A_split0_cl["val"].shape, setting2A_split0_cl["test"].shape

((156959, 6), (17440, 6), (21655, 6))

#### Setting 2B: Many drugs to single drug, one cancer type to the same.
* Here we ignore cancer types of cell lines. We use all labelled cell lines as in setting 2A.
* This works out to be the same as the train-test split in setting 2A.

In [104]:
setting2B_split0_cl = {"train": {}, "val": {}, "test": {}}

In [105]:
cancer_types_considered

['TCGA-BRCA', 'TCGA-CESC', 'TCGA-HNSC', 'TCGA-STAD', 'TCGA-PAAD', 'TCGA-LGG']

In [106]:
for ctype in cancer_types_considered:
    # split 0 train
    df_list = []
    for k, v in split0_samples_cl["train"].items():
        if (k in drugs_with_smiles.index):
            df_list.append(v)
    setting2B_split0_cl["train"][ctype], setting2B_split0_cl["val"][ctype] = train_test_split(pd.concat(df_list, ignore_index=True), test_size=0.1, random_state=42)
    
    # split 0 test
    df_list = []
    for k, v in split0_samples_cl["test"].items():
        if (k in drugs_with_smiles.index):
            df_list.append(v)
    setting2B_split0_cl["test"][ctype] = pd.concat(df_list, ignore_index=True)

In [107]:
setting2B_split0_cl["train"]["TCGA-CESC"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
50059,PR-132fPs,DOCETAXEL,0.191876,-4.662091,1,1
52792,PR-L3QLdq,ELEPHANTIN,0.940458,5.730421,3,0
95348,PR-NxSV8u,MITOXANTRONE,0.921925,4.070582,1,0
44655,PR-oLPbwB,DACTINOMYCIN,0.179515,-6.588337,1,1
36895,PR-4ngqZx,CCT007093,0.989986,3.724712,3,0
...,...,...,...,...,...,...
119879,PR-M4505H,PFI-1,0.919051,3.534174,3,1
103694,PR-Bz57NU,NILOTINIB,0.995489,4.073733,1,0
131932,PR-6SyWYo,SAPITINIB,0.492491,-1.567439,2,1
146867,PR-wGySam,TASELISIB,0.901939,2.716776,2,0


In [108]:
setting1A_split0_cl["val"]["CISPLATIN"].pivot_table(values="auc", index="sample_id", columns="drug_name").reset_index().melt(
                id_vars=["sample_id"],
                var_name="drug_name",
                value_name="response",
            )

Unnamed: 0,sample_id,drug_name,response
0,PR-0qGhSp,CISPLATIN,0.956183
1,PR-14sBTC,CISPLATIN,0.935923
2,PR-1Uo7R0,CISPLATIN,0.941333
3,PR-2TYUx5,CISPLATIN,0.934851
4,PR-2uDYZY,CISPLATIN,0.964225
...,...,...,...
73,PR-voYF6y,CISPLATIN,0.898736
74,PR-w6rzl9,CISPLATIN,0.913975
75,PR-yEZYvr,CISPLATIN,0.837869
76,PR-yY64Z3,CISPLATIN,0.890298


In [109]:
# cell_line_metadata = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/metadata/labelled_ccle_metadata.csv", index_col=0)
# cell_line_metadata

### Save files

In [110]:
import pickle

In [111]:
processed_file_dir = "/data/ajayago/papers_data/systematic_assessment/processed/"
expt1A_file_dir = processed_file_dir + "Experiment1/SettingA/"
expt1B_file_dir = processed_file_dir + "Experiment1/SettingB/"
expt2A_file_dir = processed_file_dir + "Experiment2/SettingA/"
expt2B_file_dir = processed_file_dir + "Experiment2/SettingB/"

In [112]:
# patient files
# Experiment 1A
with open(f"{expt1A_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting1A_split0, f)
with open(f"{expt1A_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting1A_split1, f)
with open(f"{expt1A_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting1A_split2, f)

# Experiment 1B
with open(f"{expt1B_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting1B_split0, f)
with open(f"{expt1B_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting1B_split1, f)
with open(f"{expt1B_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting1B_split2, f)

# Experiment 2A
with open(f"{expt2A_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting2A_split0, f)
with open(f"{expt2A_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting2A_split1, f)
with open(f"{expt2A_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting2A_split2, f)

# Experiment 2B
with open(f"{expt2B_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting2B_split0, f)
with open(f"{expt2B_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting2B_split1, f)
with open(f"{expt2B_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting2B_split2, f)

In [113]:
# cell line files
# Experiment 1A
with open(f"{expt1A_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting1A_split0_cl, f)

# Experiment 1B
with open(f"{expt1B_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting1B_split0_cl, f)

# Experiment 2A
with open(f"{expt2A_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting2A_split0_cl, f)

# Experiment 2B
with open(f"{expt2B_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting2B_split0_cl, f)

In [114]:
setting1A_split0_cl["val"]["CISPLATIN"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
143,PR-WdRLig,CISPLATIN,0.962399,6.581119,1,0
589,PR-eUfoVk,CISPLATIN,0.925949,3.012760,1,1
423,PR-ZgrTvM,CISPLATIN,0.810434,0.820110,1,1
407,PR-eFC9MA,CISPLATIN,0.973798,4.644924,1,0
648,PR-hN4fOZ,CISPLATIN,0.717122,-0.134330,1,1
...,...,...,...,...,...,...
504,PR-LTIJeY,CISPLATIN,0.934312,4.192231,1,1
514,PR-rCh6Mh,CISPLATIN,0.845564,1.426818,1,1
7,PR-a5T2sI,CISPLATIN,0.953856,3.768136,1,0
675,PR-1Uo7R0,CISPLATIN,0.941333,3.107998,1,0


In [115]:
setting1B_split0_cl["val"][("CISPLATIN", "TCGA-CESC", "TCGA")]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
143,PR-WdRLig,CISPLATIN,0.962399,6.581119,1,0
589,PR-eUfoVk,CISPLATIN,0.925949,3.012760,1,1
423,PR-ZgrTvM,CISPLATIN,0.810434,0.820110,1,1
407,PR-eFC9MA,CISPLATIN,0.973798,4.644924,1,0
648,PR-hN4fOZ,CISPLATIN,0.717122,-0.134330,1,1
...,...,...,...,...,...,...
504,PR-LTIJeY,CISPLATIN,0.934312,4.192231,1,1
514,PR-rCh6Mh,CISPLATIN,0.845564,1.426818,1,1
7,PR-a5T2sI,CISPLATIN,0.953856,3.768136,1,0
675,PR-1Uo7R0,CISPLATIN,0.941333,3.107998,1,0


In [116]:
# Each patient/cell line file has 3 keys - train, val, test
# Within each, we can access dataframes using keys `drug_name` in Exp 1A, `drug, cancer type, dataset` in 1B, nothing in 2A, `cancer type` in 2B

### Ensure validation splits have atleast 1 sample from both class labels

* Pearson and Spearman correlation coefficients result in NaNs when there is only an array of the same label in the input.
* Here, if val data split has only samples of a single class label, we obtain one sample of the opposite label from the train data and add it to the set of samples in validation data.
* This is done only for the patient datasets, as cell lines are more in number and mostly rely on AUC rather than binary labels. Further the binary label for cell lines is based on median AUC resulting in a 50-50 split of labels in train and test.

In [117]:
# Patients
# Experiment 1A
with open(f"{expt1A_file_dir}/patients_fold0.pkl", "rb") as f:
    setting1A_split0 = pickle.load(f)
with open(f"{expt1A_file_dir}/patients_fold1.pkl", "rb") as f:
    setting1A_split1 = pickle.load(f)
with open(f"{expt1A_file_dir}/patients_fold2.pkl", "rb") as f:
    setting1A_split2 = pickle.load(f)

# Experiment 1B
with open(f"{expt1B_file_dir}/patients_fold0.pkl", "rb") as f:
    setting1B_split0 = pickle.load(f)
with open(f"{expt1B_file_dir}/patients_fold1.pkl", "rb") as f:
    setting1B_split1 = pickle.load(f)
with open(f"{expt1B_file_dir}/patients_fold2.pkl", "rb") as f:
    setting1B_split2 = pickle.load(f)

# Experiment 2A
with open(f"{expt2A_file_dir}/patients_fold0.pkl", "rb") as f:
    setting2A_split0 = pickle.load(f)
with open(f"{expt2A_file_dir}/patients_fold1.pkl", "rb") as f:
    setting2A_split1 = pickle.load(f)
with open(f"{expt2A_file_dir}/patients_fold2.pkl", "rb") as f:
    setting2A_split2 = pickle.load(f)

# Experiment 2B
with open(f"{expt2B_file_dir}/patients_fold0.pkl", "rb") as f:
    setting2B_split0 = pickle.load(f)
with open(f"{expt2B_file_dir}/patients_fold1.pkl", "rb") as f:
    setting2B_split1 = pickle.load(f)
with open(f"{expt2B_file_dir}/patients_fold2.pkl", "rb") as f:
    setting2B_split2 = pickle.load(f)

In [118]:
# Experiment 1A
i = 0
for setting in [setting1A_split0, setting1A_split1, setting1A_split2]:
    print(f"Experiment 1A split {i}")
    i += 1
    for k in setting["val"].keys():
        print(k)
        num_unique_labels = len(setting["val"][k].recist.value_counts())
        print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"][k].recist.value_counts())} in train")
        if num_unique_labels == 1: # only 1 label available
            label_available = setting["val"][k].recist.value_counts().index[0]
            # take samples of other label from train split
            train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
            # sample a data point from here (last occurring data sample) if atleast two samples are available in train
            if len(train_data_df) >= 2:
                row2move_from_train2val = train_data_df[-1:]
                setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
                # remove from train data
                setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)
    
            # check that both labels are present in train and val splits after moving
            print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 1A split 0
BUPARLISIB
Before moving: 2 in val data and 2 in train
CISPLATIN
Before moving: 2 in val data and 2 in train
FLUOROURACIL
Before moving: 2 in val data and 2 in train
GEMCITABINE
Before moving: 2 in val data and 2 in train
PACLITAXEL
Before moving: 2 in val data and 2 in train
SORAFENIB
Before moving: 2 in val data and 2 in train
TEMOZOLOMIDE
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
Experiment 1A split 1
BUPARLISIB
Before moving: 2 in val data and 1 in train
CISPLATIN
Before moving: 2 in val data and 2 in train
FLUOROURACIL
Before moving: 2 in val data and 2 in train
GEMCITABINE
Before moving: 2 in val data and 2 in train
PACLITAXEL
Before moving: 2 in val data and 2 in train
SORAFENIB
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
TEMOZOLOMIDE
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
Experiment 1A split 2
BUPARLISIB
Before moving: 1 in val data and 

In [119]:
setting1A_split0["val"]["PACLITAXEL"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
27,TCGA-N5-A4RO,PACLITAXEL,1,TCGA-UCS,TCGA
15,TCGA-A7-A5ZX,PACLITAXEL,1,TCGA-BRCA,TCGA
23,TCGA-99-8033,PACLITAXEL,0,TCGA-LUAD,TCGA


In [120]:
# Experiment 1B
i = 0
for setting in [setting1B_split0, setting1B_split1, setting1B_split2]:
    print(f"Experiment 1B split {i}")
    i += 1
    for k in setting["val"].keys():
        print(k)
        num_unique_labels = len(setting["val"][k].recist.value_counts())
        print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"][k].recist.value_counts())} in train")
        if num_unique_labels == 1: # only 1 label available
            label_available = setting["val"][k].recist.value_counts().index[0]
            # take samples of other label from train split
            train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
            # sample a data point from here (last occurring data sample) if atleast two samples are available in train
            if len(train_data_df) >= 2:
                row2move_from_train2val = train_data_df[-1:]
                setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
                # remove from train data
                setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)
    
            # check that both labels are present in train and val splits after moving
            print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 1B split 0
('CISPLATIN', 'TCGA-CESC', 'TCGA')
Before moving: 2 in val data and 2 in train
('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Before moving: 2 in val data and 2 in train
('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')
Before moving: 2 in val data and 2 in train
Experiment 1B split 1
('CISPLATIN', 'TCGA-CESC', 'TCGA')
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Bef

In [121]:
setting1B_split1["val"][('CISPLATIN', 'TCGA-CESC', 'TCGA')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
48,TCGA-VS-A9UH,CISPLATIN,1,TCGA-CESC,TCGA
25,TCGA-VS-A9U5,CISPLATIN,1,TCGA-CESC,TCGA
38,TCGA-VS-A8QF,CISPLATIN,1,TCGA-CESC,TCGA
27,TCGA-DS-A5RQ,CISPLATIN,1,TCGA-CESC,TCGA
24,TCGA-VS-AA62,CISPLATIN,0,TCGA-CESC,TCGA


In [122]:
# Experiment 2A - nothing to be done
i = 0
for setting in [setting2A_split0, setting2A_split1, setting2A_split2]:
    print(f"Experiment 2A split {i}")
    i += 1
    num_unique_labels = len(setting["val"].recist.value_counts())
    print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"].recist.value_counts())} in train")
    # if num_unique_labels == 1: # only 1 label available
    #     label_available = setting["val"][k].recist.value_counts().index[0]
    #     # take samples of other label from train split
    #     train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
    #     # sample a data point from here (last occurring data sample) if atleast two samples are available in train
    #     if len(train_data_df) >= 2:
    #         row2move_from_train2val = train_data_df[-1:]
    #         setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
    #         # remove from train data
    #         setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)

    #     # check that both labels are present in train and val splits after moving
    #     print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 2A split 0
Before moving: 2 in val data and 2 in train
Experiment 2A split 1
Before moving: 2 in val data and 2 in train
Experiment 2A split 2
Before moving: 2 in val data and 2 in train


In [123]:
# Experiment 2B
i = 0
for setting in [setting2B_split0, setting2B_split1, setting2B_split2]:
    print(f"Experiment 2B split {i}")
    i += 1
    for k in setting["val"].keys():
        print(k)
        num_unique_labels = len(setting["val"][k].recist.value_counts())
        print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"][k].recist.value_counts())} in train")
        if num_unique_labels == 1: # only 1 label available
            label_available = setting["val"][k].recist.value_counts().index[0]
            # take samples of other label from train split
            train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
            # sample a data point from here (last occurring data sample) if atleast two samples are available in train
            if len(train_data_df) >= 2:
                row2move_from_train2val = train_data_df[-1:]
                setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
                # remove from train data
                setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)
    
            # check that both labels are present in train and val splits after moving
            print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 2B split 0
TCGA-BRCA
Before moving: 2 in val data and 2 in train
TCGA-CESC
Before moving: 2 in val data and 2 in train
TCGA-HNSC
Before moving: 2 in val data and 2 in train
TCGA-STAD
Before moving: 2 in val data and 2 in train
TCGA-PAAD
Before moving: 2 in val data and 2 in train
TCGA-LGG
Before moving: 2 in val data and 2 in train
Experiment 2B split 1
TCGA-BRCA
Before moving: 2 in val data and 2 in train
TCGA-CESC
Before moving: 2 in val data and 2 in train
TCGA-HNSC
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
TCGA-STAD
Before moving: 2 in val data and 2 in train
TCGA-PAAD
Before moving: 2 in val data and 2 in train
TCGA-LGG
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
Experiment 2B split 2
TCGA-BRCA
Before moving: 2 in val data and 2 in train
TCGA-CESC
Before moving: 2 in val data and 2 in train
TCGA-HNSC
Before moving: 2 in val data and 2 in train
TCGA-STAD
Before moving: 2 in val data and 2 in tr

In [124]:
setting2B_split2["val"]["TCGA-LGG"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
22,TCGA-FG-7638,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
0,TCGA-S9-A7R7,CARMUSTINE,0,TCGA-LGG,TCGA
49,TCGA-DU-8166,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
4,TCGA-E1-A7YW,ETOPOSIDE,0,TCGA-LGG,TCGA
54,TCGA-DU-A76R,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
18,TCGA-QH-A6XC,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
10,TCGA-DU-A5TS,TEMOZOLOMIDE,0,TCGA-LGG,TCGA
65,TCGA-DB-A4XD,TEMOZOLOMIDE,1,TCGA-LGG,TCGA


In [125]:
# Save updated files
# patient files
# Experiment 1A
with open(f"{expt1A_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting1A_split0, f)
with open(f"{expt1A_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting1A_split1, f)
with open(f"{expt1A_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting1A_split2, f)

# Experiment 1B
with open(f"{expt1B_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting1B_split0, f)
with open(f"{expt1B_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting1B_split1, f)
with open(f"{expt1B_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting1B_split2, f)

# Experiment 2A
with open(f"{expt2A_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting2A_split0, f)
with open(f"{expt2A_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting2A_split1, f)
with open(f"{expt2A_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting2A_split2, f)

# Experiment 2B
with open(f"{expt2B_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting2B_split0, f)
with open(f"{expt2B_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting2B_split1, f)
with open(f"{expt2B_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting2B_split2, f)

### Train-test splits for survival data

PREDICT-AI uses survival information of patients during pre-training. We use GENIE CRC and GENIE NSCLC datasets for this.

#### GENIE CRC

In [126]:
genie_crc_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/genie_crc_pfs.csv")
genie_crc_response_df["dataset_name"] = "GENIE_CRC"
genie_crc_response_df["mappedProject"] = "TCGA-COAD" # Colorectal Cancer
genie_crc_response_df

Unnamed: 0,sample_id,REGIMEN_NUMBER,drug_name,drug_start_date,drug_end_date,pfs_status,pfs_days,category,dataset_name,mappedProject
0,GENIE-DFCI-008889-6970,1.0,BEVACIZUMAB,23.0,23.0,0.0,56.0,1.0,GENIE_CRC,TCGA-COAD
1,GENIE-DFCI-008759-7792,1.0,CAPECITABINE,73.0,272.0,1.0,21.0,1.0,GENIE_CRC,TCGA-COAD
2,GENIE-DFCI-009942-7682,2.0,CARBOPLATIN,570.0,570.0,0.0,56.0,1.0,GENIE_CRC,TCGA-COAD
3,GENIE-DFCI-007011-11659,1.0,FLUOROURACIL,262.0,440.0,0.0,206.0,1.0,GENIE_CRC,TCGA-COAD
4,GENIE-DFCI-011252-11577,1.0,CETUXIMAB,457.0,527.0,0.0,142.0,1.0,GENIE_CRC,TCGA-COAD
...,...,...,...,...,...,...,...,...,...,...
66,GENIE-VICC-205976-unk-1,3.0,BEVACIZUMAB,821.0,821.0,0.0,71.0,1.0,GENIE_CRC,TCGA-COAD
67,GENIE-VICC-390430-unk-1,2.0,PANITUMUMAB,233.0,361.0,0.0,142.0,1.0,GENIE_CRC,TCGA-COAD
68,GENIE-VICC-598717-unk-1,2.0,CAPECITABINE,1267.0,1276.0,0.0,70.0,1.0,GENIE_CRC,TCGA-COAD
69,GENIE-VICC-155993-unk-1,3.0,MITOMYCIN,463.0,463.0,0.0,1532.0,1.0,GENIE_CRC,TCGA-COAD


#### GENIE NSCLC

In [127]:
genie_nsclc_response_df = pd.read_csv("/data/ajayago/papers_data/systematic_assessment/raw/response_files/genie_nsclc_pfs.csv")
genie_nsclc_response_df["dataset_name"] = "GENIE_NSCLC"
genie_nsclc_response_df["mappedProject"] = "TCGA-LUAD" # Lung Cancer
genie_nsclc_response_df

Unnamed: 0,sample_id,REGIMEN_NUMBER,drug_name,drug_start_date,drug_end_date,pfs_status,pfs_days,category,dataset_name,mappedProject
0,GENIE-DFCI-003908-234520,2.0,ALECTINIB,1488.0,2419.0,1.0,864.0,1.0,GENIE_NSCLC,TCGA-LUAD
1,GENIE-DFCI-037921-36048,1.0,CRIZOTINIB,497.0,566.0,1.0,83.0,1.0,GENIE_NSCLC,TCGA-LUAD
2,GENIE-DFCI-033743-88529,3.0,DOCETAXEL,582.0,687.0,1.0,126.0,1.0,GENIE_NSCLC,TCGA-LUAD
3,GENIE-DFCI-090170-266320,1.0,ALECTINIB,35.0,35.0,0.0,942.0,1.0,GENIE_NSCLC,TCGA-LUAD
4,GENIE-DFCI-078051-298963,5.0,CERITINIB,4593.0,4638.0,1.0,2.0,1.0,GENIE_NSCLC,TCGA-LUAD
...,...,...,...,...,...,...,...,...,...,...
66,GENIE-VICC-199259-unk-1,1.0,CRIZOTINIB,43.0,210.0,0.0,167.0,1.0,GENIE_NSCLC,TCGA-LUAD
67,GENIE-VICC-352202-unk-1,5.0,OSIMERTINIB,1569.0,1715.0,1.0,162.0,1.0,GENIE_NSCLC,TCGA-LUAD
68,GENIE-VICC-647513-unk-1,5.0,OSIMERTINIB,3895.0,4573.0,1.0,1000.0,1.0,GENIE_NSCLC,TCGA-LUAD
69,GENIE-VICC-356066-unk-1,1.0,CRIZOTINIB,46.0,362.0,1.0,337.0,1.0,GENIE_NSCLC,TCGA-LUAD


#### Experiment Settings

We are going to use the same set of patients across all 4 experiment settings (1A, 1B, 2A, 2B). 3 folds to be created, with train, val and test data splits.

In [128]:
genie_combined_df = pd.concat([genie_crc_response_df, genie_nsclc_response_df], axis = 0, ignore_index=True)
print(genie_combined_df.shape)
genie_combined_df.head()

(142, 10)


Unnamed: 0,sample_id,REGIMEN_NUMBER,drug_name,drug_start_date,drug_end_date,pfs_status,pfs_days,category,dataset_name,mappedProject
0,GENIE-DFCI-008889-6970,1.0,BEVACIZUMAB,23.0,23.0,0.0,56.0,1.0,GENIE_CRC,TCGA-COAD
1,GENIE-DFCI-008759-7792,1.0,CAPECITABINE,73.0,272.0,1.0,21.0,1.0,GENIE_CRC,TCGA-COAD
2,GENIE-DFCI-009942-7682,2.0,CARBOPLATIN,570.0,570.0,0.0,56.0,1.0,GENIE_CRC,TCGA-COAD
3,GENIE-DFCI-007011-11659,1.0,FLUOROURACIL,262.0,440.0,0.0,206.0,1.0,GENIE_CRC,TCGA-COAD
4,GENIE-DFCI-011252-11577,1.0,CETUXIMAB,457.0,527.0,0.0,142.0,1.0,GENIE_CRC,TCGA-COAD


In [129]:
genie_combined_df.drug_name.value_counts()

drug_name
CAPECITABINE    39
CRIZOTINIB      25
OSIMERTINIB     22
FLUOROURACIL    12
BEVACIZUMAB     11
MITOMYCIN        8
GEFITINIB        6
ALECTINIB        6
DOCETAXEL        5
CARBOPLATIN      2
CETUXIMAB        2
PANITUMUMAB      2
ETOPOSIDE        1
CERITINIB        1
Name: count, dtype: int64

In [130]:
setting_split0 = {"train": {}, "val": {}, "test": {}}
setting_split1 = {"train": {}, "val": {}, "test": {}}
setting_split2 = {"train": {}, "val": {}, "test": {}}

In [131]:
skf_survival = StratifiedKFold(n_splits=3, shuffle=True, random_state = 42)
three_fold_indices_survival = list(skf_survival.split(genie_combined_df["sample_id"], genie_combined_df["pfs_status"]))
three_fold_indices_survival

[(array([  3,   5,   8,   9,  10,  12,  14,  16,  18,  19,  20,  21,  22,
          23,  24,  25,  28,  29,  30,  38,  39,  40,  41,  42,  43,  45,
          47,  48,  49,  50,  51,  52,  53,  56,  57,  58,  59,  61,  63,
          65,  66,  68,  69,  70,  72,  73,  76,  77,  78,  80,  81,  82,
          83,  84,  85,  87,  88,  89,  90,  91,  92,  94,  95,  96,  97,
          99, 100, 101, 102, 103, 104, 106, 108, 110, 111, 112, 113, 115,
         116, 119, 120, 121, 127, 128, 129, 130, 131, 132, 133, 136, 137,
         138, 139, 141]),
  array([  0,   1,   2,   4,   6,   7,  11,  13,  15,  17,  26,  27,  31,
          32,  33,  34,  35,  36,  37,  44,  46,  54,  55,  60,  62,  64,
          67,  71,  74,  75,  79,  86,  93,  98, 105, 107, 109, 114, 117,
         118, 122, 123, 124, 125, 126, 134, 135, 140])),
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   9,  11,  13,  14,  15,
          16,  17,  19,  21,  22,  24,  25,  26,  27,  31,  32,  33,  34,
          35,  36,  37,  39, 

In [132]:
split0_train_val = genie_combined_df.loc[three_fold_indices_survival[0][0]] # 0th index for split 0, 0th index for train
setting_split0["train"], setting_split0["val"] = train_test_split(split0_train_val, test_size=0.1, random_state=42)
setting_split0["test"] = genie_combined_df.loc[three_fold_indices_survival[0][1]] # 0th index for split 0, 1st index for test
split1_train_val = genie_combined_df.loc[three_fold_indices_survival[1][0]] # 1st index for split 1, 0th index for train
setting_split1["train"], setting_split1["val"] = train_test_split(split1_train_val, test_size=0.1, random_state=42)
setting_split1["test"] = genie_combined_df.loc[three_fold_indices_survival[1][1]] # 1st index for split 1, 1st index for test
split2_train_val = genie_combined_df.loc[three_fold_indices_survival[2][0]] # 2nd index for split 2, 0th index for train
setting_split2["train"], setting_split2["val"] = train_test_split(split2_train_val, test_size=0.1, random_state=42)
setting_split2["test"] = genie_combined_df.loc[three_fold_indices_survival[2][1]] # 2nd index for split 2, 1st index for test

In [133]:
for div in ["train", "val", "test"]:
    print("Fold 0")
    print(f"{div}", end = " -- ")
    print(setting_split0[div].shape)

    print("Fold 1")
    print(f"{div}", end = " -- ")
    print(setting_split1[div].shape)

    print("Fold 2")
    print(f"{div}", end = " -- ")
    print(setting_split2[div].shape)


Fold 0
train -- (84, 10)
Fold 1
train -- (85, 10)
Fold 2
train -- (85, 10)
Fold 0
val -- (10, 10)
Fold 1
val -- (10, 10)
Fold 2
val -- (10, 10)
Fold 0
test -- (48, 10)
Fold 1
test -- (47, 10)
Fold 2
test -- (47, 10)


In [134]:
### Save files
with open(f"{processed_file_dir}/survival_splits/patients_fold0_survival.pkl", "wb") as f:
    pickle.dump(setting_split0, f)
with open(f"{processed_file_dir}/survival_splits/patients_fold1_survival.pkl", "wb") as f:
    pickle.dump(setting_split1, f)
with open(f"{processed_file_dir}/survival_splits/patients_fold2_survival.pkl", "wb") as f:
    pickle.dump(setting_split2, f)