Loads raw mutation data needed for all methods (except PREDICT-AI which uses a variable length input).

#### Data

*Cell Lines and Patients*

**Inputs**
* Mutations (annotated) in 324 genes
* No drug input needed

**Output labels**
* AUDRC for cell lines
* Binary labels for cell lines (threshold set to median AUDRC per drug)
* RECIST for patients

**Experiments: 1A, 1B, 2A, 2B**

In [1]:
import torch
import pandas as pd
import numpy as np

In [2]:
import pickle

In [3]:
druid_data_dir = "/data/druid_data/"

In [4]:
annotated_matrices_dir = "/data//papers_data/systematic_assessment/raw/annotated_mutation_matrices/"

In [5]:
raw_data_dir = "/data//papers_data/systematic_assessment/raw/mutation_files/"
data_splits_dir = "/data//papers_data/systematic_assessment/processed/"
expt1A_dir = data_splits_dir + "Experiment1/SettingA/"
expt1B_dir = data_splits_dir + "Experiment1/SettingB/"
expt2A_dir = data_splits_dir + "Experiment2/SettingA/"
expt2B_dir = data_splits_dir + "Experiment2/SettingB/"

In [6]:
save_dir = "/data//papers_data/systematic_assessment/input_types/annotated_mutations/"
save_dir_expt1A_dir = save_dir + "Experiment1/SettingA/"
save_dir_expt1B_dir = save_dir + "Experiment1/SettingB/"
save_dir_expt2A_dir = save_dir + "Experiment2/SettingA/"
save_dir_expt2B_dir = save_dir + "Experiment2/SettingB/"

In [7]:
len(list(pd.read_csv("/data/druid_data/raw_data/gene2ind.txt", header=None)[0]))

324

In [8]:
suffixes = ["_piu_max", "_piu_sum", "_piu_mean", "_piu_count",
            "_lu_max", "_lu_sum", "_lu_mean", "_lu_count",
            "_ncu_max", "_ncu_sum", "_ncu_mean", '_ncu_count',
            "_pathogenic_max", "_pathogenic_sum", "_pathogenic_mean", "_pathogenic_count",
            "_vus_max", "_vus_sum", "_vus_mean", "_vus_count",
            "_benign_max", "_benign_sum", "_benign_mean", "_benign_count"
           ]
columns2select = []
for s in suffixes:
    for g in list(pd.read_csv("/data/druid_data/raw_data/gene2ind.txt", header=None)[0]):
        columns2select.append(f"{g}{s}")
len(columns2select)

7776

### Cell lines

In [9]:
with open(expt1A_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp1A_cl_fold0 = pickle.load(f)
    
with open(expt1B_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp1B_cl_fold0 = pickle.load(f)

with open(expt2A_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp2A_cl_fold0 = pickle.load(f)

with open(expt2B_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp2B_cl_fold0 = pickle.load(f)

#### Annotated mutation matrices

In [10]:
cl_mutations = pd.read_csv(f"{annotated_matrices_dir}/clinvar_gpd_annovar_annotated_CCLE_23Q4_feature_matrix.csv", index_col=0)
cl_mutations

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
depmap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PR-00UtU3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-04VvBz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-09S1KU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-0DEQ8n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-0EypNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PR-zqnXiv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zrjzDr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zv0mdh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zwU925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
cl_mutations[columns2select]

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
depmap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PR-00UtU3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-04VvBz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-09S1KU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-0DEQ8n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-0EypNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PR-zqnXiv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zrjzDr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zv0mdh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zwU925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cl_mutations = cl_mutations[cl_mutations.sum(axis=1) != 0]
cl_mutations

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
depmap_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PR-00UtU3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-04VvBz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-09S1KU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-0DEQ8n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-0EypNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PR-zqnXiv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zrjzDr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zv0mdh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PR-zwU925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Experiment 1A
exp1A_cl_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp1A_cl_fold0_processed[div] = {}
    for k, v in exp1A_cl_fold0[div].items():
        print(div + ": " + k, end=" -- ")
        merged = v.merge(cl_mutations, how="inner", left_on="sample_id", right_on=cl_mutations.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1A_cl_fold0_processed[div][k] = merged

train: BUPARLISIB -- (860, 7782)
train: CISPLATIN -- (692, 7782)
train: FLUOROURACIL -- (868, 7782)
train: GEMCITABINE -- (861, 7782)
train: PACLITAXEL -- (860, 7782)
train: SORAFENIB -- (861, 7782)
train: TEMOZOLOMIDE -- (864, 7782)
val: BUPARLISIB -- (96, 7782)
val: CISPLATIN -- (78, 7782)
val: FLUOROURACIL -- (96, 7782)
val: GEMCITABINE -- (96, 7782)
val: PACLITAXEL -- (96, 7782)
val: SORAFENIB -- (96, 7782)
val: TEMOZOLOMIDE -- (97, 7782)
test: BUPARLISIB -- (119, 7782)
test: CISPLATIN -- (95, 7782)
test: FLUOROURACIL -- (120, 7782)
test: GEMCITABINE -- (118, 7782)
test: PACLITAXEL -- (119, 7782)
test: SORAFENIB -- (119, 7782)
test: TEMOZOLOMIDE -- (120, 7782)


In [14]:
# Experiment 1B
exp1B_cl_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp1B_cl_fold0_processed[div] = {}
    for k, v in exp1B_cl_fold0[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(cl_mutations, how="inner", left_on="sample_id", right_on=cl_mutations.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1B_cl_fold0_processed[div][k] = merged

train: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (692, 7782)
train: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (692, 7782)
train: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (868, 7782)
train: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (861, 7782)
train: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (860, 7782)
train: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (864, 7782)
val: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (78, 7782)
val: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (78, 7782)
val: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (96, 7782)
val: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (96, 7782)
val: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (96, 7782)
val: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (97, 7782)
test: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (95, 7782)
test: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (95, 7782)
test: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (120, 7782)
test: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (118, 7782)
test: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (119, 7782)
test: ('TEMOZOLOMIDE', 'TCGA-LGG',

In [15]:
# Experiment 2A
exp2A_cl_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp2A_cl_fold0_processed[div] = {}
    v = exp2A_cl_fold0[div]
    print(div, end=" -- ")
    merged = v.merge(cl_mutations, how="inner", left_on="sample_id", right_on=cl_mutations.index)
    print(merged.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    if not (merged.shape[0] == v.shape[0]):
        print("Missing some mutations!")
        print(set(v.sample_id) - set(merged.sample_id))
    exp2A_cl_fold0_processed[div] = merged

train -- (156441, 7782)
val -- (17371, 7782)
test -- (21589, 7782)


In [16]:
# Experiment 2B
exp2B_cl_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp2B_cl_fold0_processed[div] = {}
    for k, v in exp2B_cl_fold0[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(cl_mutations, how="inner", left_on="sample_id", right_on=cl_mutations.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp2B_cl_fold0_processed[div][k] = merged

train: TCGA-BRCA -- (156441, 7782)
train: TCGA-CESC -- (156441, 7782)
train: TCGA-HNSC -- (156441, 7782)
train: TCGA-STAD -- (156441, 7782)
train: TCGA-PAAD -- (156441, 7782)
train: TCGA-LGG -- (156441, 7782)
val: TCGA-BRCA -- (17371, 7782)
val: TCGA-CESC -- (17371, 7782)
val: TCGA-HNSC -- (17371, 7782)
val: TCGA-STAD -- (17371, 7782)
val: TCGA-PAAD -- (17371, 7782)
val: TCGA-LGG -- (17371, 7782)
test: TCGA-BRCA -- (21589, 7782)
test: TCGA-CESC -- (21589, 7782)
test: TCGA-HNSC -- (21589, 7782)
test: TCGA-STAD -- (21589, 7782)
test: TCGA-PAAD -- (21589, 7782)
test: TCGA-LGG -- (21589, 7782)


### Patients

In [17]:
# Fold 0, 1, 2
for i in range(0, 3):
    with open(expt1A_dir + f"patients_fold{i}.pkl", "rb") as f:
        exec(f"exp1A_patients_fold{i} = pickle.load(f)")
        
    with open(expt1B_dir + f"patients_fold{i}.pkl", "rb") as f:
        exec(f"exp1B_patients_fold{i} = pickle.load(f)")
    
    with open(expt2A_dir + f"patients_fold{i}.pkl", "rb") as f:
        exec(f"exp2A_patients_fold{i} = pickle.load(f)")
    
    with open(expt2B_dir + f"patients_fold{i}.pkl", "rb") as f:
        exec(f"exp2B_patients_fold{i} = pickle.load(f)")

#### Raw mutations

In [18]:
tcga_mutations = pd.read_csv(f"{annotated_matrices_dir}/clinvar_gpd_annovar_annotated_Tcga_feature_matrix.csv", index_col = 0)
tcga_mutations

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-2466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-ZS-A9CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-ZS-A9CG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-ZU-A8S4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
moores_mutations = pd.read_csv(f"{annotated_matrices_dir}/clinvar_gpd_annovar_annotated_Moores_feature_matrix.csv", index_col = 0)
moores_mutations

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
cbio_hcc_mutations = pd.read_csv(f"{annotated_matrices_dir}/clinvar_gpd_annovar_annotated_cbio_hcc_feature_matrix.csv", index_col = 0)
cbio_hcc_mutations

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P-0000037-T02-IM3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0000182-T01-IM3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0000218-T01-IM3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0000228-T03-IM5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0000587-T01-IM3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P-0021001-T01-IM6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0021059-T01-IM6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0021288-T01-IM6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P-0021780-T01-IM6,0.0,0.0,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cbio_brca_mutations = pd.read_csv(f"{annotated_matrices_dir}/clinvar_gpd_annovar_annotated_cbio_brca_feature_matrix.csv", index_col = 0)
cbio_brca_mutations

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s_DS_bkm_001_T,0.000000,0.0,0.0,0.0,0.0,0.294118,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_002_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_003_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_004_T,0.000000,0.0,0.0,0.0,0.0,0.352941,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_005_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.647059,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_080_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_081_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_082_T,0.000000,0.0,0.0,0.0,0.0,0.294118,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_083_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
patients_combined = pd.concat([tcga_mutations, moores_mutations, cbio_hcc_mutations, cbio_brca_mutations]) # only those with RECIST responses
patients_combined

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0003,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0033,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0047,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0055,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-2466,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_080_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_081_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_082_T,0.000000,0.0,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_083_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
patients_combined[columns2select]

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0003,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0033,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0047,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0055,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-2466,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_080_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_081_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_082_T,0.000000,0.0,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_083_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
patients_combined = patients_combined[patients_combined.sum(axis=1)!=0]
patients_combined

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0003,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0033,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0047,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0055,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-2466,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_080_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_081_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_082_T,0.000000,0.0,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_083_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
tcga_mutations_raw = pd.read_csv("/data//papers_data/systematic_assessment/raw/mutation_files/labelled_tcga_patient_gene_alteration_mutation.csv")
tcga_mutations_raw

Unnamed: 0,patient_id,gene,alteration
0,TCGA-VQ-A928,EPHA2,G75=
1,TCGA-VQ-A928,ELAVL4,G138C
2,TCGA-VQ-A928,SLC35D1,G30=
3,TCGA-VQ-A928,SRSF11,G30=
4,TCGA-VQ-A928,CD5L,S207T
...,...,...,...
138808,TCGA-EE-A3JI,IL13RA1,K212K
138809,TCGA-EE-A3JI,GRIA3,I474I
138810,TCGA-EE-A3JI,SPANXN3,T13T
138811,TCGA-EE-A3JI,F8,G2287G


In [26]:
# Experiment 1A
# fold 0
print("Fold 0")
exp1A_patient_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp1A_patient_fold0_processed[div] = {}
    for k, v in exp1A_patients_fold0[div].items():
        print(div + ": " + k, end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1A_patient_fold0_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))
            
# fold 1
print("Fold 1")
exp1A_patient_fold1_processed = {}
for div in ["train", "val", "test"]:
    exp1A_patient_fold1_processed[div] = {}
    for k, v in exp1A_patients_fold1[div].items():
        print(div + ": " + k, end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1A_patient_fold1_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

# fold 2
print("Fold 2")
exp1A_patient_fold2_processed = {}
for div in ["train", "val", "test"]:
    exp1A_patient_fold2_processed[div] = {}
    for k, v in exp1A_patients_fold2[div].items():
        print(div + ": " + k, end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1A_patient_fold2_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

Fold 0
train: BUPARLISIB -- (16, 7781)
train: CISPLATIN -- (64, 7781)
train: FLUOROURACIL -- (33, 7781)
train: GEMCITABINE -- (37, 7781)
train: PACLITAXEL -- (26, 7781)
train: SORAFENIB -- (38, 7781)
train: TEMOZOLOMIDE -- (58, 7781)
val: BUPARLISIB -- (2, 7781)
Val classes
2
val: CISPLATIN -- (8, 7781)
Val classes
2
val: FLUOROURACIL -- (3, 7781)
Val classes
2
val: GEMCITABINE -- (4, 7781)
Val classes
2
val: PACLITAXEL -- (3, 7781)
Val classes
2
val: SORAFENIB -- (5, 7781)
Val classes
2
val: TEMOZOLOMIDE -- (8, 7781)
Val classes
2
test: BUPARLISIB -- (9, 7781)
test: CISPLATIN -- (27, 7781)
test: FLUOROURACIL -- (12, 7781)
test: GEMCITABINE -- (14, 7781)
test: PACLITAXEL -- (8, 7781)
test: SORAFENIB -- (15, 7781)
test: TEMOZOLOMIDE -- (30, 7781)
Fold 1
train: BUPARLISIB -- (16, 7781)
train: CISPLATIN -- (61, 7781)
train: FLUOROURACIL -- (34, 7781)
train: GEMCITABINE -- (39, 7781)
train: PACLITAXEL -- (25, 7781)
train: SORAFENIB -- (39, 7781)
train: TEMOZOLOMIDE -- (58, 7781)
val: BUPAR

In [27]:
# Experiment 1B
# fold 0
print("Fold 0")
exp1B_patient_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp1B_patient_fold0_processed[div] = {}
    for k, v in exp1B_patients_fold0[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1B_patient_fold0_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

# fold 1
print("Fold 1")
exp1B_patient_fold1_processed = {}
for div in ["train", "val", "test"]:
    exp1B_patient_fold1_processed[div] = {}
    for k, v in exp1B_patients_fold1[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1B_patient_fold1_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

# fold 2
print("Fold 2")
exp1B_patient_fold2_processed = {}
for div in ["train", "val", "test"]:
    exp1B_patient_fold2_processed[div] = {}
    for k, v in exp1B_patients_fold2[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp1B_patient_fold2_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

Fold 0
train: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (29, 7781)
train: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (22, 7781)
train: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (19, 7781)
train: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (23, 7781)
train: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (15, 7781)
train: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (52, 7781)
val: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (4, 7781)
Val classes
2
val: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (3, 7781)
Val classes
2
val: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (3, 7781)
Val classes
2
val: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (3, 7781)
Val classes
2
val: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (3, 7781)
Val classes
2
val: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (6, 7781)
Val classes
2
test: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (15, 7781)
test: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (12, 7781)
test: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (12, 7781)
test: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (14, 7781)
test: ('PACLIT

In [28]:
# Experiment 2A
# fold 0
print("Fold 0")
exp2A_patient_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp2A_patient_fold0_processed[div] = {}
    v = exp2A_patients_fold0[div]
    print(div, end=" -- ")
    merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
    print(merged.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    if not (merged.shape[0] == v.shape[0]):
        print("Missing some mutations!")
        print(set(v.sample_id) - set(merged.sample_id))
    exp2A_patient_fold0_processed[div] = merged
    if div == "val":
        print("Val classes")
        print(len(merged.recist.value_counts()))

# fold 1
print("Fold 1")
exp2A_patient_fold1_processed = {}
for div in ["train", "val", "test"]:
    exp2A_patient_fold1_processed[div] = {}
    v = exp2A_patients_fold1[div]
    print(div, end=" -- ")
    merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
    print(merged.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    if not (merged.shape[0] == v.shape[0]):
        print("Missing some mutations!")
        print(set(v.sample_id) - set(merged.sample_id))
    exp2A_patient_fold1_processed[div] = merged
    if div == "val":
        print("Val classes")
        print(len(merged.recist.value_counts()))

# fold 2
print("Fold 2")
exp2A_patient_fold2_processed = {}
for div in ["train", "val", "test"]:
    exp2A_patient_fold2_processed[div] = {}
    v = exp2A_patients_fold2[div]
    print(div, end=" -- ")
    merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
    print(merged.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    if not (merged.shape[0] == v.shape[0]):
        print("Missing some mutations!")
        print(set(v.sample_id) - set(merged.sample_id))
    exp2A_patient_fold2_processed[div] = merged
    if div == "val":
        print("Val classes")
        print(len(merged.recist.value_counts()))

Fold 0
train -- (488, 7781)
val -- (53, 7781)
Val classes
2
test -- (115, 7781)
Fold 1
train -- (488, 7781)
val -- (54, 7781)
Val classes
2
test -- (114, 7781)
Fold 2
train -- (487, 7781)
val -- (56, 7781)
Val classes
2
test -- (113, 7781)


In [29]:
# Experiment 2B
# fold 0
print("Fold 0")
exp2B_patient_fold0_processed = {}
for div in ["train", "val", "test"]:
    exp2B_patient_fold0_processed[div] = {}
    for k, v in exp2B_patients_fold0[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp2B_patient_fold0_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

# fold 1
print("Fold 1")
exp2B_patient_fold1_processed = {}
for div in ["train", "val", "test"]:
    exp2B_patient_fold1_processed[div] = {}
    for k, v in exp2B_patients_fold1[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp2B_patient_fold1_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

# fold 2
print("Fold 2")
exp2B_patient_fold2_processed = {}
for div in ["train", "val", "test"]:
    exp2B_patient_fold2_processed[div] = {}
    for k, v in exp2B_patients_fold2[div].items():
        print(div + ": " + str(k), end=" -- ")
        merged = v.merge(patients_combined, how="inner", left_on="sample_id", right_on=patients_combined.index)
        print(merged.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        if not (merged.shape[0] == v.shape[0]):
            print("Missing some mutations!")
            print(set(v.sample_id) - set(merged.sample_id))
        exp2B_patient_fold2_processed[div][k] = merged
        if div == "val":
            print("Val classes")
            print(len(merged.recist.value_counts()))

Fold 0
train: TCGA-BRCA -- (74, 7781)
train: TCGA-CESC -- (33, 7781)
train: TCGA-HNSC -- (39, 7781)
train: TCGA-STAD -- (38, 7781)
train: TCGA-PAAD -- (32, 7781)
train: TCGA-LGG -- (60, 7781)
val: TCGA-BRCA -- (8, 7781)
Val classes
2
val: TCGA-CESC -- (4, 7781)
Val classes
2
val: TCGA-HNSC -- (5, 7781)
Val classes
2
val: TCGA-STAD -- (5, 7781)
Val classes
2
val: TCGA-PAAD -- (4, 7781)
Val classes
2
val: TCGA-LGG -- (7, 7781)
Val classes
2
test: TCGA-BRCA -- (17, 7781)
test: TCGA-CESC -- (15, 7781)
test: TCGA-HNSC -- (12, 7781)
test: TCGA-STAD -- (12, 7781)
test: TCGA-PAAD -- (14, 7781)
test: TCGA-LGG -- (30, 7781)
Fold 1
train: TCGA-BRCA -- (72, 7781)
train: TCGA-CESC -- (31, 7781)
train: TCGA-HNSC -- (37, 7781)
train: TCGA-STAD -- (39, 7781)
train: TCGA-PAAD -- (35, 7781)
train: TCGA-LGG -- (59, 7781)
val: TCGA-BRCA -- (8, 7781)
Val classes
2
val: TCGA-CESC -- (4, 7781)
Val classes
2
val: TCGA-HNSC -- (6, 7781)
Val classes
2
val: TCGA-STAD -- (5, 7781)
Val classes
2
val: TCGA-PAAD -- 

In [30]:
exp1A_cl_fold0_processed["train"]["CISPLATIN"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
0,PR-8EE2ka,CISPLATIN,0.776032,0.285054,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PR-9ByeKo,CISPLATIN,0.779539,0.211263,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PR-1ofDIZ,CISPLATIN,0.921701,1.621675,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,PR-kSuyzj,CISPLATIN,0.935353,2.934207,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PR-EvLXSQ,CISPLATIN,0.981732,4.660050,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,PR-UOveiJ,CISPLATIN,0.920806,2.967637,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
688,PR-sQS7tJ,CISPLATIN,0.967626,5.570299,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
689,PR-O1Dni4,CISPLATIN,0.967654,4.333717,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
690,PR-kuCd2G,CISPLATIN,0.891760,2.305190,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
exp1A_patient_fold0_processed["train"].keys(), exp1B_patient_fold0_processed["train"].keys(), exp2B_patient_fold0_processed["train"].keys()

(dict_keys(['BUPARLISIB', 'CISPLATIN', 'FLUOROURACIL', 'GEMCITABINE', 'PACLITAXEL', 'SORAFENIB', 'TEMOZOLOMIDE']),
 dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')]),
 dict_keys(['TCGA-BRCA', 'TCGA-CESC', 'TCGA-HNSC', 'TCGA-STAD', 'TCGA-PAAD', 'TCGA-LGG']))

In [32]:
exp2A_patient_fold0_processed["train"]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
0,TCGA-DB-A64P,TEMOZOLOMIDE,0,TCGA-LGG,TCGA,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-S9-A89V,TEMOZOLOMIDE,0,TCGA-LGG,TCGA,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P-0001324-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-S9-A6U8,CARMUSTINE,0,TCGA-LGG,TCGA,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-CN-4731,CETUXIMAB,0,TCGA-HNSC,TCGA,0.0,0.294118,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,s_DS_bkm_008_T,BUPARLISIB,0,TCGA-BRCA,CBIO_brca_mskcc_2019,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484,TCGA-GN-A8LK,CARBOPLATIN,0,TCGA-SKCM,TCGA,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
485,TCGA-VS-A8EJ,CISPLATIN,0,TCGA-CESC,TCGA,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
486,P-0002719-T01-IM3,SORAFENIB,0,TCGA-LIHC,CBIO_hcc_mskimpact_2018,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
exp1A_patient_fold1_processed["train"].keys(), exp1B_patient_fold1_processed["train"].keys(), exp2B_patient_fold1_processed["train"].keys()

(dict_keys(['BUPARLISIB', 'CISPLATIN', 'FLUOROURACIL', 'GEMCITABINE', 'PACLITAXEL', 'SORAFENIB', 'TEMOZOLOMIDE']),
 dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')]),
 dict_keys(['TCGA-BRCA', 'TCGA-CESC', 'TCGA-HNSC', 'TCGA-STAD', 'TCGA-PAAD', 'TCGA-LGG']))

In [34]:
exp1A_patient_fold2_processed["train"].keys(), exp1B_patient_fold2_processed["train"].keys(), exp2B_patient_fold2_processed["train"].keys()

(dict_keys(['BUPARLISIB', 'CISPLATIN', 'FLUOROURACIL', 'GEMCITABINE', 'PACLITAXEL', 'SORAFENIB', 'TEMOZOLOMIDE']),
 dict_keys([('CISPLATIN', 'TCGA-CESC', 'TCGA'), ('CISPLATIN', 'TCGA-HNSC', 'TCGA'), ('FLUOROURACIL', 'TCGA-STAD', 'TCGA'), ('GEMCITABINE', 'TCGA-PAAD', 'TCGA'), ('PACLITAXEL', 'TCGA-BRCA', 'TCGA'), ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')]),
 dict_keys(['TCGA-BRCA', 'TCGA-CESC', 'TCGA-HNSC', 'TCGA-STAD', 'TCGA-PAAD', 'TCGA-LGG']))

In [35]:
exp1B_patient_fold0_processed["train"][('PACLITAXEL', 'TCGA-BRCA', 'TCGA')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
0,TCGA-GM-A3XG,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-LL-A73Z,PACLITAXEL,0,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-A7-A6VY,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-GM-A2DB,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-GM-A2DH,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,TCGA-GM-A2DN,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.882353,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,TCGA-A7-A5ZX,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,TCGA-LL-A8F5,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,TCGA-GM-A3XL,PACLITAXEL,1,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,TCGA-EW-A2FR,PACLITAXEL,0,TCGA-BRCA,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
exp1B_patient_fold2_processed["train"][('FLUOROURACIL', 'TCGA-STAD', 'TCGA')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
0,TCGA-VQ-AA6G,FLUOROURACIL,0,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-F1-A448,FLUOROURACIL,1,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TCGA-VQ-A8PD,FLUOROURACIL,0,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-VQ-A8PT,FLUOROURACIL,1,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TCGA-D7-A748,FLUOROURACIL,0,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,TCGA-VQ-A8DU,FLUOROURACIL,0,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,TCGA-VQ-AA6F,FLUOROURACIL,0,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,TCGA-VQ-A91E,FLUOROURACIL,1,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,TCGA-VQ-AA6B,FLUOROURACIL,1,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,TCGA-VQ-A922,FLUOROURACIL,0,TCGA-STAD,TCGA,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save files

In [None]:
save_dir_expt1A_dir

In [38]:
# Experiment 1A
with open(f"{save_dir_expt1A_dir}/cell_lines_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp1A_cl_fold0_processed, f)

with open(f"{save_dir_expt1A_dir}/patients_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp1A_patient_fold0_processed, f)
    
with open(f"{save_dir_expt1A_dir}/patients_fold1_processed.pkl", "wb") as f:
    pickle.dump(exp1A_patient_fold1_processed, f)
    
with open(f"{save_dir_expt1A_dir}/patients_fold2_processed.pkl", "wb") as f:
    pickle.dump(exp1A_patient_fold2_processed, f)

In [39]:
# Experiment 1B
with open(f"{save_dir_expt1B_dir}/cell_lines_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp1B_cl_fold0_processed, f)

with open(f"{save_dir_expt1B_dir}/patients_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp1B_patient_fold0_processed, f)
    
with open(f"{save_dir_expt1B_dir}/patients_fold1_processed.pkl", "wb") as f:
    pickle.dump(exp1B_patient_fold1_processed, f)
    
with open(f"{save_dir_expt1B_dir}/patients_fold2_processed.pkl", "wb") as f:
    pickle.dump(exp1B_patient_fold2_processed, f)

In [40]:
# Experiment 2A
with open(f"{save_dir_expt2A_dir}/cell_lines_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp2A_cl_fold0_processed, f)

with open(f"{save_dir_expt2A_dir}/patients_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp2A_patient_fold0_processed, f)
    
with open(f"{save_dir_expt2A_dir}/patients_fold1_processed.pkl", "wb") as f:
    pickle.dump(exp2A_patient_fold1_processed, f)
    
with open(f"{save_dir_expt2A_dir}/patients_fold2_processed.pkl", "wb") as f:
    pickle.dump(exp2A_patient_fold2_processed, f)

In [41]:
# Experiment 2B
with open(f"{save_dir_expt2B_dir}/cell_lines_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp2B_cl_fold0_processed, f)

with open(f"{save_dir_expt2B_dir}/patients_fold0_processed.pkl", "wb") as f:
    pickle.dump(exp2B_patient_fold0_processed, f)
    
with open(f"{save_dir_expt2B_dir}/patients_fold1_processed.pkl", "wb") as f:
    pickle.dump(exp2B_patient_fold1_processed, f)
    
with open(f"{save_dir_expt2B_dir}/patients_fold2_processed.pkl", "wb") as f:
    pickle.dump(exp2B_patient_fold2_processed, f)

### Unlabelled patient data

Used in WISER. In each train-test split, we first remove patients present in test split, add unlabelled patient samples (only those datasets which have RECIST responses, i.e. those in `patients_combined` df) to those in the train set.

This is equivalent to removing test patient ids from patients_combined and saving it.

In [42]:
patients_combined[patients_combined.sum(axis=1)!=0]

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0003,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0033,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0047,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-0055,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-02-2466,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_080_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_081_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_082_T,0.000000,0.0,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s_DS_bkm_083_T,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
# Experiment 1A
# fold 0
print("Fold 0")
exp1A_patient_fold0_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp1A_patient_fold0_processed_unlabelled["train"] = {}
    for k, v in exp1A_patient_fold0_processed[div].items():
        print(div + ": " + k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp1A_patient_fold0_processed_unlabelled["train"][k] = unlabelled_df
            
# fold 1
print("Fold 1")
exp1A_patient_fold1_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp1A_patient_fold1_processed_unlabelled["train"] = {}
    for k, v in exp1A_patient_fold1_processed[div].items():
        print(div + ": " + k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp1A_patient_fold1_processed_unlabelled["train"][k] = unlabelled_df

# fold 2
print("Fold 2")
exp1A_patient_fold2_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp1A_patient_fold2_processed_unlabelled["train"] = {}
    for k, v in exp1A_patient_fold2_processed[div].items():
        print(div + ": " + k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp1A_patient_fold2_processed_unlabelled["train"][k] = unlabelled_df

Fold 0
test: BUPARLISIB -- (9437, 7776)
test: CISPLATIN -- (9419, 7776)
test: FLUOROURACIL -- (9434, 7776)
test: GEMCITABINE -- (9432, 7776)
test: PACLITAXEL -- (9438, 7776)
test: SORAFENIB -- (9431, 7776)
test: TEMOZOLOMIDE -- (9416, 7776)
Fold 1
test: BUPARLISIB -- (9437, 7776)
test: CISPLATIN -- (9416, 7776)
test: FLUOROURACIL -- (9435, 7776)
test: GEMCITABINE -- (9435, 7776)
test: PACLITAXEL -- (9436, 7776)
test: SORAFENIB -- (9433, 7776)
test: TEMOZOLOMIDE -- (9416, 7776)
Fold 2
test: BUPARLISIB -- (9437, 7776)
test: CISPLATIN -- (9418, 7776)
test: FLUOROURACIL -- (9435, 7776)
test: GEMCITABINE -- (9431, 7776)
test: PACLITAXEL -- (9438, 7776)
test: SORAFENIB -- (9432, 7776)
test: TEMOZOLOMIDE -- (9418, 7776)


In [44]:
# Experiment 1B
# fold 0
print("Fold 0")
exp1B_patient_fold0_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp1B_patient_fold0_processed_unlabelled["train"] = {}
    for k, v in exp1B_patient_fold0_processed[div].items():
        print(k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp1B_patient_fold0_processed_unlabelled["train"][k] = unlabelled_df
            
# fold 1
print("Fold 1")
exp1B_patient_fold1_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp1B_patient_fold1_processed_unlabelled["train"] = {}
    for k, v in exp1B_patient_fold1_processed[div].items():
        print(k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp1B_patient_fold1_processed_unlabelled["train"][k] = unlabelled_df

# fold 2
print("Fold 2")
exp1B_patient_fold2_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp1B_patient_fold2_processed_unlabelled["train"] = {}
    for k, v in exp1B_patient_fold2_processed[div].items():
        print(k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp1B_patient_fold2_processed_unlabelled["train"][k] = unlabelled_df

Fold 0
('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (9431, 7776)
('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (9434, 7776)
('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (9434, 7776)
('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (9432, 7776)
('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (9438, 7776)
('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (9416, 7776)
Fold 1
('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (9429, 7776)
('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (9433, 7776)
('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (9435, 7776)
('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (9435, 7776)
('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (9436, 7776)
('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (9416, 7776)
Fold 2
('CISPLATIN', 'TCGA-CESC', 'TCGA') -- (9430, 7776)
('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- (9434, 7776)
('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- (9435, 7776)
('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- (9431, 7776)
('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- (9438, 7776)
('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- (9418, 7776)


In [45]:
# Experiment 2A
# fold 0
print("Fold 0")
exp2A_patient_fold0_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp2A_patient_fold0_processed_unlabelled["train"] = {}
    unlabelled_df = patients_combined[~patients_combined.index.isin(exp2A_patient_fold0_processed[div].sample_id)] # remove test patients from patients_combined
    print(unlabelled_df.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    exp2A_patient_fold0_processed_unlabelled["train"] = unlabelled_df
            
# fold 1
print("Fold 1")
exp2A_patient_fold1_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp2A_patient_fold1_processed_unlabelled["train"] = {}
    unlabelled_df = patients_combined[~patients_combined.index.isin(exp2A_patient_fold1_processed[div].sample_id)] # remove test patients from patients_combined
    print(unlabelled_df.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    exp2A_patient_fold1_processed_unlabelled["train"] = unlabelled_df

# fold 2
print("Fold 2")
exp2A_patient_fold2_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp2A_patient_fold2_processed_unlabelled["train"] = {}
    unlabelled_df = patients_combined[~patients_combined.index.isin(exp2A_patient_fold2_processed[div].sample_id)] # remove test patients from patients_combined
    print(unlabelled_df.shape)
    # mutations2use = cl_mutations.loc[v.sample_id]
    exp2A_patient_fold2_processed_unlabelled["train"] = unlabelled_df

Fold 0
(9331, 7776)
Fold 1
(9332, 7776)
Fold 2
(9333, 7776)


In [46]:
# Experiment 2B
# fold 0
print("Fold 0")
exp2B_patient_fold0_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp2B_patient_fold0_processed_unlabelled["train"] = {}
    for k, v in exp2B_patient_fold0_processed[div].items():
        print(k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp2B_patient_fold0_processed_unlabelled["train"][k] = unlabelled_df
            
# fold 1
print("Fold 1")
exp2B_patient_fold1_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp2B_patient_fold1_processed_unlabelled["train"] = {}
    for k, v in exp2B_patient_fold1_processed[div].items():
        print(k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp2B_patient_fold1_processed_unlabelled["train"][k] = unlabelled_df

# fold 2
print("Fold 2")
exp2B_patient_fold2_processed_unlabelled = {}
for div in ["test"]: # only remove the patients present in test set
    exp2B_patient_fold2_processed_unlabelled["train"] = {}
    for k, v in exp2B_patient_fold2_processed[div].items():
        print(k, end=" -- ")
        unlabelled_df = patients_combined[~patients_combined.index.isin(v.sample_id)] # remove test patients from patients_combined
        print(unlabelled_df.shape)
        # mutations2use = cl_mutations.loc[v.sample_id]
        exp2B_patient_fold2_processed_unlabelled["train"][k] = unlabelled_df

Fold 0
TCGA-BRCA -- (9429, 7776)
TCGA-CESC -- (9431, 7776)
TCGA-HNSC -- (9434, 7776)
TCGA-STAD -- (9434, 7776)
TCGA-PAAD -- (9432, 7776)
TCGA-LGG -- (9416, 7776)
Fold 1
TCGA-BRCA -- (9427, 7776)
TCGA-CESC -- (9429, 7776)
TCGA-HNSC -- (9433, 7776)
TCGA-STAD -- (9435, 7776)
TCGA-PAAD -- (9435, 7776)
TCGA-LGG -- (9416, 7776)
Fold 2
TCGA-BRCA -- (9429, 7776)
TCGA-CESC -- (9430, 7776)
TCGA-HNSC -- (9434, 7776)
TCGA-STAD -- (9435, 7776)
TCGA-PAAD -- (9431, 7776)
TCGA-LGG -- (9418, 7776)


#### Save unlabelled patient files

In [None]:
save_dir_expt1A_dir

In [48]:
# Experiment 1A
with open(f"{save_dir_expt1A_dir}/patients_fold0_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp1A_patient_fold0_processed_unlabelled, f)
    
with open(f"{save_dir_expt1A_dir}/patients_fold1_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp1A_patient_fold1_processed_unlabelled, f)
    
with open(f"{save_dir_expt1A_dir}/patients_fold2_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp1A_patient_fold2_processed_unlabelled, f)

In [49]:
# Experiment 1B
with open(f"{save_dir_expt1B_dir}/patients_fold0_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp1B_patient_fold0_processed_unlabelled, f)
    
with open(f"{save_dir_expt1B_dir}/patients_fold1_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp1B_patient_fold1_processed_unlabelled, f)
    
with open(f"{save_dir_expt1B_dir}/patients_fold2_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp1B_patient_fold2_processed_unlabelled, f)

In [50]:
# Experiment 2A
with open(f"{save_dir_expt2A_dir}/patients_fold0_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp2A_patient_fold0_processed_unlabelled, f)
    
with open(f"{save_dir_expt2A_dir}/patients_fold1_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp2A_patient_fold1_processed_unlabelled, f)
    
with open(f"{save_dir_expt2A_dir}/patients_fold2_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp2A_patient_fold2_processed_unlabelled, f)

In [51]:
# Experiment 2B
with open(f"{save_dir_expt2B_dir}/patients_fold0_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp2B_patient_fold0_processed_unlabelled, f)
    
with open(f"{save_dir_expt2B_dir}/patients_fold1_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp2B_patient_fold1_processed_unlabelled, f)
    
with open(f"{save_dir_expt2B_dir}/patients_fold2_processed_unlabelled.pkl", "wb") as f:
    pickle.dump(exp2B_patient_fold2_processed_unlabelled, f)