This notebook identifies patients and cell lines without mutations in the 324 genes. These are removed from the splits.

### Remove patients and cell lines with no mutations in the 324 genes

The patients and cell lines without mutations in the 324 genes are first identified. These are removed from all the train-test splits across all experiments.

Further there is another check done to ensure that all the validation splits have atleast 1 sample of each label.

In [1]:
import torch
import pandas as pd
import numpy as np

In [2]:
import pickle
import json

In [3]:
druid_data_dir = "/data/druid_data/"

In [4]:
raw_data_dir = "/data/ajayago/papers_data/systematic_assessment/raw/mutation_files/"
data_splits_dir = "/data/ajayago/papers_data/systematic_assessment/processed/"
expt1A_dir = data_splits_dir + "Experiment1/SettingA/"
expt1B_dir = data_splits_dir + "Experiment1/SettingB/"
expt2A_dir = data_splits_dir + "Experiment2/SettingA/"
expt2B_dir = data_splits_dir + "Experiment2/SettingB/"
survival_dir = data_splits_dir + "survival_splits/"

In [5]:
save_dir = "/data/ajayago/papers_data/systematic_assessment/input_types/raw_mutations/"
save_dir_expt1A_dir = save_dir + "Experiment1/SettingA/"
save_dir_expt1B_dir = save_dir + "Experiment1/SettingB/"
save_dir_expt2A_dir = save_dir + "Experiment2/SettingA/"
save_dir_expt2B_dir = save_dir + "Experiment2/SettingB/"

In [6]:
genes324 = list(pd.read_csv("/data/druid_data/raw_data/gene2ind.txt", header=None)[0])
len(genes324)

324

In [9]:
with open(f"/data/ajayago/papers_data/systematic_assessment/raw/mutation_files/annotation_vocab_ccle_cbio_icgc_moores_tcga_genie_nci60_nuh_union.pickle", "rb") as f:
    annot_vocab = pickle.load(f)

In [10]:
len(annot_vocab) # set of all mutations in the datasets CCLE, CBIO, ICGC, Moore's, TCGA, GENIE, NCI60 and NUH

2324534

In [11]:
def get_alias_to_canonical_name_map():
    with open("/data/druid_data/raw_data/gene_aliases.json", "r") as fp:
        aliases_on_disk = json.load(fp)

    alias_to_canonical_name_map = {}
    for canonical_name, aliases in aliases_on_disk.items():
        # Some canonical names have only one alias - convert those as list for consistency
        if type(aliases) != list:
            aliases = [aliases]

        for alias in aliases:

            # If an alias is one of the canonical names in GENES_324, do not add it to the map
            # Else, we'd be renaming a canonical named column into something else
            if alias in genes324:
                print(f"Alias {alias} is a canonical_name, skipping")
                continue

            if alias in alias_to_canonical_name_map:
                print(
                    f"Found multiple canonical names for alias - {alias} = {[canonical_name, alias_to_canonical_name_map[alias]]}"
                )
                # Drop aliases with conflicting canonical names per recommendation from clinicians
                alias_to_canonical_name_map.pop(alias)

            # Convert all aliases to be upper case for consistency
            alias_to_canonical_name_map[alias.upper()] = canonical_name.upper()

    return alias_to_canonical_name_map

In [12]:
alias2canonicalmap = get_alias_to_canonical_name_map()

Alias RAD54L is a canonical_name, skipping
Found multiple canonical names for alias - CDK4I = ['CDKN2B', 'CDKN2A']
Found multiple canonical names for alias - IDH = ['IDH2', 'IDH1']
Found multiple canonical names for alias - IDP = ['IDH2', 'IDH1']
Found multiple canonical names for alias - HDMX = ['MDM4', 'MDM2']
Found multiple canonical names for alias - HNPCC = ['MSH2', 'MLH1']
Found multiple canonical names for alias - MRP1 = ['MSH3', 'MDM4']
Alias KRAS is a canonical_name, skipping
Found multiple canonical names for alias - ADPRTL2 = ['PARP3', 'PARP2']
Found multiple canonical names for alias - ADPRTL3 = ['PARP3', 'PARP2']
Found multiple canonical names for alias - MCAP = ['PIK3CA', 'BRD4']
Found multiple canonical names for alias - PI3K = ['PIK3CB', 'PIK3CA']
Found multiple canonical names for alias - R51H3 = ['RAD51D', 'RAD51C']
Found multiple canonical names for alias - PTC = ['RET', 'PTCH1']
Found multiple canonical names for alias - SDH1 = ['SDHB', 'SDHA']
Found multiple canoni

In [13]:
len(set(alias2canonicalmap.values()))

311

In [14]:
len(set(genes324) | set(alias2canonicalmap.values()))

324

In [15]:
def convert2canonical(gene):
    if gene in genes324: # already a canonical name
        return gene
    if gene in alias2canonicalmap.keys(): # not canonical name => convert to canonical name
        return alias2canonicalmap[gene]
    return np.NaN
        

### Cell Lines

In [16]:
with open(expt1A_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp1A_cl_fold0 = pickle.load(f)
    
with open(expt1B_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp1B_cl_fold0 = pickle.load(f)

with open(expt2A_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp2A_cl_fold0 = pickle.load(f)

with open(expt2B_dir + "cell_lines_fold0.pkl", "rb") as f:
    exp2B_cl_fold0 = pickle.load(f)

In [17]:
cell_line_data = druid_data_dir + "CCLE_23Q4" # to load raw mutations from

In [18]:
cl_mutations = pd.read_csv(cell_line_data + "/patient_gene_alteration(mutation).csv")
cl_mutations

Unnamed: 0,depmap_id,gene,alteration
0,PR-sxFiuq,SAMD11,L76V
1,PR-DNEoiz,SAMD11,P107S
2,PR-2ei6MD,SAMD11,E160K
3,PR-CYz5sB,SAMD11,A218V
4,PR-xcsbEI,SAMD11,N285S
...,...,...,...
885436,PR-MX9ndc,KDM5D,R68H
885437,PR-AiAKPa,EIF1AY,D83Y
885438,PR-MX9ndc,RPS4Y2,T115A
885439,PR-Bs4EcD,RPS4Y2,P152S


In [19]:
cl_mutations["canonical_gene_name"] = cl_mutations["gene"].apply(lambda x: convert2canonical(x))

In [20]:
cl_mutations[cl_mutations.canonical_gene_name.astype(str) != "nan"]

Unnamed: 0,depmap_id,gene,alteration,canonical_gene_name
449,PR-rsoNmY,TNFRSF14,G68R,TNFRSF14
450,PR-yDgpga,TNFRSF14,G68R,TNFRSF14
451,PR-sgXEkc,TNFRSF14,G89C,TNFRSF14
452,PR-kRqGcx,TNFRSF14,A140S,TNFRSF14
453,PR-ZhEuUF,TNFRSF14,R149M,TNFRSF14
...,...,...,...,...
879891,PR-6Ybf3z,BCORL1,P1755QfsTer20,BCORL1
879892,PR-81oclJ,BCORL1,P1755QfsTer20,BCORL1
879893,PR-Qvs2q6,BCORL1,P1755QfsTer20,BCORL1
879894,PR-EaZDJD,BCORL1,E1767K,BCORL1


In [21]:
len(exp1A_cl_fold0["train"]["CISPLATIN"]["sample_id"].unique())

694

In [22]:
exp1A_cl_fold0["train"]["CISPLATIN"]

Unnamed: 0,sample_id,drug_name,auc,ic50,drug_category,response_label
690,PR-8EE2ka,CISPLATIN,0.776032,0.285054,1,1
747,PR-9ByeKo,CISPLATIN,0.779539,0.211263,1,1
21,PR-1ofDIZ,CISPLATIN,0.921701,1.621675,1,1
38,PR-kSuyzj,CISPLATIN,0.935353,2.934207,1,1
522,PR-EvLXSQ,CISPLATIN,0.981732,4.660050,1,0
...,...,...,...,...,...,...
539,PR-UOveiJ,CISPLATIN,0.920806,2.967637,1,1
755,PR-sQS7tJ,CISPLATIN,0.967626,5.570299,1,0
462,PR-O1Dni4,CISPLATIN,0.967654,4.333717,1,0
17,PR-kuCd2G,CISPLATIN,0.891760,2.305190,1,1


In [23]:
# cell lines without alterations in genes_324 after mapping to canonical mapping
full_set = set()
for div in ["train", "val", "test"]:
    print("Experiment 1A")
    for k, v in exp1A_cl_fold0[div].items():
        print(div, end= ": ")
        print(k, end = " -- ")
        missing = set(exp1A_cl_fold0[div][k]["sample_id"]) - set(cl_mutations[cl_mutations.canonical_gene_name.astype(str) != "nan"].depmap_id)
        print(missing)
        full_set = full_set | missing
    
    print("Experiment 1B")
    for k, v in exp1B_cl_fold0[div].items():
        print(div, end= ": ")
        print(k, end = " -- ")
        missing = set(exp1B_cl_fold0[div][k]["sample_id"]) - set(cl_mutations[cl_mutations.canonical_gene_name.astype(str) != "nan"].depmap_id)
        print(missing)
        full_set = full_set | missing

    print("Fold 2A")
    print(div, end= ": ")
    missing = set(exp2A_cl_fold0[div]["sample_id"]) - set(cl_mutations[cl_mutations.canonical_gene_name.astype(str) != "nan"].depmap_id)
    print(missing)
    full_set = full_set | missing

    print("Experiment 2B")
    for k, v in exp2B_cl_fold0[div].items():
        print(div, end= ": ")
        print(k, end = " -- ")
        missing = set(exp2B_cl_fold0[div][k]["sample_id"]) - set(cl_mutations[cl_mutations.canonical_gene_name.astype(str) != "nan"].depmap_id)
        print(missing)
        full_set = full_set | missing

Experiment 1A
train: BUPARLISIB -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL', 'PR-r3HxM0'}
train: CISPLATIN -- {'PR-pYHfy7', 'PR-r3HxM0'}
train: FLUOROURACIL -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-r3HxM0'}
train: GEMCITABINE -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL'}
train: PACLITAXEL -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL', 'PR-r3HxM0'}
train: SORAFENIB -- {'PR-pYHfy7', 'PR-XcC2fL', 'PR-r3HxM0'}
train: TEMOZOLOMIDE -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL'}
Experiment 1B
train: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- {'PR-pYHfy7', 'PR-r3HxM0'}
train: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- {'PR-pYHfy7', 'PR-r3HxM0'}
train: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-r3HxM0'}
train: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL'}
train: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL', 'PR-r3HxM0'}
train: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- {'PR-pYHfy7', 'PR-z1ZBpL', 'PR-XcC2fL'}
Fold 2A
train: {'PR-pYHf

In [24]:
full_set

{'PR-XcC2fL', 'PR-pYHfy7', 'PR-r3HxM0', 'PR-z1ZBpL'}

In [25]:
cl_mutations[cl_mutations.depmap_id.isin(full_set)]

Unnamed: 0,depmap_id,gene,alteration,canonical_gene_name
349,PR-pYHfy7,CFAP74,T583P,
9557,PR-pYHfy7,RFX5,R335P,
11113,PR-pYHfy7,CD5L,K154N,
12865,PR-pYHfy7,ASTN1,Y803C,
31091,PR-pYHfy7,STIP1,E208K,
...,...,...,...,...
860601,PR-XcC2fL,MXRA5,A1655T,
864540,PR-XcC2fL,FAM47B,R4L,
869326,PR-z1ZBpL,FGD1,T961P,
871297,PR-r3HxM0,GDPD2,M128I,


In [26]:
cl_mutations_df = pd.read_csv(cell_line_data + "/patient_raw_mutation(gene_324).csv", index_col = 0)
cl_mutations_df

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PR-sxFiuq,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PR-DNEoiz,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
PR-2ei6MD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PR-CYz5sB,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1
PR-xcsbEI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PR-QnZ5wx,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PR-YWsnzi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PR-CiXXaA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PR-v6GkOV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
cl_mutations_df.loc[list(full_set)].sum(axis = 1)

patient_id
PR-z1ZBpL    0
PR-r3HxM0    0
PR-pYHfy7    0
PR-XcC2fL    0
dtype: int64

### Patients TCGA

In [28]:
with open(expt1A_dir + "patients_fold0.pkl", "rb") as f:
    exp1A_p_fold0 = pickle.load(f)
    
with open(expt1B_dir + "patients_fold0.pkl", "rb") as f:
    exp1B_p_fold0 = pickle.load(f)

with open(expt2A_dir + "patients_fold0.pkl", "rb") as f:
    exp2A_p_fold0 = pickle.load(f)

with open(expt2B_dir + "patients_fold0.pkl", "rb") as f:
    exp2B_p_fold0 = pickle.load(f)

In [29]:
tcga_mutations = pd.read_csv("/data/druid_data/Tcga/patient_gene_alteration(mutation).csv")
tcga_mutations

Unnamed: 0,patient_id,gene,alteration
0,TCGA-50-5931,CAMTA1,V870E
1,TCGA-50-5931,CATSPER4,P365=
2,TCGA-50-5931,KDF1,I243T
3,TCGA-50-5931,CSMD2,T417S
4,TCGA-50-5931,SFPQ,G647C
...,...,...,...
3093849,TCGA-YD-A9TA,CNGA2,G303G
3093850,TCGA-YD-A9TA,MAGEA12,R243R
3093851,TCGA-YD-A9TA,ZNF275,L224L
3093852,TCGA-YD-A9TA,L1CAM,P279P


In [30]:
len(tcga_mutations.patient_id.unique())

10173

In [31]:
moores_mutations = pd.read_csv("/data/druid_data/Moores/patient_gene_alteration(mutation).csv")
moores_mutations

Unnamed: 0,patient_id,gene,alteration
0,1,PTEN,splice site 493-1 G>A
1,2,TP53,P151A
2,3,ESR1,Y537S
3,4,PTEN,I67K
4,4,CTNNB1,T257I
...,...,...,...
220,84,GATA3,G335fs*18
221,85,TP53,H168R
222,85,GATA3,N332fs*21
223,86,MLL2,A4571T


In [32]:
cbio_hcc_mutations = pd.read_csv("/data/druid_data/CBIO/hcc_mskimpact_2018/patient_gene_alteration(mutation).csv")
cbio_hcc_mutations

Unnamed: 0,patient_id,gene,alteration
0,P-0005038-T02-IM6,TNFRSF14,Q242R
1,P-0005038-T02-IM6,JAK1,S729C
2,P-0005038-T02-IM6,MEN1,X224_splice
3,P-0005038-T02-IM6,ALK,E717K
4,P-0015203-T01-IM6,ZRSR2,C172S
...,...,...,...
531,P-0012182-T01-IM5,NEGR1,Q8L
532,P-0012182-T01-IM5,SETD2,S2479A
533,P-0012182-T01-IM5,POLE,V544M
534,P-0012182-T01-IM5,AXIN1,E291*


In [33]:
cbio_brca_mutations = pd.read_csv("/data/druid_data/CBIO/brca_mskcc_2019/patient_gene_alteration(mutation).csv")
cbio_brca_mutations

Unnamed: 0,patient_id,gene,alteration
0,s_DS_bkm_077_T,VTCN1,S192L
1,s_DS_bkm_078_T2,NOTCH2,D1582N
2,s_DS_bkm_078_T1,NOTCH2,D1582N
3,s_DS_bkm_074_T,NOTCH2,T1303P
4,s_DS_bkm_064_T2,NOTCH2,P6Rfs*27
...,...,...,...
653,s_DS_bkm_058_T,NCOR1,A750V
654,s_DS_bkm_058_T,BCOR,N193T
655,s_DS_bkm_059_T,SF3B1,I641V
656,s_DS_bkm_059_T,ESR1,L536R


In [34]:
patients_combined = pd.concat([tcga_mutations, moores_mutations, cbio_brca_mutations, cbio_hcc_mutations], axis = 0, ignore_index=True)
patients_combined

Unnamed: 0,patient_id,gene,alteration
0,TCGA-50-5931,CAMTA1,V870E
1,TCGA-50-5931,CATSPER4,P365=
2,TCGA-50-5931,KDF1,I243T
3,TCGA-50-5931,CSMD2,T417S
4,TCGA-50-5931,SFPQ,G647C
...,...,...,...
3095268,P-0012182-T01-IM5,NEGR1,Q8L
3095269,P-0012182-T01-IM5,SETD2,S2479A
3095270,P-0012182-T01-IM5,POLE,V544M
3095271,P-0012182-T01-IM5,AXIN1,E291*


In [35]:
patients_combined["canonical_gene_name"] = patients_combined["gene"].apply(lambda x: convert2canonical(x))

In [36]:
patients_combined[patients_combined.canonical_gene_name.astype(str) == "nan"]

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
0,TCGA-50-5931,CAMTA1,V870E,
1,TCGA-50-5931,CATSPER4,P365=,
2,TCGA-50-5931,KDF1,I243T,
3,TCGA-50-5931,CSMD2,T417S,
4,TCGA-50-5931,SFPQ,G647C,
...,...,...,...,...
3095258,P-0010226-T01-IM5,ERCC5,S1067A,
3095260,P-0014167-T01-IM5,ARID2,V1649L,
3095263,P-0014167-T01-IM5,ERCC5,R959K,
3095264,P-0014167-T01-IM5,NCOR1,G1348V,


In [37]:
patients_combined = patients_combined[patients_combined.canonical_gene_name.isin(genes324)]
patients_combined

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
35,TCGA-50-5931,DNMT3A,V258M,DNMT3A
38,TCGA-50-5931,MSH2,I356V,MSH2
86,TCGA-50-5931,RICTOR,L42F,RICTOR
162,TCGA-50-5931,NOTCH1,D297G,NOTCH1
197,TCGA-50-5931,MLL2,E668*,MLL2
...,...,...,...,...
3095267,P-0014167-T01-IM5,KDR,R720W,KDR
3095269,P-0012182-T01-IM5,SETD2,S2479A,SETD2
3095270,P-0012182-T01-IM5,POLE,V544M,POLE
3095271,P-0012182-T01-IM5,AXIN1,E291*,AXIN1


In [38]:
# patients without alterations in genes_324 after mapping to canonical mapping
full_set_p = set()
for div in ["train", "val", "test"]:
    print("Experiment 1A")
    for k, v in exp1A_p_fold0[div].items():
        print(div, end= ": ")
        print(k, end = " -- ")
        missing = set(exp1A_p_fold0[div][k]["sample_id"]) - set(patients_combined.patient_id)
        print(missing)
        full_set_p = full_set_p | missing
    
    print("Experiment 1B")
    for k, v in exp1B_p_fold0[div].items():
        print(div, end= ": ")
        print(k, end = " -- ")
        missing = set(exp1B_p_fold0[div][k]["sample_id"]) - set(patients_combined.patient_id)
        print(missing)
        full_set_p = full_set_p | missing

    print("Fold 2A")
    print(div, end= ": ")
    missing = set(exp2A_p_fold0[div]["sample_id"]) - set(patients_combined.patient_id)
    print(missing)
    full_set_p = full_set_p | missing

    print("Experiment 2B")
    for k, v in exp2B_p_fold0[div].items():
        print(div, end= ": ")
        print(k, end = " -- ")
        missing = set(exp2B_p_fold0[div][k]["sample_id"]) - set(patients_combined.patient_id)
        print(missing)
        full_set_p = full_set_p | missing

Experiment 1A
train: BUPARLISIB -- set()
train: CISPLATIN -- {'TCGA-LK-A4O0', 'TCGA-QF-A5YT'}
train: FLUOROURACIL -- {'TCGA-3A-A9IX'}
train: GEMCITABINE -- {'TCGA-Q3-A5QY', 'TCGA-F2-7273', 'TCGA-F2-6880'}
train: PACLITAXEL -- {'TCGA-A2-A0EP'}
train: SORAFENIB -- {'TCGA-2Y-A9GV', 'P-0013161-T01-IM5'}
train: TEMOZOLOMIDE -- {'TCGA-TM-A84B', 'TCGA-HW-A5KK'}
Experiment 1B
train: ('CISPLATIN', 'TCGA-CESC', 'TCGA') -- set()
train: ('CISPLATIN', 'TCGA-HNSC', 'TCGA') -- set()
train: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA') -- set()
train: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA') -- {'TCGA-Q3-A5QY', 'TCGA-3A-A9IX', 'TCGA-F2-7273', 'TCGA-F2-6880'}
train: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA') -- {'TCGA-A2-A0EP'}
train: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA') -- {'TCGA-TM-A84B', 'TCGA-HW-A5KK'}
Fold 2A
train: {'TCGA-Q3-A5QY', 'TCGA-UD-AAC4', 'TCGA-IW-A3M4', 8, 'TCGA-ZG-A8QZ', 'TCGA-F2-7273', 16, 'TCGA-TM-A84B', 'TCGA-XF-A9SH', 'TCGA-F2-6880', 'TCGA-HW-A5KK', 48, 'TCGA-F2-A44H', 'TCGA-HZ-7920', 'TCGA-LK-A4O0',

In [39]:
full_set_p

{16,
 48,
 8,
 'P-0013161-T01-IM5',
 'TCGA-2Y-A9GV',
 'TCGA-3A-A9IX',
 'TCGA-3H-AB3O',
 'TCGA-A2-A0EP',
 'TCGA-A2-A3XW',
 'TCGA-AG-3584',
 'TCGA-CE-A481',
 'TCGA-DU-A5TY',
 'TCGA-F2-6880',
 'TCGA-F2-7273',
 'TCGA-F2-A44H',
 'TCGA-G7-7501',
 'TCGA-GM-A2DA',
 'TCGA-HW-A5KK',
 'TCGA-HZ-7920',
 'TCGA-IR-A3L7',
 'TCGA-IW-A3M4',
 'TCGA-LK-A4O0',
 'TCGA-LL-A7SZ',
 'TCGA-MZ-A6I9',
 'TCGA-Q3-A5QY',
 'TCGA-QF-A5YT',
 'TCGA-QQ-A8VD',
 'TCGA-SX-A7SM',
 'TCGA-TM-A84B',
 'TCGA-UD-AAC4',
 'TCGA-US-A774',
 'TCGA-VS-A9UU',
 'TCGA-XF-A9SH',
 'TCGA-XJ-A9DI',
 'TCGA-YU-AA61',
 'TCGA-ZG-A8QZ'}

In [40]:
tcga_mutations_df = pd.read_csv("/data/druid_data/Tcga/patient_raw_mutation(gene_324).csv", index_col = 0)
tcga_mutations_df

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-50-5931,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-LN-A7HV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-EK-A3GM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-P6-A5OF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-HC-7748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-44-2659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-35-3615,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-97-A4M2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-55-8505,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
moores_mutations_df = pd.read_csv("/data/druid_data/Moores/patient_raw_mutation(gene_324).csv", index_col = 0)
moores_mutations_df

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
cbio_hcc_mutations_df = pd.read_csv("/data/druid_data/CBIO/hcc_mskimpact_2018/patient_raw_mutation(gene_324).csv", index_col = 0)
cbio_hcc_mutations_df

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P-0005038-T02-IM6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0015203-T01-IM6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0015581-T01-IM6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0015687-T01-IM6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0015853-T01-IM6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P-0010119-T01-IM5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0013312-T01-IM5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0010226-T01-IM5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0014167-T01-IM5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
cbio_brca_mutations_df = pd.read_csv("/data/druid_data/CBIO/brca_mskcc_2019/patient_raw_mutation(gene_324).csv", index_col = 0)
cbio_brca_mutations_df

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s_DS_bkm_077_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_078_T2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_078_T1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_074_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_064_T2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_055_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_056_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_057_T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_058_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
patient_mutations_df = pd.concat([tcga_mutations_df, moores_mutations_df, cbio_hcc_mutations_df, cbio_brca_mutations_df])
patient_mutations_df

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-50-5931,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-LN-A7HV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-EK-A3GM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-P6-A5OF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-HC-7748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_055_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_056_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_057_T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_058_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
patient_mutations_df[patient_mutations_df.sum(axis =1) == 0]

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-HC-7748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-VN-A943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-OR-A5JI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-S6-A8JX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-DA-A1IB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0013075-T01-IM5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P-0013161-T01-IM5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
patient_mutations_df[patient_mutations_df.sum(axis = 1)!=0]

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-50-5931,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-LN-A7HV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-EK-A3GM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-P6-A5OF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCGA-B8-5550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s_DS_bkm_055_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_056_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_057_T,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s_DS_bkm_058_T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
patient_mutations_df[patient_mutations_df.index.isin(full_set_p)].sum(axis = 1)

patient_id
TCGA-VS-A9UU         0
TCGA-F2-7273         0
TCGA-HZ-7920         0
TCGA-ZG-A8QZ         0
TCGA-AG-3584         0
TCGA-UD-AAC4         0
TCGA-Q3-A5QY         0
TCGA-CE-A481         0
TCGA-SX-A7SM         0
TCGA-2Y-A9GV         0
TCGA-IW-A3M4         0
TCGA-LL-A7SZ         0
TCGA-US-A774         0
TCGA-GM-A2DA         0
TCGA-HW-A5KK         0
TCGA-MZ-A6I9         0
TCGA-F2-A44H         0
TCGA-DU-A5TY         0
TCGA-QF-A5YT         0
TCGA-QQ-A8VD         0
TCGA-XF-A9SH         0
TCGA-3H-AB3O         0
TCGA-A2-A3XW         0
TCGA-F2-6880         0
TCGA-IR-A3L7         0
TCGA-XJ-A9DI         0
TCGA-YU-AA61         0
TCGA-G7-7501         0
TCGA-LK-A4O0         0
TCGA-3A-A9IX         0
TCGA-TM-A84B         0
TCGA-A2-A0EP         0
8                    0
16                   0
48                   0
P-0013161-T01-IM5    0
dtype: int64

In [48]:
patients_combined[patients_combined.patient_id == "TCGA-AA-A02R"]

Unnamed: 0,patient_id,gene,alteration,canonical_gene_name
1801300,TCGA-AA-A02R,SPEN,I1159Lfs*28,SPEN
1801308,TCGA-AA-A02R,ARID1A,D1850Tfs*33,ARID1A
1801314,TCGA-AA-A02R,CSF3R,S469Afs*22,CSF3R
1801391,TCGA-AA-A02R,PARP1,S463=,PARP1
1801420,TCGA-AA-A02R,MSH6,R1024W,MSH6
...,...,...,...,...
2385087,TCGA-AA-A02R,CD22,S22S,CD22
2385134,TCGA-AA-A02R,RBM10,R82R,RBM10
2385135,TCGA-AA-A02R,RBM10,R83R,RBM10
2385137,TCGA-AA-A02R,KDM5C,A1078A,KDM5C


In [49]:
patient_mutations_df.loc["TCGA-VS-A9V0"]["TP53"]

1

In [50]:
tcga_mutations_df.loc["TCGA-VS-A9V0"].sum()

5

In [51]:
# patients without mutations in the 324 genes
full_set_p

{16,
 48,
 8,
 'P-0013161-T01-IM5',
 'TCGA-2Y-A9GV',
 'TCGA-3A-A9IX',
 'TCGA-3H-AB3O',
 'TCGA-A2-A0EP',
 'TCGA-A2-A3XW',
 'TCGA-AG-3584',
 'TCGA-CE-A481',
 'TCGA-DU-A5TY',
 'TCGA-F2-6880',
 'TCGA-F2-7273',
 'TCGA-F2-A44H',
 'TCGA-G7-7501',
 'TCGA-GM-A2DA',
 'TCGA-HW-A5KK',
 'TCGA-HZ-7920',
 'TCGA-IR-A3L7',
 'TCGA-IW-A3M4',
 'TCGA-LK-A4O0',
 'TCGA-LL-A7SZ',
 'TCGA-MZ-A6I9',
 'TCGA-Q3-A5QY',
 'TCGA-QF-A5YT',
 'TCGA-QQ-A8VD',
 'TCGA-SX-A7SM',
 'TCGA-TM-A84B',
 'TCGA-UD-AAC4',
 'TCGA-US-A774',
 'TCGA-VS-A9UU',
 'TCGA-XF-A9SH',
 'TCGA-XJ-A9DI',
 'TCGA-YU-AA61',
 'TCGA-ZG-A8QZ'}

In [52]:
# cell lines without mutations in the 324 genes
full_set

{'PR-XcC2fL', 'PR-pYHfy7', 'PR-r3HxM0', 'PR-z1ZBpL'}

#### Remove these patients/cell lines from existing splits

In [53]:
processed_file_dir = "/data/ajayago/papers_data/systematic_assessment/processed/"
expt1A_file_dir = processed_file_dir + "Experiment1/SettingA/"
expt1B_file_dir = processed_file_dir + "Experiment1/SettingB/"
expt2A_file_dir = processed_file_dir + "Experiment2/SettingA/"
expt2B_file_dir = processed_file_dir + "Experiment2/SettingB/"

In [54]:
# Patients
# Experiment 1A
with open(f"{expt1A_file_dir}/patients_fold0.pkl", "rb") as f:
    setting1A_split0 = pickle.load(f)
with open(f"{expt1A_file_dir}/patients_fold1.pkl", "rb") as f:
    setting1A_split1 = pickle.load(f)
with open(f"{expt1A_file_dir}/patients_fold2.pkl", "rb") as f:
    setting1A_split2 = pickle.load(f)

# Experiment 1B
with open(f"{expt1B_file_dir}/patients_fold0.pkl", "rb") as f:
    setting1B_split0 = pickle.load(f)
with open(f"{expt1B_file_dir}/patients_fold1.pkl", "rb") as f:
    setting1B_split1 = pickle.load(f)
with open(f"{expt1B_file_dir}/patients_fold2.pkl", "rb") as f:
    setting1B_split2 = pickle.load(f)

# Experiment 2A
with open(f"{expt2A_file_dir}/patients_fold0.pkl", "rb") as f:
    setting2A_split0 = pickle.load(f)
with open(f"{expt2A_file_dir}/patients_fold1.pkl", "rb") as f:
    setting2A_split1 = pickle.load(f)
with open(f"{expt2A_file_dir}/patients_fold2.pkl", "rb") as f:
    setting2A_split2 = pickle.load(f)

# Experiment 2B
with open(f"{expt2B_file_dir}/patients_fold0.pkl", "rb") as f:
    setting2B_split0 = pickle.load(f)
with open(f"{expt2B_file_dir}/patients_fold1.pkl", "rb") as f:
    setting2B_split1 = pickle.load(f)
with open(f"{expt2B_file_dir}/patients_fold2.pkl", "rb") as f:
    setting2B_split2 = pickle.load(f)

In [55]:
# Experiment 1A
i = 0
for setting in [setting1A_split0, setting1A_split1, setting1A_split2]:
    print(f"Experiment 1A split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        for k, v in setting[div].items():
            print(f"{div}: {k}")
            print(f"Size before = {v.shape}")
            df = v[~v.sample_id.isin(full_set_p)]
            print(f"Size after = {df.shape}")
            setting[div][k] = df # reassign df

Experiment 1A split 0
train: BUPARLISIB
Size before = (16, 5)
Size after = (16, 5)
train: CISPLATIN
Size before = (66, 5)
Size after = (64, 5)
train: FLUOROURACIL
Size before = (34, 5)
Size after = (33, 5)
train: GEMCITABINE
Size before = (40, 5)
Size after = (37, 5)
train: PACLITAXEL
Size before = (27, 5)
Size after = (26, 5)
train: SORAFENIB
Size before = (40, 5)
Size after = (38, 5)
train: TEMOZOLOMIDE
Size before = (60, 5)
Size after = (58, 5)
val: BUPARLISIB
Size before = (2, 5)
Size after = (2, 5)
val: CISPLATIN
Size before = (8, 5)
Size after = (8, 5)
val: FLUOROURACIL
Size before = (4, 5)
Size after = (3, 5)
val: GEMCITABINE
Size before = (5, 5)
Size after = (4, 5)
val: PACLITAXEL
Size before = (3, 5)
Size after = (3, 5)
val: SORAFENIB
Size before = (5, 5)
Size after = (5, 5)
val: TEMOZOLOMIDE
Size before = (8, 5)
Size after = (8, 5)
test: BUPARLISIB
Size before = (9, 5)
Size after = (9, 5)
test: CISPLATIN
Size before = (30, 5)
Size after = (27, 5)
test: FLUOROURACIL
Size befor

In [56]:
setting1A_split0["train"]["CISPLATIN"].shape

(64, 5)

In [57]:
# Experiment 1B
i = 0
for setting in [setting1B_split0, setting1B_split1, setting1B_split2]:
    print(f"Experiment 1B split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        for k, v in setting[div].items():
            print(f"{div}: {k}")
            print(f"Size before = {v.shape}")
            df = v[~v.sample_id.isin(full_set_p)]
            print(f"Size after = {df.shape}")
            setting[div][k] = df # reassign df

Experiment 1B split 0
train: ('CISPLATIN', 'TCGA-CESC', 'TCGA')
Size before = (29, 5)
Size after = (29, 5)
train: ('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Size before = (22, 5)
Size after = (22, 5)
train: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Size before = (19, 5)
Size after = (19, 5)
train: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Size before = (27, 5)
Size after = (23, 5)
train: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Size before = (16, 5)
Size after = (15, 5)
train: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')
Size before = (54, 5)
Size after = (52, 5)
val: ('CISPLATIN', 'TCGA-CESC', 'TCGA')
Size before = (4, 5)
Size after = (4, 5)
val: ('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Size before = (3, 5)
Size after = (3, 5)
val: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Size before = (3, 5)
Size after = (3, 5)
val: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Size before = (3, 5)
Size after = (3, 5)
val: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Size before = (3, 5)
Size after = (3, 5)
val: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')
Size be

In [58]:
setting1B_split0["train"][('CISPLATIN', 'TCGA-CESC', 'TCGA')].shape

(29, 5)

In [59]:
# Experiment 2A
i = 0
for setting in [setting2A_split0, setting2A_split1, setting2A_split2]:
    print(f"Experiment 2A split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        print(f"{div}")
        v = setting[div]
        print(f"Size before = {v.shape}")
        df = v[~v.sample_id.isin(full_set_p)]
        print(f"Size after = {df.shape}")
        setting[div] = df # reassign df

Experiment 2A split 0
train
Size before = (520, 5)
Size after = (488, 5)
val
Size before = (58, 5)
Size after = (53, 5)
test
Size before = (123, 5)
Size after = (115, 5)
Experiment 2A split 1
train
Size before = (523, 5)
Size after = (488, 5)
val
Size before = (59, 5)
Size after = (54, 5)
test
Size before = (119, 5)
Size after = (114, 5)
Experiment 2A split 2
train
Size before = (526, 5)
Size after = (487, 5)
val
Size before = (59, 5)
Size after = (56, 5)
test
Size before = (116, 5)
Size after = (113, 5)


In [60]:
setting2A_split0["train"].shape

(488, 5)

In [61]:
# Experiment 2B
i = 0
for setting in [setting2B_split0, setting2B_split1, setting2B_split2]:
    print(f"Experiment 2B split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        for k, v in setting[div].items():
            print(f"{div}: {k}")
            print(f"Size before = {v.shape}")
            df = v[~v.sample_id.isin(full_set_p)]
            print(f"Size after = {df.shape}")
            setting[div][k] = df # reassign df

Experiment 2B split 0
train: TCGA-BRCA
Size before = (80, 5)
Size after = (74, 5)
train: TCGA-CESC
Size before = (33, 5)
Size after = (33, 5)
train: TCGA-HNSC
Size before = (39, 5)
Size after = (39, 5)
train: TCGA-STAD
Size before = (38, 5)
Size after = (38, 5)
train: TCGA-PAAD
Size before = (39, 5)
Size after = (32, 5)
train: TCGA-LGG
Size before = (62, 5)
Size after = (60, 5)
val: TCGA-BRCA
Size before = (9, 5)
Size after = (8, 5)
val: TCGA-CESC
Size before = (4, 5)
Size after = (4, 5)
val: TCGA-HNSC
Size before = (5, 5)
Size after = (5, 5)
val: TCGA-STAD
Size before = (5, 5)
Size after = (5, 5)
val: TCGA-PAAD
Size before = (5, 5)
Size after = (4, 5)
val: TCGA-LGG
Size before = (7, 5)
Size after = (7, 5)
test: TCGA-BRCA
Size before = (19, 5)
Size after = (17, 5)
test: TCGA-CESC
Size before = (17, 5)
Size after = (15, 5)
test: TCGA-HNSC
Size before = (13, 5)
Size after = (12, 5)
test: TCGA-STAD
Size before = (12, 5)
Size after = (12, 5)
test: TCGA-PAAD
Size before = (16, 5)
Size after

In [62]:
setting2B_split0["train"]["TCGA-BRCA"].shape

(74, 5)

In [63]:
# Cell lines
# Experiment 1A
with open(f"{expt1A_file_dir}/cell_lines_fold0.pkl", "rb") as f:
    setting1A_split0_cl = pickle.load(f)

# Experiment 1B
with open(f"{expt1B_file_dir}/cell_lines_fold0.pkl", "rb") as f:
    setting1B_split0_cl = pickle.load(f)

# Experiment 2A
with open(f"{expt2A_file_dir}/cell_lines_fold0.pkl", "rb") as f:
    setting2A_split0_cl = pickle.load(f)

# Experiment 2B
with open(f"{expt2B_file_dir}/cell_lines_fold0.pkl", "rb") as f:
    setting2B_split0_cl = pickle.load(f)

In [64]:
# Experiment 1A
i = 0
for setting in [setting1A_split0_cl]:
    print(f"Experiment 1A split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        for k, v in setting[div].items():
            print(f"{div}: {k}")
            print(f"Size before = {v.shape}")
            df = v[~v.sample_id.isin(full_set)]
            print(f"Size after = {df.shape}")
            setting[div][k] = df # reassign df

Experiment 1A split 0
train: BUPARLISIB
Size before = (864, 6)
Size after = (860, 6)
train: CISPLATIN
Size before = (694, 6)
Size after = (692, 6)
train: FLUOROURACIL
Size before = (871, 6)
Size after = (868, 6)
train: GEMCITABINE
Size before = (864, 6)
Size after = (861, 6)
train: PACLITAXEL
Size before = (864, 6)
Size after = (860, 6)
train: SORAFENIB
Size before = (864, 6)
Size after = (861, 6)
train: TEMOZOLOMIDE
Size before = (867, 6)
Size after = (864, 6)
val: BUPARLISIB
Size before = (96, 6)
Size after = (96, 6)
val: CISPLATIN
Size before = (78, 6)
Size after = (78, 6)
val: FLUOROURACIL
Size before = (97, 6)
Size after = (96, 6)
val: GEMCITABINE
Size before = (96, 6)
Size after = (96, 6)
val: PACLITAXEL
Size before = (96, 6)
Size after = (96, 6)
val: SORAFENIB
Size before = (96, 6)
Size after = (96, 6)
val: TEMOZOLOMIDE
Size before = (97, 6)
Size after = (97, 6)
test: BUPARLISIB
Size before = (119, 6)
Size after = (119, 6)
test: CISPLATIN
Size before = (96, 6)
Size after = (95, 

In [65]:
# Experiment 1B
i = 0
for setting in [setting1B_split0_cl]:
    print(f"Experiment 1B split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        for k, v in setting[div].items():
            print(f"{div}: {k}")
            print(f"Size before = {v.shape}")
            df = v[~v.sample_id.isin(full_set)]
            print(f"Size after = {df.shape}")
            setting[div][k] = df # reassign df

Experiment 1B split 0
train: ('CISPLATIN', 'TCGA-CESC', 'TCGA')
Size before = (694, 6)
Size after = (692, 6)
train: ('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Size before = (694, 6)
Size after = (692, 6)
train: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Size before = (871, 6)
Size after = (868, 6)
train: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Size before = (864, 6)
Size after = (861, 6)
train: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Size before = (864, 6)
Size after = (860, 6)
train: ('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')
Size before = (867, 6)
Size after = (864, 6)
val: ('CISPLATIN', 'TCGA-CESC', 'TCGA')
Size before = (78, 6)
Size after = (78, 6)
val: ('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Size before = (78, 6)
Size after = (78, 6)
val: ('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Size before = (97, 6)
Size after = (96, 6)
val: ('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Size before = (96, 6)
Size after = (96, 6)
val: ('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Size before = (96, 6)
Size after = (96, 6)
val: ('TEMOZOLOMIDE', 'TCGA

In [66]:
# Experiment 2A
i = 0
for setting in [setting2A_split0_cl]:
    print(f"Experiment 2A split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        print(f"{div}:")
        v = setting[div]
        print(f"Size before = {v.shape}")
        df = v[~v.sample_id.isin(full_set)]
        print(f"Size after = {df.shape}")
        setting[div] = df # reassign df

Experiment 2A split 0
train:
Size before = (156959, 6)
Size after = (156441, 6)
val:
Size before = (17440, 6)
Size after = (17371, 6)
test:
Size before = (21655, 6)
Size after = (21589, 6)


In [67]:
# Experiment 2B
i = 0
for setting in [setting2B_split0_cl]:
    print(f"Experiment 2B split {i}")
    i += 1
    for div in ["train", "val", "test"]:
        for k, v in setting[div].items():
            print(f"{div}: {k}")
            print(f"Size before = {v.shape}")
            df = v[~v.sample_id.isin(full_set)]
            print(f"Size after = {df.shape}")
            setting[div][k] = df # reassign df

Experiment 2B split 0
train: TCGA-BRCA
Size before = (156959, 6)
Size after = (156441, 6)
train: TCGA-CESC
Size before = (156959, 6)
Size after = (156441, 6)
train: TCGA-HNSC
Size before = (156959, 6)
Size after = (156441, 6)
train: TCGA-STAD
Size before = (156959, 6)
Size after = (156441, 6)
train: TCGA-PAAD
Size before = (156959, 6)
Size after = (156441, 6)
train: TCGA-LGG
Size before = (156959, 6)
Size after = (156441, 6)
val: TCGA-BRCA
Size before = (17440, 6)
Size after = (17371, 6)
val: TCGA-CESC
Size before = (17440, 6)
Size after = (17371, 6)
val: TCGA-HNSC
Size before = (17440, 6)
Size after = (17371, 6)
val: TCGA-STAD
Size before = (17440, 6)
Size after = (17371, 6)
val: TCGA-PAAD
Size before = (17440, 6)
Size after = (17371, 6)
val: TCGA-LGG
Size before = (17440, 6)
Size after = (17371, 6)
test: TCGA-BRCA
Size before = (21655, 6)
Size after = (21589, 6)
test: TCGA-CESC
Size before = (21655, 6)
Size after = (21589, 6)
test: TCGA-HNSC
Size before = (21655, 6)
Size after = (215

#### Ensure both labels are present in validation split for patients

* Pearson and Spearman correlation coefficients result in NaNs when there is only an array of the same label in the input.
* Here, if val data split has only samples of a single class label, we obtain one sample of the opposite label from the train data and add it to the set of samples in validation data.
* This is done only for the patient datasets, as cell lines are more in number and mostly rely on AUC rather than binary labels. Further the binary label for cell lines is based on median AUC resulting in a 50-50 split of labels in train and test.

In [68]:
# Experiment 1A
i = 0
for setting in [setting1A_split0, setting1A_split1, setting1A_split2]:
    print(f"Experiment 1A split {i}")
    i += 1
    for k in setting["val"].keys():
        print(k)
        num_unique_labels = len(setting["val"][k].recist.value_counts())
        print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"][k].recist.value_counts())} in train")
        if num_unique_labels == 1: # only 1 label available
            label_available = setting["val"][k].recist.value_counts().index[0]
            # take samples of other label from train split
            train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
            # sample a data point from here (last occurring data sample) if atleast two samples are available in train
            if len(train_data_df) >= 2:
                row2move_from_train2val = train_data_df[-1:]
                setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
                # remove from train data
                setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)
    
            # check that both labels are present in train and val splits after moving
            print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 1A split 0
BUPARLISIB
Before moving: 2 in val data and 2 in train
CISPLATIN
Before moving: 2 in val data and 2 in train
FLUOROURACIL
Before moving: 2 in val data and 2 in train
GEMCITABINE
Before moving: 2 in val data and 2 in train
PACLITAXEL
Before moving: 2 in val data and 2 in train
SORAFENIB
Before moving: 2 in val data and 2 in train
TEMOZOLOMIDE
Before moving: 2 in val data and 2 in train
Experiment 1A split 1
BUPARLISIB
Before moving: 2 in val data and 1 in train
CISPLATIN
Before moving: 2 in val data and 2 in train
FLUOROURACIL
Before moving: 2 in val data and 2 in train
GEMCITABINE
Before moving: 2 in val data and 2 in train
PACLITAXEL
Before moving: 2 in val data and 2 in train
SORAFENIB
Before moving: 2 in val data and 2 in train
TEMOZOLOMIDE
Before moving: 2 in val data and 2 in train
Experiment 1A split 2
BUPARLISIB
Before moving: 1 in val data and 2 in train
After moving: 1 in val and 2 in train
CISPLATIN
Before moving: 2 in val data and 2 in train
FLUOROURACI

In [69]:
# Experiment 1B
i = 0
for setting in [setting1B_split0, setting1B_split1, setting1B_split2]:
    print(f"Experiment 1B split {i}")
    i += 1
    for k in setting["val"].keys():
        print(k)
        num_unique_labels = len(setting["val"][k].recist.value_counts())
        print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"][k].recist.value_counts())} in train")
        if num_unique_labels == 1: # only 1 label available
            label_available = setting["val"][k].recist.value_counts().index[0]
            # take samples of other label from train split
            train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
            # sample a data point from here (last occurring data sample) if atleast two samples are available in train
            if len(train_data_df) >= 2:
                row2move_from_train2val = train_data_df[-1:]
                setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
                # remove from train data
                setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)
    
            # check that both labels are present in train and val splits after moving
            print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 1B split 0
('CISPLATIN', 'TCGA-CESC', 'TCGA')
Before moving: 2 in val data and 2 in train
('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Before moving: 2 in val data and 2 in train
('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Before moving: 2 in val data and 2 in train
('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA')
Before moving: 2 in val data and 2 in train
Experiment 1B split 1
('CISPLATIN', 'TCGA-CESC', 'TCGA')
Before moving: 2 in val data and 2 in train
('CISPLATIN', 'TCGA-HNSC', 'TCGA')
Before moving: 2 in val data and 2 in train
('FLUOROURACIL', 'TCGA-STAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('GEMCITABINE', 'TCGA-PAAD', 'TCGA')
Before moving: 2 in val data and 2 in train
('PACLITAXEL', 'TCGA-BRCA', 'TCGA')
Before moving: 1 in val data and 2 in train
After moving: 2 in val and 2 in train
('TEMOZOLOMIDE', 'TCGA-LGG', 'TCGA'

In [70]:
setting1B_split2["val"][('PACLITAXEL', 'TCGA-BRCA', 'TCGA')]

Unnamed: 0,sample_id,drug_name,recist,mappedProject,dataset_name
0,TCGA-A2-A4S2,PACLITAXEL,1,TCGA-BRCA,TCGA
25,TCGA-GM-A2DB,PACLITAXEL,1,TCGA-BRCA,TCGA
28,TCGA-EW-A2FR,PACLITAXEL,0,TCGA-BRCA,TCGA


In [71]:
# Experiment 2A - nothing to be done
i = 0
for setting in [setting2A_split0, setting2A_split1, setting2A_split2]:
    print(f"Experiment 2A split {i}")
    i += 1
    num_unique_labels = len(setting["val"].recist.value_counts())
    print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"].recist.value_counts())} in train")
    # if num_unique_labels == 1: # only 1 label available
    #     label_available = setting["val"][k].recist.value_counts().index[0]
    #     # take samples of other label from train split
    #     train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
    #     # sample a data point from here (last occurring data sample) if atleast two samples are available in train
    #     if len(train_data_df) >= 2:
    #         row2move_from_train2val = train_data_df[-1:]
    #         setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
    #         # remove from train data
    #         setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)

    #     # check that both labels are present in train and val splits after moving
    #     print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 2A split 0
Before moving: 2 in val data and 2 in train
Experiment 2A split 1
Before moving: 2 in val data and 2 in train
Experiment 2A split 2
Before moving: 2 in val data and 2 in train


In [72]:
# Experiment 2B - nothing to be done
i = 0
for setting in [setting2B_split0, setting2B_split1, setting2B_split2]:
    print(f"Experiment 2B split {i}")
    i += 1
    for k in setting["val"].keys():
        print(k)
        num_unique_labels = len(setting["val"][k].recist.value_counts())
        print(f"Before moving: {num_unique_labels} in val data and {len(setting["train"][k].recist.value_counts())} in train")
        # if num_unique_labels == 1: # only 1 label available
        #     label_available = setting["val"][k].recist.value_counts().index[0]
        #     # take samples of other label from train split
        #     train_data_df = setting["train"][k][setting["train"][k].recist != label_available]
        #     # sample a data point from here (last occurring data sample) if atleast two samples are available in train
        #     if len(train_data_df) >= 2:
        #         row2move_from_train2val = train_data_df[-1:]
        #         setting["val"][k] = pd.concat([setting["val"][k], train_data_df[-1:]]) # add to the validation data
        #         # remove from train data
        #         setting["train"][k].drop(row2move_from_train2val.index, axis = 0, inplace=True)
    
        #     # check that both labels are present in train and val splits after moving
        #     print(f"After moving: {len(setting["val"][k].recist.value_counts())} in val and {len(setting["train"][k].recist.value_counts())} in train")

Experiment 2B split 0
TCGA-BRCA
Before moving: 2 in val data and 2 in train
TCGA-CESC
Before moving: 2 in val data and 2 in train
TCGA-HNSC
Before moving: 2 in val data and 2 in train
TCGA-STAD
Before moving: 2 in val data and 2 in train
TCGA-PAAD
Before moving: 2 in val data and 2 in train
TCGA-LGG
Before moving: 2 in val data and 2 in train
Experiment 2B split 1
TCGA-BRCA
Before moving: 2 in val data and 2 in train
TCGA-CESC
Before moving: 2 in val data and 2 in train
TCGA-HNSC
Before moving: 2 in val data and 2 in train
TCGA-STAD
Before moving: 2 in val data and 2 in train
TCGA-PAAD
Before moving: 2 in val data and 2 in train
TCGA-LGG
Before moving: 2 in val data and 2 in train
Experiment 2B split 2
TCGA-BRCA
Before moving: 2 in val data and 2 in train
TCGA-CESC
Before moving: 2 in val data and 2 in train
TCGA-HNSC
Before moving: 2 in val data and 2 in train
TCGA-STAD
Before moving: 2 in val data and 2 in train
TCGA-PAAD
Before moving: 2 in val data and 2 in train
TCGA-LGG
Before mo

In [73]:
# Save updated files after removing patients/cell lines without mutations in the 324 genes and ensuring both labels in val sets
# cell line files
# Experiment 1A
with open(f"{expt1A_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting1A_split0_cl, f)

# Experiment 1B
with open(f"{expt1B_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting1B_split0_cl, f)

# Experiment 2A
with open(f"{expt2A_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting2A_split0_cl, f)

# Experiment 2B
with open(f"{expt2B_file_dir}/cell_lines_fold0.pkl", "wb") as f:
    pickle.dump(setting2B_split0_cl, f)


# patient files
# Experiment 1A
with open(f"{expt1A_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting1A_split0, f)
with open(f"{expt1A_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting1A_split1, f)
with open(f"{expt1A_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting1A_split2, f)

# Experiment 1B
with open(f"{expt1B_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting1B_split0, f)
with open(f"{expt1B_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting1B_split1, f)
with open(f"{expt1B_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting1B_split2, f)

# Experiment 2A
with open(f"{expt2A_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting2A_split0, f)
with open(f"{expt2A_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting2A_split1, f)
with open(f"{expt2A_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting2A_split2, f)

# Experiment 2B
with open(f"{expt2B_file_dir}/patients_fold0.pkl", "wb") as f:
    pickle.dump(setting2B_split0, f)
with open(f"{expt2B_file_dir}/patients_fold1.pkl", "wb") as f:
    pickle.dump(setting2B_split1, f)
with open(f"{expt2B_file_dir}/patients_fold2.pkl", "wb") as f:
    pickle.dump(setting2B_split2, f)