In [1]:
development = True

In [2]:
if development:
    %load_ext autoreload
    %autoreload 2


In [3]:
import os
import numpy as np
import pandas as pd
import swifter

In [4]:
data_dir = "../data"
raw_folder_name = "raw"
if development:
    processed_folder_name = "development"
else:
    processed_folder_name = "processed"

os.makedirs(os.path.join(data_dir, processed_folder_name), exist_ok=True)


In [5]:
tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]


# Process RPPA Data

In [6]:
rppa_file_name = "TCGA-RPPA-pancan-clean.xena"

rppa_df = pd.read_csv(os.path.join(data_dir, "raw", rppa_file_name), sep="\t")
rppa_df.index = rppa_df["SampleID"].tolist()
rppa_df.drop(columns=["SampleID"], inplace=True)
rppa_df = rppa_df.T
rppa_df.reset_index(drop=False, inplace=True)
rppa_df.rename(columns={"index": "sample_id"}, inplace=True)
rppa_df = rppa_df.dropna(axis=1)

if development:
    display(rppa_df)


Unnamed: 0,sample_id,X1433EPSILON,X4EBP1,X4EBP1_pS65,X4EBP1_pT37T46,X53BP1,ACC_pS79,ACC1,AKT,AKT_pS473,...,CHK1_pS296,COG3,DUSP4,ERCC5,IGF1R_pY1135Y1136,IRF1,JAK2,P16INK4A,SHP2_pY542,PDL1
0,TCGA-FI-A2EY-01,-0.013829,-1.127400,-0.423550,-0.827380,0.021656,2.21700,2.37550,-0.278160,-1.20050,...,0.342840,-0.187920,-0.562350,-0.300170,0.153210,-0.165840,-0.212820,-0.667480,0.065243,-0.211030
1,TCGA-DF-A2KS-01,-0.168630,0.165870,-0.505950,0.019504,-0.947890,0.41386,0.50057,-0.635490,0.85884,...,0.216650,-0.086944,-0.231180,0.256720,0.098167,0.236690,0.174500,1.647200,0.063824,0.787150
2,TCGA-A5-A1OH-01,0.038842,-0.382370,0.042306,0.119400,0.374310,0.00000,-0.23451,0.338930,0.30467,...,-0.036242,0.059741,-0.375340,0.056021,-0.011055,-0.399340,-0.039017,0.440300,0.292810,0.033804
3,TCGA-AX-A2H7-01,0.021308,-0.717660,-0.493150,-0.370670,0.195980,0.32478,0.42861,0.116670,0.56017,...,0.082728,0.074237,-0.361160,-0.257720,0.065891,-0.219940,0.116700,-0.887010,0.159430,0.109700
4,TCGA-AX-A2HA-01,0.108640,0.090459,0.339620,-0.017032,0.266380,-1.24100,-1.09810,-1.145600,-0.41474,...,0.202660,-0.108580,0.120520,-0.184920,-0.052694,0.001150,0.483320,-0.757430,-0.359640,-0.070833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7749,TCGA-VQ-A8DU-01,-0.048516,-0.073244,-0.034748,-0.649850,-0.407540,-1.39370,-1.47300,-0.481990,-0.97238,...,0.002732,-0.366760,-0.543580,-1.203700,-0.018443,-0.225050,-0.091311,1.281900,-0.948590,0.349620
7750,TCGA-VQ-A8DT-01,-0.139280,-0.169090,0.125880,-1.216300,-0.657220,-0.61348,-0.78547,-0.200010,-0.80668,...,-0.153640,0.135010,1.317900,-0.339530,-0.100710,-0.024453,-0.151100,-0.255410,-1.088100,-0.070307
7751,TCGA-IN-A7NR-01,-0.232530,-0.509850,0.429300,0.444290,-0.565080,-0.43580,-1.46160,0.009025,0.60736,...,-0.177360,-0.021167,0.007237,-0.013777,0.024895,0.326390,-0.287180,-0.089055,0.639800,-0.197000
7752,TCGA-RD-A8MV-01,-0.019379,0.250550,0.376320,0.926290,-2.084400,-0.63963,-1.04320,-1.877600,-0.46353,...,-0.018805,-0.534090,0.873200,0.758660,0.181290,0.333570,-0.534320,0.246050,-0.127830,0.102580


# Process Thresholded and Unthresholded CNA Data

In [7]:
thresholded_cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes"
unthresholded_cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"

hgnc_symbol_to_entrezgene_id_mapping_file_name = "hgnc_to_entrezgene_id_mapping.tsv"
hgnc_symbol_to_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, hgnc_symbol_to_entrezgene_id_mapping_file_name), sep="\t").values)

gex_file_name = "tcga_gene_expected_count"
gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t", usecols=["sample"])
gex_ensembl_ids = frozenset(gex_df["sample"].tolist())
del gex_df

def process_cna_df(cna_file_name):
    if development:
        cna_df = pd.read_csv(os.path.join(data_dir, "raw", cna_file_name), sep="\t", nrows=1000)
    else:
        cna_df = pd.read_csv(os.path.join(data_dir, "raw", cna_file_name), sep="\t")  

    for index, row in cna_df.iterrows():
        sample_splitted = row["Sample"].split("|")
        if len(sample_splitted) == 1:
            cna_df.at[index, "ensembl_id"] = ""
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_to_entrezgene_id_mapping.get(sample_splitted[0], np.NaN)
        elif len(sample_splitted) == 2:
            cna_df.at[index, "ensembl_id"] = sample_splitted[1]
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_to_entrezgene_id_mapping.get(sample_splitted[0], np.NaN)
        else:
            raise Exception("sample_splitted has more than 1 '|'s")

    cna_df = cna_df[~pd.isnull(cna_df["entrezgene_id"])]
    cna_df["entrezgene_id"] = cna_df["entrezgene_id"].swifter.apply(lambda x: int(x))

    cna_df["ensembl_id_is_not_in_gex_ensembl_ids"] = cna_df["ensembl_id"].swifter.apply(lambda x: 1 * (x not in gex_ensembl_ids))

    def get_ensembl_version(ensembl_id):
        if pd.isnull(ensembl_id) or ensembl_id == "":
            return -1
        else:
            return int(ensembl_id.split(".")[-1])

    cna_df["ensembl_version"] = cna_df["ensembl_id"].swifter.apply(lambda ensembl_id: get_ensembl_version(ensembl_id))

    def select_one_row_per_entrezgene_id(x):
        return x.sort_values(by=["ensembl_id_is_not_in_gex_ensembl_ids", "ensembl_version"], ascending=True).iloc[0, :]

    cna_df = cna_df.swifter.groupby("entrezgene_id").apply(lambda x: select_one_row_per_entrezgene_id(x)).reset_index(drop=True)

    cna_df.drop(columns=["Sample", "ensembl_id", "ensembl_id_is_not_in_gex_ensembl_ids", "ensembl_version"], inplace=True)

    cna_df.set_index("entrezgene_id", inplace=True)

    cna_df = cna_df.T

    cna_df.reset_index(drop=False, inplace=True)
    cna_df = cna_df.rename_axis(None, axis=1)
    cna_df.rename(columns={"index": "sample_id"}, inplace=True)
    
    return cna_df

thresholded_cna_df = process_cna_df(cna_file_name=thresholded_cna_file_name)
unthresholded_cna_df = process_cna_df(cna_file_name=unthresholded_cna_file_name)

if development:
    display(thresholded_cna_df)
    display(unthresholded_cna_df)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=959.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=959.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=959.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=959.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=959.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=959.0), HTML(value='')))




Unnamed: 0,sample_id,34,204,205,249,473,533,576,656,712,...,106481100,106481135,106481157,106481167,106481845,106481849,106481855,107048981,107074708,124906933
0,TCGA-A5-A0GI-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TCGA-S9-A7J2-01,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,TCGA-06-0150-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TCGA-AR-A1AH-01,2.0,-1.0,2.0,-1.0,-1.0,2.0,-1.0,0.0,-1.0,...,-1.0,2.0,2.0,-1.0,2.0,-1.0,-1.0,-1.0,2.0,2.0
4,TCGA-EK-A2RE-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10840,TCGA-IB-7885-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10841,TCGA-95-7947-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10842,TCGA-VQ-AA6F-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10843,TCGA-BR-8588-01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,sample_id,34,204,205,249,473,533,576,656,712,...,106481100,106481135,106481157,106481167,106481845,106481849,106481855,107048981,107074708,124906933
0,TCGA-A5-A0GI-01,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,...,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001
1,TCGA-S9-A7J2-01,-0.923,-0.872,-0.923,-0.872,-0.872,-0.872,-0.872,-0.872,-0.872,...,-0.872,-0.872,-0.923,-0.872,-0.923,-0.872,-0.872,-0.872,-0.872,-0.872
2,TCGA-06-0150-01,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,...,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002
3,TCGA-AR-A1AH-01,1.378,-0.733,1.485,-0.733,-0.733,1.491,-0.733,0.039,-0.733,...,-0.733,1.485,1.378,-0.733,1.485,-0.733,-0.733,-0.733,1.646,1.646
4,TCGA-EK-A2RE-01,0.010,-0.025,0.010,-0.025,-0.025,0.010,-0.025,0.010,-0.025,...,-0.025,0.010,0.010,-0.025,0.010,-0.025,-0.025,-0.025,-0.025,-0.025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10840,TCGA-IB-7885-01,0.012,0.012,0.012,0.012,0.012,0.012,0.012,0.012,0.012,...,0.012,0.012,0.012,0.012,0.012,0.012,0.012,0.012,0.012,0.012
10841,TCGA-95-7947-01,0.028,-0.033,-0.033,-0.033,-0.033,-0.033,-0.033,-0.033,-0.033,...,-0.033,-0.033,0.028,-0.033,-0.033,-0.033,-0.033,-0.033,-0.033,-0.033
10842,TCGA-VQ-AA6F-01,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,...,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057,-0.057
10843,TCGA-BR-8588-01,0.151,0.151,0.151,0.151,0.151,0.151,0.151,0.151,0.151,...,0.151,0.151,0.151,0.151,0.151,0.151,0.151,0.151,0.151,0.151


# Process Cancer Type Data

In [8]:
cancer_type_file_name = "TCGA_phenotype_denseDataOnlyDownload.tsv"
cancer_type_full_name_to_abbrreviation_mapping_file_name = "cancer_type_full_name_to_abbrreviation_mapping.tsv"

cancer_type_full_name_to_abbrreviation_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_type_full_name_to_abbrreviation_mapping_file_name), sep="\t").values)

cancer_type_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_type_file_name), sep="\t")
cancer_type_df = cancer_type_df.rename(columns={"sample": "sample_id", "_primary_disease": "cancer_type"})
cancer_type_df = cancer_type_df[["sample_id", "cancer_type"]]
cancer_type_df["cancer_type"] = cancer_type_df["cancer_type"].swifter.apply(lambda x: cancer_type_full_name_to_abbrreviation_mapping[x])

cancer_type_one_hot_df = pd.get_dummies(data=cancer_type_df, columns=["cancer_type"])

if development:
    display(cancer_type_df)
    display(cancer_type_one_hot_df)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=12804.0), HTML(value='')))




Unnamed: 0,sample_id,cancer_type
0,TCGA-D3-A1QA-07,skcm
1,TCGA-DE-A4MD-06,thcm
2,TCGA-J8-A3O2-06,thcm
3,TCGA-J8-A3YH-06,thcm
4,TCGA-EM-A2P1-06,thcm
...,...,...
12799,TCGA-17-Z059-01,luad
12800,TCGA-17-Z060-01,luad
12801,TCGA-17-Z061-01,luad
12802,TCGA-17-Z062-01,luad


Unnamed: 0,sample_id,cancer_type_acc,cancer_type_blca,cancer_type_brca,cancer_type_cesc,cancer_type_chol,cancer_type_coad,cancer_type_dlbc,cancer_type_esca,cancer_type_gbm,...,cancer_type_read,cancer_type_sarc,cancer_type_skcm,cancer_type_stad,cancer_type_tgct,cancer_type_thcm,cancer_type_thym,cancer_type_ucec,cancer_type_ucs,cancer_type_uvm
0,TCGA-D3-A1QA-07,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,TCGA-DE-A4MD-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,TCGA-J8-A3O2-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,TCGA-J8-A3YH-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,TCGA-EM-A2P1-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12799,TCGA-17-Z059-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12800,TCGA-17-Z060-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12801,TCGA-17-Z061-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12802,TCGA-17-Z062-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Process Tumor Purity Data

In [9]:
tumor_purity_cpe_file_name = "tumor_purity.csv"
tumor_purity_estimate_file_name = "tumor_purity_ESTIMATE.csv"

tumor_purity_cpe_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_cpe_file_name))
tumor_purity_estimate_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_estimate_file_name))

tumor_sample_id_purity_mapping = {}
for _, row in tumor_purity_cpe_df.iterrows():
    sample_id = row["Sample.ID"]
    purity_cpe = row["CPE"]
    purity_estimate = row["ESTIMATE"]
    purity_absolute = row["ABSOLUTE"]

    if not pd.isnull(purity_cpe):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_cpe.replace(",", "."))
        continue
    
    if not pd.isnull(purity_absolute):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_absolute.replace(",", "."))
        continue
    
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_estimate.replace(",", "."))
        continue

for _, row in tumor_purity_estimate_df.iterrows():
    sample_id = row["NAME"]
    purity_estimate = row["TumorPurity"]
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = purity_estimate

tumor_purity_df = pd.DataFrame.from_dict({"sample_id": tumor_sample_id_purity_mapping.keys(),
                                          "purity": tumor_sample_id_purity_mapping.values()})

sample_id_dict = {}
for sample_id in tumor_purity_df["sample_id"].values:
    if sample_id.split("-")[3][:2] not in tumor_sample_ids:
        continue
    sample_id_first_15 = sample_id[:15]
    if sample_id_first_15 in sample_id_dict.keys():
        if sample_id < sample_id_dict[sample_id_first_15]:
            sample_id_dict[sample_id_first_15] = sample_id
        else:
            continue
    else:
        sample_id_dict[sample_id_first_15] = sample_id

tumor_purity_df = tumor_purity_df[tumor_purity_df["sample_id"].swifter.apply(lambda x: x in list(sample_id_dict.values()))]
tumor_purity_df["sample_id"] = tumor_purity_df["sample_id"].swifter.apply(lambda x: x[:15])

if development:
    display(tumor_purity_df)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10805.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10770.0), HTML(value='')))




Unnamed: 0,sample_id,purity
0,TCGA-OR-A5J1-01,0.924600
1,TCGA-OR-A5J2-01,0.898500
2,TCGA-OR-A5J3-01,0.946600
3,TCGA-OR-A5J4-01,0.866000
4,TCGA-OR-A5J5-01,0.978000
...,...,...
10800,TCGA-XU-A92Z-01,0.804001
10801,TCGA-X7-A8D8-01,0.752956
10802,TCGA-XU-A92O-01,0.808031
10803,TCGA-X7-A8M8-01,0.731812


# Process GEX Data

In [10]:
ensembl_id_to_entrezgene_id_mapping_file_name = "ensembl_id_to_entrezgene_id_mapping.tsv"
gex_file_name = "tcga_gene_expected_count"

tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]
ensembl_id_to_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, ensembl_id_to_entrezgene_id_mapping_file_name), sep="\t").values)

if development:
    gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t", nrows=1000)
else:
    gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t")

gex_df.rename(columns={"sample": "ensembl_id"}, inplace=True)
gex_df = gex_df[["ensembl_id"] + [column for column in gex_df.columns if column.split("-")[-1] in tumor_sample_ids]]
gex_df["ensembl_id"] = gex_df["ensembl_id"].swifter.apply(lambda x: x.split(".")[0]).tolist()
gex_df = gex_df[gex_df["ensembl_id"].swifter.apply(lambda x: x in ensembl_id_to_entrezgene_id_mapping.keys())]
gex_df["entrezgene_id"] = gex_df["ensembl_id"].swifter.apply(lambda x: ensembl_id_to_entrezgene_id_mapping[x]).tolist()
gex_df.drop(columns=["ensembl_id"], inplace=True)
gex_df.index = gex_df["entrezgene_id"].tolist()
gex_df.drop(columns=["entrezgene_id"], inplace=True)
gex_df = gex_df.T
gex_df.reset_index(drop=False, inplace=True)
gex_df.rename(columns={"index": "sample_id"}, inplace=True)

if development:
    display(gex_df)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=449.0), HTML(value='')))




Unnamed: 0,sample_id,100507661,105372097,53916,57103,104472717,100616274,22838,55567,6147,...,388849,80135,8438,26973,6571,7733,8992,4969,118663,9683
0,TCGA-19-1787-01,0.0000,2.0000,10.3835,9.7764,7.1898,0.0,11.1624,3.0000,14.1243,...,4.7004,11.1662,9.9488,9.9829,2.0976,8.8580,12.9202,5.1293,5.5236,11.1046
1,TCGA-S9-A7J2-01,4.6439,2.8074,9.9144,8.7649,5.0875,0.0,11.9333,0.0000,13.7616,...,4.8580,8.9915,5.1707,10.6640,2.6276,8.2095,12.1889,5.4919,1.0000,11.4189
2,TCGA-EK-A2RE-01,0.0000,0.0000,10.0543,10.6375,0.0000,0.0,11.1157,7.1599,14.5443,...,1.5850,11.3537,9.5902,11.4514,5.3462,9.1085,13.2131,1.0000,3.3219,12.7041
3,TCGA-44-6778-01,0.0000,1.5850,9.8319,9.0954,6.8948,0.0,11.3454,4.0875,13.2414,...,4.5236,10.1923,8.5045,10.6015,7.9912,8.7616,12.0133,8.2336,1.0000,11.5493
4,TCGA-F4-6854-01,0.0000,0.0000,9.9701,9.4858,7.5774,0.0,12.7099,7.2479,15.0865,...,3.7004,10.7211,10.2562,10.3437,4.3820,8.6795,12.9280,6.8202,6.6724,11.5250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9798,TCGA-IB-7885-01,0.0000,0.0000,10.4929,9.5755,5.0875,0.0,11.7069,6.7142,14.6213,...,4.4594,10.9801,8.5455,10.5539,6.4276,8.3707,13.7557,9.3837,4.7004,11.9979
9799,TCGA-95-7947-01,0.0000,1.5850,11.4536,9.3287,2.3219,0.0,11.9538,9.0196,14.2452,...,4.0000,11.0147,9.3053,9.9653,6.2407,9.2784,13.1665,6.3923,0.0000,11.3712
9800,TCGA-VQ-AA6F-01,0.0000,0.0000,11.2277,10.5127,6.8455,0.0,13.0694,7.9069,15.2174,...,3.8074,10.7764,9.2406,10.5338,8.3767,9.7313,13.5452,5.9542,7.0000,12.0151
9801,TCGA-BR-8588-01,0.0000,0.0000,9.4450,9.8688,7.5314,0.0,11.3548,6.0444,14.4275,...,2.0000,10.4988,8.4638,10.1523,6.9920,7.9425,12.7432,11.0279,4.0875,11.0881


# Find intersecting sample IDs and columns

In [11]:
gex_sample_ids = set(gex_df["sample_id"].tolist())
unthresholded_cna_sample_ids = set(unthresholded_cna_df["sample_id"].tolist())
thresholded_cna_sample_ids = set(thresholded_cna_df["sample_id"].tolist())
tumor_purity_sample_ids = set(tumor_purity_df["sample_id"].tolist())
cancer_type_sample_ids = set(cancer_type_df["sample_id"].tolist())
intersecting_sample_ids = gex_sample_ids.intersection(unthresholded_cna_sample_ids).intersection(thresholded_cna_sample_ids).intersection(tumor_purity_sample_ids).intersection(cancer_type_sample_ids)

gex_gene_ids = set(gex_df.drop(columns=["sample_id"]).columns)
unthresholded_cna_gene_ids = set(unthresholded_cna_df.drop(columns=["sample_id"]).columns)
thresholded_cna_gene_ids = set(thresholded_cna_df.drop(columns=["sample_id"]).columns)
intersecting_columns = ["sample_id"] + sorted(list(gex_gene_ids.intersection(unthresholded_cna_gene_ids).intersection(thresholded_cna_gene_ids)))


# Save data

In [12]:
rppa_df = rppa_df[rppa_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
rppa_df = rppa_df.sort_values(by="sample_id")
rppa_df.to_csv(os.path.join(data_dir, processed_folder_name, "rppa.tsv"), sep="\t", index=False)
print("rppa_df.shape:", rppa_df.shape)

thresholded_cna_df = thresholded_cna_df[thresholded_cna_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
thresholded_cna_df = thresholded_cna_df.sort_values(by="sample_id")
thresholded_cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "thresholded_cna.tsv"), sep="\t", index=False)
print("thresholded_cna_df.shape:", thresholded_cna_df.shape)

unthresholded_cna_df = unthresholded_cna_df[unthresholded_cna_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
unthresholded_cna_df = unthresholded_cna_df.sort_values(by="sample_id")
unthresholded_cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "unthresholded_cna.tsv"), sep="\t", index=False)
print("unthresholded_cna_df.shape:", unthresholded_cna_df.shape)

tumor_purity_df = tumor_purity_df[tumor_purity_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
tumor_purity_df = tumor_purity_df.sort_values(by="sample_id")
tumor_purity_df.to_csv(os.path.join(data_dir, processed_folder_name, "tumor_purity.tsv"), sep="\t", index=False)

cancer_type_df = cancer_type_df[cancer_type_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
cancer_type_df = cancer_type_df.sort_values(by="sample_id")
cancer_type_df.to_csv(os.path.join(data_dir, processed_folder_name, "cancer_type.tsv"), sep="\t", index=False)

cancer_type_one_hot_df = cancer_type_one_hot_df[cancer_type_one_hot_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
cancer_type_one_hot_df = cancer_type_one_hot_df.sort_values(by="sample_id")
cancer_type_one_hot_df.to_csv(os.path.join(data_dir, processed_folder_name, "cancer_type_one_hot.tsv"), sep="\t", index=False)

gex_df = gex_df[gex_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
gex_df = gex_df.sort_values(by="sample_id")
gex_df.to_csv(os.path.join(data_dir, processed_folder_name, "gex.tsv"), sep="\t", index=False)
print("gex_df.shape:", gex_df.shape)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=7754.0), HTML(value='')))


rppa_df.shape: (6450, 211)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10845.0), HTML(value='')))


thresholded_cna_df.shape: (9168, 24)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10845.0), HTML(value='')))


unthresholded_cna_df.shape: (9168, 24)


HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=10770.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=12804.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=12804.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=9803.0), HTML(value='')))


gex_df.shape: (9168, 24)
