In [7]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import os
import sys
import numpy as np
import pandas as pd
from typing import Dict, List
from pathlib import Path

!{sys.executable} -m pip install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [9]:
data_dir = "../data"
raw_folder_name = "raw"
processed_folder_name = "processed"

os.makedirs(os.path.join(data_dir, processed_folder_name), exist_ok=True)


In [10]:
def save_processed_tsv_files(df: pd.DataFrame, data_type: str, cancer_type_sample_ids_mapping: Dict[str, List]) -> None:
    assert df.shape == df.dropna().shape, f"Dataframe for {data_type} has np.NaN values!"
    for cancer_type, sample_ids in cancer_type_sample_ids_mapping.items():
        folder_path = os.path.join(data_dir, processed_folder_name, cancer_type)
        os.makedirs(folder_path, exist_ok=True)
        df[df["sample_id"].apply(lambda x: x in sample_ids)].to_csv(os.path.join(folder_path, data_type + ".tsv"), sep="\t", index=False)


# Process CNA data

In [11]:
cancer_type_cna_file_name_mapping = {
    "BLCA": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "LUSC": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "OV": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "BLCA_thresholded": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes",
    "LUSC_thresholded": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes",
    "OV_thresholded": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes"
}

hgnc_symbol_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, "hgnc_to_entrezgene_id_mapping.tsv"), sep="\t").values)


def select_the_row_with_max_expression_per_gene(x):
    x_sum = x.values.sum(axis=1).ravel()
    x_argmax = np.argmax(x_sum)
    return x.iloc[x_argmax, :]


def process_cna_df(cancer_type: str) -> pd.DataFrame:
    cna_file_name = cancer_type_cna_file_name_mapping[cancer_type]
    cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cna_file_name), sep="\t")

    for index, row in cna_df.iterrows():
        gene_symbol_splitted = row["Gene Symbol"].split("|")
        if len(gene_symbol_splitted) == 1:
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
            cna_df.at[index, "ensembl_id"] = np.NaN
        elif len(gene_symbol_splitted) == 2:
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
            cna_df.at[index, "ensembl_id"] = gene_symbol_splitted[1]
        else:
            raise Exception("gene_symbol_splitted has more than 1 '|'s")
    
    cna_df = cna_df.drop(columns=["Gene Symbol", "ensembl_id"])
    cna_df = cna_df[~pd.isnull(cna_df["entrezgene_id"])]
    cna_df = cna_df.reset_index(drop=True)
    cna_df = cna_df.groupby("entrezgene_id").apply(lambda x: select_the_row_with_max_expression_per_gene(x)).reset_index(drop=True)
    cna_df = cna_df.T
    cna_df.columns = cna_df.loc["entrezgene_id", :]
    cna_df = cna_df.drop("entrezgene_id", axis=0)
    cna_df.columns = [int(column) for column in cna_df.columns]
    cna_df = cna_df.reset_index(drop=False)
    cna_df = cna_df.rename(columns={"index": "sample_id"})
    return cna_df

blca_cna_df = process_cna_df(cancer_type="BLCA")
lusc_cna_df = process_cna_df(cancer_type="LUSC")
ov_cna_df = process_cna_df(cancer_type="OV")
assert blca_cna_df.columns.tolist() == lusc_cna_df.columns.tolist(), "Columns of BLCA Subtype dataframe should be equal to columns of LUSC Subtype dataframe"
assert lusc_cna_df.columns.tolist() == ov_cna_df.columns.tolist(), "Columns of LUSC Subtype dataframe should be equal to columns of OV Subtype dataframe"
cna_df = pd.concat([blca_cna_df, lusc_cna_df, ov_cna_df], axis=0)

blca_cna_thresholded_df = process_cna_df(cancer_type="BLCA_thresholded")
lusc_cna_thresholded_df = process_cna_df(cancer_type="LUSC_thresholded")
ov_cna_thresholded_df = process_cna_df(cancer_type="OV_thresholded")
assert blca_cna_thresholded_df.columns.tolist() == lusc_cna_thresholded_df.columns.tolist(), "Columns of BLCA Subtype dataframe should be equal to columns of LUSC Subtype dataframe"
assert lusc_cna_thresholded_df.columns.tolist() == ov_cna_thresholded_df.columns.tolist(), "Columns of LUSC Subtype dataframe should be equal to columns of OV Subtype dataframe"
cna_thresholded_df = pd.concat([blca_cna_thresholded_df, lusc_cna_thresholded_df, ov_cna_thresholded_df], axis=0)

cancer_type_sample_ids_mapping = {
    "blca": blca_cna_df.sample_id.tolist(),
    "lusc": lusc_cna_df.sample_id.tolist(),
    "ov": ov_cna_df.sample_id.tolist()
}

save_processed_tsv_files(df=cna_df, data_type="cna", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)
save_processed_tsv_files(df=cna_thresholded_df, data_type="cna_thresholded", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)

cna_df


Unnamed: 0,sample_id,1,2,3,9,10,12,13,14,15,...,124907803,124907805,124907806,124907808,124907837,124907841,124907928,124907948,124908556,124908558
0,TCGA-2F-A9KO-01,-0.020,0.318,0.318,-0.284,-0.284,-0.025,0.128,-0.267,0.095,...,0.033,0.041,0.033,0.041,-0.160,-0.160,-0.284,0.095,-0.031,-0.031
1,TCGA-2F-A9KP-01,-0.120,-0.011,-0.011,-0.858,-0.858,-0.436,0.004,-0.827,-0.093,...,0.405,0.405,0.405,0.405,0.584,0.584,0.320,-0.093,-0.063,-0.063
2,TCGA-2F-A9KQ-01,-0.099,0.000,0.000,-0.007,-0.007,0.007,0.022,-0.874,-0.054,...,-0.038,-0.038,-0.038,-0.038,-0.038,-0.038,-0.054,-0.054,-0.166,-0.166
3,TCGA-2F-A9KR-01,0.005,0.018,0.018,-0.518,-0.518,-0.771,0.030,0.019,1.294,...,0.041,0.041,0.041,0.041,0.041,0.041,-0.884,-0.886,-0.082,-0.082
4,TCGA-2F-A9KT-01,-0.057,0.039,0.039,-0.298,-0.298,-0.648,0.041,-0.592,0.003,...,-0.019,-0.019,-0.019,-0.019,-0.024,-0.024,-0.630,0.003,-0.128,-0.128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574,TCGA-72-4241-01,-0.531,1.475,1.475,-0.970,-0.970,-0.798,0.556,0.480,-0.540,...,-0.185,-0.076,-0.185,-0.076,0.297,0.297,-0.619,-0.540,1.084,0.765
575,TCGA-OY-A56P-01,-0.781,0.732,0.732,0.009,0.009,-0.799,0.944,0.467,0.026,...,-1.025,-1.025,-1.025,-1.025,-0.345,-0.345,-0.359,0.026,-0.058,-0.058
576,TCGA-OY-A56Q-01,-1.147,1.003,1.003,0.232,0.232,-0.793,0.638,0.587,-0.370,...,0.357,0.357,0.357,0.357,0.357,0.357,-0.750,-0.665,-0.031,-0.031
577,TCGA-VG-A8LO-01,0.057,0.061,0.061,-0.978,-0.978,-0.843,0.736,-0.025,0.138,...,-0.848,-0.851,-0.848,-0.851,0.089,0.089,-0.834,0.031,1.190,1.190


# Process GEX data

In [12]:
gex_data_file_name = "EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv"

gex_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, gex_data_file_name), sep="\t")
gex_df["gene_id"] = gex_df["gene_id"].apply(lambda x: int(x.split("|")[1]))

tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]

column_dict = {}
for column in gex_df.columns:
    if column == "gene_id":
        continue
    if column.split("-")[3][:2] not in tumor_sample_ids:
        continue
    column_first_15 = column[:15]
    if column_first_15 in column_dict.keys():
        if column < column_dict[column_first_15]:
            column_dict[column_first_15] = column
        else:
            continue
    else:
        column_dict[column_first_15] = column

gex_df = gex_df[["gene_id"] + list(column_dict.values())]
gex_df.columns = [column[:15] for column in gex_df.columns]

gex_df = gex_df.T
gex_df.columns = gex_df.loc["gene_id", :]
gex_df = gex_df.drop("gene_id", axis=0)
gex_df.columns = [int(column) for column in gex_df.columns]
gex_df = gex_df.reset_index(drop=False).rename(columns={"index": "sample_id"})
gex_df = gex_df.dropna(axis=1)

avg_gex_df = pd.concat(
    [
        gex_df[["sample_id"]],
        pd.DataFrame.from_dict({"avg_gex": gex_df.drop(columns=["sample_id"]).values.mean(axis=1)})
    ], axis=1)

gex_sample_ids_set = set(gex_df["sample_id"])

cancer_type_sample_ids_mapping = {
    "blca": gex_sample_ids_set.intersection(cancer_type_sample_ids_mapping["blca"]),
    "lusc": gex_sample_ids_set.intersection(cancer_type_sample_ids_mapping["lusc"]),
    "ov": gex_sample_ids_set.intersection(cancer_type_sample_ids_mapping["ov"])
}

save_processed_tsv_files(df=gex_df, data_type="gex", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)
save_processed_tsv_files(df=avg_gex_df, data_type="avg_gex", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)

gex_df


Unnamed: 0,sample_id,100133144,100134869,10357,10431,155060,388795,390284,57714,645851,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
0,TCGA-OR-A5J1-01,3.266100,3.938500,149.135000,2034.100000,274.255000,0.000000,11.527400,282.421000,1.440900,...,183.958000,146.974000,20.653200,351.585000,1050.910000,0.480300,648.415000,1841.020000,1157.540000,596.062000
1,TCGA-OR-A5J2-01,2.681500,8.994800,81.077700,1304.930000,199.302000,0.402600,5.234200,527.848000,2.415800,...,264.125000,438.464000,57.173500,279.023000,2979.870000,31.405200,1166.020000,3059.990000,1895.990000,801.637000
2,TCGA-OR-A5J3-01,1.730100,6.565000,86.487900,1054.660000,348.393000,0.592500,7.702600,162.346000,10.072600,...,90.653200,190.194000,39.697800,495.334000,914.827000,0.592500,806.399000,2655.610000,1482.450000,437.269000
3,TCGA-OR-A5J5-01,0.000000,1.549200,53.911700,2350.890000,439.194000,0.774600,6.196700,52.672300,3.098400,...,260.263000,840.434000,17.815600,206.042000,890.782000,11.618900,553.834000,2367.930000,1140.200000,512.781000
4,TCGA-OR-A5J6-01,0.000000,4.470900,66.906300,1257.990000,149.215000,2.794300,10.618300,143.067000,1.676600,...,64.268500,63.150800,12.853700,117.919000,894.730000,7.824000,795.812000,708.071000,796.371000,475.587000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10318,TCGA-CG-4472-01,6.997533,13.654193,639.238135,1297.152549,57.924566,0.316592,11.628927,426.953051,3.614168,...,292.541116,450.676983,79.402403,509.914317,1224.493338,3.268489,905.046317,3359.241568,1740.926312,1148.217844
10319,TCGA-CG-4474-01,18.311906,7.417109,742.479964,1152.909807,166.813851,0.569025,10.323868,513.013287,10.831553,...,745.532095,1056.917140,39.429966,349.337898,970.055609,17.164493,757.811259,4264.469081,2702.668453,857.336078
10320,TCGA-CG-4475-01,12.057112,11.585177,506.336449,1375.495774,115.805483,3.951853,11.573288,1743.957055,18.766946,...,556.284407,655.851549,59.290240,554.447507,1308.788955,3.756246,927.963540,3103.609391,1370.141309,473.853001
10321,TCGA-CG-4476-01,18.628740,11.482418,712.452165,971.893874,188.696451,0.559003,11.902364,898.943092,8.812796,...,516.997610,608.014394,45.332473,464.399520,1101.931239,0.301440,845.677334,3302.569055,1915.477072,1027.285482


# Process RPPA data

In [13]:
rppa_data_file_name = "TCGA-RPPA-pancan-clean.txt"

rppa_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, rppa_data_file_name), sep="\t")
rppa_df = rppa_df.drop(columns=["TumorType"])
rppa_df = rppa_df.T
rppa_df.columns = rppa_df.loc["SampleID", :].tolist()
rppa_df = rppa_df.drop("SampleID", axis=0)

column_dict = {}
for column in rppa_df.columns:
    if column.split("-")[3][:2] not in tumor_sample_ids:
        continue
    column_first_15 = column[:15]
    if column_first_15 in column_dict.keys():
        if column < column_dict[column_first_15]:
            column_dict[column_first_15] = column
        else:
            continue
    else:
        column_dict[column_first_15] = column

rppa_df = rppa_df[list(column_dict.values())]
rppa_df.columns = [column[:15] for column in rppa_df.columns]
rppa_df = rppa_df.T
rppa_df = rppa_df.reset_index(drop=False).rename(columns={"index": "sample_id"})
rppa_df = rppa_df.dropna(axis=1)

save_processed_tsv_files(df=rppa_df, data_type="rppa", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)

rppa_df


Unnamed: 0,sample_id,X1433EPSILON,X4EBP1,X4EBP1_pS65,X4EBP1_pT37T46,X53BP1,ACC_pS79,ACC1,AKT,AKT_pS473,...,X1433BETA,X1433ZETA,ACVRL1,DIRAS3,ANNEXIN1,PREX1,ERCC1,MSH2,MSH6,SMAC
0,TCGA-OR-A5J2-01,-0.494327,0.592339,0.104604,-0.134915,1.891361,0.036756,1.715016,0.695838,-0.709114,...,-0.330586,0.906543,-0.31872,-2.154075,0.071476,-0.731556,-0.166567,-0.262756,0.280405,1.111355
1,TCGA-PA-A5YG-01,-0.535367,0.182801,-0.105769,-0.040087,1.033167,0.24345,0.945532,0.542829,-0.4244,...,-0.357255,0.151834,-0.471326,-1.880672,0.593315,-0.039804,-0.084995,-0.265735,-0.40721,0.990587
2,TCGA-OR-A5JV-01,-0.10901,0.417659,0.02475,-0.720552,1.664733,0.237807,1.629606,0.436791,-1.201352,...,-0.236244,0.872919,-0.125896,-1.891871,0.318994,0.514128,-0.147235,-0.401774,-0.36715,0.801252
3,TCGA-OR-A5JT-01,-0.412523,0.258615,0.005974,0.563042,1.462891,-0.103568,0.783692,0.533254,-0.490975,...,-0.337845,0.691135,-0.222455,-2.048488,1.63042,-0.133165,-0.343098,-0.61102,-0.565428,1.496277
4,TCGA-OR-A5JR-01,-0.288187,-0.112085,0.019447,-0.002671,2.230052,0.804794,1.864346,1.193249,0.221961,...,-0.439145,0.884969,-0.408125,-2.204558,0.367352,0.038922,-0.066627,-0.470087,-0.233017,1.402321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7785,TCGA-V4-A9EO-01,-0.141735,0.742978,0.350677,1.358215,1.432059,0.419816,1.778006,1.394221,-1.29108,...,-0.380221,0.832543,-0.881674,-2.927522,-1.74354,-1.453579,-0.482385,-0.540578,0.106378,0.871774
7786,TCGA-V4-A9EJ-01,-0.278094,0.805884,0.563067,1.388077,1.492527,-0.392376,0.993545,1.654516,-1.246585,...,-0.564944,0.920832,-0.764127,-2.821593,-1.491739,-0.950304,-0.669655,-0.384842,-0.005848,0.898869
7787,TCGA-VD-A8KF-01,-0.491379,0.301767,0.358113,1.20106,1.771701,0.80813,2.011375,1.18423,-0.523638,...,-0.551941,0.855399,-0.726026,-2.798757,-0.700305,-0.54034,-0.613287,-0.035179,-0.048269,0.851824
7788,TCGA-WC-A88A-01,-0.309337,0.498963,0.43467,0.796926,2.441612,0.278614,1.793356,0.757287,-1.146702,...,-0.564883,1.239581,-0.657106,-2.741425,-0.755726,-0.68864,-0.392158,-0.017519,0.086633,0.951335


# Process Tumor Purity data

In [14]:
tumor_purity_cpe_file_name = "tumor_purity.csv"
tumor_purity_estimate_file_name = "tumor_purity_ESTIMATE.csv"

tumor_purity_cpe_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_cpe_file_name))
tumor_purity_estimate_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_estimate_file_name))

tumor_sample_id_purity_mapping = {}
for _, row in tumor_purity_cpe_df.iterrows():
    sample_id = row["Sample.ID"]
    purity_cpe = row["CPE"]
    purity_estimate = row["ESTIMATE"]
    purity_absolute = row["ABSOLUTE"]

    if not pd.isnull(purity_cpe):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_cpe.replace(",", "."))
        continue
    
    if not pd.isnull(purity_absolute):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_absolute.replace(",", "."))
        continue
    
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_estimate.replace(",", "."))
        continue

for _, row in tumor_purity_estimate_df.iterrows():
    sample_id = row["NAME"]
    purity_estimate = row["TumorPurity"]
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = purity_estimate

tumor_purity_df = pd.DataFrame.from_dict({"sample_id": tumor_sample_id_purity_mapping.keys(),
                                          "purity": tumor_sample_id_purity_mapping.values()})

sample_id_dict = {}
for sample_id in tumor_purity_df["sample_id"].values:
    if sample_id.split("-")[3][:2] not in tumor_sample_ids:
        continue
    sample_id_first_15 = sample_id[:15]
    if sample_id_first_15 in sample_id_dict.keys():
        if sample_id < sample_id_dict[sample_id_first_15]:
            sample_id_dict[sample_id_first_15] = sample_id
        else:
            continue
    else:
        sample_id_dict[sample_id_first_15] = sample_id

tumor_purity_df = tumor_purity_df[tumor_purity_df.sample_id.apply(lambda x: x in list(sample_id_dict.values()))]
tumor_purity_df["sample_id"] = tumor_purity_df["sample_id"].apply(lambda x: x[:15])

save_processed_tsv_files(df=tumor_purity_df, data_type="purity", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)

tumor_purity_df


Unnamed: 0,sample_id,purity
0,TCGA-OR-A5J1-01,0.924600
1,TCGA-OR-A5J2-01,0.898500
2,TCGA-OR-A5J3-01,0.946600
3,TCGA-OR-A5J4-01,0.866000
4,TCGA-OR-A5J5-01,0.978000
...,...,...
10800,TCGA-XU-A92Z-01,0.804001
10801,TCGA-X7-A8D8-01,0.752956
10802,TCGA-XU-A92O-01,0.808031
10803,TCGA-X7-A8M8-01,0.731812


# Process Subtype Data

In [15]:
# blca
blca_subtype_info_file_name = "TCGA.BLCA.sampleMap_BLCA_clinicalMatrix"
blca_subtype_info_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, blca_subtype_info_file_name), sep="\t")
blca_subtype_info_df = blca_subtype_info_df[["sampleID", "diagnosis_subtype"]]
blca_subtype_info_df = blca_subtype_info_df.rename(columns={"sampleID": "sample_id", "diagnosis_subtype": "subtype"})
blca_subtype_info_df["subtype"] = blca_subtype_info_df["subtype"].replace(to_replace="[Discrepancy]", value=np.NaN)
blca_subtype_info_df = blca_subtype_info_df.dropna(axis=0)
blca_subtype_info_df["subtype"] = blca_subtype_info_df["subtype"].apply(lambda x: x.lower())

# lusc
lusc_subtype_info_file_name = "lusc_subtype_info.xlsx"
lusc_subtype_info_df = pd.read_excel(os.path.join(data_dir, raw_folder_name, lusc_subtype_info_file_name), sheet_name="subtypes_LUSC")
lusc_subtype_info_df["sample_id"] = lusc_subtype_info_df["code"].apply(lambda x: x[:4] + "-" + x[4:6] + "-" + x[6:10] + "-" + x[10:13])
lusc_subtype_info_df = lusc_subtype_info_df[["sample_id", "subtype"]]

sample_id_dict = {}
for sample_id in lusc_subtype_info_df["sample_id"].values:
    if sample_id.split("-")[3][:2] not in tumor_sample_ids:
        continue
    sample_id_first_15 = sample_id[:15]
    if sample_id_first_15 in sample_id_dict.keys():
        if sample_id < sample_id_dict[sample_id_first_15]:
            sample_id_dict[sample_id_first_15] = sample_id
        else:
            continue
    else:
        sample_id_dict[sample_id_first_15] = sample_id

lusc_subtype_info_df = lusc_subtype_info_df[lusc_subtype_info_df.sample_id.apply(lambda x: x in list(sample_id_dict.values()))]
lusc_subtype_info_df["sample_id"] = lusc_subtype_info_df["sample_id"].apply(lambda x: x[:15])

# ov
ov_subtype_info_file_name = "ov_subtype_info.tsv"
ov_subtype_info_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, ov_subtype_info_file_name), sep="\t")
ov_subtype_info_df = ov_subtype_info_df[["ID", "SUBTYPE"]]
ov_subtype_info_df.columns = ["sample_id", "subtype"]
ov_subtype_info_df["subtype"] = ov_subtype_info_df["subtype"].apply(lambda x: x.lower())

# blca, lusc, ov
assert blca_subtype_info_df.columns.tolist() == lusc_subtype_info_df.columns.tolist(), "Columns of BLCA Subtype dataframe should be equal to columns of LUSC Subtype dataframe"
assert lusc_subtype_info_df.columns.tolist() == ov_subtype_info_df.columns.tolist(), "Columns of LUSC Subtype dataframe should be equal to columns of OV Subtype dataframe"

subtype_info_df = pd.concat([blca_subtype_info_df, lusc_subtype_info_df, ov_subtype_info_df], axis=0)
subtype_info_df = pd.get_dummies(subtype_info_df, columns=["subtype"])
save_processed_tsv_files(df=subtype_info_df, data_type="subtype", cancer_type_sample_ids_mapping=cancer_type_sample_ids_mapping)

subtype_info_df


Unnamed: 0,sample_id,subtype_basal,subtype_classical,subtype_differentiated,subtype_immunoreactive,subtype_mesenchymal,subtype_non-papillary,subtype_papillary,subtype_primitive,subtype_proliferative,subtype_secretory
0,TCGA-2F-A9KO-01,0,0,0,0,0,1,0,0,0,0
1,TCGA-2F-A9KP-01,0,0,0,0,0,1,0,0,0,0
2,TCGA-2F-A9KQ-01,0,0,0,0,0,1,0,0,0,0
3,TCGA-2F-A9KR-01,0,0,0,0,0,1,0,0,0,0
4,TCGA-2F-A9KT-01,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
484,TCGA-61-2009-01,0,0,0,0,1,0,0,0,0,0
485,TCGA-13-0724-01,0,0,1,0,0,0,0,0,0,0
486,TCGA-24-1928-01,0,0,0,0,1,0,0,0,0,0
487,TCGA-31-1953-01,0,0,0,1,0,0,0,0,0,0
