In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import numpy as np
import pandas as pd


In [3]:
data_dir = "../data"
raw_folder_name = "raw"
processed_folder_name = "processed"

os.makedirs(os.path.join(data_dir, processed_folder_name), exist_ok=True)


In [4]:
tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]


# Process Cancer Type Data

In [5]:
cancer_type_file_name = "TCGA_phenotype_denseDataOnlyDownload.tsv"
cancer_type_full_name_to_abbrreviation_mapping_file_name = "cancer_type_full_name_to_abbrreviation_mapping.tsv"

cancer_type_full_name_to_abbrreviation_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_type_full_name_to_abbrreviation_mapping_file_name), sep="\t").values)

cancer_type_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_type_file_name), sep="\t")
cancer_type_df = cancer_type_df.rename(columns={"sample": "sample_id", "_primary_disease": "cancer_type"})
cancer_type_df = cancer_type_df[["sample_id", "cancer_type"]]
cancer_type_df["cancer_type"] = cancer_type_df["cancer_type"].apply(lambda x: cancer_type_full_name_to_abbrreviation_mapping[x])

cancer_type_one_hot_df = pd.get_dummies(data=cancer_type_df, columns=["cancer_type"])

display(cancer_type_df)
display(cancer_type_one_hot_df)


Unnamed: 0,sample_id,cancer_type
0,TCGA-D3-A1QA-07,skcm
1,TCGA-DE-A4MD-06,thcm
2,TCGA-J8-A3O2-06,thcm
3,TCGA-J8-A3YH-06,thcm
4,TCGA-EM-A2P1-06,thcm
...,...,...
12799,TCGA-17-Z059-01,luad
12800,TCGA-17-Z060-01,luad
12801,TCGA-17-Z061-01,luad
12802,TCGA-17-Z062-01,luad


Unnamed: 0,sample_id,cancer_type_acc,cancer_type_blca,cancer_type_brca,cancer_type_cesc,cancer_type_chol,cancer_type_coad,cancer_type_dlbc,cancer_type_esca,cancer_type_gbm,...,cancer_type_read,cancer_type_sarc,cancer_type_skcm,cancer_type_stad,cancer_type_tgct,cancer_type_thcm,cancer_type_thym,cancer_type_ucec,cancer_type_ucs,cancer_type_uvm
0,TCGA-D3-A1QA-07,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,TCGA-DE-A4MD-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,TCGA-J8-A3O2-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,TCGA-J8-A3YH-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,TCGA-EM-A2P1-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12799,TCGA-17-Z059-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12800,TCGA-17-Z060-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12801,TCGA-17-Z061-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12802,TCGA-17-Z062-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Process Tumor Purity Data

In [6]:
tumor_purity_cpe_file_name = "tumor_purity.csv"
tumor_purity_estimate_file_name = "tumor_purity_ESTIMATE.csv"

tumor_purity_cpe_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_cpe_file_name))
tumor_purity_estimate_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_estimate_file_name))

tumor_sample_id_purity_mapping = {}
for _, row in tumor_purity_cpe_df.iterrows():
    sample_id = row["Sample.ID"]
    purity_cpe = row["CPE"]
    purity_estimate = row["ESTIMATE"]
    purity_absolute = row["ABSOLUTE"]

    if not pd.isnull(purity_cpe):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_cpe.replace(",", "."))
        continue
    
    if not pd.isnull(purity_absolute):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_absolute.replace(",", "."))
        continue
    
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_estimate.replace(",", "."))
        continue

for _, row in tumor_purity_estimate_df.iterrows():
    sample_id = row["NAME"]
    purity_estimate = row["TumorPurity"]
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = purity_estimate

tumor_purity_df = pd.DataFrame.from_dict({"sample_id": tumor_sample_id_purity_mapping.keys(),
                                          "purity": tumor_sample_id_purity_mapping.values()})

sample_id_dict = {}
for sample_id in tumor_purity_df["sample_id"].values:
    if sample_id.split("-")[3][:2] not in tumor_sample_ids:
        continue
    sample_id_first_15 = sample_id[:15]
    if sample_id_first_15 in sample_id_dict.keys():
        if sample_id < sample_id_dict[sample_id_first_15]:
            sample_id_dict[sample_id_first_15] = sample_id
        else:
            continue
    else:
        sample_id_dict[sample_id_first_15] = sample_id

tumor_purity_df = tumor_purity_df[tumor_purity_df.sample_id.apply(lambda x: x in list(sample_id_dict.values()))]
tumor_purity_df["sample_id"] = tumor_purity_df["sample_id"].apply(lambda x: x[:15])

tumor_purity_df


Unnamed: 0,sample_id,purity
0,TCGA-OR-A5J1-01,0.924600
1,TCGA-OR-A5J2-01,0.898500
2,TCGA-OR-A5J3-01,0.946600
3,TCGA-OR-A5J4-01,0.866000
4,TCGA-OR-A5J5-01,0.978000
...,...,...
10800,TCGA-XU-A92Z-01,0.804001
10801,TCGA-X7-A8D8-01,0.752956
10802,TCGA-XU-A92O-01,0.808031
10803,TCGA-X7-A8M8-01,0.731812


# Process CNA Data

In [7]:
cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
hgnc_symbol_to_entrezgene_id_mapping_file_name = "hgnc_to_entrezgene_id_mapping.tsv"
hgnc_symbol_to_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, hgnc_symbol_to_entrezgene_id_mapping_file_name), sep="\t").values)

gex_file_name = "tcga_gene_expected_count"
gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t", usecols=["sample"])
gex_ensembl_ids = frozenset(gex_df["sample"].tolist())
del gex_df

cna_df = pd.read_csv(os.path.join(data_dir, "raw", cna_file_name), sep="\t")

for index, row in cna_df.iterrows():
    sample_splitted = row["Sample"].split("|")
    if len(sample_splitted) == 1:
        cna_df.at[index, "ensembl_id"] = np.NaN
        cna_df.at[index, "entrezgene_id"] = hgnc_symbol_to_entrezgene_id_mapping.get(sample_splitted[0], np.NaN)
    elif len(sample_splitted) == 2:
        cna_df.at[index, "ensembl_id"] = sample_splitted[1]
        cna_df.at[index, "entrezgene_id"] = hgnc_symbol_to_entrezgene_id_mapping.get(sample_splitted[0], np.NaN)
    else:
        raise Exception("sample_splitted has more than 1 '|'s")

cna_df = cna_df[~pd.isnull(cna_df["entrezgene_id"])]
cna_df["entrezgene_id"] = cna_df["entrezgene_id"].apply(lambda x: int(x))

cna_df["ensemble_id_is_not_in_gex_ensemble_ids"] = cna_df["ensembl_id"].apply(lambda x: 1 * (x not in gex_ensembl_ids))

def get_ensembl_version(ensembl_id):
    if pd.isnull(ensembl_id):
        return -1
    else:
        return int(ensembl_id.split(".")[-1])

cna_df["ensembl_version"] = cna_df["ensembl_id"].apply(lambda ensembl_id: get_ensembl_version(ensembl_id))

def select_one_row_per_entrezgene_id(x):
    return x.sort_values(by=["ensemble_id_is_not_in_gex_ensemble_ids", "ensembl_version"], ascending=True).iloc[0, :]

cna_df = cna_df.groupby("entrezgene_id").apply(lambda x: select_one_row_per_entrezgene_id(x)).reset_index(drop=False)

cna_df


In [10]:
def select_one_row_per_entrezgene_id(x):
    x_in = x[x["ensemble_id_is_in_gex_ensemble_ids"] == True]
    x_out = x[x["ensemble_id_is_in_gex_ensemble_ids"] == False]
    x_out_nan = x_out[pd.isnull(x_out["ensembl_id"])]
    x_out_not_nan = x_out[~pd.isnull(x_out["ensembl_id"])]
    if x_in.shape[0] > 0:
        versions = [int(version) for version in x_in["ensembl_id"].apply(lambda x: x.split(".")[-1]).tolist()]
        max_version_index = np.argmax(versions)
        return x_in.iloc[max_version_index, :]
    elif x_out_nan.shape[0] > 0:
        return x_out_nan.iloc[0, :]
    else:
        versions = [int(version) for version in x_out_not_nan["ensembl_id"].apply(lambda x: x.split(".")[-1]).tolist()]
        max_version_index = np.argmax(versions)
        return x_out_not_nan.iloc[max_version_index, :]

cna_df.groupby("entrezgene_id").apply(lambda x: select_one_row_per_entrezgene_id(x))



# Process GEX Data

In [None]:
gex_file_name = "tcga_gene_expected_count"
ensembl_id_to_entrezgene_id_mapping_file_name = "ensembl_id_to_entrezgene_id_mapping.tsv"

ensembl_id_to_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, ensembl_id_to_entrezgene_id_mapping_file_name), sep="\t").values)

gex_df = gex_df[["ensembl_id"] + [column for column in gex_df.columns if column.split("-")[-1] in tumor_sample_ids]]
gex_df["ensembl_id"] = gex_df["ensembl_id"].apply(lambda x: x.split(".")[0])
gex_df = gex_df[gex_df["ensembl_id"].isin(ensembl_id_to_entrezgene_id_mapping.keys())]
gex_df["entrezgene_id"] = gex_df["ensembl_id"].apply(lambda x: ensembl_id_to_entrezgene_id_mapping[x])
gex_df.drop(columns=["ensembl_id"], inplace=True)
gex_df.set_index("entrezgene_id", inplace=True)
gex_df = gex_df.T
gex_df["sample_id"] = gex_df.index.tolist()
gex_df.index = np.arange(gex_df.shape[0])
gex_df = gex_df.rename_axis(None, axis=1)


# Process CNA Data

In [5]:
cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
hgnc_symbol_entrezgene_id_mapping_file_name = "hgnc_to_entrezgene_id_mapping.tsv"

cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cna_file_name), sep="\t")
hgnc_symbol_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, hgnc_symbol_entrezgene_id_mapping_file_name), sep="\t").values)

def select_the_row_with_max_expression_per_gene(x):
    x_sum = x.values.sum(axis=1).ravel()
    x_argmax = np.argmax(x_sum)
    return x.iloc[x_argmax, :]

for index, row in cna_df.iterrows():
    gene_symbol_splitted = row["Sample"].split("|")
    if len(gene_symbol_splitted) == 1:
        cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
        cna_df.at[index, "ensembl_id"] = np.NaN
    elif len(gene_symbol_splitted) == 2:
        cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
        cna_df.at[index, "ensembl_id"] = gene_symbol_splitted[1]
    else:
        raise Exception("gene_symbol_splitted has more than 1 '|'s")

cna_df = cna_df.drop(columns=["Sample", "ensembl_id"])
cna_df = cna_df[~pd.isnull(cna_df["entrezgene_id"])]
cna_df = cna_df.reset_index(drop=True)
cna_df = cna_df.groupby("entrezgene_id").apply(lambda x: select_the_row_with_max_expression_per_gene(x)).reset_index(drop=True)
cna_df = cna_df.T
cna_df.columns = cna_df.loc["entrezgene_id", :]
cna_df = cna_df.drop("entrezgene_id", axis=0)
cna_df.columns = [int(column) for column in cna_df.columns]
cna_df = cna_df.reset_index(drop=False)
cna_df = cna_df.rename(columns={"index": "sample_id"})
cna_df


Unnamed: 0,sample_id,1,2,3,9,10,12,13,14,15,...,124907803,124907805,124907806,124907808,124907837,124907841,124907928,124907948,124908556,124908558
0,TCGA-A5-A0GI-01,-0.001,0.002,0.002,-0.001,-0.001,0.004,0.000,0.000,0.004,...,0.006,0.006,0.006,0.006,0.006,0.006,0.004,0.004,0.030,0.030
1,TCGA-S9-A7J2-01,-0.849,0.007,0.007,-0.011,-0.011,0.014,-0.002,-0.007,0.068,...,0.032,0.032,0.032,0.032,0.032,0.032,0.068,0.068,-0.062,-0.062
2,TCGA-06-0150-01,0.601,-0.010,-0.010,-0.010,-0.010,-0.008,0.023,0.006,0.015,...,-0.009,-0.009,-0.009,-0.009,-0.009,-0.009,0.015,0.015,-0.069,-0.069
3,TCGA-AR-A1AH-01,-0.727,0.037,0.037,0.222,0.222,-0.755,-0.751,0.018,0.071,...,-0.745,-0.742,-0.745,-0.742,-0.186,-0.186,-0.761,0.071,-0.233,-0.233
4,TCGA-EK-A2RE-01,-0.031,-0.007,-0.007,-0.001,-0.001,-0.012,0.031,0.000,-0.010,...,-0.014,-0.014,-0.014,-0.014,-0.014,-0.014,-0.024,-0.010,-0.205,-0.205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10840,TCGA-IB-7885-01,-0.126,-0.069,-0.069,0.000,0.000,0.009,-0.054,0.010,-0.039,...,0.007,0.007,0.007,0.007,0.007,0.007,-0.039,-0.039,-0.038,-0.038
10841,TCGA-95-7947-01,-0.064,-0.604,-0.604,-0.645,-0.645,0.005,0.613,0.006,-0.042,...,-0.630,-0.630,-0.630,-0.630,0.454,0.454,-0.050,-0.042,-0.037,-0.037
10842,TCGA-VQ-AA6F-01,-0.034,0.016,0.016,-0.156,-0.156,-0.027,0.449,0.020,0.204,...,-0.199,-0.199,-0.199,-0.199,0.338,0.338,-0.215,0.204,0.006,0.006
10843,TCGA-BR-8588-01,0.002,-0.001,-0.001,0.000,0.000,-0.002,0.105,0.001,0.004,...,0.000,0.000,0.000,0.000,0.000,0.000,0.004,0.004,0.049,0.049


# Find intersecting sample IDs and columns

In [8]:
gex_sample_ids = set(gex_df["sample_id"].tolist())
cna_sample_ids = set(cna_df["sample_id"].tolist())
tumor_purity_sample_ids = set(tumor_purity_df["sample_id"].tolist())
cancer_type_sample_ids = set(cancer_type_df["sample_id"].tolist())
intersecting_sample_ids = gex_sample_ids.intersection(cna_sample_ids).intersection(tumor_purity_sample_ids).intersection(cancer_type_sample_ids)

gex_gene_ids = set(gex_df.drop(columns=["sample_id"]).columns)
cna_gene_ids = set(cna_df.drop(columns=["sample_id"]).columns)
intersecting_columns = ["sample_id"] + sorted(list(gex_gene_ids.intersection(cna_gene_ids)))


# Save and delete processed data so far

In [11]:
# gex_df = gex_df[gex_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
# gex_df = gex_df.sort_values(by="sample_id")
# gex_df.to_csv(os.path.join(data_dir, processed_folder_name, "gex.tsv"), sep="\t", index=False)
# print("gex_df.shape:", gex_df.shape)
# del gex_df

cna_df = cna_df[cna_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
cna_df = cna_df.sort_values(by="sample_id")
cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "cna.tsv"), sep="\t", index=False)
print("cna_df.shape:", cna_df.shape)
del cna_df

tumor_purity_df = tumor_purity_df[tumor_purity_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)]
tumor_purity_df = tumor_purity_df.sort_values(by="sample_id")
tumor_purity_df.to_csv(os.path.join(data_dir, processed_folder_name, "tumor_purity.tsv"), sep="\t", index=False)
del tumor_purity_df

cancer_type_df = cancer_type_df[cancer_type_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)]
cancer_type_df = cancer_type_df.sort_values(by="sample_id")
cancer_type_df.to_csv(os.path.join(data_dir, processed_folder_name, "cancer_type.tsv"), sep="\t", index=False)
del cancer_type_df

cancer_type_one_hot_df = cancer_type_one_hot_df[cancer_type_one_hot_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)]
cancer_type_one_hot_df = cancer_type_one_hot_df.sort_values(by="sample_id")
cancer_type_one_hot_df.to_csv(os.path.join(data_dir, processed_folder_name, "cancer_type_one_hot.tsv"), sep="\t", index=False)
del cancer_type_one_hot_df


NameError: name 'cna_df' is not defined

# Process Thresholded CNA Data

In [10]:
thresholded_cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes"

hgnc_symbol_entrezgene_id_mapping_file_name = "hgnc_to_entrezgene_id_mapping.tsv"

thresholded_cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, thresholded_cna_file_name), sep="\t")
hgnc_symbol_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, hgnc_symbol_entrezgene_id_mapping_file_name), sep="\t").values)

def select_the_row_with_max_expression_per_gene(x):
    x_sum = x.values.sum(axis=1).ravel()
    x_argmax = np.argmax(x_sum)
    return x.iloc[x_argmax, :]

for index, row in thresholded_cna_df.iterrows():
    gene_symbol_splitted = row["Sample"].split("|")
    if len(gene_symbol_splitted) == 1:
        thresholded_cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
        thresholded_cna_df.at[index, "ensembl_id"] = np.NaN
    elif len(gene_symbol_splitted) == 2:
        thresholded_cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
        thresholded_cna_df.at[index, "ensembl_id"] = gene_symbol_splitted[1]
    else:
        raise Exception("gene_symbol_splitted has more than 1 '|'s")

thresholded_cna_df = thresholded_cna_df.drop(columns=["Sample", "ensembl_id"])
thresholded_cna_df = thresholded_cna_df[~pd.isnull(thresholded_cna_df["entrezgene_id"])]
thresholded_cna_df = thresholded_cna_df.reset_index(drop=True)
thresholded_cna_df = thresholded_cna_df.groupby("entrezgene_id").apply(lambda x: select_the_row_with_max_expression_per_gene(x)).reset_index(drop=True)
thresholded_cna_df = thresholded_cna_df.T
thresholded_cna_df.columns = thresholded_cna_df.loc["entrezgene_id", :]
thresholded_cna_df = thresholded_cna_df.drop("entrezgene_id", axis=0)
thresholded_cna_df.columns = [int(column) for column in thresholded_cna_df.columns]
thresholded_cna_df = thresholded_cna_df.reset_index(drop=False)
thresholded_cna_df = thresholded_cna_df.rename(columns={"index": "sample_id"})

thresholded_cna_df = thresholded_cna_df[thresholded_cna_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
thresholded_cna_df = thresholded_cna_df.sort_values(by="sample_id")
thresholded_cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "thresholded_cna.tsv"), sep="\t", index=False)
print("thresholded_cna_df.shape:", thresholded_cna_df.shape)

thresholded_cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "thresholded_cna.tsv"), sep="\t", index=False)

del thresholded_cna_df



KeyboardInterrupt: 

# Process RPPA Data

In [None]:
rppa_file_name = "TCGA-RPPA-pancan-clean.xena"

rppa_df = pd.read_csv(os.path.join(data_dir, "raw", rppa_file_name), sep="\t")
rppa_df.index = rppa_df["SampleID"].tolist()
rppa_df.drop(columns=["SampleID"], inplace=True)
rppa_df = rppa_df.T
rppa_df.reset_index(drop=False, inplace=True)
rppa_df.rename(columns={"index": "sample_id"}, inplace=True)
rppa_df = rppa_df.dropna(axis=1)
rppa_df = rppa_df[rppa_df["sample_id"].apply(lambda x: x in intersecting_sample_ids)]
rppa_df = rppa_df.sort_values(by="sample_id")
rppa_df.to_csv(os.path.join(data_dir, processed_folder_name, "rppa.tsv"), sep="\t", index=False)

display(rppa_df)

del rppa_df


Unnamed: 0,sample_id,X1433EPSILON,X4EBP1,X4EBP1_pS65,X4EBP1_pT37T46,X53BP1,ACC_pS79,ACC1,AKT,AKT_pS473,...,CHK1_pS296,COG3,DUSP4,ERCC5,IGF1R_pY1135Y1136,IRF1,JAK2,P16INK4A,SHP2_pY542,PDL1
1621,TCGA-02-2485-01,0.273230,-0.52810,0.199410,1.811200,-0.302440,0.265890,0.068594,1.198000,2.925900,...,-0.156340,-0.240520,0.15881,-0.120690,0.125260,-0.437870,-0.45857,1.47190,1.282800,0.458870
708,TCGA-04-1338-01,0.188070,-0.34841,-0.320030,-0.856150,-0.289130,-1.270400,-1.186400,-0.176120,-0.191820,...,0.060738,-0.398270,-0.32067,-0.829740,0.159060,0.035438,-0.38887,-0.24471,-0.075832,0.036075
718,TCGA-04-1341-01,0.056274,-0.94781,-0.297190,-0.069594,-0.191210,0.078739,-0.036084,-0.203400,0.057105,...,0.012515,0.040846,-0.87482,0.402570,0.117780,0.417980,-0.15509,1.56690,0.138620,-0.030532
659,TCGA-04-1343-01,-0.130960,-0.18465,-0.326880,0.448450,-0.350830,-0.132710,-0.109820,-0.597660,0.445890,...,0.058369,-0.312680,-0.14074,-0.361830,0.112070,-0.060811,-0.15761,1.49700,0.000342,0.050266
669,TCGA-04-1348-01,-0.026592,0.17374,-0.133330,0.081989,0.343650,-0.280570,-0.375430,0.032848,-1.202900,...,0.072442,-0.128150,-0.44281,0.256900,-0.009775,0.193460,-0.10340,1.84010,-0.394400,0.097303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3324,TCGA-ZJ-AB0I-01,-0.148030,0.42838,0.647190,1.223700,-0.345400,0.571170,1.000400,-0.507920,-0.775280,...,0.145890,-0.258880,-0.17974,0.448300,0.053071,0.355560,0.10849,1.41600,0.413530,0.275810
3702,TCGA-ZL-A9V6-01,0.038334,-0.43030,-0.306760,0.268230,0.072804,-0.849420,-0.917870,-0.521210,0.040304,...,-0.110140,0.023475,-0.42487,0.479810,0.252280,0.348400,0.48670,-0.74474,0.576300,0.510680
3591,TCGA-ZN-A9VQ-01,0.061736,0.18937,0.511320,0.117930,0.463680,-0.383600,-0.134740,-0.228820,-1.327800,...,0.180530,0.139770,-0.68690,0.029791,0.295170,0.220100,0.23418,-1.73570,-0.402560,0.189090
3592,TCGA-ZN-A9VV-01,0.030460,-0.13645,-0.044024,0.048419,1.200000,-0.043118,-0.432840,-0.011013,-0.913920,...,0.092824,0.335980,-0.59654,0.095181,0.275380,0.235230,0.38030,0.60712,0.218670,0.124220
