In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


development = False


# In[3]:


import os
import numpy as np
import pandas as pd
import swifter


# In[4]:


data_dir = "../data"
raw_folder_name = "raw"
if development:
    processed_folder_name = "development"
else:
    processed_folder_name = "processed"

os.makedirs(os.path.join(data_dir, processed_folder_name), exist_ok=True)


# In[5]:


tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]


# # Process RPPA Data

# In[6]:

print("Processing RPPA data...")
rppa_file_name = "TCGA-RPPA-pancan-clean.xena"

rppa_df = pd.read_csv(os.path.join(data_dir, "raw", rppa_file_name), sep="\t")
rppa_df.index = rppa_df["SampleID"].tolist()
rppa_df.drop(columns=["SampleID"], inplace=True)
rppa_df = rppa_df.T
rppa_df.reset_index(drop=False, inplace=True)
rppa_df.rename(columns={"index": "sample_id"}, inplace=True)
rppa_df = rppa_df.dropna(axis=1)

if development:
    print(rppa_df)

print("Processed RPPA data.")

# # Process Thresholded and Unthresholded CNA Data

# In[7]:

print("Processing CNA data...")
thresholded_cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes"
unthresholded_cna_file_name = "TCGA.PANCAN.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"

hgnc_symbol_to_entrezgene_id_mapping_file_name = "hgnc_to_entrezgene_id_mapping.tsv"
hgnc_symbol_to_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, hgnc_symbol_to_entrezgene_id_mapping_file_name), sep="\t").values)

gex_file_name = "tcga_gene_expected_count"
gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t", usecols=["sample"])
gex_ensembl_ids = frozenset(gex_df["sample"].tolist())
del gex_df

def process_cna_df(cna_file_name):
    if development:
        cna_df = pd.read_csv(os.path.join(data_dir, "raw", cna_file_name), sep="\t", nrows=1000)
    else:
        cna_df = pd.read_csv(os.path.join(data_dir, "raw", cna_file_name), sep="\t")

    for index, row in cna_df.iterrows():
        sample_splitted = row["Sample"].split("|")
        if len(sample_splitted) == 1:
            cna_df.at[index, "ensembl_id"] = ""
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_to_entrezgene_id_mapping.get(sample_splitted[0], np.NaN)
        elif len(sample_splitted) == 2:
            cna_df.at[index, "ensembl_id"] = sample_splitted[1]
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_to_entrezgene_id_mapping.get(sample_splitted[0], np.NaN)
        else:
            raise Exception("sample_splitted has more than 1 '|'s")

    cna_df = cna_df[~pd.isnull(cna_df["entrezgene_id"])]
    cna_df["entrezgene_id"] = cna_df["entrezgene_id"].swifter.apply(lambda x: int(x))

    cna_df["ensembl_id_is_not_in_gex_ensembl_ids"] = cna_df["ensembl_id"].swifter.apply(lambda x: 1 * (x not in gex_ensembl_ids))

    def get_ensembl_version(ensembl_id):
        if pd.isnull(ensembl_id) or ensembl_id == "":
            return -1
        else:
            return int(ensembl_id.split(".")[-1])

    cna_df["ensembl_version"] = cna_df["ensembl_id"].swifter.apply(lambda ensembl_id: get_ensembl_version(ensembl_id))

    def select_one_row_per_entrezgene_id(x):
        return x.sort_values(by=["ensembl_id_is_not_in_gex_ensembl_ids", "ensembl_version"], ascending=True).iloc[0, :]

    cna_df = cna_df.swifter.groupby("entrezgene_id").apply(lambda x: select_one_row_per_entrezgene_id(x)).reset_index(drop=True)

    cna_df.drop(columns=["Sample", "ensembl_id", "ensembl_id_is_not_in_gex_ensembl_ids", "ensembl_version"], inplace=True)

    cna_df.set_index("entrezgene_id", inplace=True)

    cna_df = cna_df.T

    cna_df.reset_index(drop=False, inplace=True)
    cna_df = cna_df.rename_axis(None, axis=1)
    cna_df.rename(columns={"index": "sample_id"}, inplace=True)

    return cna_df

thresholded_cna_df = process_cna_df(cna_file_name=thresholded_cna_file_name)
unthresholded_cna_df = process_cna_df(cna_file_name=unthresholded_cna_file_name)

if development:
    print(thresholded_cna_df)
    print(unthresholded_cna_df)

print("Processed CNA data.")

# # Process Cancer Type Data

# In[8]:

print("Processing Cancer Type data...")

cancer_type_file_name = "TCGA_phenotype_denseDataOnlyDownload.tsv"
cancer_type_full_name_to_abbreviation_mapping_file_name = "cancer_type_full_name_to_abbrreviation_mapping.tsv"

cancer_type_full_name_to_abbreviation_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_type_full_name_to_abbreviation_mapping_file_name), sep="\t").values)

cancer_type_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_type_file_name), sep="\t")
cancer_type_df = cancer_type_df.rename(columns={"sample": "sample_id", "_primary_disease": "cancer_type"})
cancer_type_df = cancer_type_df[["sample_id", "cancer_type"]]
cancer_type_df["cancer_type"] = cancer_type_df["cancer_type"].swifter.apply(lambda x: cancer_type_full_name_to_abbreviation_mapping[x])

cancer_type_one_hot_df = pd.get_dummies(data=cancer_type_df, columns=["cancer_type"])

if development:
    print(cancer_type_df)
    print(cancer_type_one_hot_df)

print("Processed Cancer Type data...")

# # Process Tumor Purity Data

# In[9]:

print("Processing Tumor Purity data...")

tumor_purity_cpe_file_name = "tumor_purity.csv"
tumor_purity_estimate_file_name = "tumor_purity_ESTIMATE.csv"

tumor_purity_cpe_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_cpe_file_name))
tumor_purity_estimate_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, tumor_purity_estimate_file_name))

tumor_sample_id_purity_mapping = {}
for _, row in tumor_purity_cpe_df.iterrows():
    sample_id = row["Sample.ID"]
    purity_cpe = row["CPE"]
    purity_estimate = row["ESTIMATE"]
    purity_absolute = row["ABSOLUTE"]

    if not pd.isnull(purity_cpe):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_cpe.replace(",", "."))
        continue

    if not pd.isnull(purity_absolute):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_absolute.replace(",", "."))
        continue

    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = float(purity_estimate.replace(",", "."))
        continue

for _, row in tumor_purity_estimate_df.iterrows():
    sample_id = row["NAME"]
    purity_estimate = row["TumorPurity"]
    if not pd.isnull(purity_estimate):
        tumor_sample_id_purity_mapping[sample_id] = purity_estimate

tumor_purity_df = pd.DataFrame.from_dict({"sample_id": tumor_sample_id_purity_mapping.keys(),
                                          "purity": tumor_sample_id_purity_mapping.values()})

sample_id_dict = {}
for sample_id in tumor_purity_df["sample_id"].values:
    if sample_id.split("-")[3][:2] not in tumor_sample_ids:
        continue
    sample_id_first_15 = sample_id[:15]
    if sample_id_first_15 in sample_id_dict.keys():
        if sample_id < sample_id_dict[sample_id_first_15]:
            sample_id_dict[sample_id_first_15] = sample_id
        else:
            continue
    else:
        sample_id_dict[sample_id_first_15] = sample_id

tumor_purity_df = tumor_purity_df[tumor_purity_df["sample_id"].swifter.apply(lambda x: x in list(sample_id_dict.values()))]
tumor_purity_df["sample_id"] = tumor_purity_df["sample_id"].swifter.apply(lambda x: x[:15])

if development:
    print(tumor_purity_df)

print("Processed Tumor Purity data.")

# # Process GEX Data

# In[10]:
print("Processing GEX data...")

ensembl_id_to_entrezgene_id_mapping_file_name = "ensembl_id_to_entrezgene_id_mapping.tsv"
gex_file_name = "tcga_gene_expected_count"

tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]
ensembl_id_to_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, raw_folder_name, ensembl_id_to_entrezgene_id_mapping_file_name), sep="\t").values)

if development:
    gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t", nrows=1000)
else:
    gex_df = pd.read_csv(os.path.join(data_dir, "raw", gex_file_name), sep="\t")

gex_df.rename(columns={"sample": "ensembl_id"}, inplace=True)
gex_df = gex_df[["ensembl_id"] + [column for column in gex_df.columns if column.split("-")[-1] in tumor_sample_ids]]
gex_df["ensembl_id"] = gex_df["ensembl_id"].swifter.apply(lambda x: x.split(".")[0]).tolist()
gex_df = gex_df[gex_df["ensembl_id"].swifter.apply(lambda x: x in ensembl_id_to_entrezgene_id_mapping.keys())]
gex_df["entrezgene_id"] = gex_df["ensembl_id"].swifter.apply(lambda x: ensembl_id_to_entrezgene_id_mapping[x]).tolist()
gex_df.drop(columns=["ensembl_id"], inplace=True)

def get_indices_with_max_expression_per_gene(x):
    expression_sums = x["expression_sum"].values
    expression_argmax = np.argmax(expression_sums)
    return x.iloc[expression_argmax, :]["index"]

gex_df.reset_index(drop=True, inplace=True)
gex_df["index"] = gex_df.index.tolist()
gex_df["expression_sum"] = gex_df.drop(columns=["entrezgene_id"]).values.sum(axis=1).tolist()
selected_indices = gex_df[["index", "entrezgene_id", "expression_sum"]].swifter.groupby("entrezgene_id").apply(lambda x: get_indices_with_max_expression_per_gene(x)).tolist()
gex_df = gex_df[gex_df["index"].swifter.apply(lambda x: x in selected_indices)].drop(columns=["index", "expression_sum"])
gex_df["entrezgene_id"] = gex_df["entrezgene_id"].swifter.apply(lambda x: int(x))

gex_df.index = gex_df["entrezgene_id"].tolist()
gex_df.drop(columns=["entrezgene_id"], inplace=True)

gex_df = gex_df.T
gex_df.reset_index(drop=False, inplace=True)
gex_df.rename(columns={"index": "sample_id"}, inplace=True)

if development:
    print(gex_df)

print("Processed GEX data.")


# # Find intersecting sample IDs and columns

# In[11]:

gex_sample_ids = set(gex_df["sample_id"].tolist())
unthresholded_cna_sample_ids = set(unthresholded_cna_df["sample_id"].tolist())
thresholded_cna_sample_ids = set(thresholded_cna_df["sample_id"].tolist())
tumor_purity_sample_ids = set(tumor_purity_df["sample_id"].tolist())
cancer_type_sample_ids = set(cancer_type_df["sample_id"].tolist())
intersecting_sample_ids = gex_sample_ids.intersection(unthresholded_cna_sample_ids).intersection(thresholded_cna_sample_ids).intersection(tumor_purity_sample_ids).intersection(cancer_type_sample_ids)

gex_gene_ids = set(gex_df.drop(columns=["sample_id"]).columns)
unthresholded_cna_gene_ids = set(unthresholded_cna_df.drop(columns=["sample_id"]).columns)
thresholded_cna_gene_ids = set(thresholded_cna_df.drop(columns=["sample_id"]).columns)
intersecting_columns = ["sample_id"] + sorted(list(gex_gene_ids.intersection(unthresholded_cna_gene_ids).intersection(thresholded_cna_gene_ids)))


# # Save data

# In[12]:
print("Saving data...")

rppa_df = rppa_df[rppa_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
rppa_df = rppa_df.sort_values(by="sample_id")
rppa_df.to_csv(os.path.join(data_dir, processed_folder_name, "rppa.tsv"), sep="\t", index=False)
print("rppa_df.shape:", rppa_df.shape)

thresholded_cna_df = thresholded_cna_df[thresholded_cna_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
thresholded_cna_df = thresholded_cna_df.sort_values(by="sample_id")
thresholded_cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "thresholded_cna.tsv"), sep="\t", index=False)
print("thresholded_cna_df.shape:", thresholded_cna_df.shape)

unthresholded_cna_df = unthresholded_cna_df[unthresholded_cna_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
unthresholded_cna_df = unthresholded_cna_df.sort_values(by="sample_id")
unthresholded_cna_df.to_csv(os.path.join(data_dir, processed_folder_name, "unthresholded_cna.tsv"), sep="\t", index=False)
print("unthresholded_cna_df.shape:", unthresholded_cna_df.shape)

tumor_purity_df = tumor_purity_df[tumor_purity_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
tumor_purity_df = tumor_purity_df.sort_values(by="sample_id")
tumor_purity_df.to_csv(os.path.join(data_dir, processed_folder_name, "tumor_purity.tsv"), sep="\t", index=False)
print("tumor_purity_df.shape:", tumor_purity_df.shape)

cancer_type_df = cancer_type_df[cancer_type_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
cancer_type_df = cancer_type_df.sort_values(by="sample_id")
cancer_type_df.to_csv(os.path.join(data_dir, processed_folder_name, "cancer_type.tsv"), sep="\t", index=False)
print("cancer_type_df.shape:", cancer_type_df.shape)

cancer_type_one_hot_df = cancer_type_one_hot_df[cancer_type_one_hot_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)]
cancer_type_one_hot_df = cancer_type_one_hot_df.sort_values(by="sample_id")
cancer_type_one_hot_df.to_csv(os.path.join(data_dir, processed_folder_name, "cancer_type_one_hot.tsv"), sep="\t", index=False)
print("cancer_type_one_hot_df.shape", cancer_type_one_hot_df.shape)

gex_df = gex_df[gex_df["sample_id"].swifter.apply(lambda x: x in intersecting_sample_ids)][intersecting_columns]
gex_df = gex_df.sort_values(by="sample_id")
gex_df.to_csv(os.path.join(data_dir, processed_folder_name, "gex.tsv"), sep="\t", index=False)
print("gex_df.shape:", gex_df.shape)

print("Saved data.")



Processing RPPA data...
            sample_id  X1433EPSILON    X4EBP1  X4EBP1_pS65  X4EBP1_pT37T46  \
0     TCGA-FI-A2EY-01     -0.013829 -1.127400    -0.423550       -0.827380   
1     TCGA-DF-A2KS-01     -0.168630  0.165870    -0.505950        0.019504   
2     TCGA-A5-A1OH-01      0.038842 -0.382370     0.042306        0.119400   
3     TCGA-AX-A2H7-01      0.021308 -0.717660    -0.493150       -0.370670   
4     TCGA-AX-A2HA-01      0.108640  0.090459     0.339620       -0.017032   
...               ...           ...       ...          ...             ...   
7749  TCGA-VQ-A8DU-01     -0.048516 -0.073244    -0.034748       -0.649850   
7750  TCGA-VQ-A8DT-01     -0.139280 -0.169090     0.125880       -1.216300   
7751  TCGA-IN-A7NR-01     -0.232530 -0.509850     0.429300        0.444290   
7752  TCGA-RD-A8MV-01     -0.019379  0.250550     0.376320        0.926290   
7753  TCGA-KB-A93G-01     -0.064041 -0.247550    -0.042583       -0.014209   

        X53BP1  ACC_pS79     ACC1      

Pandas Apply:   0%|          | 0/959 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/959 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/959 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/959 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/959 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/959 [00:00<?, ?it/s]

             sample_id   34  204  205  249  473  533  576  656  712  ...  \
0      TCGA-A5-A0GI-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
1      TCGA-S9-A7J2-01 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0  ...   
2      TCGA-06-0150-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
3      TCGA-AR-A1AH-01  2.0 -1.0  2.0 -1.0 -1.0  2.0 -1.0  0.0 -1.0  ...   
4      TCGA-EK-A2RE-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
...                ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
10840  TCGA-IB-7885-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
10841  TCGA-95-7947-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
10842  TCGA-VQ-AA6F-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
10843  TCGA-BR-8588-01  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...   
10844  TCGA-DD-A115-01  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   

       106481100  106481135  106481157  106481167  106481845  106481849  \
0           

Pandas Apply:   0%|          | 0/12804 [00:00<?, ?it/s]

             sample_id cancer_type
0      TCGA-D3-A1QA-07        skcm
1      TCGA-DE-A4MD-06        thcm
2      TCGA-J8-A3O2-06        thcm
3      TCGA-J8-A3YH-06        thcm
4      TCGA-EM-A2P1-06        thcm
...                ...         ...
12799  TCGA-17-Z059-01        luad
12800  TCGA-17-Z060-01        luad
12801  TCGA-17-Z061-01        luad
12802  TCGA-17-Z062-01        luad
12803  TCGA-02-0002-01         gbm

[12804 rows x 2 columns]
             sample_id  cancer_type_acc  cancer_type_blca  cancer_type_brca  \
0      TCGA-D3-A1QA-07                0                 0                 0   
1      TCGA-DE-A4MD-06                0                 0                 0   
2      TCGA-J8-A3O2-06                0                 0                 0   
3      TCGA-J8-A3YH-06                0                 0                 0   
4      TCGA-EM-A2P1-06                0                 0                 0   
...                ...              ...               ...               ...   
1

Pandas Apply:   0%|          | 0/10805 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/10770 [00:00<?, ?it/s]

             sample_id    purity
0      TCGA-OR-A5J1-01  0.924600
1      TCGA-OR-A5J2-01  0.898500
2      TCGA-OR-A5J3-01  0.946600
3      TCGA-OR-A5J4-01  0.866000
4      TCGA-OR-A5J5-01  0.978000
...                ...       ...
10800  TCGA-XU-A92Z-01  0.804001
10801  TCGA-X7-A8D8-01  0.752956
10802  TCGA-XU-A92O-01  0.808031
10803  TCGA-X7-A8M8-01  0.731812
10804  TCGA-3G-AB14-01  0.752233

[10770 rows x 2 columns]
Processed Tumor Purity data.
Processing GEX data...


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/449 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/449 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/449 [00:00<?, ?it/s]

            sample_id  100507661  105372097    53916    57103  104472717  \
0     TCGA-19-1787-01     0.0000     2.0000  10.3835   9.7764     7.1898   
1     TCGA-S9-A7J2-01     4.6439     2.8074   9.9144   8.7649     5.0875   
2     TCGA-EK-A2RE-01     0.0000     0.0000  10.0543  10.6375     0.0000   
3     TCGA-44-6778-01     0.0000     1.5850   9.8319   9.0954     6.8948   
4     TCGA-F4-6854-01     0.0000     0.0000   9.9701   9.4858     7.5774   
...               ...        ...        ...      ...      ...        ...   
9798  TCGA-IB-7885-01     0.0000     0.0000  10.4929   9.5755     5.0875   
9799  TCGA-95-7947-01     0.0000     1.5850  11.4536   9.3287     2.3219   
9800  TCGA-VQ-AA6F-01     0.0000     0.0000  11.2277  10.5127     6.8455   
9801  TCGA-BR-8588-01     0.0000     0.0000   9.4450   9.8688     7.5314   
9802  TCGA-DD-A115-01     0.0000     0.0000   9.0019   7.9484     4.0000   

      100616274    22838   55567     6147  ...  388849    80135     8438  \
0          

Pandas Apply:   0%|          | 0/7754 [00:00<?, ?it/s]

rppa_df.shape: (6450, 211)


Pandas Apply:   0%|          | 0/10845 [00:00<?, ?it/s]

thresholded_cna_df.shape: (9168, 24)


Pandas Apply:   0%|          | 0/10845 [00:00<?, ?it/s]

unthresholded_cna_df.shape: (9168, 24)


Pandas Apply:   0%|          | 0/10770 [00:00<?, ?it/s]

tumor_purity_df.shape: (9168, 2)


Pandas Apply:   0%|          | 0/12804 [00:00<?, ?it/s]

cancer_type_df.shape: (9168, 2)


Pandas Apply:   0%|          | 0/12804 [00:00<?, ?it/s]

cancer_type_one_hot_df.shape (9168, 34)


Pandas Apply:   0%|          | 0/9803 [00:00<?, ?it/s]

gex_df.shape: (9168, 24)
Saved data.
