# Load data

In [1]:
import cptac.pancan as pc
import cptac.utils as cput
import numpy as np
import os
import pandas as pd
import pcprutils as ut

### Get pancancerProteinMRNA repository path

This analysis will build off of data stored in the [pancancerProteinMRNA repo](https://github.com/PayneLab/pancancerProteinMRNA), which is publicly accessible. To access that data, clone the repository, then store the path to it in a text file with no quotes named `pancancerProteinMRNA_repo_path.txt` in the same directory as this notebook. The notebook will then read the path from that file and be able to access the data.

In [2]:
with open("pancancerProteinMRNA_repo_path.txt", "r") as pcp_path_file:
    pcp_path = pcp_path_file.read()

print(pcp_path)

/home/caleb/GitHub/PayneLab/pancancerProteinMRNA


In [3]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

### Load delta correlations

In [4]:
delta_corr_path = os.path.join(pcp_path, "notebook_steps_Spearman", "data", "delta_correlation_df.csv")
delta_corr = pd.read_csv(delta_corr_path)
delta_corr = delta_corr.assign(Cancer=delta_corr["Cancer"].str.lower())

delta_corr

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.268533,5.703182e-02,1.320375e-01,ccrcc
1,A1CF,0.192038,1.063340e-04,6.401858e-04,ccrcc
2,A2M,-0.191619,1.277644e-01,2.439276e-01,ccrcc
3,AAAS,0.019654,8.963138e-01,9.409267e-01,ccrcc
4,AACS,-0.169937,6.007042e-02,1.375402e-01,ccrcc
5,AADAT,-0.263372,1.531939e-02,4.572896e-02,ccrcc
6,AAED1,0.190506,2.882440e-01,4.385777e-01,ccrcc
7,AAGAB,0.364999,7.633656e-04,3.646252e-03,ccrcc
8,AAK1,0.356932,6.347464e-14,1.385608e-12,ccrcc
9,AAMP,0.281272,2.536223e-02,6.920503e-02,ccrcc


### Load gene-based residuals data

In [5]:
residuals = {}
residuals_dir_path = os.path.join(pcp_path, "notebook_steps_Spearman", "clinical_associations")

for cancer_type in cancer_types:
    file_name = f"{cancer_type}_residuals.tsv.gz"
    res = pd.read_csv(os.path.join(residuals_dir_path, file_name), sep="\t")
    res = res.assign(Patient_ID=res["Patient_ID"].str.split("\.N", expand=True)[0]) # Make paired Patient_IDs same
    residuals[cancer_type] = res

residuals["ccrcc"]

Unnamed: 0,Patient_ID,Gene,Proteomics,Tissue,Transcriptomics,m,b,orth_resid,intersect_x,intersect_y,above_reg_line
0,C3L-00004,A1CF,0.641447,Tumor,16.677828,0.082623,-0.847022,0.110114,16.686895,0.531707,True
1,C3L-00010,A1CF,0.194620,Tumor,16.682712,0.082623,-0.847022,0.335598,16.655078,0.529078,False
2,C3L-00011,A1CF,-0.780455,Tumor,0.245606,0.082623,-0.847022,0.046116,0.249403,-0.826415,True
3,C3L-00026,A1CF,0.404286,Tumor,16.347532,0.082623,-0.847022,0.099045,16.339377,0.502994,False
4,C3L-00079,A1CF,-0.677773,Tumor,4.858958,0.082623,-0.847022,0.231427,4.839902,-0.447132,False
5,C3L-00088,A1CF,0.310249,Tumor,13.654469,0.082623,-0.847022,0.028993,13.656856,0.281355,True
6,C3L-00096,A1CF,-0.128732,Tumor,8.107277,0.082623,-0.847022,0.048274,8.111252,-0.176842,True
7,C3L-00097,A1CF,-0.513243,Tumor,4.541293,0.082623,-0.847022,0.041298,4.537892,-0.472085,False
8,C3L-00103,A1CF,-1.135859,Tumor,1.853419,0.082623,-0.847022,0.440472,1.817149,-0.696883,False
9,C3L-00183,A1CF,-0.128068,Tumor,6.293220,0.082623,-0.847022,0.198311,6.309550,-0.325705,True


#### For information's sake, use the residuals tables ot figure out how many paired tumor-normal samples there are in each cancer type

In [6]:
def get_paired_sample_count(cancer_type, residuals_map):
    res = residuals_map[cancer_type][["Patient_ID", "Tissue"]].drop_duplicates(keep="first")
    return res.\
    assign(Patient_ID=res["Patient_ID"].str.split("\.N", expand=True)[0]).\
    pivot(
        index="Patient_ID",
        columns="Tissue",
        values="Tissue"
    ).\
    dropna(axis=0, how="any").\
    shape[0]

print("Number of paired tumor-normal samples in each cancer type:")
for cancer_type in cancer_types:
    print(f"{cancer_type: >11} - {get_paired_sample_count(cancer_type, residuals): >3}")

Number of paired tumor-normal samples in each cancer type:
      ccrcc -  75
endometrial -  14
      hnscc -  42
       lscc -  94
       luad - 101


#### Calculate patient-wise tumor-normal residuals differences for genes with greatest change in correlation between tumor and normal (highest absolute value of delta correlation)

In [7]:
n = 25 # How many highest genes to select

def get_highest_delta_corr(df):
    df = df.assign(abs_delta_corr=df["Delta_Correlation"].abs())
    df = df.sort_values(by="abs_delta_corr")
    return df["Gene"].iloc[-n:].tolist()

highest_delta_genes = get_highest_delta_corr(delta_corr)

highest_delta_genes

['SLC1A5',
 'ECH1',
 'IMPDH1',
 'PPM1L',
 'ACSS1',
 'THADA',
 'MFSD4A',
 'CARD9',
 'IMPDH2',
 'MBNL2',
 'CDKN2A',
 'TALDO1',
 'DYNC1LI1',
 'NELL1',
 'ECT2',
 'GPCPD1',
 'FAM57A',
 'RIPK3',
 'HPS3',
 'IFT43',
 'TXLNG',
 'CFI',
 'RNASET2',
 'C1orf116',
 'USP7']

In [8]:
top_genes_residuals_diff = {}
for cancer_type in cancer_types:
    res = residuals[cancer_type]
    
    top_res = res[res["Gene"].isin(highest_delta_genes)].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="orth_resid",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    dropna(axis=0, how="any").\
    reset_index(drop=False)
    
    top_res = top_res.\
    assign(tumor_normal_residual_diff=top_res["Tumor"] - top_res["Normal"]).\
    pivot(
        index="Patient_ID",
        columns="Gene",
        values="tumor_normal_residual_diff",
    ).\
    add_prefix("tumor_normal_residual_diff_")
    
    top_res.columns.name = None
    
    top_genes_residuals_diff[cancer_type] = top_res

top_genes_residuals_diff["ccrcc"]

Unnamed: 0_level_0,tumor_normal_residual_diff_ACSS1,tumor_normal_residual_diff_C1orf116,tumor_normal_residual_diff_CARD9,tumor_normal_residual_diff_ECH1,tumor_normal_residual_diff_ECT2,tumor_normal_residual_diff_FAM57A,tumor_normal_residual_diff_IMPDH1,tumor_normal_residual_diff_MFSD4A,tumor_normal_residual_diff_RNASET2,tumor_normal_residual_diff_TALDO1,tumor_normal_residual_diff_USP7
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C3L-00004,-0.097351,0.236227,,-0.335819,,,0.057013,0.510559,-0.002235,-0.014105,-0.044224
C3L-00010,0.133579,-0.135144,0.071047,0.360411,,,-0.061065,0.120648,-0.321574,-0.013660,-0.092398
C3L-00011,-0.236049,-0.004635,-0.135547,-0.113364,-0.029735,,0.037074,-0.161446,0.344812,0.073765,-0.031312
C3L-00026,0.641068,-0.033196,0.063180,0.196011,0.047859,,-0.204715,0.481989,0.201297,0.194513,-0.119729
C3L-00079,-0.603942,0.166894,,-0.361304,-0.201955,,-0.048765,-0.984103,-0.781335,0.016353,-0.056516
C3L-00088,-0.084110,0.318678,,-0.160148,,,0.190302,-0.275206,0.052973,-0.057773,-0.111372
C3L-00096,-0.121850,-0.148592,,0.280326,,,0.438087,-0.017267,-0.214168,0.072704,-0.098803
C3L-00097,0.243694,0.318929,,0.587981,,,0.033112,0.121004,0.051454,0.025781,0.114518
C3L-00103,0.185379,0.322994,-0.014286,-0.001786,-0.172011,0.162971,0.096544,,-0.019132,-0.160472,0.091324
C3L-00360,-0.140671,-0.586553,,0.085915,0.239064,1.129445,0.205068,-0.234431,-0.372114,0.054108,-0.065667


#### Select tumor and normal above_reg_line values for top genes

In [9]:
above_reg_line_top_genes = {}
for cancer_type in cancer_types:
    res = residuals[cancer_type]
    
    top_res = res[res["Gene"].isin(highest_delta_genes)].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="above_reg_line",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    dropna(axis=0, how="any").\
    reset_index(drop=False)
    
    top_res = top_res.\
    pivot(
        index="Patient_ID",
        columns="Gene",
        values=["Normal", "Tumor"],
    ).\
    swaplevel(0, 1, axis=1).\
    sort_index(axis=1)
    
    top_res = cput.reduce_multiindex(top_res, flatten=True).\
    add_prefix("above_reg_line_")
    
    top_res.columns.name = None
    
    above_reg_line_top_genes[cancer_type] = top_res.astype(bool)

above_reg_line_top_genes["ccrcc"]

Unnamed: 0_level_0,above_reg_line_ACSS1_Normal,above_reg_line_ACSS1_Tumor,above_reg_line_C1orf116_Normal,above_reg_line_C1orf116_Tumor,above_reg_line_CARD9_Normal,above_reg_line_CARD9_Tumor,above_reg_line_ECH1_Normal,above_reg_line_ECH1_Tumor,above_reg_line_ECT2_Normal,above_reg_line_ECT2_Tumor,...,above_reg_line_IMPDH1_Normal,above_reg_line_IMPDH1_Tumor,above_reg_line_MFSD4A_Normal,above_reg_line_MFSD4A_Tumor,above_reg_line_RNASET2_Normal,above_reg_line_RNASET2_Tumor,above_reg_line_TALDO1_Normal,above_reg_line_TALDO1_Tumor,above_reg_line_USP7_Normal,above_reg_line_USP7_Tumor
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,True,False,True,False,True,True,False,True,True,True,...,False,True,True,True,True,False,True,False,True,False
C3L-00010,False,False,True,False,False,True,False,False,True,True,...,False,False,False,False,False,True,True,False,True,False
C3L-00011,False,True,False,True,False,False,True,True,True,False,...,False,False,True,True,True,True,False,False,False,False
C3L-00026,False,True,True,True,False,True,False,True,True,False,...,True,True,True,True,False,False,True,False,True,False
C3L-00079,False,True,False,True,True,True,False,False,True,False,...,True,True,False,False,False,False,False,False,True,True
C3L-00088,False,False,True,False,True,True,True,True,True,True,...,True,True,False,True,True,False,False,False,False,True
C3L-00096,True,False,False,True,True,True,False,False,True,True,...,False,True,False,True,True,True,False,True,False,False
C3L-00097,False,True,False,False,True,True,True,True,True,True,...,False,False,False,False,False,False,False,True,False,False
C3L-00103,False,False,True,False,False,False,True,False,False,True,...,False,True,True,True,True,False,True,True,True,False
C3L-00360,True,False,True,False,True,True,False,False,False,False,...,False,True,True,False,False,False,True,True,True,False


### Generate, for each patient, correlation between tumor and normal residuals

This is the correlation value from a graph for each patient where the x axis is normal residuals for a gene in that patient and the y axis is tumor residuals for a gene in that patient.

In [10]:
def get_patient_residuals_corr(res):
    """For a particular cancer type, find the correlation coefficient of tumor
    residual to normal residual for each patient in the cancer type.
    
    Parameters:
    res (pandas.DataFrame): The residuals table for the cancer type
    
    Returns:
    pandas.DataFrame: A table containing the correlation coefficient of tumor 
        residual to normal residual for each patient in the cancer type
    """
    
    # Just get the columns we need
    res = res[["Patient_ID", "Gene", "Tissue", "above_reg_line"]]
    
    # Make tumor and normal residuals separate columns
    res = res.pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="above_reg_line",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).reset_index(drop=False)
    
    res.columns.name = None
    
    # Define function to get correlation for a particular patient
    def get_corr(df):
        return df.corr(
            method="spearman",
            min_periods=10 if cancer_type == "endometrial" else 15,
        ).iloc[0][1]
    
    # Get correlation for each patient
    corr = res.\
    groupby("Patient_ID").\
    apply(get_corr).\
    rename("tumor_normal_residuals_corr").\
    to_frame().\
    dropna(axis=0, how="any")
    
    return corr

patient_residuals_corr = {}
for cancer_type in cancer_types:
    patient_residuals_corr[cancer_type] = get_patient_residuals_corr(residuals[cancer_type])

patient_residuals_corr["ccrcc"]

Unnamed: 0_level_0,tumor_normal_residuals_corr
Patient_ID,Unnamed: 1_level_1
C3L-00004,-0.072648
C3L-00010,-0.024319
C3L-00011,-0.065059
C3L-00026,-0.221979
C3L-00079,0.184029
C3L-00088,-0.195487
C3L-00096,0.081507
C3L-00097,-0.171591
C3L-00103,-0.072623
C3L-00360,-0.139999


Now remember--we can only generate this data for paired tumor-normal samples. Earlier we calculated how many paired samples we have for each data type; below we print the numbers of patients we have tumor-normal residuals correlation values for, which should be the same as the number of paired samples for each cancer type.

In [11]:
print("Number of patients per cancer type for which we were able to calculate tumor-normal residuals correlation:")
for cancer_type in cancer_types:
    print(f"{cancer_type: >11} - {patient_residuals_corr[cancer_type].shape[0]: >3}")

Number of patients per cancer type for which we were able to calculate tumor-normal residuals correlation:
      ccrcc -  75
endometrial -  14
      hnscc -  42
       lscc -  94
       luad - 101


### Generate, for each patient, correlations of RNA tumor-normal ratio to protein tumor-normal ratio

This is the correlation value from a graph for each patient where the x axis is the ratio of tumor over normal transcriptomics value for a gene in that patient, and the y axis is the ratio of tumor over normal proteomics value for a gene in that patient.

In [12]:
prot_RNA_tumor_normal_ratios_corr = {}
for cancer_type in cancer_types:
    ratios = residuals[cancer_type][["Patient_ID", "Tissue", "Gene", "Proteomics", "Transcriptomics"]].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values=["Proteomics", "Transcriptomics"],
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    reset_index(drop=False)
    ratios = cput.reduce_multiindex(ratios, flatten=True)
    
    def make_cols_ratio(df, col1, col2, ratio_col_name):
        return df.\
        assign(**{ratio_col_name: df[col1] / df[col2]}).\
        drop(columns=[col1, col2])
    
    ratios = make_cols_ratio(
        ratios, 
        "Proteomics_Tumor", 
        "Proteomics_Normal", 
        "Prot_Tumor_Normal_Ratio"
    )
    ratios = make_cols_ratio(
        ratios, 
        "Transcriptomics_Tumor", 
        "Transcriptomics_Normal", 
        "RNA_Tumor_Normal_Ratio"
    )
    
    # Define function to get correlation for a particular patient
    def get_corr(df):
        return df.corr(
            method="spearman",
            min_periods=10 if cancer_type == "endometrial" else 15,
        ).iloc[0][1]
    
    corr = ratios.\
    groupby("Patient_ID").\
    apply(get_corr).\
    rename("prot_RNA_tumor_normal_ratios_corr").\
    to_frame().\
    dropna(axis=0, how="any")
    
    prot_RNA_tumor_normal_ratios_corr[cancer_type] = corr
    
prot_RNA_tumor_normal_ratios_corr["ccrcc"]

Unnamed: 0_level_0,prot_RNA_tumor_normal_ratios_corr
Patient_ID,Unnamed: 1_level_1
C3L-00004,0.005287
C3L-00010,0.025093
C3L-00011,0.059309
C3L-00026,0.048963
C3L-00079,-0.141564
C3L-00088,-0.002509
C3L-00096,0.009792
C3L-00097,0.075757
C3L-00103,0.085842
C3L-00360,0.028530


### Load pancan clinical data

In [13]:
dss = {
    "ccrcc": pc.PancanCcrcc,
    "endometrial": pc.PancanUcec,
    "hnscc": pc.PancanHnscc,
    "lscc": pc.PancanLscc,
    "luad": pc.PancanLuad,
}

In [14]:
def get_clinical(dss, cancer_type):
    
    # Instantiate the dataset
    ds = dss[cancer_type]()
    
    # We use get_clinical instead of get_followup because get_followup just returns
    # a subset of the clinical table, and we need other columns too
    clin_vars = ds.get_clinical()[[
        "Recurrence status (1, yes; 0, no)",
        "Survial status (1, dead; 0, alive)",
        "baseline/histologic_type",
        "baseline/pathologic_staging_primary_tumor",
        "cptac_path/histologic_grade",
        "follow-up/measure_of_success_of_outcome_at_last_available_follow-up",
    ]]

    # Get rid of categorizing prefixes in column names, where they exist
    cols = clin_vars.columns.str.split("/", expand=True).to_frame()
    cols.columns = ["a", "b"]
    cols = cols.assign(b=cols["b"].fillna(cols["a"])) # If there was no prefix, no change
    clin_vars.columns = cols["b"].rename(None)
    
    # Fix values in measure of success column--sometimes deceased patients are listed
    # as "Persistent Disease" in this column, instead of "Patient Deceased"
    clin_vars = clin_vars.assign(
        **{"measure_of_success_of_outcome_at_last_available_follow-up": np.where(
            clin_vars["Survial status (1, dead; 0, alive)"] == 1,
            "Patient Deceased",
            clin_vars["measure_of_success_of_outcome_at_last_available_follow-up"]
        )}
    )
    
    # Make paired Patient_IDs same
    clin_vars.index = clin_vars.index.str.split("\.N", expand=True).to_frame()[0]
    
    # Drop any completely NaN rows
    clin_vars = clin_vars.dropna(axis=0, how="all")

    return clin_vars

clin = {}
for cancer_type in cancer_types:
    clin[cancer_type] = get_clinical(dss, cancer_type)

Loading broadccrcc v1.0...                     



  features=features)


  features=features)


                                                 

### Combine data into one table

In [32]:
all_data = pd.DataFrame()
for cancer_type in cancer_types:
    dfs = [
        top_genes_residuals_diff[cancer_type],
        above_reg_line_top_genes[cancer_type],
        patient_residuals_corr[cancer_type],
        prot_RNA_tumor_normal_ratios_corr[cancer_type],
        clin[cancer_type],
    ]
    
    cancer_type_all = dfs[0]
    for df in dfs[1:]:
        cancer_type_all = cancer_type_all.join(df, how="outer")
    cancer_type_all = cancer_type_all.assign(cancer_type=cancer_type)
    all_data = all_data.append(cancer_type_all)

In [33]:
# Define input and target columns
input_cols = [  

    # Cancer type
    "cancer_type",
    
    # Our calculated correlations
    "tumor_normal_residuals_corr",
    "prot_RNA_tumor_normal_ratios_corr",
    
    # Gene-based regression data
    "above_reg_line_ACSS1_Normal",
    "above_reg_line_ACSS1_Tumor",
    "above_reg_line_C1orf116_Normal",
    "above_reg_line_C1orf116_Tumor",
    "above_reg_line_CARD9_Normal",
    "above_reg_line_CARD9_Tumor",
    "above_reg_line_CDKN2A_Normal",
    "above_reg_line_CDKN2A_Tumor",
    "above_reg_line_CFI_Normal",
    "above_reg_line_CFI_Tumor",
    "above_reg_line_DYNC1LI1_Normal",
    "above_reg_line_DYNC1LI1_Tumor",
    "above_reg_line_ECH1_Normal",
    "above_reg_line_ECH1_Tumor",
    "above_reg_line_ECT2_Normal",
    "above_reg_line_ECT2_Tumor",
    "above_reg_line_FAM57A_Normal",
    "above_reg_line_FAM57A_Tumor",
    "above_reg_line_GPCPD1_Normal",
    "above_reg_line_GPCPD1_Tumor",
    "above_reg_line_HPS3_Normal",
    "above_reg_line_HPS3_Tumor",
    "above_reg_line_IFT43_Normal",
    "above_reg_line_IFT43_Tumor",
    "above_reg_line_IMPDH1_Normal",
    "above_reg_line_IMPDH1_Tumor",
    "above_reg_line_IMPDH2_Normal",
    "above_reg_line_IMPDH2_Tumor",
    "above_reg_line_MBNL2_Normal",
    "above_reg_line_MBNL2_Tumor",
    "above_reg_line_MFSD4A_Normal",
    "above_reg_line_MFSD4A_Tumor",
    "above_reg_line_NELL1_Normal",
    "above_reg_line_NELL1_Tumor",
    "above_reg_line_PPM1L_Normal",
    "above_reg_line_PPM1L_Tumor",
    "above_reg_line_RIPK3_Normal",
    "above_reg_line_RIPK3_Tumor",
    "above_reg_line_RNASET2_Normal",
    "above_reg_line_RNASET2_Tumor",
    "above_reg_line_SLC1A5_Normal",
    "above_reg_line_SLC1A5_Tumor",
    "above_reg_line_TALDO1_Normal",
    "above_reg_line_TALDO1_Tumor",
    "above_reg_line_THADA_Normal",
    "above_reg_line_THADA_Tumor",
    "above_reg_line_TXLNG_Normal",
    "above_reg_line_TXLNG_Tumor",
    "above_reg_line_USP7_Normal",
    "above_reg_line_USP7_Tumor",
    "tumor_normal_residual_diff_ACSS1",
    "tumor_normal_residual_diff_C1orf116",
    "tumor_normal_residual_diff_CARD9",
    "tumor_normal_residual_diff_CDKN2A",
    "tumor_normal_residual_diff_CFI",
    "tumor_normal_residual_diff_DYNC1LI1",
    "tumor_normal_residual_diff_ECH1",
    "tumor_normal_residual_diff_ECT2",
    "tumor_normal_residual_diff_FAM57A",
    "tumor_normal_residual_diff_GPCPD1",
    "tumor_normal_residual_diff_HPS3",
    "tumor_normal_residual_diff_IFT43",
    "tumor_normal_residual_diff_IMPDH1",
    "tumor_normal_residual_diff_IMPDH2",
    "tumor_normal_residual_diff_MBNL2",
    "tumor_normal_residual_diff_MFSD4A",
    "tumor_normal_residual_diff_NELL1",
    "tumor_normal_residual_diff_PPM1L",
    "tumor_normal_residual_diff_RIPK3",
    "tumor_normal_residual_diff_RNASET2",
    "tumor_normal_residual_diff_SLC1A5",
    "tumor_normal_residual_diff_TALDO1",
    "tumor_normal_residual_diff_THADA",
    "tumor_normal_residual_diff_TXLNG",
    "tumor_normal_residual_diff_USP7",
]

target_cols = [
    "Recurrence status (1, yes; 0, no)",
    "Survial status (1, dead; 0, alive)",
    "histologic_grade",
    "histologic_type",
    "measure_of_success_of_outcome_at_last_available_follow-up",
    "pathologic_staging_primary_tumor",
]

In [34]:
# Drop rows where input cols are all NaN
nan_subset = input_cols.copy()
nan_subset.remove("cancer_type")
all_data = all_data.dropna(axis=0, how="all", subset=nan_subset)

In [35]:
# Clean up and simplify the tumor stage column
all_data = all_data.assign(pathologic_staging_primary_tumor=all_data["pathologic_staging_primary_tumor"].\
    str.split(" ", expand=True)[0].\
    str.lower().\
    str.strip().\
    str.replace(":", "", regex=False).\
    str.replace("^p", "", regex=True).\
    str.replace("^(\d)", "t\1", regex=True).\
    str.replace("\x01", "", regex=False).\
    str.replace("[abc]", "", regex=True).\
    replace(to_replace="t", value=np.nan)
)

In [36]:
# Reorder the columns nicely, and split into inputs and targets
inputs = all_data[input_cols]
targets = all_data[target_cols]

In [37]:
# Save the tables
inputs.to_csv("inputs.tsv", sep="\t")
targets.to_csv("targets.tsv", sep="\t")