# Load data

In [1]:
import cptac.pancan as pc
import cptac.utils as cput
import numpy as np
import os
import pandas as pd
import pcprutils as ut
import altair as alt

### Get pancancerProteinMRNA repository path

This analysis will build off of data stored in the [pancancerProteinMRNA repo](https://github.com/PayneLab/pancancerProteinMRNA), which is publicly accessible. To access that data, clone the repository, then store the path to it in a text file with no quotes named `pancancerProteinMRNA_repo_path.txt` in the same directory as this notebook. The notebook will then read the path from that file and be able to access the data.

In [2]:
with open("pancancerProteinMRNA_repo_path.txt", "r") as pcp_path_file:
    pcp_path = pcp_path_file.read()

print(pcp_path)

/Users/Bryn/PayneLab/pancancer/pancancerProteinMRNA


In [3]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

### Load delta correlations

In [4]:
delta_corr_path = os.path.join(pcp_path, "notebook_steps_Spearman", "data", "delta_correlation_df.csv")
delta_corr = pd.read_csv(delta_corr_path)
delta_corr = delta_corr.assign(Cancer=delta_corr["Cancer"].str.lower())

delta_corr

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.198013,2.451044e-01,4.045115e-01,hnscc
1,A2M,-0.118384,4.480278e-01,6.091130e-01,hnscc
2,A2ML1,-0.023469,2.918125e-01,4.561968e-01,hnscc
3,AAAS,0.275905,1.051756e-01,2.209072e-01,hnscc
4,AACS,-0.136836,1.800586e-01,3.266475e-01,hnscc
...,...,...,...,...,...
50684,ZWINT,1.219024,2.267627e-09,1.049863e-07,endometrial
50685,ZXDC,-0.346532,2.983295e-01,5.386144e-01,endometrial
50686,ZYG11B,0.768196,5.463938e-04,5.319699e-03,endometrial
50687,ZYX,0.253630,2.456049e-01,4.795301e-01,endometrial


### Load gene-based residuals data

In [5]:
residuals = {}
residuals_dir_path = os.path.join(pcp_path, "notebook_steps_Spearman", "clinical_associations")

for cancer_type in cancer_types:
    file_name = f"{cancer_type}_residuals.tsv.gz"
    res = pd.read_csv(os.path.join(residuals_dir_path, file_name), sep="\t")
    res = res.assign(Patient_ID=res["Patient_ID"].str.split("\.N", expand=True)[0]) # Make paired Patient_IDs same
    residuals[cancer_type] = res

residuals["ccrcc"]

Unnamed: 0,Patient_ID,Gene,Proteomics,Tissue,Transcriptomics,m,b,orth_resid,intersect_x,intersect_y,above_reg_line
0,C3L-00004,A1CF,0.641447,Tumor,16.677828,0.082623,-0.847022,0.110114,16.686895,0.531707,True
1,C3L-00010,A1CF,0.194620,Tumor,16.682712,0.082623,-0.847022,0.335598,16.655078,0.529078,False
2,C3L-00011,A1CF,-0.780455,Tumor,0.245606,0.082623,-0.847022,0.046116,0.249403,-0.826415,True
3,C3L-00026,A1CF,0.404286,Tumor,16.347532,0.082623,-0.847022,0.099045,16.339377,0.502994,False
4,C3L-00079,A1CF,-0.677773,Tumor,4.858958,0.082623,-0.847022,0.231427,4.839902,-0.447132,False
...,...,...,...,...,...,...,...,...,...,...,...
527860,C3N-01646,ZSCAN18,-0.018171,Normal,13.238732,0.045205,-0.511652,0.104866,13.233996,0.086587,False
527861,C3N-01648,ZSCAN18,0.155586,Normal,10.655611,0.045205,-0.511652,0.185364,10.663982,-0.029590,True
527862,C3N-01649,ZSCAN18,0.076949,Normal,15.453511,0.045205,-0.511652,0.109859,15.448550,0.186696,False
527863,C3N-01651,ZSCAN18,0.091213,Normal,10.846205,0.045205,-0.511652,0.112450,10.851283,-0.021123,True


## For information's sake, use the residuals tables to figure out how many paired tumor-normal samples there are in each cancer type

In [6]:
def get_paired_sample_count(cancer_type, residuals_map):
    res = residuals_map[cancer_type][["Patient_ID", "Tissue"]].drop_duplicates(keep="first")
    return res.\
    assign(Patient_ID=res["Patient_ID"].str.split("\.N", expand=True)[0]).\
    pivot(
        index="Patient_ID",
        columns="Tissue",
        values="Tissue"
    ).\
    dropna(axis=0, how="any").\
    shape[0]

print("Number of paired tumor-normal samples in each cancer type:")
for cancer_type in cancer_types:
    print(f"{cancer_type: >11} - {get_paired_sample_count(cancer_type, residuals): >3}")

Number of paired tumor-normal samples in each cancer type:
      ccrcc -  75
endometrial -  14
      hnscc -  42
       lscc -  94
       luad - 101


#### Calculate patient-wise tumor-normal residuals differences for genes with greatest change in correlation between tumor and normal (highest absolute value of delta correlation)

In [7]:
def get_highest_delta_corr(df):
    df = df.assign(abs_delta_corr=df["Delta_Correlation"].abs())
    df = df.sort_values(by="abs_delta_corr")
    df = df.loc[(df['FDR'] < 0.05) & ((df['Delta_Correlation'] > 0.7) | (df['Delta_Correlation'] < -0.7))]
    return df["Gene"].tolist()

highest_delta_genes = {}
for cancer in cancer_types:
    cancer_df = delta_corr.loc[delta_corr['Cancer'] == cancer]
    highest_delta_genes[cancer] = get_highest_delta_corr(cancer_df)

highest_delta_genes

{'ccrcc': ['AP4S1',
  'GDAP1',
  'UNC5B',
  'ZDHHC2',
  'USP47',
  'NEK3',
  'PTTG1IP',
  'STRA6',
  'AZI2',
  'WNK1',
  'RALGAPA1',
  'FAF2',
  'TSPYL4',
  'SCGN',
  'LSR',
  'MUC15',
  'SLC35A5',
  'CLINT1',
  'GMFB',
  'RHAG',
  'MARK3',
  'EFHD1',
  'CR2',
  'CNTLN',
  'PEG10',
  'PTPRG',
  'NACC1',
  'C9orf78',
  'IGF2BP3',
  'LY6E',
  'MUC13',
  'YARS2',
  'CERCAM',
  'PGK1',
  'LPCAT1',
  'AP3M1',
  'NCOA7',
  'DNAAF2',
  'NPEPPS',
  'METAP1D',
  'PGBD5',
  'SCAP',
  'CCP110',
  'GOLPH3L',
  'ALDH18A1',
  'SLC36A2',
  'PDCD7',
  'PPM1M',
  'GCDH',
  'RBP2',
  'POMT2',
  'NOVA2',
  'DLG5',
  'APLP2',
  'PGD',
  'COLEC12',
  'MCC',
  'PIK3R4',
  'PDZRN3',
  'AHSA1',
  'KIAA1522',
  'CYP4F2',
  'NOL3',
  'SLC44A1',
  'DCTD',
  'ZNF358',
  'ENPP2',
  'PPID',
  'PLS1',
  'TTC36',
  'NGLY1',
  'EHD4',
  'MAPK10',
  'SMAP1',
  'CCNY',
  'PNCK',
  'TREM2',
  'PDP1',
  'SMPD2',
  'CALCRL',
  'CLCC1',
  'PPP1R3G',
  'SLC22A8',
  'NCAPD2',
  'MIOS',
  'PISD',
  'BTN3A1',
  'COL5A1',
  'DUS

In [8]:
top_genes_residuals_diff = {}
for cancer_type in cancer_types:
    res = residuals[cancer_type]
    
    top_res = res[res["Gene"].isin(highest_delta_genes[cancer_type])].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="orth_resid",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    dropna(axis=0, how="any").\
    reset_index(drop=False)
    
    top_res = top_res.\
    assign(tumor_normal_residual_diff=top_res["Tumor"] - top_res["Normal"]).\
    pivot(
        index="Patient_ID",
        columns="Gene",
        values="tumor_normal_residual_diff",
    ).\
    add_prefix("tumor_normal_residual_diff_")
    
    top_res.columns.name = None
    
    top_genes_residuals_diff[cancer_type] = top_res

top_genes_residuals_diff["ccrcc"]

Unnamed: 0_level_0,tumor_normal_residual_diff_ADCY3,tumor_normal_residual_diff_AGK,tumor_normal_residual_diff_AGXT,tumor_normal_residual_diff_AHSA1,tumor_normal_residual_diff_ALDH18A1,tumor_normal_residual_diff_ANKZF1,tumor_normal_residual_diff_AP3M1,tumor_normal_residual_diff_AP4S1,tumor_normal_residual_diff_APLP2,tumor_normal_residual_diff_APPL1,...,tumor_normal_residual_diff_USP47,tumor_normal_residual_diff_USP6NL,tumor_normal_residual_diff_VPS25,tumor_normal_residual_diff_WNK1,tumor_normal_residual_diff_XPNPEP1,tumor_normal_residual_diff_YARS2,tumor_normal_residual_diff_ZDHHC2,tumor_normal_residual_diff_ZEB1,tumor_normal_residual_diff_ZNF358,tumor_normal_residual_diff_ZNF397
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,,-0.017927,-0.427347,-0.027146,-0.274487,0.121017,-0.024181,-0.184498,0.221461,0.019471,...,-0.060553,-0.166229,0.102887,0.027144,-0.022660,0.032056,,,0.221142,
C3L-00010,,0.030770,0.389726,0.291561,0.200216,0.018235,-0.092127,-0.047637,0.213010,-0.005618,...,-0.063582,-0.105923,-0.065229,0.017284,-0.183118,0.516323,,,,
C3L-00011,,-0.158858,-0.048885,-0.005581,0.078417,0.085755,-0.021808,-0.061461,0.162054,0.024129,...,0.202844,0.070314,0.106287,0.057198,0.117379,0.289518,-0.678727,,-0.050268,
C3L-00026,,0.293615,0.243402,-0.068528,0.503235,0.066225,-0.173053,-0.002121,-0.154429,0.161409,...,-0.094514,0.042727,-0.031146,-0.169713,-0.176085,0.657336,,0.011026,,
C3L-00079,,-0.159576,0.520756,-0.042411,-0.057107,0.175662,-0.077721,-0.171132,0.052900,0.321521,...,0.104721,0.246074,-0.113529,-0.180434,-0.219944,-0.405774,,-0.173976,0.033889,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,,-0.150593,-0.677954,0.037328,-0.238651,-0.029590,0.054482,-0.041253,0.014083,0.057598,...,0.128537,0.118924,-0.012401,-0.028849,-0.052458,-0.001251,0.534525,,-0.128325,
C3N-01648,,-0.018247,-0.166399,-0.055132,0.287773,0.040302,-0.004579,0.040053,0.054391,0.017791,...,-0.153775,0.236419,-0.102720,0.178380,0.076352,0.167799,,,0.017676,
C3N-01649,0.131998,0.047608,0.058750,0.049832,0.138271,0.248659,0.007734,0.122265,-0.013364,-0.054940,...,-0.065001,0.046015,0.012371,-0.035182,0.076530,-0.432277,,0.312212,0.182501,
C3N-01651,,-0.076749,-0.418871,0.129164,0.253536,-0.091163,-0.113438,0.230977,0.096755,0.104274,...,-0.113729,0.040112,0.118889,0.052720,-0.093561,0.391485,,-0.000415,,-0.079001


#### Select tumor and normal above_reg_line values for top genes

In [9]:
above_reg_line_top_genes = {}
for cancer_type in cancer_types:
    res = residuals[cancer_type]
    
    top_res = res[res["Gene"].isin(highest_delta_genes[cancer_type])].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="above_reg_line",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    dropna(axis=0, how="any").\
    reset_index(drop=False)
    
    top_res = top_res.\
    pivot(
        index="Patient_ID",
        columns="Gene",
        values=["Normal", "Tumor"],
    ).\
    swaplevel(0, 1, axis=1).\
    sort_index(axis=1)
    
    top_res = cput.reduce_multiindex(top_res, flatten=True).\
    add_prefix("above_reg_line_")
    
    top_res.columns.name = None
    
    above_reg_line_top_genes[cancer_type] = top_res.astype(bool)

above_reg_line_top_genes["ccrcc"]

Unnamed: 0_level_0,above_reg_line_ADCY3_Normal,above_reg_line_ADCY3_Tumor,above_reg_line_AGK_Normal,above_reg_line_AGK_Tumor,above_reg_line_AGXT_Normal,above_reg_line_AGXT_Tumor,above_reg_line_AHSA1_Normal,above_reg_line_AHSA1_Tumor,above_reg_line_ALDH18A1_Normal,above_reg_line_ALDH18A1_Tumor,...,above_reg_line_YARS2_Normal,above_reg_line_YARS2_Tumor,above_reg_line_ZDHHC2_Normal,above_reg_line_ZDHHC2_Tumor,above_reg_line_ZEB1_Normal,above_reg_line_ZEB1_Tumor,above_reg_line_ZNF358_Normal,above_reg_line_ZNF358_Tumor,above_reg_line_ZNF397_Normal,above_reg_line_ZNF397_Tumor
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,True,True,True,False,True,True,False,True,False,True,...,False,True,True,True,True,True,False,False,True,True
C3L-00010,True,True,False,False,True,False,True,False,False,False,...,False,False,True,True,True,True,True,True,True,True
C3L-00011,True,True,True,False,True,True,True,True,True,False,...,True,False,True,False,True,True,False,False,True,True
C3L-00026,True,True,True,True,True,True,False,False,True,True,...,False,True,True,True,True,True,True,True,True,True
C3L-00079,True,True,False,True,False,False,False,True,True,True,...,False,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,True,True,True,True,False,False,False,False,True,True,...,True,False,False,True,True,True,False,True,True,True
C3N-01648,True,True,True,False,True,True,True,True,True,True,...,True,True,True,True,True,True,False,True,True,True
C3N-01649,False,False,False,False,False,False,True,False,True,False,...,True,True,True,True,True,False,True,True,True,True
C3N-01651,True,True,True,True,True,False,True,True,True,True,...,True,True,True,True,False,True,True,True,False,False


### Generate, for each patient, correlation between tumor and normal residuals

This is the correlation value from a graph for each patient where the x axis is normal residuals for a gene in that patient and the y axis is tumor residuals for a gene in that patient.

In [10]:
# FINDS THE TUMOR NORMAL CORR FOR THE HIGHEST DELTA GENES

def get_patient_residuals_corr(res):
    """For a particular cancer type, find the correlation coefficient of tumor
    residual to normal residual for each patient in the cancer type.
    
    Parameters:
    res (pandas.DataFrame): The residuals table for the cancer type
    
    Returns:
    pandas.DataFrame: A table containing the correlation coefficient of tumor 
        residual to normal residual for each patient in the cancer type
    """
    
    # Just get the columns we need
    res = res[["Patient_ID", "Gene", "Tissue", "orth_resid"]]
    
    # Make tumor and normal residuals separate columns
    res = res.pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="orth_resid",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).reset_index(drop=False)
    
    res.columns.name = None
    
    # Define function to get correlation for a particular patient
    def get_corr(df):
        return df.corr(
            method="spearman",
            min_periods=10 if cancer_type == "endometrial" else 15,
        ).iloc[0][1]
    
    # Get correlation for each patient
    corr = res.\
    groupby("Patient_ID").\
    apply(get_corr).\
    rename("tumor_normal_residuals_corr").\
    to_frame().\
    dropna(axis=0, how="any")
    
    return corr

patient_residuals_corr = {}
for cancer_type in cancer_types:
    df = residuals[cancer_type]
    patient_residuals_corr[cancer_type] = get_patient_residuals_corr(df[df['Gene'].isin(highest_delta_genes[cancer_type])])

patient_residuals_corr["luad"]

Unnamed: 0_level_0,tumor_normal_residuals_corr
Patient_ID,Unnamed: 1_level_1
C3L-00001,0.250850
C3L-00009,0.221196
C3L-00080,0.091227
C3L-00083,0.057682
C3L-00093,0.070900
...,...
C3N-02582,0.192453
C3N-02586,0.219598
C3N-02587,0.203052
C3N-02588,0.147810


In [11]:
def patient_residuals_corr_plot(res):
    """For a particular cancer type, find the correlation coefficient of tumor
    residual to normal residual for each patient in the cancer type.
    
    Parameters:
    res (pandas.DataFrame): The residuals table for the cancer type
    
    Returns:
    pandas.DataFrame: A table containing the correlation coefficient of tumor 
        residual to normal residual for each patient in the cancer type
    """
    
    # Just get the columns we need
    res = res[["Patient_ID", "Gene", "Tissue", "orth_resid"]]

    
    # Make tumor and normal residuals separate columns
    res = res.pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="orth_resid",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).reset_index(drop=False)
    
    res.columns.name = None
    res = res[res["Patient_ID"] == "C3N-01648"]
    
    
    scatter = alt.Chart(res).mark_point().encode(
        x=alt.X(
            "Normal",
            title="Normal residual"
        ),
        y=alt.Y(
            "Tumor",
            title="Tumor residual"
        ),
    )
    
    chart = (scatter + scatter.transform_regression('Normal', 'Tumor').mark_line(color="#FF7F0E")).properties(
        title=["Tumor-normal residuals correlation for each protein", "Patient C3N-01648"]
    )
    
    return chart

df = residuals['ccrcc']
#patient_residuals_corr_plot(residuals["ccrcc"])
patient_residuals_corr_plot(df[df['Gene'].isin(highest_delta_genes[cancer_type])])

Now remember--we can only generate this data for paired tumor-normal samples. Earlier we calculated how many paired samples we have for each data type; below we print the numbers of patients we have tumor-normal residuals correlation values for, which should be the same as the number of paired samples for each cancer type.

In [12]:
print("Number of patients per cancer type for which we were able to calculate tumor-normal residuals correlation:")
for cancer_type in cancer_types:
    print(f"{cancer_type: >11} - {patient_residuals_corr[cancer_type].shape[0]: >3}")

Number of patients per cancer type for which we were able to calculate tumor-normal residuals correlation:
      ccrcc -  75
endometrial -  14
      hnscc -  42
       lscc -  94
       luad - 101


### Generate, for each patient, correlations of RNA tumor-normal ratio to protein tumor-normal ratio

This is the correlation value from a graph for each patient where the x axis is the ratio of tumor over normal transcriptomics value for a gene in that patient, and the y axis is the ratio of tumor over normal proteomics value for a gene in that patient.

In [13]:
prot_RNA_tumor_normal_ratios_corr = {}
for cancer_type in cancer_types:
    df = residuals[cancer_type]
    df = df[df['Gene'].isin(highest_delta_genes[cancer_type])]
    ratios = df[["Patient_ID", "Tissue", "Gene", "Proteomics", "Transcriptomics"]].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values=["Proteomics", "Transcriptomics"],
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    reset_index(drop=False)
    ratios = cput.reduce_multiindex(ratios, flatten=True)
    
    def make_cols_ratio(df, col1, col2, ratio_col_name):
        return df.\
        assign(**{ratio_col_name: df[col1] / df[col2]}).\
        drop(columns=[col1, col2])
    
    ratios = make_cols_ratio(
        ratios, 
        "Proteomics_Tumor", 
        "Proteomics_Normal", 
        "Prot_Tumor_Normal_Ratio"
    )
    ratios = make_cols_ratio(
        ratios, 
        "Transcriptomics_Tumor", 
        "Transcriptomics_Normal", 
        "RNA_Tumor_Normal_Ratio"
    )
    
    # Define function to get correlation for a particular patient
    def get_corr(df):
        return df.corr(
            method="spearman",
            min_periods=10 if cancer_type == "endometrial" else 15,
        ).iloc[0][1]
    
    corr = ratios.\
    groupby("Patient_ID").\
    apply(get_corr).\
    rename("prot_RNA_tumor_normal_ratios_corr").\
    to_frame().\
    dropna(axis=0, how="any")
    
    prot_RNA_tumor_normal_ratios_corr[cancer_type] = corr
    
prot_RNA_tumor_normal_ratios_corr["ccrcc"]

Unnamed: 0_level_0,prot_RNA_tumor_normal_ratios_corr
Patient_ID,Unnamed: 1_level_1
C3L-00004,0.055306
C3L-00010,0.010438
C3L-00011,0.152253
C3L-00026,0.018879
C3L-00079,-0.298308
...,...
C3N-01646,0.205542
C3N-01648,0.174059
C3N-01649,0.135709
C3N-01651,0.082687


In [14]:
for cancer_type in ["hnscc"]:
    df = residuals[cancer_type]
    df = df[df['Gene'].isin(highest_delta_genes[cancer_type])]
    ratios = df[["Patient_ID", "Tissue", "Gene", "Proteomics", "Transcriptomics"]].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values=["Proteomics", "Transcriptomics"],
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    reset_index(drop=False)
    ratios = cput.reduce_multiindex(ratios, flatten=True)
    
    def make_cols_ratio(df, col1, col2, ratio_col_name):
        return df.\
        assign(**{ratio_col_name: df[col1] / df[col2]}).\
        drop(columns=[col1, col2])
    
    ratios = make_cols_ratio(
        ratios, 
        "Proteomics_Tumor", 
        "Proteomics_Normal", 
        "Prot_Tumor_Normal_Ratio"
    )
    ratios = make_cols_ratio(
        ratios, 
        "Transcriptomics_Tumor", 
        "Transcriptomics_Normal", 
        "RNA_Tumor_Normal_Ratio"
    )
    
ratios = ratios[ratios["Patient_ID"] == "C3N-03888"]

scatter = alt.Chart(ratios).mark_point().encode(
    x=alt.X(
        "RNA_Tumor_Normal_Ratio",
        title="RNA tumor-normal ratio"
    ),
    y=alt.Y(
        "Prot_Tumor_Normal_Ratio",
        title="Protein tumor-normal ratio"
    ),
    tooltip="Gene"
)

chart = (scatter + scatter.transform_regression("RNA_Tumor_Normal_Ratio", "Prot_Tumor_Normal_Ratio").mark_line(color="#FF7F0E")).properties(
    title=["RNA to protein tumor-normal ratio correlation for each protein", "Patient C3N-03888"]
)
chart

### Load pancan clinical data

In [15]:
dss = {
    "ccrcc": pc.PancanCcrcc,
    "endometrial": pc.PancanUcec,
    "hnscc": pc.PancanHnscc,
    "lscc": pc.PancanLscc,
    "luad": pc.PancanLuad,
}

In [16]:
def get_clinical(dss, cancer_type):
    
    # Download
    #pc.download("pancan" + cancer_type) if cancer_type != "endometrial" else pc.download("pancanucec")
    
    # Instantiate the dataset
    ds = dss[cancer_type]()
    
    # We use get_clinical instead of get_followup because get_followup just returns
    # a subset of the clinical table, and we need other columns too
    clin_vars = ds.get_clinical()[[
        "Recurrence status (1, yes; 0, no)",
        "Survial status (1, dead; 0, alive)",
        "baseline/histologic_type",
        "baseline/pathologic_staging_primary_tumor",
        "cptac_path/histologic_grade",
        "follow-up/measure_of_success_of_outcome_at_last_available_follow-up",
    ]]

    # Get rid of categorizing prefixes in column names, where they exist
    cols = clin_vars.columns.str.split("/", expand=True).to_frame()
    cols.columns = ["a", "b"]
    cols = cols.assign(b=cols["b"].fillna(cols["a"])) # If there was no prefix, no change
    clin_vars.columns = cols["b"].rename(None)
    
    # Fix values in measure of success column--sometimes deceased patients are listed
    # as "Persistent Disease" in this column, instead of "Patient Deceased"
    clin_vars = clin_vars.assign(
        **{"measure_of_success_of_outcome_at_last_available_follow-up": np.where(
            clin_vars["Survial status (1, dead; 0, alive)"] == 1,
            "Patient Deceased",
            clin_vars["measure_of_success_of_outcome_at_last_available_follow-up"]
        )}
    )
    
    # Make paired Patient_IDs same
    clin_vars.index = clin_vars.index.str.split("\.N", expand=True).to_frame()[0]
    
    # Drop any completely NaN rows
    clin_vars = clin_vars.dropna(axis=0, how="all")

    return clin_vars

clin = {}
for cancer_type in cancer_types:
    clin[cancer_type] = get_clinical(dss, cancer_type)

                                                 

### Combine data into one table

In [22]:
all_data = {}
for cancer_type in cancer_types:
    dfs = [
        top_genes_residuals_diff[cancer_type],
        above_reg_line_top_genes[cancer_type],
        patient_residuals_corr[cancer_type],
        prot_RNA_tumor_normal_ratios_corr[cancer_type],
        clin[cancer_type],
    ]
    
    cancer_type_all = dfs[0]
    for df in dfs[1:]:
        cancer_type_all = cancer_type_all.join(df, how="outer")
    cancer_type_all = cancer_type_all.assign(cancer_type=cancer_type)
    all_data[cancer_type] = cancer_type_all

In [23]:
all_data

Unnamed: 0,tumor_normal_residual_diff_AAGAB,tumor_normal_residual_diff_AAR2,tumor_normal_residual_diff_ABAT,tumor_normal_residual_diff_ABCC1,tumor_normal_residual_diff_ABHD11,tumor_normal_residual_diff_ABHD15,tumor_normal_residual_diff_ABHD3,tumor_normal_residual_diff_ABI2,tumor_normal_residual_diff_ABO,tumor_normal_residual_diff_ABRACL,...,above_reg_line_ZNF532_Normal,above_reg_line_ZNF532_Tumor,above_reg_line_ZNF544_Normal,above_reg_line_ZNF544_Tumor,above_reg_line_ZNF609_Normal,above_reg_line_ZNF609_Tumor,above_reg_line_ZNF668_Normal,above_reg_line_ZNF668_Tumor,above_reg_line_ZNF672_Normal,above_reg_line_ZNF672_Tumor
C3L-00004,-0.114371,0.058212,-0.126338,0.279651,0.191433,-0.806119,0.286192,-0.024497,,-0.258826,...,,,,,,,,,,
C3L-00010,0.125057,-0.010755,0.322410,-0.112348,-0.030529,0.174814,-0.010978,-0.092135,0.067995,-0.141112,...,,,,,,,,,,
C3L-00011,-0.000058,-0.015141,0.504048,0.073510,-0.028614,0.577749,0.022975,-0.162236,-0.014068,0.012607,...,,,,,,,,,,
C3L-00026,0.056581,-0.065460,0.076625,-0.471484,0.533219,-1.328916,0.007143,-0.034911,0.553142,-0.525805,...,,,,,,,,,,
C3L-00079,0.173266,-0.040997,-0.469949,-0.156338,-0.052869,,-0.014427,-0.061460,,0.014097,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02582,0.538398,,,,0.133200,,,,,,...,False,True,False,True,False,False,False,False,True,False
C3N-02586,0.510025,,,,0.115293,,,,,,...,False,False,False,True,False,False,False,False,True,False
C3N-02587,-0.080176,,,,0.292935,,,,,,...,True,False,True,True,False,False,False,True,False,True
C3N-02588,0.118428,,,,-0.078499,,,,,,...,False,False,False,False,False,True,False,False,True,False


In [58]:
# Define input and target columns
# input_cols = [  

#     # Cancer type
#     "cancer_type",
    
#     # Our calculated correlations
#     "tumor_normal_residuals_corr",
#     "prot_RNA_tumor_normal_ratios_corr",
    
#     # Gene-based regression data
#     "above_reg_line_ACSS1_Normal",
#     "above_reg_line_ACSS1_Tumor",
#     "above_reg_line_C1orf116_Normal",
#     "above_reg_line_C1orf116_Tumor",
#     "above_reg_line_CARD9_Normal",
#     "above_reg_line_CARD9_Tumor",
#     "above_reg_line_CDKN2A_Normal",
#     "above_reg_line_CDKN2A_Tumor",
#     "above_reg_line_CFI_Normal",
#     "above_reg_line_CFI_Tumor",
#     "above_reg_line_DYNC1LI1_Normal",
#     "above_reg_line_DYNC1LI1_Tumor",
#     "above_reg_line_ECH1_Normal",
#     "above_reg_line_ECH1_Tumor",
#     "above_reg_line_ECT2_Normal",
#     "above_reg_line_ECT2_Tumor",
#     "above_reg_line_FAM57A_Normal",
#     "above_reg_line_FAM57A_Tumor",
#     "above_reg_line_GPCPD1_Normal",
#     "above_reg_line_GPCPD1_Tumor",
#     "above_reg_line_HPS3_Normal",
#     "above_reg_line_HPS3_Tumor",
#     "above_reg_line_IFT43_Normal",
#     "above_reg_line_IFT43_Tumor",
#     "above_reg_line_IMPDH1_Normal",
#     "above_reg_line_IMPDH1_Tumor",
#     "above_reg_line_IMPDH2_Normal",
#     "above_reg_line_IMPDH2_Tumor",
#     "above_reg_line_MBNL2_Normal",
#     "above_reg_line_MBNL2_Tumor",
#     "above_reg_line_MFSD4A_Normal",
#     "above_reg_line_MFSD4A_Tumor",
#     "above_reg_line_NELL1_Normal",
#     "above_reg_line_NELL1_Tumor",
#     "above_reg_line_PPM1L_Normal",
#     "above_reg_line_PPM1L_Tumor",
#     "above_reg_line_RIPK3_Normal",
#     "above_reg_line_RIPK3_Tumor",
#     "above_reg_line_RNASET2_Normal",
#     "above_reg_line_RNASET2_Tumor",
#     "above_reg_line_SLC1A5_Normal",
#     "above_reg_line_SLC1A5_Tumor",
#     "above_reg_line_TALDO1_Normal",
#     "above_reg_line_TALDO1_Tumor",
#     "above_reg_line_THADA_Normal",
#     "above_reg_line_THADA_Tumor",
#     "above_reg_line_TXLNG_Normal",
#     "above_reg_line_TXLNG_Tumor",
#     "above_reg_line_USP7_Normal",
#     "above_reg_line_USP7_Tumor",
#     "tumor_normal_residual_diff_ACSS1",
#     "tumor_normal_residual_diff_C1orf116",
#     "tumor_normal_residual_diff_CARD9",
#     "tumor_normal_residual_diff_CDKN2A",
#     "tumor_normal_residual_diff_CFI",
#     "tumor_normal_residual_diff_DYNC1LI1",
#     "tumor_normal_residual_diff_ECH1",
#     "tumor_normal_residual_diff_ECT2",
#     "tumor_normal_residual_diff_FAM57A",
#     "tumor_normal_residual_diff_GPCPD1",
#     "tumor_normal_residual_diff_HPS3",
#     "tumor_normal_residual_diff_IFT43",
#     "tumor_normal_residual_diff_IMPDH1",
#     "tumor_normal_residual_diff_IMPDH2",
#     "tumor_normal_residual_diff_MBNL2",
#     "tumor_normal_residual_diff_MFSD4A",
#     "tumor_normal_residual_diff_NELL1",
#     "tumor_normal_residual_diff_PPM1L",
#     "tumor_normal_residual_diff_RIPK3",
#     "tumor_normal_residual_diff_RNASET2",
#     "tumor_normal_residual_diff_SLC1A5",
#     "tumor_normal_residual_diff_TALDO1",
#     "tumor_normal_residual_diff_THADA",
#     "tumor_normal_residual_diff_TXLNG",
#     "tumor_normal_residual_diff_USP7",
# ]

input_cols = [
    #Cancer type
     "cancer_type",
    
     # Our calculated correlations
     "tumor_normal_residuals_corr",
     "prot_RNA_tumor_normal_ratios_corr",
    
    ] + 

target_cols = [
    "Recurrence status (1, yes; 0, no)",
    "Survial status (1, dead; 0, alive)",
    "histologic_grade",
    "histologic_type",
    "measure_of_success_of_outcome_at_last_available_follow-up",
    "pathologic_staging_primary_tumor",
]

In [59]:
# Drop rows where input cols are all NaN
nan_subset = input_cols.copy()
nan_subset.remove("cancer_type")
all_data = all_data.dropna(axis=0, how="all", subset=nan_subset)

In [60]:
# Clean up and simplify the tumor stage column
all_data = all_data.assign(pathologic_staging_primary_tumor=all_data["pathologic_staging_primary_tumor"].\
    str.split(" ", expand=True)[0].\
    str.lower().\
    str.strip().\
    str.replace(":", "", regex=False).\
    str.replace("^p", "", regex=True).\
    str.replace("^(\d)", "t\1", regex=True).\
    str.replace("\x01", "", regex=False).\
    str.replace("[abc]", "", regex=True).\
    replace(to_replace="t", value=np.nan)
)

In [61]:
# Reorder the columns nicely, and split into inputs and targets
inputs = all_data[input_cols]
targets = all_data[target_cols]

In [62]:
# Save the tables
inputs.to_csv("inputs.tsv", sep="\t")
targets.to_csv("targets.tsv", sep="\t")