# Load data

In [1]:
import pandas as pd
import os

### Get pancancerProteinMRNA repository path

This analysis will build off of data stored in the [pancancerProteinMRNA repo](https://github.com/PayneLab/pancancerProteinMRNA), which is publicly accessible. To access that data, clone the repository, then store the path to it in a text file with no quotes named `pancancerProteinMRNA_repo_path.txt` in the same directory as this notebook. The notebook will then read the path from that file and be able to access the data.

In [2]:
with open("pancancerProteinMRNA_repo_path.txt", "r") as pcp_path_file:
    pcp_path = pcp_path_file.read()

In [3]:
pcp_path

'/home/caleb/GitHub/PayneLab/pancancerProteinMRNA'

In [4]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

In [8]:
residuals = {}
residuals_dir_path = os.path.join(pcp_path, "notebook_steps_Spearman", "clinical_associations")

for cancer_type in cancer_types:
    file_name = f"{cancer_type}_residuals.tsv.gz"
    residuals[cancer_type] = pd.read_csv(os.path.join(residuals_dir_path, file_name), sep="\t")

In [9]:
residuals["ccrcc"]

Unnamed: 0,Patient_ID,Gene,Proteomics,Tissue,Transcriptomics,m,b,orth_resid,intersect_x,intersect_y,above_reg_line
0,C3L-00004,A1CF,0.641447,Tumor,16.677828,0.082623,-0.847022,0.110114,16.686895,0.531707,True
1,C3L-00010,A1CF,0.194620,Tumor,16.682712,0.082623,-0.847022,0.335598,16.655078,0.529078,False
2,C3L-00011,A1CF,-0.780455,Tumor,0.245606,0.082623,-0.847022,0.046116,0.249403,-0.826415,True
3,C3L-00026,A1CF,0.404286,Tumor,16.347532,0.082623,-0.847022,0.099045,16.339377,0.502994,False
4,C3L-00079,A1CF,-0.677773,Tumor,4.858958,0.082623,-0.847022,0.231427,4.839902,-0.447132,False
5,C3L-00088,A1CF,0.310249,Tumor,13.654469,0.082623,-0.847022,0.028993,13.656856,0.281355,True
6,C3L-00096,A1CF,-0.128732,Tumor,8.107277,0.082623,-0.847022,0.048274,8.111252,-0.176842,True
7,C3L-00097,A1CF,-0.513243,Tumor,4.541293,0.082623,-0.847022,0.041298,4.537892,-0.472085,False
8,C3L-00103,A1CF,-1.135859,Tumor,1.853419,0.082623,-0.847022,0.440472,1.817149,-0.696883,False
9,C3L-00183,A1CF,-0.128068,Tumor,6.293220,0.082623,-0.847022,0.198311,6.309550,-0.325705,True


In [11]:
regressions = {}
regression_dir_path = os.path.join(pcp_path, "notebook_steps_Spearman", "clinical_associations")

for cancer_type in cancer_types:
    file_name = f"{cancer_type}_regression.tsv"
    regressions[cancer_type] = pd.read_csv(os.path.join(regression_dir_path, file_name), sep="\t")

In [12]:
regressions["ccrcc"]

Unnamed: 0,Tissue,Gene,m,b
0,Normal,A1CF,0.073530,-0.566914
1,Normal,AADAT,0.667910,-1.559035
2,Normal,AAGAB,0.012379,-0.154849
3,Normal,AAK1,0.033276,-0.012194
4,Normal,AARS,0.007246,-0.414622
5,Normal,AARSD1,0.014896,-0.252408
6,Normal,AASS,0.026897,-0.359696
7,Normal,ABAT,0.017792,-0.552797
8,Normal,ABCA6,0.096910,-0.601968
9,Normal,ABCB1,0.031563,-0.920134
