In [1]:
import cptac.pancan as pc
import cptac.utils as cput
import numpy as np
import os
import pandas as pd
import pcprutils as ut
import altair as alt
import math
from math import dist

### Get pancancerProteinMRNA repository path
This analysis will build off of data stored in the pancancerProteinMRNA repo, which is publicly accessible. To access that data, clone the repository, then store the path to it in a text file with no quotes named pancancerProteinMRNA_repo_path.txt in the same directory as this notebook. The notebook will then read the path from that file and be able to access the data.

In [2]:
with open("pancancerProteinMRNA_repo_path.txt", "r") as pcp_path_file:
    pcp_path = pcp_path_file.read()

print(pcp_path)

/Users/Bryn/PayneLab/pancancer/pancancerProteinMRNA


In [3]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

### Load delta correlations

In [4]:
delta_corr_path = os.path.join(pcp_path, "notebook_steps_Spearman", "data", "delta_correlation_df.csv")
delta_corr = pd.read_csv(delta_corr_path)
delta_corr = delta_corr.assign(Cancer=delta_corr["Cancer"].str.lower())

delta_corr

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.198013,2.451044e-01,4.045115e-01,hnscc
1,A2M,-0.118384,4.480278e-01,6.091130e-01,hnscc
2,A2ML1,-0.023469,2.918125e-01,4.561968e-01,hnscc
3,AAAS,0.275905,1.051756e-01,2.209072e-01,hnscc
4,AACS,-0.136836,1.800586e-01,3.266475e-01,hnscc
...,...,...,...,...,...
50684,ZWINT,1.219024,2.267627e-09,1.049863e-07,endometrial
50685,ZXDC,-0.346532,2.983295e-01,5.386144e-01,endometrial
50686,ZYG11B,0.768196,5.463938e-04,5.319699e-03,endometrial
50687,ZYX,0.253630,2.456049e-01,4.795301e-01,endometrial


### Load gene-based residuals data

In [5]:
residuals = {}
residuals_dir_path = os.path.join(pcp_path, "notebook_steps_Spearman", "clinical_associations")

for cancer_type in cancer_types:
    file_name = f"{cancer_type}_residuals.tsv.gz"
    res = pd.read_csv(os.path.join(residuals_dir_path, file_name), sep="\t")
    res = res.assign(Patient_ID=res["Name"].str.split("\.N", expand=True)[0]) # Make paired Patient_IDs same
    residuals[cancer_type] = res

residuals["ccrcc"]

Unnamed: 0,Name,Gene,Proteomics,Tissue,Transcriptomics,m,b,orth_resid,intersect_x,intersect_y,above_reg_line,Patient_ID
0,C3L-00004,A1CF,0.569688,Tumor,15.287382,0.131457,-1.330471,0.108536,15.273236,0.677298,False,C3L-00004
1,C3L-00010,A1CF,0.289879,Tumor,15.952434,0.131457,-1.330471,0.472637,15.890833,0.758485,False,C3L-00010
2,C3L-00011,A1CF,-2.555520,Tumor,0.016077,0.131457,-1.330471,1.216695,-0.142501,-1.349203,False,C3L-00011
3,C3L-00026,A1CF,0.392756,Tumor,15.005262,0.131457,-1.330471,0.247188,14.973045,0.637836,False,C3L-00026
4,C3L-00079,A1CF,-0.884605,Tumor,4.334532,0.131457,-1.330471,0.122880,4.318517,-0.762773,False,C3L-00079
...,...,...,...,...,...,...,...,...,...,...,...,...
556633,C3N-01646.N,ZSCAN18,0.336257,Normal,8.516963,0.031765,-0.094214,0.159848,8.522038,0.176490,True,C3N-01646
556634,C3N-01648.N,ZSCAN18,0.181620,Normal,7.488197,0.031765,-0.094214,0.037951,7.489402,0.143688,True,C3N-01648
556635,C3N-01649.N,ZSCAN18,0.122117,Normal,10.325149,0.031765,-0.094214,0.111592,10.321607,0.233653,False,C3N-01649
556636,C3N-01651.N,ZSCAN18,0.136398,Normal,6.818568,0.031765,-0.094214,0.014012,6.819013,0.122393,True,C3N-01651


### Get the genes that are highly correlated and highly significant for each cancer type

In [6]:
def get_highest_delta_corr(df):
    df = df.assign(abs_delta_corr=df["Delta_Correlation"].abs())
    df = df.sort_values(by="abs_delta_corr")
    df = df.loc[(df['FDR'] < 0.05) & ((df['Delta_Correlation'] > 0.7) | (df['Delta_Correlation'] < -0.7))]
    return df["Gene"].tolist()

highest_delta_genes = {}
for cancer in cancer_types:
    cancer_df = delta_corr.loc[delta_corr['Cancer'] == cancer]
    highest_delta_genes[cancer] = get_highest_delta_corr(cancer_df)

highest_delta_genes

{'ccrcc': ['AP4S1',
  'GDAP1',
  'UNC5B',
  'ZDHHC2',
  'USP47',
  'NEK3',
  'PTTG1IP',
  'STRA6',
  'AZI2',
  'WNK1',
  'RALGAPA1',
  'FAF2',
  'TSPYL4',
  'SCGN',
  'LSR',
  'MUC15',
  'SLC35A5',
  'CLINT1',
  'GMFB',
  'RHAG',
  'MARK3',
  'EFHD1',
  'CR2',
  'CNTLN',
  'PEG10',
  'PTPRG',
  'NACC1',
  'C9orf78',
  'IGF2BP3',
  'LY6E',
  'MUC13',
  'YARS2',
  'CERCAM',
  'PGK1',
  'LPCAT1',
  'AP3M1',
  'NCOA7',
  'DNAAF2',
  'NPEPPS',
  'METAP1D',
  'PGBD5',
  'SCAP',
  'CCP110',
  'GOLPH3L',
  'ALDH18A1',
  'SLC36A2',
  'PDCD7',
  'PPM1M',
  'GCDH',
  'RBP2',
  'POMT2',
  'NOVA2',
  'DLG5',
  'APLP2',
  'PGD',
  'COLEC12',
  'MCC',
  'PIK3R4',
  'PDZRN3',
  'AHSA1',
  'KIAA1522',
  'CYP4F2',
  'NOL3',
  'SLC44A1',
  'DCTD',
  'ZNF358',
  'ENPP2',
  'PPID',
  'PLS1',
  'TTC36',
  'NGLY1',
  'EHD4',
  'MAPK10',
  'SMAP1',
  'CCNY',
  'PNCK',
  'TREM2',
  'PDP1',
  'SMPD2',
  'CALCRL',
  'CLCC1',
  'PPP1R3G',
  'SLC22A8',
  'NCAPD2',
  'MIOS',
  'PISD',
  'BTN3A1',
  'COL5A1',
  'DUS

### Get the distance between the tumor and normal residuals for each patient and each significant gene.

In [7]:
top_genes_residuals_dist = {}
for cancer_type in cancer_types:
    res = residuals[cancer_type]
    
    top_res = res[res["Gene"].isin(highest_delta_genes[cancer_type])].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values=["intersect_x", "intersect_y"],
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    dropna(axis=0, how="any").\
    reset_index(drop=False)
    
    x_tumor = top_res['intersect_x']['Tumor']
    y_tumor = top_res['intersect_y']['Tumor']
    x_normal = top_res['intersect_x']['Normal']
    y_normal = top_res['intersect_y']['Normal']
    
    tumor = tuple(zip(x_tumor, y_tumor))
    normal = tuple(zip(x_normal, y_normal))
    
    top_res = top_res.\
    assign(tumor_normal_residual_dist=[math.dist(tumor[x], normal[x]) for x in range(len(tumor))]).\
    pivot(
        index="Patient_ID",
        columns="Gene",
        values="tumor_normal_residual_dist",
    ).\
    add_prefix("tumor_normal_residual_dist_")
    
    top_res.columns.name = None
    
    top_genes_residuals_dist[cancer_type] = top_res

top_genes_residuals_dist["ccrcc"]

Unnamed: 0_level_0,tumor_normal_residual_dist_ADCY3,tumor_normal_residual_dist_AGK,tumor_normal_residual_dist_AGXT,tumor_normal_residual_dist_AHSA1,tumor_normal_residual_dist_ALDH18A1,tumor_normal_residual_dist_ANKZF1,tumor_normal_residual_dist_AP3M1,tumor_normal_residual_dist_AP4S1,tumor_normal_residual_dist_APLP2,tumor_normal_residual_dist_APPL1,...,tumor_normal_residual_dist_USP47,tumor_normal_residual_dist_USP6NL,tumor_normal_residual_dist_VPS25,tumor_normal_residual_dist_WNK1,tumor_normal_residual_dist_XPNPEP1,tumor_normal_residual_dist_YARS2,tumor_normal_residual_dist_ZDHHC2,tumor_normal_residual_dist_ZEB1,tumor_normal_residual_dist_ZNF358,tumor_normal_residual_dist_ZNF397
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,,2.368666,13.190856,8.481922,7.053608,2.121463,1.227607,0.648589,90.507645,1.901647,...,0.690147,0.334414,2.890216,22.063776,6.150969,1.036304,,,1.803353,
C3L-00010,,0.160878,11.688750,11.935946,4.414980,2.268968,2.087734,1.128539,36.071283,5.221817,...,1.146994,0.258698,1.391139,27.201389,4.916786,0.891255,,,,
C3L-00011,,0.914093,3.568109,6.287237,34.092835,7.746025,5.285246,1.872484,5.217191,2.252784,...,8.855820,0.549924,2.219539,37.144099,11.330073,4.226288,11.778113,,2.735838,
C3L-00026,,1.699979,3.824380,3.670214,18.820047,5.686363,1.816493,0.364995,73.821719,4.882907,...,1.284213,0.515147,4.089902,29.820201,5.085747,0.748156,,0.145020,,
C3L-00079,,0.600205,2.337069,1.420444,9.337463,3.583359,0.263818,1.039790,53.952419,7.534761,...,0.111235,1.697944,6.768456,13.759756,2.167726,1.526541,,4.290248,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,,0.780648,4.981830,7.273306,20.638650,5.568947,1.877280,1.558076,9.978579,5.104328,...,2.485320,0.622399,3.926320,19.072103,9.838101,2.040016,7.116299,,,0.227134
C3N-01648,,1.179941,5.613505,0.705244,5.357198,0.381726,2.456724,0.426310,34.261414,4.428208,...,0.456511,0.030175,2.158922,23.660798,2.559214,0.734890,,,0.993375,
C3N-01649,3.768251,0.663157,5.136188,3.093425,1.741500,2.624052,0.315908,0.386628,42.591365,0.622113,...,0.759353,1.172342,1.509458,25.003360,5.858864,0.976327,,22.031193,,
C3N-01651,2.546564,1.107103,10.179527,3.290355,23.025980,14.078832,1.001933,0.323635,71.024819,5.637702,...,3.961973,0.378206,3.196001,31.660524,1.705531,1.383558,,5.654181,,0.567331


### Get the orthogonal distance of the tumor residuals and whether the point is above or below the regression line for each patient and each gene. (If the orthogonal distance is negative, it is below the line and vice versa)

In [8]:
def get_signed_res(row):
    if not row['above_reg_line']:
        return (row['orth_resid'] * -1)
    else:
        return row['orth_resid']
    
above_reg_line_top_genes = {}
for cancer_type in cancer_types:
    res = residuals[cancer_type]
    
    res['signed_orth_res'] = res.apply(get_signed_res, axis=1)
    
    top_res = res[res["Gene"].isin(highest_delta_genes[cancer_type])].\
    pivot_table(
        index=["Patient_ID", "Gene"],
        columns="Tissue",
        values="signed_orth_res",
        aggfunc=np.mean, # To handle duplicates--temp until we get Database_ID
    ).\
    dropna(axis=0, how="any").\
    reset_index(drop=False)
    
    top_res = top_res.\
    pivot(
        index="Patient_ID",
        columns="Gene",
        values="Tumor",
    ).\
    sort_index(axis=1)
    
    top_res = top_res.add_prefix("signed_orth_res_")
    
    top_res.columns.name = None
    
    above_reg_line_top_genes[cancer_type] = top_res

above_reg_line_top_genes["ccrcc"]

Unnamed: 0_level_0,signed_orth_res_ADCY3,signed_orth_res_AGK,signed_orth_res_AGXT,signed_orth_res_AHSA1,signed_orth_res_ALDH18A1,signed_orth_res_ANKZF1,signed_orth_res_AP3M1,signed_orth_res_AP4S1,signed_orth_res_APLP2,signed_orth_res_APPL1,...,signed_orth_res_USP47,signed_orth_res_USP6NL,signed_orth_res_VPS25,signed_orth_res_WNK1,signed_orth_res_XPNPEP1,signed_orth_res_YARS2,signed_orth_res_ZDHHC2,signed_orth_res_ZEB1,signed_orth_res_ZNF358,signed_orth_res_ZNF397
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,,0.087608,0.173471,-0.035851,-0.148725,0.380128,-0.050607,0.051859,-0.525722,-0.013978,...,0.010893,0.152850,0.181947,0.149241,0.422377,0.061650,,,-0.069231,
C3L-00010,,-0.297483,0.064843,-0.240694,-0.565231,-0.117507,0.080662,0.173044,-0.286951,-0.008435,...,-0.075189,0.055203,0.050951,-0.086946,0.142795,-0.338331,,,,
C3L-00011,,-0.326863,0.183270,0.333279,-0.438071,0.032485,0.093737,0.008956,0.610745,-0.228480,...,-0.261528,-0.312138,-0.777187,0.605387,-0.179915,-0.343415,0.018595,,0.113805,
C3L-00026,,0.105183,-1.214213,-0.246163,0.611293,0.342243,-0.017364,0.117009,-0.153906,-0.232801,...,0.088498,0.155157,-0.068772,-0.096996,0.189214,0.779191,,0.288435,,
C3L-00079,,0.167635,0.090650,0.126676,0.310022,0.359718,-0.083191,0.188223,0.421607,-0.631637,...,-0.097382,-0.313189,-0.276598,0.065601,0.052522,0.252166,,0.195492,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,,0.176065,-0.509809,-0.121372,-0.455689,-0.016855,0.239441,0.119615,0.031246,0.030285,...,0.057308,-0.252448,-0.027133,0.098906,-0.123295,-0.373050,0.864411,,,-0.09257
C3N-01648,,-0.004708,0.204864,0.148220,0.456692,0.225285,0.035138,0.315726,-0.311116,0.301077,...,0.078408,-0.588844,0.005994,0.420178,-0.162881,0.089283,,,0.615288,
C3N-01649,0.314160,0.035091,0.166634,-0.277172,-0.178384,-0.373279,-0.162847,-0.322632,0.014751,-0.121734,...,-0.187441,0.058879,-0.050599,-0.024938,-0.208821,0.035469,,-0.710171,,
C3N-01651,-0.263315,0.418284,-0.090940,0.025406,0.347838,0.258287,-0.001249,-0.155597,-0.108598,0.016128,...,0.021833,-0.144407,-0.415184,0.089418,0.032847,0.417992,,0.201946,,-0.07813


### Combine the dataframes for the residual distances and orthogonal residual to create the input dataframe

In [9]:
all_data = {}
for cancer_type in cancer_types:
    cancer_type_all = top_genes_residuals_dist[cancer_type].join(above_reg_line_top_genes[cancer_type], how="outer")
    cancer_type_all = cancer_type_all.dropna(axis=0, how="all")
    all_data[cancer_type] = cancer_type_all
    
all_data['ccrcc']

Unnamed: 0_level_0,tumor_normal_residual_dist_ADCY3,tumor_normal_residual_dist_AGK,tumor_normal_residual_dist_AGXT,tumor_normal_residual_dist_AHSA1,tumor_normal_residual_dist_ALDH18A1,tumor_normal_residual_dist_ANKZF1,tumor_normal_residual_dist_AP3M1,tumor_normal_residual_dist_AP4S1,tumor_normal_residual_dist_APLP2,tumor_normal_residual_dist_APPL1,...,signed_orth_res_USP47,signed_orth_res_USP6NL,signed_orth_res_VPS25,signed_orth_res_WNK1,signed_orth_res_XPNPEP1,signed_orth_res_YARS2,signed_orth_res_ZDHHC2,signed_orth_res_ZEB1,signed_orth_res_ZNF358,signed_orth_res_ZNF397
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,,2.368666,13.190856,8.481922,7.053608,2.121463,1.227607,0.648589,90.507645,1.901647,...,0.010893,0.152850,0.181947,0.149241,0.422377,0.061650,,,-0.069231,
C3L-00010,,0.160878,11.688750,11.935946,4.414980,2.268968,2.087734,1.128539,36.071283,5.221817,...,-0.075189,0.055203,0.050951,-0.086946,0.142795,-0.338331,,,,
C3L-00011,,0.914093,3.568109,6.287237,34.092835,7.746025,5.285246,1.872484,5.217191,2.252784,...,-0.261528,-0.312138,-0.777187,0.605387,-0.179915,-0.343415,0.018595,,0.113805,
C3L-00026,,1.699979,3.824380,3.670214,18.820047,5.686363,1.816493,0.364995,73.821719,4.882907,...,0.088498,0.155157,-0.068772,-0.096996,0.189214,0.779191,,0.288435,,
C3L-00079,,0.600205,2.337069,1.420444,9.337463,3.583359,0.263818,1.039790,53.952419,7.534761,...,-0.097382,-0.313189,-0.276598,0.065601,0.052522,0.252166,,0.195492,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,,0.780648,4.981830,7.273306,20.638650,5.568947,1.877280,1.558076,9.978579,5.104328,...,0.057308,-0.252448,-0.027133,0.098906,-0.123295,-0.373050,0.864411,,,-0.09257
C3N-01648,,1.179941,5.613505,0.705244,5.357198,0.381726,2.456724,0.426310,34.261414,4.428208,...,0.078408,-0.588844,0.005994,0.420178,-0.162881,0.089283,,,0.615288,
C3N-01649,3.768251,0.663157,5.136188,3.093425,1.741500,2.624052,0.315908,0.386628,42.591365,0.622113,...,-0.187441,0.058879,-0.050599,-0.024938,-0.208821,0.035469,,-0.710171,,
C3N-01651,2.546564,1.107103,10.179527,3.290355,23.025980,14.078832,1.001933,0.323635,71.024819,5.637702,...,0.021833,-0.144407,-0.415184,0.089418,0.032847,0.417992,,0.201946,,-0.07813


### Get the clinical data for each cancer that will be the targets for the model

In [10]:
dss = {
    "ccrcc": pc.PancanCcrcc,
    "endometrial": pc.PancanUcec,
    "hnscc": pc.PancanHnscc,
    "lscc": pc.PancanLscc,
    "luad": pc.PancanLuad,
}

In [11]:
def get_clinical(dss, cancer_type):
    
    # Download
    #pc.download("pancan" + cancer_type) if cancer_type != "endometrial" else pc.download("pancanucec")
    
    # Instantiate the dataset
    ds = dss[cancer_type]()
    
    # We use get_clinical instead of get_followup because get_followup just returns
    # a subset of the clinical table, and we need other columns too
    clin_vars = ds.get_clinical()[[
        "Recurrence status (1, yes; 0, no)",
        "Survial status (1, dead; 0, alive)",
        "baseline/histologic_type",
        "baseline/pathologic_staging_primary_tumor",
        "cptac_path/histologic_grade",
        "follow-up/measure_of_success_of_outcome_at_last_available_follow-up",
    ]]

    # Get rid of categorizing prefixes in column names, where they exist
    cols = clin_vars.columns.str.split("/", expand=True).to_frame()
    cols.columns = ["a", "b"]
    cols = cols.assign(b=cols["b"].fillna(cols["a"])) # If there was no prefix, no change
    clin_vars.columns = cols["b"].rename(None)
    
    # Fix values in measure of success column--sometimes deceased patients are listed
    # as "Persistent Disease" in this column, instead of "Patient Deceased"
    clin_vars = clin_vars.assign(
        **{"measure_of_success_of_outcome_at_last_available_follow-up": np.where(
            clin_vars["Survial status (1, dead; 0, alive)"] == 1,
            "Patient Deceased",
            clin_vars["measure_of_success_of_outcome_at_last_available_follow-up"]
        )}
    )
    
    # Make paired Patient_IDs same
    clin_vars.index = clin_vars.index.str.split("\.N", expand=True).to_frame()[0]
    
    # Drop any completely NaN rows
    clin_vars = clin_vars.dropna(axis=0, how="all")

    return clin_vars

clin = {}
for cancer_type in cancer_types:
    clin[cancer_type] = get_clinical(dss, cancer_type)

                                                 

In [12]:
# Clean up and simplify the tumor stage column
for cancer_type in cancer_types:
    clin[cancer_type] = clin[cancer_type].assign(pathologic_staging_primary_tumor=clin[cancer_type]["pathologic_staging_primary_tumor"].\
        str.split(" ", expand=True)[0].\
        str.lower().\
        str.strip().\
        str.replace(":", "", regex=False).\
        str.replace("^p", "", regex=True).\
        str.replace("^(\d)", "t\1", regex=True).\
        str.replace("\x01", "", regex=False).\
        str.replace("[abc]", "", regex=True).\
        replace(to_replace="t", value=np.nan)
    )  

In [13]:
# Drop rows where input cols are all NaN
for cancer_type in cancer_types:
    nan_subset = all_data[cancer_type].columns
    all_data[cancer_type] = all_data[cancer_type].dropna(axis=0, how="all", subset=nan_subset)
    
    nan_subset = clin[cancer_type].columns
    clin[cancer_type] = clin[cancer_type].dropna(axis=0, how="all", subset=nan_subset)

In [None]:
# Export input and target dataframes 
for cancer_type in cancer_types:
    all_data[cancer_type].to_csv(f'uncleaned_data/{cancer_type}_inputs.tsv', sep="\t")
    clin[cancer_type].to_csv(f'uncleaned_data/{cancer_type}_targets.tsv', sep="\t")

In [15]:
clin['ccrcc']

Unnamed: 0_level_0,"Recurrence status (1, yes; 0, no)","Survial status (1, dead; 0, alive)",histologic_type,pathologic_staging_primary_tumor,histologic_grade,measure_of_success_of_outcome_at_last_available_follow-up
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C3L-00004,0.0,0.0,Clear cell renal cell carcinoma,t3,"G3: Nuclei very irregular, approximately 20µm;...",Unknown
C3L-00010,0.0,0.0,Clear cell renal cell carcinoma,t1,"G3: Nuclei very irregular, approximately 20µm;...",Complete Remission
C3L-00011,0.0,1.0,Clear cell renal cell carcinoma,t3,"G4: Nuclei bizarre and multilobulated, 20µm or...",Patient Deceased
C3L-00026,0.0,0.0,Clear cell renal cell carcinoma,t1,"G3: Nuclei very irregular, approximately 20µm;...",Complete Remission
C3L-00079,0.0,1.0,Clear cell renal cell carcinoma,t3,"G3: Nuclei very irregular, approximately 20µm;...",Patient Deceased
...,...,...,...,...,...,...
C3N-01646,0.0,0.0,Clear cell renal cell carcinoma,t3,"G3: Nuclei very irregular, approximately 20µm;...",Complete Remission
C3N-01648,0.0,0.0,Clear cell renal cell carcinoma,t2,"G2: Nuclei slightly irregular, approximately 1...",Complete Remission
C3N-01649,0.0,0.0,Clear cell renal cell carcinoma,t3,"G2: Nuclei slightly irregular, approximately 1...",Complete Remission
C3N-01651,1.0,0.0,Clear cell renal cell carcinoma,t2,"G3: Nuclei very irregular, approximately 20µm;...",Persistent Disease


In [16]:
clin['lscc']

Unnamed: 0_level_0,"Recurrence status (1, yes; 0, no)","Survial status (1, dead; 0, alive)",histologic_type,pathologic_staging_primary_tumor,histologic_grade,measure_of_success_of_outcome_at_last_available_follow-up
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C3L-00081,1.0,1.0,Squamous cell carcinoma,t2,G2 Moderately differentiated,Patient Deceased
C3L-00415,1.0,0.0,Squamous cell carcinoma,t2,G3 Poorly differentiated,Persistent Disease
C3L-00445,0.0,1.0,Keratinizing squamous cell carcinoma,t2,G2 Moderately differentiated,Patient Deceased
C3L-00568,0.0,0.0,Squamous cell carcinoma,t2,G2 Moderately differentiated,Not Applicable
C3L-00603,0.0,0.0,Squamous cell carcinoma,t1,G3 Poorly differentiated,
...,...,...,...,...,...,...
C3N-03886,0.0,0.0,Basaloid squamous cell carcinoma,t2,GX Grade cannot be assessed,Complete Remission
C3N-04124,0.0,0.0,Squamous cell carcinoma,t2,G3 Poorly differentiated,Not Applicable
C3N-04127,0.0,0.0,Squamous cell carcinoma,t2,G2 Moderately differentiated,Complete Remission
C3N-04155,0.0,0.0,Non-keratinizing squamous cell carcinoma,t1,G2 Moderately differentiated,Complete Remission
