In [25]:
## INPUT DESIRED STUDY NUMBER
cancertype = 1

##------------------------------------------------- STUDY INFO -------------------------------------------------------------- ##
## 1) Colorectal Adenocarcinoma
## 2) Uterine Corpus Endometrial Carcinoma
## 3) Invasive Breast Carcinoma (PanCancer)
## 3.1) Invasive Breast Carcinoma (Cell 2015)
## 4) Kidney Renal Clear Cell Carcinoma
## 5) Kidney Renal Papillary Cell Carcinoma
## 6) Liver Hepatocellular Carcinoma
## 7) Lung Adenocarcinoma
## 8) Prostate Adenocarcinoma (Cell 2015)
## 9) Stomach Adenocarcinoma
## 10) Thyroid Carcinoma

In [26]:
nonpancancer = [3.1, 8]

studies = {
    1: "colorectaladenocarcinoma",       
    2: "endometrialcarcinoma",
    3: "invasivebreastcarcinoma_pc",
    3.1: "invasivebreastcarcinoma_cell",
    4: "kidneyrenalclearcellcarcinoma",
    5: "kidneyrenalpapillarycellcarcinoma",
    6: "liverhepatocellularcarcinoma",
    7: "lungadenocarcinoma",
    8: "prostateadenocarcinoma",
    9: "stomachadenocarcinoma",
    10: "thyroidcarcinoma"
}

typedict = {"colorectaladenocarcinoma": 'CORE',       
    "endometrialcarcinoma": 'UCEC',
    "invasivebreastcarcinoma_pc": 'BRCA',
    "kidneyrenalclearcellcarcinoma": 'KIRC',
    "kidneyrenalpapillarycellcarcinoma": 'KIRP',
    "liverhepatocellularcarcinoma": 'LIHC',
    "lungadenocarcinoma": 'LUAD',
    "stomachadenocarcinoma": 'STAD',
    "thyroidcarcinoma": 'THCA'}

## Importing pandas
import pandas as pd 
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100000)

## Reading source data and converting to dataframes
protein = "data/tcga_rppa.txt"
proteindf = pd.read_csv(protein, sep = '\t')
clinical = "data/tcga_clinical.tsv"
clinicaldf = pd.read_csv(clinical, sep = '\t', index_col = 'bcr_patient_barcode', engine = 'python')

## Cleaning up sample IDs
proteindf['SampleID'] = proteindf['SampleID'].apply(lambda x: x[0:12])
proteindf.set_index('SampleID', inplace = True)

## Appending race info from clinical df to the protein df
proteindf['Race'] = clinicaldf['race']
race_column = proteindf.pop('Race')
proteindf.insert(1, 'Race', race_column)
proteindf['Race'] = proteindf['Race'].map({'AMERICAN INDIAN OR ALASKA NATIVE':'AMERICAN INDIAN OR ALASKA NATIVE',
                                           'ASIAN':'ASIAN',
                                           'BLACK OR AFRICAN AMERICAN':'BLACK OR AFRICAN AMERICAN',
                                           'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
                                           'WHITE':'WHITE'})

## Sorting the protein df by the sample ID
proteindf.sort_index(inplace = True)

## Creating groupings by cancer type and race
typegroup = proteindf.groupby('TumorType')
racegroup = proteindf.groupby('Race')
racebytype = typegroup['Race'].value_counts()

## If the data is PanCancer:
if cancertype not in nonpancancer:
    
    #Creating a dataframe including only samples from the specifed cancer type
    typespecific = typegroup.get_group(typedict[studies[cancertype]])
    
    #Deleting second samples
    typespecific = typespecific[~typespecific.index.duplicated(keep='last')]
    
    #Creating a dataframe sorted by race to make it easier to annotate in Reactome
    racesort = typespecific.sort_values(by = 'Race')
    
    typespecific_black = typespecific.loc[typespecific['Race'] == "BLACK OR AFRICAN AMERICAN"]
    typespecific_white = typespecific.loc[typespecific['Race'] == "WHITE"]
    typespecific_asian = typespecific.loc[typespecific['Race'] == "ASIAN"]
    
    #Transposing and cleaning up racesort for export to Reactome
    typespecific_export = racesort.transpose().drop(['TumorType', 'Race'], axis = 0)
    typespecific_export.drop(['ADAR1', 'ALPHACATENIN', 'TTF1', 'CASPASE3', 'CASPASE9', 'PARP1', 'JAB1'], inplace = True)
    typespecific_export.rename({'CD20':'MS4A1', 'DIRAS3': 'O95661', 'ERALPHA': 'P03372', 'RBM15': 'Q96T37', 'PKCALPHA' : 'P17252', 'P21': 'P38936'}, axis = "index", inplace = True)
    typespecific_export = typespecific_export[~typespecific_export.index.str.contains('_p')]
    typespecific_export = typespecific_export[~typespecific_export.index.str.contains('CLEAVED')]
    typespecific_export.columns.name = '#SampleID'
    typespecific_export.profile = "protein"
    
    #Export function
    def export(df):
        df.to_csv('data/tcga_{}_{}.csv'.format(df.profile, studies[cancertype]))
    
else:
    typespecific = "Not PanCancer"
    typespecific_export = "Not PanCancer"
    typespecific_black = "Not PanCancer"
    typespecific_white = "Not PanCancer"
    typespecific_asian = "Not PanCancer"
##--------------------------------------------------- GLOSSARY -------------------------------------------------------------- ##
## proteindf: Entire protein microarray dataframe of all samples across all cancer types with protein data. Sorted by SampleID
## racebytype: Breakdown of race for each cancer type. NOTE: For some reason the TCGA protein data contains slightly more samples than what is displayed in cBioPortal.

## FOR PANCANCER STUDIES ONLY:

## typespecific: Protein microarray dataframe for only the cancer type specified. Sorted by SampleID
## typespecific_export: Above, but suitable for export to Reactome.

## typespecific_black: Typespecific filtered for black/African American.
## typespecific_white: Typespecific except filtered for white.
## typespecific_asian: Typespecific except filtered for Asian.

## export(df): Function that converts a dataframe ready for export to Reactome into a csv.