In [10]:
def read(n):
    global pd, np, genelist, studies, abbreviations, hasmethylationdata, asian, correctedtextfile_df, short, short_mrna, short_protein, short_combined, short_meth, all_mrna, mrna_qvalue, all_protein, protein_qvalue, all_meth, meth_qvalue, textfile_df
    cancertype = n
    
    ## List of studies with methylation data
    hasmethylationdata = [3.1, 8]
    if cancertype not in hasmethylationdata:
        all_meth = "Not available"
        short_meth = "Not available"
        meth_qvalue = "Not available"
    asian = [6, 9, 10]

    ## Reading cancer type
    studies = {
        1:   "colorectaladenocarcinoma",       
        2:   "endometrialcarcinoma",
        3:   "invasivebreastcarcinoma_pc",
        3.1: "invasivebreastcarcinoma_cell",
        4:   "kidneyrenalclearcellcarcinoma",
        5:   "kidneyrenalpapillarycellcarcinoma",
        6:   "liverhepatocellularcarcinoma",
        7:   "lungadenocarcinoma",
        8:   "prostateadenocarcinoma",
        9:   "stomachadenocarcinoma",
        10:  "thyroidcarcinoma"
    }
    
    abbreviations = {1:"CORE", 2:"UCEC", 3:"BRCA-PC", 3.1:"BRCA-C", 4:"KIRC", 5:"KIRP", 6:"LIHC", 7:"LUAD", 8:"PRAD", 9:"STAD", 10:"THCA"}
    
    ## Defining statistical significance
    def significance(x): 
        if x <= 0.05:
            return True
        else:
            return False

    ## Importing pandas
    import numpy as np
    import pandas as pd 
    pd.set_option('display.max_rows', 10000)
    pd.set_option('display.max_columns', 100)

    ## Reading both mRNA and protein files and making dataframes
    mrnafile = "data/mrna_{}.tsv".format(studies[cancertype])
    proteinfile = mrnafile.replace('mrna', 'protein')

    all_mrna = pd.read_csv(mrnafile, sep = '\t', index_col = 'Gene') ## mRNA dataframe
    all_protein = pd.read_csv(proteinfile, sep = '\t', index_col = 'Gene') ## Protein dataframe

    ## Appending significance to the source dataframes
    all_mrna['Significance'] = all_mrna['q-Value'].apply(significance)
    all_protein['Significance'] = all_protein['q-Value'].apply(significance)

    ## Appending protein and mRNA data to the source dataframes
    all_mrna[['Protein q-Value', 'Higher protein expression in', 'Protein Significance']] = all_protein[['q-Value', 'Higher expression in', 'Significance']]
    all_protein[['mRNA q-Value', 'Higher mRNA expression in', 'mRNA Significance']] = all_mrna[['q-Value', 'Higher expression in', 'Significance']]

    ## Creating a list of genes based on the text file
    def createlist(file): 
        with open (file, 'r') as f:
            genefile = f.read() + " "
            gene = ""
            genelist = []  
            for n in genefile:  
                if n != " " and n != "\n":
                    gene = gene + n
                else:
                    if len(gene) > 0:
                        genelist.append(gene)
                        gene = ""
        return(genelist)
    
    genelist = set(createlist('genelists/!genes.txt'))

    ## Filtering out the whole mRNA dataframe to only include the desired genes
    genefilter = all_mrna.index.isin(genelist)
    genesofinterest = all_mrna.loc[genefilter]

    ## Creating final sorted dataframes based on genesofinterest dataframe
    if cancertype not in hasmethylationdata:
        textfile_df = genesofinterest.reindex(genelist)
        mrna_qvalue = genesofinterest.sort_values(by = 'q-Value', ascending = True)
        protein_qvalue = genesofinterest.sort_values(by = 'Protein q-Value', ascending = True)

    ## Adding methylation data if available
    if cancertype in hasmethylationdata:
        methfile = "data/meth_{}.tsv".format(studies[cancertype])
        all_meth = pd.read_csv(methfile, sep = '\t', index_col = 'Gene')
        all_meth['Significance'] = all_meth['q-Value'].apply(significance)

    ## Appending methylation, protein, mRNA data to each other
        all_meth[['mRNA q-Value', 'Higher mRNA expression in', 'mRNA Significance']] = all_mrna[['q-Value', 'Higher expression in', 'Significance']]
        all_meth[['Protein q-Value', 'Higher protein expression in', 'Protein Significance']] = all_protein[['q-Value', 'Higher expression in', 'Significance']]

        all_mrna[['Methylation q-Value', 'Higher methylation in', 'Methylation Significance']] = all_meth[['q-Value', 'Higher methylation in', 'Significance']]
        all_protein[['Methylation q-Value', 'Higher methylation in', 'Methylation Significance']] = all_meth[['q-Value', 'Higher methylation in', 'Significance']] 

        genesofinterest = all_mrna.loc[genefilter]
        textfile_df = genesofinterest.reindex(genelist)
        mrna_qvalue = genesofinterest.sort_values(by = 'q-Value', ascending = True)
        protein_qvalue = genesofinterest.sort_values(by = 'Protein q-Value', ascending = True)
        meth_qvalue = genesofinterest.sort_values(by = 'Methylation q-Value', ascending = True)

    # Creating summary dataframe
    correctedprotein = all_protein.rename(columns = {'q-Value': 'Protein q-Value', 'Higher expression in': 'Higher protein expression in', 'Significance': 'Protein Significance'})
    correctedtextfile_df = textfile_df.rename(columns = {'q-Value': 'mRNA q-Value', 'Higher expression in': 'Higher mRNA expression in', 'Significance': 'mRNA Significance'})

    if cancertype not in hasmethylationdata:
        if cancertype not in asian:
            correctedtextfile_df = correctedtextfile_df.drop(columns=['Cytoband','(A) Black or African American', '(B) White', '(A) Black or African American.1', '(B) White.1', 'Log Ratio', 'p-Value'])
            correctedtextfile_df.replace({'(A) Black or African American':'Black', '(B) White': 'White'}, inplace = True)
        else:
            correctedtextfile_df = correctedtextfile_df.drop(columns=['Cytoband','(A) Asian', '(B) White', '(A) Asian.1', '(B) White.1', 'Log Ratio', 'p-Value'])
            correctedtextfile_df.replace({'(A) Asian':'Asian', '(B) White': 'White'}, inplace = True)
        correctedtextfile_df = correctedtextfile_df[['Higher mRNA expression in', 'mRNA q-Value', 'mRNA Significance', 'Higher protein expression in', 'Protein q-Value', 'Protein Significance']]

    else:
        correctedtextfile_df = correctedtextfile_df.drop(columns=['Cytoband','(A) BLACK OR AFRICAN AMERICAN', '(B) WHITE', '(A) BLACK OR AFRICAN AMERICAN.1', '(B) WHITE.1', 'Log Ratio', 'p-Value'])
        correctedtextfile_df.replace({'(A) BLACK OR AFRICAN AMERICAN':'Black', '(B) WHITE': 'White'}, inplace = True)
        correctedtextfile_df = correctedtextfile_df[['Higher mRNA expression in', 'mRNA q-Value', 'mRNA Significance', 'Higher protein expression in', 'Protein q-Value', 'Protein Significance', 'Higher methylation in', 'Methylation q-Value', 'Methylation Significance']]

    correctedtextfile_df = correctedtextfile_df.sort_index()

    short = correctedtextfile_df.copy()
    short.loc[short['mRNA Significance'] == False, 'Higher mRNA expression in'] = 'NS'
    short.loc[short['Protein Significance'] == False, 'Higher protein expression in'] = 'NS'
    short = short.drop(columns = ['mRNA Significance', 'Protein Significance'])
    
    if cancertype in hasmethylationdata:
        short.loc[short['Methylation Significance'] == False, 'Higher methylation in'] = 'NS'
        short = short.drop(columns = ['Methylation Significance'])
        short_meth = short[['Higher methylation in', 'Methylation q-Value']]

    short_mrna = short[['Higher mRNA expression in', 'mRNA q-Value']]
    short_protein = short[['Higher protein expression in', 'Protein q-Value']]
    short_combined = short[['Higher mRNA expression in', 'Higher protein expression in']]
    
# Create master list function
def table(mp):
    read(1)
    if mp == "mrna": 
        short_mrna.columns = [col + ' {}'.format(abbreviations[1]) for col in short_mrna.columns]
        masterdf = short_mrna
    elif mp == "protein":
        short_protein.columns = [col + ' {}'.format(abbreviations[1]) for col in short_protein.columns]
        masterdf = short_protein
    elif mp == "combined":
        short_combined.columns = [col + ' {}'.format(abbreviations[1]) for col in short_combined.columns]
        masterdf = short_combined
    elif mp == "meth":
        read(hasmethylationdata[0])
        short_meth.columns = [col + ' {}'.format(abbreviations[hasmethylationdata[0]]) for col in short_meth.columns]
        masterdf = short_meth
    for n in list(studies.keys())[1:]:
        read(n)
        if mp == "mrna": 
            short_mrna.columns = [col + ' {}'.format(abbreviations[n]) for col in short_mrna.columns]
            nextdf = short_mrna
        elif mp == "protein":
            short_protein.columns = [col + ' {}'.format(abbreviations[n]) for col in short_protein.columns]
            nextdf = short_protein
        elif mp == "combined":
            short_combined.columns = [col + ' {}'.format(abbreviations[n]) for col in short_combined.columns]
            nextdf = short_combined
        elif mp == "meth":
            if n in hasmethylationdata:
                if n != hasmethylationdata[0]:
                    short_meth.columns = [col + ' {}'.format(abbreviations[n]) for col in short_meth.columns]
                    nextdf = short_meth
                else:
                    continue
            else:
                continue
        masterdf = pd.merge(masterdf, nextdf, right_index = True, left_index = True)
    masterdf.dropna(axis = 'rows', how = 'all', inplace = True)
    
    if mp != "combined":
        temp = masterdf[masterdf.columns[::2]].copy()
        temp.replace({'NS': np.NaN}, inplace = True)
        temp.dropna(axis = 'rows', how = 'all', inplace = True)
        masterdf = masterdf.reindex(temp.index)
        
    if mp == "combined":
        temp = masterdf.copy()
        temp.replace({'NS': np.NaN}, inplace = True)
        temp.dropna(axis = 'rows', how = 'all', inplace = True)
        masterdf = masterdf.reindex(temp.index)
        
    return masterdf

# Export function
def export(df, name):
    df.to_csv('data/{}.csv'.format(name))
    
def excel(df, name):
    df.to_excel('data/{}.xlsx'.format(name))
    
##--------------------------------------------------- GLOSSARY -------------------------------------------------------------- ##
## genelist: List of genes specified in the gene list.

## correctedtextfile_df: Shows mRNA, protein, and methylation (if available) for all genes on the text file.
## short: Short version of above. (Also: short_mrna, short_protein, short_meth)
## all_mrna: Entire mRNA dataframe for all available genes, sorted by q-Value. Protein data appended.
## all_protein: Entire protein dataframe for all available genes, sorted by q-Value. mRNA data appended.
## textfile_df: Dataframe containing both mRNA and protein data for the genes in the text file. Sorted by order in text file.
## mrna_qvalue: Same as textfile_df, sorted by mRNA q-Values.
## protein_qvalue: Same as textfile_df, sorted by protein q-Values.

## IF METHYLATION AVAILABLE:
## all_meth: Entire methylation dataframe for all available genes, sorted by q-Value.
## meth_qvalue: Dataframe for the genes in the text file, sorted by methylation q-Values.

In [11]:
## INPUT DESIRED STUDY NUMBER
n = 1
read(n)

##------------------------------------------------- STUDY INFO -------------------------------------------------------------- ##
## 1) Colorectal Adenocarcinoma: 277 White, 61 Black
## 2) Uterine Corpus Endometrial Carcinoma: 344 White, 101 Black
## 3) Invasive Breast Carcinoma (PanCancer): 726 White, 177 Black
## 3.1) Invasive Breast Carcinoma (Cell 2015): 594 White, 90 Black
## 4) Kidney Renal Clear Cell Carcinoma: 330 White, 53 Black
## 5) Kidney Renal Papillary Cell Carcinoma: 193 White, 60 Black (No significant protein data)
## 6) Liver Hepatocellular Carcinoma: 174 White, 155 Asian
## 7) Lung Adenocarcinoma: 382 White, 52 Black
## 8) Prostate Adenocarcinoma (Cell 2015): 270 White, 43 Black
## 9) Stomach Adenocarcinoma: 258 White, 86 Asian
## 10) Thyroid Carcinoma: 320 White, 50 Asian (No significant protein data)