## Write summary table of Kolmogorov-Smirnnov GO results

In this notebook, we go through the GO enrichment results and summarize them in one overview file

In [1]:
import os
os.chdir("../../data/GO_analyses")
ontologies_of_interest = ["cellular_component", "biological_process"]

base_dir = "."

In [2]:
import pandas as pd

pd.options.display.multi_sparse = True

class GOsummaryWriter():
    def __init__(self, base_dir, ontology_of_interest):
        self._base_dir = base_dir
        self._ksresult_file = "signed_ksfull.tsv"
        self._ontology_of_interest = ontology_of_interest
        self._list_of_condition_go_results = []

        self.summary_df = None
        
        self._define_summary_df()
        self._write_summary_df()
    
    def _define_summary_df(self):
        self._define_list_of_condition_go_results()
        self.summary_df = MergerOfSingleConditionGOResults(self._list_of_condition_go_results).merged_df

        

    def _write_summary_df(self):
        excel_path = f'{self._base_dir}/../results_tables/GO_summary_table.xlsx'
        if os.path.exists(excel_path):
            with pd.ExcelWriter(excel_path,engine="openpyxl", mode='a') as writer:
                self.summary_df.to_excel(writer, sheet_name=f"GO_{self._ontology_of_interest}")
        else:
             with pd.ExcelWriter(excel_path,engine="openpyxl") as writer:
                self.summary_df.to_excel(writer, sheet_name=f"GO_{self._ontology_of_interest}")


    def _define_list_of_condition_go_results(self):
        for folder_name in self._iterate_over_folders_in_base_dir():
            goresult_filepath = self._get_goresult_filepath(folder_name)
            self._list_of_condition_go_results.append(SingleConditionGOResult(goresult_filepath, condition_name=folder_name))
    
    def _iterate_over_folders_in_base_dir(self):
        for folder in os.listdir(self._base_dir):
            if os.path.isdir(f"{self._base_dir}/{folder}"):
                if not folder.startswith("."):
                    yield folder
    
    def _get_goresult_filepath(self, folder):
        return f"{self._base_dir}/{folder}/{self._ontology_of_interest}/{self._ksresult_file}"

class MergerOfSingleConditionGOResults():
    def __init__(self, list_of_single_condition_go_results):
        self._list_of_single_condition_go_results = list_of_single_condition_go_results
        
        self.merged_df = None

        self._define_merged_df()
    
    def _define_merged_df(self):
        #function that merges multiple dataframes on column "term"
        self.merged_df = pd.concat([result.reduced_goresults_df for result in self._list_of_single_condition_go_results], axis=1)
        self.merged_df = self.merged_df.reset_index()
        self.merged_df = self.merged_df[self._define_sorted_multiindex()]
        self.merged_df = self.merged_df.sort_values(by=[('Median Score', 'Survival Score')], ascending=False)
        self.merged_df = self.merged_df.dropna()
        display(self.merged_df)
    
    def _define_sorted_multiindex(self):
        sorted_tuples = [(                 'term',               ''),
            (              'go-name',               ''),
            (                  'FDR', 'Survival Score'),
            (                  'FDR',              'C'),
            (                  'FDR',              'A'),
            (                  'FDR',              'R'),
            (                  'FDR',              'L'),
            (                  'FDR',              'S'),
            ('Median Score', 'Survival Score'),
            ('Median Score',              'C'),
            ('Median Score',              'A'),
            ('Median Score',              'R'),
            ('Median Score',              'L'),
            ('Median Score',              'S')]
        return pd.MultiIndex.from_tuples(sorted_tuples)

class SingleConditionGOResult():
    def __init__(self, goresult_filepath, condition_name):
        self._goresult_filepath = goresult_filepath
        self._condition_name = condition_name
        self._goresults_df = None

        self.reduced_goresults_df = None
        self.goterm2goname = {}

        self._define_reduced_goresults_df()
        self._define_goterm2goname()
    
    def _define_reduced_goresults_df(self):
        self._load_goresults_df()
        self.reduced_goresults_df = self._subset_goresults_df()
        self.reduced_goresults_df = self.reduced_goresults_df.set_index(["term", 'go-name'])
        self._rename_reduced_goresults_df()
        self._define_column_index()
    
    def _load_goresults_df(self):
        self._goresults_df = pd.read_csv(self._goresult_filepath, sep="\t")

    def _subset_goresults_df(self):
        return self._goresults_df[["term", "fdr", "median_z_target", "go-name"]]

    def _rename_reduced_goresults_df(self):
        self.reduced_goresults_df = self.reduced_goresults_df.rename(columns={"median_z_target" : "Median Score", "fdr" : "FDR", "term" : "GO Term"})
    
    def _define_column_index(self):
        index_tuples = [( "FDR", self._condition_name), ( "Median Score", self._condition_name)]
        self.reduced_goresults_df.columns = pd.MultiIndex.from_tuples(index_tuples)

    def _define_goterm2goname(self):
        self.goterm2goname = dict(zip(self._goresults_df["term"], self._goresults_df["go-name"]))

In [3]:
import os

if os.path.exists("../results_tables/GO_summary_table.xlsx"):
    os.remove("../results_tables/GO_summary_table.xlsx")

for ontology in ontologies_of_interest:
    GOsummaryWriter(base_dir, ontology)
    print(f"{ontology} done")

Unnamed: 0_level_0,term,go-name,FDR,FDR,FDR,FDR,FDR,FDR,Median Score,Median Score,Median Score,Median Score,Median Score,Median Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Survival Score,C,A,R,L,S,Survival Score,C,A,R,L,S
13,GO:0030288,outer membrane-bounded periplasmic space,9.062e-09,0.0001036,0.279,0.47,0.0001261,4.503e-05,4.047,2.475,0.767,0.564,2.482,2.931
22,GO:0042597,periplasmic space,1.321e-08,0.0002,0.057,0.47,0.0001261,1.247e-05,4.047,2.474,0.861,0.564,2.482,2.968
18,GO:0031975,envelope,6.713e-09,5.304e-05,0.101,0.872,5.054e-05,3.925e-09,3.819,2.056,0.331,0.224,2.261,3.201
15,GO:0030313,cell envelope,6.713e-09,5.304e-05,0.101,0.872,5.054e-05,3.925e-09,3.819,2.056,0.331,0.224,2.261,3.201
7,GO:0019867,outer membrane,0.004,0.292,0.476,0.174,0.046,3.756e-06,3.125,1.485,0.174,-0.791,1.93,3.949
33,GO:0044462,external encapsulating structure part,0.016,0.466,0.476,0.174,0.095,1.779e-05,2.39,1.345,0.174,-0.791,1.712,3.859
1,GO:0009279,cell outer membrane,0.016,0.466,0.476,0.174,0.095,1.779e-05,2.39,1.345,0.174,-0.791,1.712,3.859
14,GO:0030312,external encapsulating structure,0.008,0.306,0.563,0.174,0.107,1.02e-05,2.39,1.485,0.174,-0.791,1.704,3.393
17,GO:0031230,intrinsic component of cell outer membrane,0.026,0.461,0.819,0.768,0.361,0.004,2.279,1.485,-0.227,0.936,2.201,3.207
21,GO:0032993,protein-DNA complex,0.397,0.074,0.563,0.385,0.61,0.992,2.125,2.205,0.419,-1.03,0.834,0.603


cellular_component done


Unnamed: 0_level_0,term,go-name,FDR,FDR,FDR,FDR,FDR,FDR,Median Score,Median Score,Median Score,Median Score,Median Score,Median Score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Survival Score,C,A,R,L,S,Survival Score,C,A,R,L,S
7,GO:0000302,response to reactive oxygen species,7.000000e-03,9.900000e-02,5.960000e-01,7.780000e-01,8.200000e-02,1.500000e-02,6.553,3.271,0.912,-0.792,2.079,6.551
248,GO:0034599,cellular response to oxidative stress,3.165000e-05,3.600000e-02,2.910000e-01,1.580000e-01,1.400000e-02,1.800000e-02,5.667,4.423,1.313,4.183,2.538,6.344
452,GO:1901700,response to oxygen-containing compound,3.840000e-04,3.000000e-03,5.100000e-02,1.400000e-01,4.400000e-02,1.700000e-02,5.211,3.386,1.589,2.323,2.079,3.379
230,GO:0031669,cellular response to nutrient levels,1.570000e-01,1.880000e-01,1.340000e-01,9.700000e-01,6.310000e-01,3.220000e-01,4.352,2.981,2.542,-1.567,1.557,2.776
453,GO:1901701,cellular response to oxygen-containing compound,1.070000e-01,5.100000e-02,7.070000e-01,2.000000e-02,2.360000e-01,3.900000e-02,4.263,3.386,1.124,4.622,2.168,6.459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,GO:0042273,ribosomal large subunit biogenesis,5.516000e-11,6.597000e-09,1.869000e-05,6.832000e-09,4.246000e-05,1.000000e-03,-7.349,-5.760,-2.448,-4.002,-3.097,-3.143
250,GO:0034622,cellular macromolecular complex assembly,0.000000e+00,2.999000e-12,8.093000e-10,3.148000e-13,2.441000e-08,8.670000e-07,-7.349,-5.323,-3.000,-3.775,-2.825,-3.933
211,GO:0022618,ribonucleoprotein complex assembly,0.000000e+00,0.000000e+00,5.067000e-10,0.000000e+00,2.166000e-08,9.966000e-07,-7.349,-6.125,-3.519,-4.258,-3.051,-3.933
266,GO:0042274,ribosomal small subunit biogenesis,8.079000e-08,1.232000e-05,2.137000e-04,8.147000e-06,2.000000e-03,1.000000e-03,-7.349,-5.559,-3.548,-4.258,-2.773,-4.531


biological_process done


In [4]:
#function that merges multiple pandas dataframes on column "term"
def merge_dataframes(list_of_dataframes):
    return pd.merge([df for df in list_of_dataframes], on = "term")