# Write CARLS table

In this notebook, we go through all z values in all of the perturbation datasets we obtained and summarize them to one table

### Define paths to datasets

In [1]:
import os
os.chdir("../../data")

relative_locpaths = ["dataset_hui_et_al/sorted_differential_analysis_files/A", 'dataset_hui_et_al/sorted_differential_analysis_files/R','dataset_hui_et_al/sorted_differential_analysis_files/C_glucose_uptake_titration', 
'dataset_houser_et_al/sorted_differential_analysis_files/S_houser', 'dataset_schmidt_et_al/sorted_differential_analysis_files/L','dataset_schmidt_et_al/sorted_differential_analysis_files/C_chemostat',
'dataset_schmidt_et_al/sorted_differential_analysis_files/C_carbon_sources', 'dataset_schmidt_et_al/sorted_differential_analysis_files/S_schmidt']


tables_w_merged_zvals = [f"./{x}_mergedzs.tsv" for x in relative_locpaths]

gene_name_mapping = "gene_name_infos.tsv"

### Define classes for merging

In [27]:
from os import stat
import numpy as np
import pandas as pd

class ZScoreTableMerger():
    def __init__(self):
        self._carls2columns = {"C": ["C_glucose_uptake_titration", "C_chemostat", "C_carbon_sources"], "S": ["S_houser", "S_schmidt"], "Survival Score" : ["C", "A", "R", "L","S"]}
        self._relevant_columns = None

    def add_remaining_carls_columns_to_zscore_df(self, zscore_df):
        for carls_val in self._carls2columns:
            self._add_merged_zval(zscore_df, carls_val)
        return zscore_df

    def _add_merged_zval(self, zscore_df, carls_val):
        subset_df = zscore_df[self._carls2columns[carls_val]]
        zscore_df[carls_val] = [self._merge_zscore(x) for x in subset_df.values]
    
    def _merge_zscore(self, zscore_row):
        z_sum = np.nansum(zscore_row)
        n_elems = sum(~np.isnan(zscore_row))

        z_scaled = z_sum/np.sqrt(n_elems)
        z_scaled_cut = self._cut_z(z_scaled)
        return z_scaled_cut

    @staticmethod
    def _cut_z(summed_z,max_z = 7.3487):
        return np.sign(summed_z) * min(abs(summed_z), abs(max_z))

class ZScoreTableAnnotator():
    def __init__(self, gene_info_table):
        self._gene_info_table = gene_info_table
        self._gene2genename = {}
        self._define_gene2genename()
    
    def annotate_table(self, zscore_df):
        zscore_df = self._add_gene_infos(zscore_df)
        zscore_df = self._add_counts_of_numcarls(zscore_df)
        zscore_df = self._add_indication_if_included(zscore_df)
        zscore_df = self._adapt_headers(zscore_df)
        zscore_df = self._filter_unmatched_genes(zscore_df)
        zscore_df = self._sort_by_survival_score_and_no_measured(zscore_df)
        return zscore_df

    def _add_gene_infos(self, zscore_df):
        zscore_df["Gene Name"] = [self._gene2genename.get(x) for x in zscore_df["protein"]]
        return zscore_df

    def _add_counts_of_numcarls(self, zscore_df):
        zscore_df["No.Measured"] = [self._count_numvals(x) for x in zscore_df[["C", "A", "R", "L", "S"]].values]
        return zscore_df
    
    @staticmethod
    def _count_numvals(zscore_row):
        return sum(~np.isnan(zscore_row))
    
    def _add_indication_if_included(self, zscore_df):
        zscore_df["Used for Enrichment"] = [x>2 for x in zscore_df["No.Measured"]]
        return zscore_df

    def _define_gene2genename(self):
        gene_info_df = self._load_gene_info_df()
        self._gene2genename = dict(zip(gene_info_df["From"], gene_info_df["Gene Name"]))
    
    def _load_gene_info_df(self):
        return pd.read_csv(self._gene_info_table, sep = "\t")

    @staticmethod
    def _adapt_headers(zscore_df):
        zscore_df.rename(columns={"protein": "Gene"}, inplace=True)
        ordered_headers = ["Gene", "Gene Name", "Survival Score", "Used for Enrichment","No.Measured","C", "A", "R", "L", "S"]
        zscore_df = zscore_df[ordered_headers]
        return zscore_df
    
    def _filter_unmatched_genes(self, zscore_df):
        zscore_df = zscore_df[zscore_df["Gene"].notna()]
        zscore_df = zscore_df[[";" not in x for x in zscore_df["Gene"]]]
        return zscore_df
    
    def _sort_by_survival_score_and_no_measured(self, zscore_df):
        display(zscore_df)
        zscore_df = zscore_df.sort_values(by=["Used for Enrichment","Survival Score", "No.Measured"], ascending=False)
        return zscore_df

    
class DataSetZScoreCollector():
    def __init__(self, filepaths_of_z_values):
        self._filepaths_of_z_values = filepaths_of_z_values
        self._dataframes = []
        self.zscore_df = None
        self._define_zscore_df()

    def _define_zscore_df(self):
        self._define_dataframes()
        self._merge_dataframes()
        
    def _define_dataframes(self):
        for file in self._filepaths_of_z_values:
            dataset_name = self._get_datasetname(file)
            dataframe = pd.read_csv(file, sep = "\t")
            dataframe = dataframe.rename(columns={'z-value': dataset_name})
            self._dataframes.append(dataframe)

    #function that merges multiple dataframes on one column with the same name
    def _merge_dataframes(self):
        merged_dataframe = self._dataframes[0]
        for dataframe in self._dataframes[1:]:
            merged_dataframe = pd.merge(merged_dataframe, dataframe, on='protein', how='outer')
        self.zscore_df = merged_dataframe

    @staticmethod
    def _get_datasetname(file):
        file_wo_path = file.split("/")[-1]
        datasetname = file_wo_path.replace("_mergedzs.tsv", "")
        return datasetname

 

### Apply classes to datasets

In [31]:
zscore_df = DataSetZScoreCollector(tables_w_merged_zvals).zscore_df

zscore_df = ZScoreTableMerger().add_remaining_carls_columns_to_zscore_df(zscore_df)
zscore_df = ZScoreTableAnnotator(gene_name_mapping).annotate_table(zscore_df)

display(zscore_df)
zscore_df.to_excel("carls_table.xlsx", index = False)


  z_scaled = z_sum/np.sqrt(n_elems)
  z_scaled = z_sum/np.sqrt(n_elems)


Unnamed: 0,Gene,Gene Name,Survival Score,Used for Enrichment,No.Measured,C,A,R,L,S
0,sspA,Stringent starvation protein A,0.697086,True,5,1.739936,0.266570,-2.162785,-0.345325,2.060335
1,yfgD,Uncharacterized protein yfgD,1.324726,True,5,-0.167891,-1.237309,2.278502,1.699945,0.388929
2,fadE,Acyl-coenzyme A dehydrogenase,6.482253,True,4,6.544162,1.861784,,-2.495979,7.054539
3,rplK,50S ribosomal protein L11,-7.348700,True,5,-7.348700,-5.163955,-7.348700,-3.106491,-1.361082
4,ecnB,Entericidin B,5.728478,True,3,3.457100,5.529042,0.935873,,
...,...,...,...,...,...,...,...,...,...,...
2157,bluR,,2.053749,False,1,2.053749,,,,
2158,ybhP,Uncharacterized protein ybhP,5.496351,False,2,2.064187,,,,5.708828
2159,mepH,,1.774382,False,1,1.774382,,,,
2160,ydiH,Uncharacterized protein ydiH,2.108358,False,1,2.108358,,,,


Unnamed: 0,Gene,Gene Name,Survival Score,Used for Enrichment,No.Measured,C,A,R,L,S
69,metQ,D-methionine-binding lipoprotein metQ,7.348700,True,5,3.276345,-1.531813,7.348700,7.054539,4.566042
99,uspA,Universal stress protein A,7.348700,True,5,7.348700,2.233505,4.343749,4.858280,5.162996
105,luxS,S-ribosylhomocysteine lyase,7.348700,True,5,1.717947,4.108525,3.702122,6.553435,6.656184
136,lrp,Leucine-responsive regulatory protein,7.348700,True,5,4.253167,2.773323,2.322611,7.007819,0.947185
139,dcp,Peptidyl-dipeptidase dcp,7.348700,True,5,5.872755,0.839620,2.365618,4.531396,3.313756
...,...,...,...,...,...,...,...,...,...,...
1200,kpsD,,-6.510802,False,1,,,,,-6.510802
1789,sdaC,Serine transporter,-6.796026,False,2,-2.646520,,,-6.964513,
1256,rlhA,,-6.930649,False,1,,,,,-6.930649
2137,glnK,Nitrogen regulatory protein P-II 2,-7.348700,False,2,-5.741918,,,,-6.109410
