In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests

In [None]:
def get_top_systems(drug_name, nest_map, rlipp_df):
    
    subsys_df = rlipp_df.sort_values(by='P_rho', ascending=False, ignore_index=True)
    subsys_df['Rank'] = 0
    subsys_df['Name'] = ''
    subsys_df = subsys_df[['Rank', 'Term', 'Name', 'P_rho', 'P_pval', 'C_rho', 'C_pval', 'RLIPP']]
    for i, row in subsys_df.iterrows():
        subsys_df.at[i, 'Rank'] = i+1
        subsys_df.at[i, 'Name'] = nest_map[row['Term']]
        
    return subsys_df

In [None]:
def bh(p_vals):
    res = multipletests(p_vals, alpha=0.05, method='fdr_bh')
    return res[1].max()

bh.__name__ = 'BH'

In [None]:
def bonferroni(p_vals):
    res = multipletests(p_vals, alpha=0.05, method='bonferroni')
    return res[1].max()

bonferroni.__name__ = 'Bonferroni'

In [None]:
nest_df = pd.read_csv('../data/NeST/NeST_node.csv', sep=',')
nest_map = {row['name'].replace('.', '-'):row['Annotation'] for i, row in nest_df.iterrows()}

In [None]:
ont = 'ctg'
dataset = 'av'
zscore_method = 'auc'
folds = 5

drugs = list(pd.read_csv('../data/training_files_av/drugname_av.txt', header=None, names=['D'])['D'])
drugs = ['Palbociclib']

for drug in drugs:
    for i in range(1, folds+1):
        modeldir = '../models/Test/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        rlipp_df = pd.read_csv(modeldir + '/rlipp.out', sep='\t')
        subsys_df = get_top_systems(drug, nest_map, rlipp_df)
        subsys_df.to_csv(modeldir + '/subsystem_ranks.txt', sep='\t', index=False)

In [None]:
for drug in drugs:
    agg_terms = []
    for i in range(1, folds+1):
        modeldir = '../models/Test/model_' + ont + '_' + dataset + '_' + drug + '_' + zscore_method + '_' + str(i)
        subsys_df = pd.read_csv(modeldir + '/subsystem_ranks.txt', sep='\t')[['Term', 'Name', 'P_rho', 'P_pval']]
        agg_terms.append(subsys_df)
    
    agg_df = pd.concat(agg_terms, ignore_index=True)
    agg_rlipp_df = pd.DataFrame(agg_df.groupby(['Term', 'Name']).agg({'P_rho':'mean', 
                                                                      'P_pval': [bh, bonferroni, np.max, np.prod]
                                                                     })).reset_index()
    agg_rlipp_df.columns = ['_'.join(col).strip('_') for col in agg_rlipp_df.columns.values]
    agg_rlipp_df = agg_rlipp_df.sort_values(by='P_rho_mean', ascending=False)
    agg_rlipp_df.to_csv('../models/rlipp/' + drug + '_all_pval.txt', sep='\t', float_format='%.4f', index=False)