# Analysis of results

In [None]:
import os
import zipfile
import tempfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
normalize = mcolors.TwoSlopeNorm(vcenter=0, vmin=-1, vmax=1)

In [None]:
pth = os.path.join('preprocessR','brafi')
path = os.path.join(pth, 'out')
all_results = []
all_results_edges = []
ntop = 50
max_samples = 50

with tempfile.TemporaryDirectory() as tmpdir:
    print('Decompressing results to', tmpdir)
    with zipfile.ZipFile(f'{pth}/results.zip', 'r') as zr:
        zr.extractall(tmpdir)
    for subdir, dirs, files in os.walk(tmpdir):
        for d in dirs:
            current_path = os.path.join(subdir, d)
            for file in os.listdir(current_path):
                if len(all_results) >= max_samples:
                    break
                if file == 'nodes.csv':
                    results_path = os.path.join(current_path, file)
                    # Read the contents of the results.csv file into a DataFrame
                    df = pd.read_csv(results_path, index_col=0)
                    all_results.append(df)
                if file == 'edges.csv':
                    df_e = pd.read_csv(os.path.join(current_path, file), index_col=0)
                    all_results_edges.append(df_e)
len(all_results)

In [None]:
df_tfs = pd.read_csv(f"{pth}/tfs.tsv", sep='\t').pivot(index='source', columns='condition', values='score')
df_tfs = pd.DataFrame(df_tfs.values, index=df_tfs.index, columns=[0.5, 1, 2, 3, 4, 8])
df_tfs.head(5)

In [None]:
df_tfs.sort_values(by=2.0).tail(10)

In [None]:
measured = pd.read_csv(f'{pth}/data.tsv', sep='\t').feature.unique()
measured

In [None]:
df_avg = all_results[0].copy()
idx = df_avg.index

for df in all_results[1:]:
    df_avg += df.loc[idx]
df_avg /= len(all_results)
df_avg.to_csv(f'{path}/avg.csv')
df_avg

In [None]:
df_avg_e = all_results_edges[0].copy()
df_avg_e_abs = all_results_edges[0].abs().copy()
idx = df_avg_e.index
cols = df_avg_e.columns

for df in all_results_edges[1:]:
    df_avg_e += df.loc[idx, cols]
    df_avg_e_abs += df.loc[idx, cols].abs()
df_avg_e /= len(all_results_edges)
df_avg_e_abs /= len(all_results_edges)
df_avg_e.to_csv(f'{path}/avg_edges.csv')
df_avg_e_abs.to_csv(f'{path}/avg_edges_abs.csv')
df_avg_e

In [None]:
# Across all networks, most common interaction w/E2F2 is through RB1, same for E2F1
df_avg_e[df_avg_e.index.str.endswith('E2F2')]

In [None]:
df_avg_e[df_avg_e.index.str.endswith('E2F1')].sort_values(by='t0.5_t0').head(5)

In [None]:
idx_only_pred = df_avg.index.difference(measured).difference(df_avg.index[df_avg.index.str.startswith('_')]).tolist()
len(idx_only_pred)

In [None]:
df_avg_nonempty = df_avg.loc[df_avg.abs().sum(axis=1) > 0]
df_avg_corr = df_avg_nonempty.T.corr()

In [None]:
def plot_corr_gene(gene, only_pred=True, only_measured=False, figsize=(20,8), threshold=0.25):
    fig, ax = plt.subplots(figsize=figsize);
    corrs = df_avg_corr.loc[gene].dropna().sort_values(ascending=False)
    corrs = corrs[corrs.abs()>=threshold]
    if only_pred:
        corrs = corrs.loc[corrs.index.intersection(idx_only_pred)]
    if only_measured:
        corrs = corrs.loc[corrs.index.intersection(measured)]
    corrs.plot.bar(ax=ax);  
    return corrs.index, fig, ax
    
#plot_corr_gene('SOS1', threshold=0.60, only_pred=False, only_measured=False);

In [None]:
df_egf_init_pathways = df_avg.loc[['EGR1', 'EGFR', 'MAPK1', 'MAPK3', 'ELK1', 'MAPK8', 'MAP2K1', 'MAP2K2', 'AKT1', 'AKT2', 'AKT3','E2F1','E2F2','E2F4']]
sns.clustermap(df_egf_init_pathways, col_cluster=False, cmap=cm.RdBu_r, norm=normalize)
df_egf_init_pathways.index.intersection(measured)

In [None]:
idx_top = df_avg.loc[idx_only_pred, :].std(axis=1).sort_values(ascending=False).head(ntop).index.tolist()
df_avg_only_pred = df_avg.loc[idx_top,:].copy()
df_avg_only_pred.columns = ["t(0.5)-t(0)", "t(1)-t(0.5)", "t(2)-t(1)", "t(3)-t(2)", "t(4)-t(3)", "t(8)-t(4)"]
df_avg_only_pred.columns.name = "Conditions"
df_avg_only_pred.index.name = "Average predicted activity for unobserved signaling proteins"
df_avg_only_pred.to_csv(f'{path}/pred_nodes_mean_top{ntop}_nsamples_{len(all_results)}.csv')
df_avg.to_csv(f'{path}/pred_nodes_mean_nsamples_{len(all_results)}.csv')
sns.clustermap(df_avg_only_pred, cmap=cm.RdBu_r, norm=normalize, yticklabels=True, col_cluster=False);
plt.savefig(f'{path}/heatmap_activity_nodes_only_pred_top{ntop}_averaged_runs.pdf', format='pdf');

In [None]:
sns.clustermap(df_avg.loc[idx_top].T.corr(), cmap=cm.RdBu_r, norm=normalize, yticklabels=True, xticklabels=True);
plt.savefig(f'{path}/heatmap_correlation_activity_nodes_only_pred_top{ntop}_averaged_runs.pdf', format='pdf');

In [None]:
fig, ax = plt.subplots()
df_avg_subset = df_avg.loc[['E2F1', 'E2F2', 'TP53', 'RB1', 'MAPK1', 'MAPK3', 'BRAF']].copy()
df_avg_subset.columns = ['t(0.5)-t(0)', 't(1)-t(0.5)', 't(2)-t(1)', 't(3)-t(2)', 't(4)-t(3)', 't(8)-t(4)']
df_avg_subset = df_avg_subset.T
for col, style in zip(df_avg_subset.columns, ['-*', '-*', '-*', '--*', '--*', '--*', '--*', '--*']):
    df_avg_subset[col].plot(style=style, ax=ax);
ax.legend(loc='center left', bbox_to_anchor=(0.96, 0.5));
ax.set_xlabel("Conditions (diff. timepoints)");
ax.set_ylabel("Average predicted activity");
plt.savefig(f"{path}/avg_activity_e2f1_e2f2.pdf", format="pdf", bbox_inches='tight')
#.legend();