In [None]:
import time
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import corneto as cnt
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors
normalize = mcolors.TwoSlopeNorm(vcenter=0, vmin=-1, vmax=1)
cnt.__version__

In [None]:
# Differential time-points
tps = ['t0.5_t0', 't1_t0.5', 't2_t1', 't3_t2', 't4_t3', 't8_t4']

In [None]:
folder = 'preprocessR/brafi/'
df_all_meta = pd.read_csv('experimental_metadata.tsv', sep='\t')
runid = str(int(time.time()))

In [None]:
df_meta = pd.read_csv(f'{folder}/metadata.tsv', sep='\t')
df_meta.head(10)

In [None]:
# Make sure that changes in EGR1 from DE analysis of timepoints matches the ones in Gerosa et al paper
df_tvals = pd.read_csv(f'{folder}/t_matrix_de.tsv', sep='\t').set_index('rowname')
df_tvals.loc[['EGR1','EGR2', 'EGR3'],:].T.plot();

In [None]:
# Import TF estimation from t-vals coming from DE analysis of timepoints (see preprocess.R script)
df_tfs = pd.read_csv(f'{folder}/tfs.tsv',sep='\t').pivot(index='source', columns='condition', values='score')
df_tfs0 = df_tfs.copy()
sns.clustermap(df_tfs.T.corr(), cmap=cm.RdBu_r, norm=normalize);

In [None]:
# Show top TFs by variance across time points
top_tf_idx = df_tfs.std(axis=1).sort_values().tail(50).index
df_top_tfs = df_tfs.loc[top_tf_idx]
tf_max = df_top_tfs.max().max()
tf_min = df_top_tfs.min().min()
sns.clustermap(df_top_tfs, cmap=cm.RdBu_r, norm=mcolors.TwoSlopeNorm(vcenter=0, vmin=tf_min, vmax=tf_max), yticklabels=True);
plt.savefig(f'{folder}/heatmap_top_50_tfs.pdf', format='pdf');

## Data preparation

In [None]:
df_dataset = pd.read_csv(f'{folder}/data.tsv', sep='\t')
df_dataset.score.hist()

In [None]:
tf_threshold = 2
pval_tf_threshold = 0.05

df_tfs0 = pd.read_csv(f'{folder}/tfs.tsv',sep='\t')
df_tfs0.loc[df_tfs0.p_value > pval_tf_threshold, 'score'] = 0

df_measurements = df_tfs0[df_tfs0.score.abs() > 2]
df_measurements = df_measurements.loc[:, ['condition', 'source', 'score']]
df_measurements['type'] = 'measurement'
df_measurements = df_measurements.loc[:, ['condition', 'type', 'source', 'score']].rename(columns={'source': 'feature'})
df_measurements

In [None]:
# Change dataset by the new measurements
df_dataset = pd.concat([df_dataset.head(6), df_measurements])
df_dataset.score.describe()

In [None]:
df_wide = df_dataset.pivot(index='feature', columns='condition', values='score').fillna(0)
df_wide

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
top_changing_tfs = df_wide.std(axis=1).sort_values(ascending=False).head(30).index
df_wide.loc[top_changing_tfs].T.plot(ax=ax);
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

In [None]:
df_pkn = pd.read_csv('pkn.tsv', sep='\t')

In [None]:
d = dict()
tps = ['t0.5_t0', 't1_t0.5', 't2_t1', 't3_t2', 't4_t3', 't8_t4']

for t in tps:
    k = 'tp_'+ str(t).replace('.','')
    d_t = dict()
    d[k] = d_t
    for tf_name, score in df_wide.loc[:,t].iteritems():
        if tf_name == 'EGFR':
            d_t[tf_name] = ('P', score)
        else:
            if abs(score) >= 1.0:
                d_t[tf_name] = ('M', score)
len(d)

In [None]:
measured = set()
for k, v in d.items():
    measured |= set(v.keys())
len(measured)

## Sequential fitting signaling networks

In [None]:
from corneto import Graph, signflow, signflow_constraints, default_sign_loss, hamming_loss, create_flow_graph
pkn = cnt.import_sif("pkn.tsv", delimiter='\t', has_header=True)
network = Graph.import_network(pkn)

In [None]:
t0 = {'CTP': d['tp_t05_t0']}
t1 = {'CTP': d['tp_t1_t05']}
t2 = {'CTP': d['tp_t2_t1']}
t3 = {'CTP': d['tp_t3_t2']}
t4 = {'CTP': d['tp_t4_t3']}
t5 = {'CTP': d['tp_t8_t4']}
timepoints = [t0, t1, t2, t3, t4, t5]
# We provide conditions_as_timepoints to indicate the method to not create multiple variables for each node per condition
# since we're going to solve this sequentially
g = create_flow_graph(network, d, conditions_as_timepoints=True)

In [None]:
%%capture cap --no-stderr

reg_penalty=0.01
dist_penalty=0.1
max_time=650
gap=0.02
norel=600
use_last_diff = False # this is wrong, set to false
sol_edges, sol_nodes = [], []
flow_values = []
prev_sol = None

# This is an example of how to use a custom optimization loop.
# The main problem based on network flow and signal propagation is created with signflow_constraints.
# This creates the main set of constraints to define the problem, without any objective function to optimize
base_problem = signflow_constraints(g)


print(folder)
print("Reg:", reg_penalty, "Dist:", dist_penalty, "Gap:", gap, "NoRelHeurTime:", norel, "MaxTime:", max_time)

for t in timepoints:
    # We add the objective function to the base problem
    p = base_problem + default_sign_loss(g, t, base_problem, l0_penalty_reaction=reg_penalty)
    curr_sol = p.symbols['reaction_sends_activation_CTP'] + p.symbols['reaction_sends_inhibition_CTP']
    # If we have the solution for t-1, we use the solution to add a penalty on hamming distance
    if prev_sol is not None:
        p += hamming_loss(prev_sol, curr_sol, penalty=dist_penalty)
    p.solve(solver='GUROBI', verbosity=1, max_seconds=max_time, MIPGap=gap, NoRelHeurTime=norel);
    if prev_sol is not None:
        diff = abs(curr_sol.value - prev_sol)
        print("hamming distance:", sum(diff))
    for o in p.objectives:
        print("obj:", o.value)
    if use_last_diff and prev_sol is not None:
        prev_sol = np.array(diff)
    else:
        prev_sol = np.abs(np.array(curr_sol.value))
    # Store solution (edge values and node values)
    edge_vals = p.symbols['reaction_sends_activation_CTP'].value - p.symbols['reaction_sends_inhibition_CTP'].value
    node_vals = p.symbols['species_activated_CTP'].value - p.symbols['species_inhibited_CTP'].value
    flow_values.append(pd.DataFrame(p.symbols['_flow_rxn_ipos'].value, index=g.reactions))
    sol_edges.append(pd.DataFrame(edge_vals, index=g.reactions))
    sol_nodes.append(pd.DataFrame(node_vals, index=g.species))

In [None]:
os.makedirs(f'{folder}/runs/{runid}', exist_ok=True)
with open(f'{folder}/runs/{runid}/output.txt', 'w') as f:
    f.write(str(cap))

In [None]:
df_edges = pd.concat(sol_edges, axis=1)

In [None]:
ntop = 100

In [None]:
df_edges.loc[df_edges.std(axis=1).sort_values(ascending=False).head(ntop).index]

In [None]:
df_nodes = pd.concat(sol_nodes, axis=1)
df_top_nodes = df_nodes.loc[df_nodes.std(axis=1).sort_values(ascending=False).head(ntop).index]
sns.clustermap(pd.DataFrame(df_top_nodes.values, columns=tps, index=df_top_nodes.index), cmap=cm.RdBu_r, norm=normalize)
plt.savefig(f'{folder}/runs/{runid}/heatmap_activity_nodes_top{ntop}.pdf', format='pdf')

In [None]:
pd.DataFrame(df_edges.values, index=df_edges.index, columns=tps).to_csv(f'{folder}/runs/{runid}/edges.csv')
pd.DataFrame(df_nodes.values, index=df_nodes.index, columns=tps).to_csv(f'{folder}/runs/{runid}/nodes.csv')

In [None]:
exclude_nodes = set(df_nodes.index[df_nodes.index.str.startswith('_')].tolist()) | set(df_nodes.index[df_nodes.index.str.len()>20].tolist()) | measured
df_nodes_pred = df_nodes.loc[df_nodes.index.difference(exclude_nodes)]
pd.DataFrame(df_nodes_pred.values, index=df_nodes_pred.index, columns=tps).to_csv(f'{folder}/runs/{runid}/nodes_pred.csv')
df_top_nodes = df_nodes_pred.loc[df_nodes_pred.std(axis=1).sort_values(ascending=False).head(ntop).index]
df_top_nodes = pd.DataFrame(df_top_nodes.values, index=df_top_nodes.index, columns=tps)

In [None]:
sns.clustermap(df_top_nodes, cmap=cm.RdBu_r, norm=normalize);
plt.savefig(f'{folder}/runs/{runid}/heatmap_activity_nodes_pred_top{ntop}.pdf', format='pdf')