### Imports

In [1]:
# Imports

# standard stuff
import pandas as pd
import numpy as np
from collections import Counter
import os
# Binn stuff
from binn import Network
from binn import BINN
from binn import BINNExplainer
# from the github of the binn
from util_for_examples import fit_data_matrix_to_network_input, generate_data
# our custom functions for training and testing
from cust_functions import training as trg
# torch stuff
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
# Sklearn stuff
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
SEED = 42
trg.set_seed(SEED)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


### Data

In [3]:
# Import covid data
covid_train = pd.read_csv('covid_data/covid_train_qm.csv', index_col=0)
covid_test = pd.read_csv('covid_data/covid_test_qm.csv', index_col=0)

covid_train_design = pd.read_csv('covid_data/covid_train_design_qm.csv', index_col=0)
covid_test_design = pd.read_csv('covid_data/covid_test_design_qm.csv', index_col=0)
covid_train_design["group"]+=1
covid_test_design["group"]+=1
translation = pd.read_csv("aki_data/translation.tsv", sep="\t", index_col=0)
pathways = pd.read_csv("aki_data/pathways.tsv", sep="\t")

In [4]:
#### Import aki data

# Get data, we have to tranform it first
input_data = pd.read_csv("aki_data/test_data.tsv", sep="\t", )
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
translation = pd.read_csv("aki_data/translation.tsv", sep="\t", index_col=0)
pathways = pd.read_csv("aki_data/pathways.tsv", sep="\t")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

# split into test and train according to paper
tm_p19_cols = [col for col in input_data_qm.columns if col.startswith("TM_P19")]
tm_m2012_cols = [col for col in input_data_qm.columns if col.startswith("TM_M2012")]

# Create train/test input based on these columns
test_input_data = input_data_qm[tm_p19_cols]
train_input_data = input_data_qm[tm_m2012_cols]
first_column = input_data_qm.iloc[:, 0]
train_input_data.insert(0, first_column.name, first_column)
test_input_data.insert(0, first_column.name, first_column)

#design_matrix
tm_m2012 = design_matrix['sample'].str.startswith("TM_M2012")
tm_p19 = design_matrix['sample'].str.startswith("TM_P19")
# Use the mask to create the separate DataFrames
test_design_matrix = design_matrix[tm_p19]
train_design_matrix = design_matrix[tm_m2012]

### AKI

In [5]:
# Interpreting the 5 models of the cross validation using the explainer from the binn
network = Network(
    input_data    =   train_input_data, 
    pathways      =   pathways,
    mapping       =   translation,
    input_data_column = "Protein", 
    source_column = "child", 
    target_column = "parent",
    subset_pathways = True 
    )
binn = BINN(
        network=network,
        n_layers=4,
        dropout=0.2,
        validate=False,
        residual=False,
        device="cpu",
        learning_rate=0.001
    )

protein_matrix = fit_data_matrix_to_network_input(train_input_data, features=network.inputs)
X, y = generate_data(protein_matrix, design_matrix=train_design_matrix)

num_folds = 5  
model_save_dir = "trained_models/AKI_BINN"  
explanations = []

test_data = torch.tensor(X, dtype=torch.float32)
background_data = torch.tensor(X, dtype=torch.float32)

for fold in range(1, num_folds + 1):
    model_path = f"{model_save_dir}/model_fold_{fold}.pth"
    model = binn 
    model.load_state_dict(torch.load(model_path)) 
    model.eval()

    explainer = BINNExplainer(model)
    importance_df = explainer.explain(test_data, background_data)
    explanations.append(importance_df)



BINN is on the device: cpu
dasdf


In [6]:
### This is done in a similar ways as in the binn documentation example so the data has the same format

# Prepare the data to be explained
concatenated_values = pd.DataFrame()

for i in range(len(explanations) - 1):
    value_column = explanations[i]['value'].rename(f'value{i}')
    concatenated_values = pd.concat([concatenated_values, value_column], axis=1)

df = explanations[-1]

for column in concatenated_values.columns:
    df.insert(loc=df.columns.get_loc('value'), column=column, value=concatenated_values[column])

df.rename(columns={'value': 'value4'}, inplace=True)

df['value_mean'] = df[['value0', 'value1', 'value2', 'value3', 'value4']].mean(axis=1)
df['values_std'] = df[['value0', 'value1', 'value2', 'value3', 'value4']].std(axis=1)

df['value'] = df['value_mean']

# Drop if the source == target, aka the pathways/protein explains itself
df["copy"] = df.apply(lambda x: True if x["source name"] == x["target name"] else False, axis=1)
df = df[df["copy"] == False]

In [None]:
"""df_sorted = df.sort_values(by='value', ascending=False)
df_grouped = importance_df_copy = df.groupby(["source name", "source layer", "target layer"], as_index=False).mean(numeric_only=True)
df_grouped.sort_values(by='value', ascending=False).head(20)"""

In [13]:
# Returns a df with mean = mean rank -> lower better
# Code from the binn documentation
importance_df_copy = df.groupby(["source name", "source layer", "target layer"], as_index=False).mean(numeric_only=True)
mean_ranks = []
std_ranks = []
source_layer = []
sources = []
for layer in range(binn.n_layers):
    layer_df = importance_df_copy[importance_df_copy["source layer"] == layer].copy()
    for i in range(5):
        layer_df.sort_values(f"value{i}", ascending=False, inplace=True)
        layer_df[f"rank_{i}"] = range(len(layer_df.index))
    rank_cols = [c for c in layer_df.columns if c.startswith("rank")]
    mean_ranks += (layer_df[rank_cols].mean(axis=1)/ len(layer_df.index)).tolist()
    sources += layer_df["source name"].tolist()
    std_ranks  += (layer_df[rank_cols].std(axis=1)/ len(layer_df.index)).tolist()
    source_layer += layer_df["source layer"].tolist()
plot_df = pd.DataFrame({"mean":mean_ranks, "std":std_ranks, "source layer":source_layer, "source":sources})
plot_df.sort_values("mean", ascending=True).head(20)

Unnamed: 0,mean,std,source layer,source
449,0.001303,0.001784,1,R-HSA-975634
2,0.001782,0.001863,0,P04114
4,0.003563,0.004046,0,P04908
756,0.005405,0.005856,2,R-HSA-975634
1,0.006236,0.00366,0,P02647
0,0.008018,0.010751,0,Q96A08
8,0.018263,0.008393,0,P68871
1020,0.025862,0.010558,3,R-HSA-2262752
451,0.02671,0.014086,1,R-HSA-446388
1017,0.027586,0.033052,3,R-HSA-168249


In [14]:
# only proteins
filtered_df = plot_df[~plot_df['source'].str.contains('R-HSA')]
# Get the top 30
top_30_df = filtered_df.sort_values(by='mean', ascending=True).head(30)
# Save the data
model_save_dir = "BINN_Explanations"
os.makedirs(model_save_dir, exist_ok=True)
csv_file_path = os.path.join(model_save_dir, 'top_30_proteins_binn_aki.csv')
top_30_df.to_csv(csv_file_path, index=False)

In [19]:
# only proteins
filtered_df = plot_df[plot_df['source'].str.contains('R-HSA')]
# Get the top 30
top_30_df = filtered_df.sort_values(by='mean', ascending=True).head(30)
# Save the data
model_save_dir = "BINN_Explanations"
os.makedirs(model_save_dir, exist_ok=True)
csv_file_path = os.path.join(model_save_dir, 'top_30_pathways_binn_aki.csv')
top_30_df.to_csv(csv_file_path, index=False)

### COVID

In [22]:
network = Network(
    input_data    =   covid_train, 
    pathways      =   pathways,
    mapping       =   translation,
    input_data_column = "Protein", 
    source_column = "child", 
    target_column = "parent",
    subset_pathways = True 
    )
binn = BINN(
        network=network,
        n_layers=4,
        dropout=0.2,
        validate=False,
        residual=False,
        device="cpu",
        learning_rate=0.001
    )

protein_matrix = fit_data_matrix_to_network_input(covid_train, features=network.inputs)
X, y = generate_data(protein_matrix, design_matrix=covid_train_design)

num_folds = 5  
model_save_dir = "trained_models/Covid_BINN"  
explanations = []

test_data = torch.tensor(X, dtype=torch.float32)
background_data = torch.tensor(X, dtype=torch.float32)

for fold in range(1, num_folds + 1):
    model_path = f"{model_save_dir}/model_fold_{fold}.pth"
    model = binn 
    model.load_state_dict(torch.load(model_path)) 
    model.eval()

    explainer = BINNExplainer(model)
    importance_df = explainer.explain(test_data, background_data)
    explanations.append(importance_df)


BINN is on the device: cpu
dasdf


In [23]:
# Merge the df
concatenated_values = pd.DataFrame()

for i in range(len(explanations) - 1):
    value_column = explanations[i]['value'].rename(f'value{i}')
    concatenated_values = pd.concat([concatenated_values, value_column], axis=1)

df = explanations[-1]

for column in concatenated_values.columns:
    df.insert(loc=df.columns.get_loc('value'), column=column, value=concatenated_values[column])

df.rename(columns={'value': 'value4'}, inplace=True)
#last_df = last_df.iloc[:, :-5]

df['value_mean'] = df[['value0', 'value1', 'value2', 'value3', 'value4']].mean(axis=1)
df['values_std'] = df[['value0', 'value1', 'value2', 'value3', 'value4']].std(axis=1)

df['value'] = df['value_mean']


# Drop if the source == target, aka the pathways/protein explains itself
df["copy"] = df.apply(lambda x: True if x["source name"] == x["target name"] else False, axis=1)
df = df[df["copy"] == False]


In [24]:
importance_df_copy = df.groupby(["source name", "source layer", "target layer"], as_index=False).mean(numeric_only=True)
mean_ranks = []
std_ranks = []
source_layer = []
sources = []
for layer in range(binn.n_layers):
    layer_df = importance_df_copy[importance_df_copy["source layer"] == layer].copy()
    for i in range(5):
        layer_df.sort_values(f"value{i}", ascending=False, inplace=True)
        layer_df[f"rank_{i}"] = range(len(layer_df.index))
    rank_cols = [c for c in layer_df.columns if c.startswith("rank")]
    mean_ranks += (layer_df[rank_cols].mean(axis=1)/ len(layer_df.index)).tolist()
    sources += layer_df["source name"].tolist()
    std_ranks  += (layer_df[rank_cols].std(axis=1)/ len(layer_df.index)).tolist()
    source_layer += layer_df["source layer"].tolist()
plot_df = pd.DataFrame({"mean":mean_ranks, "std":std_ranks, "source layer":source_layer, "source":sources})


In [25]:
plot_df.sort_values("mean").head(20)

Unnamed: 0,mean,std,source layer,source
293,0.0,0.0,3,R-HSA-168249
0,0.0096,0.010431,0,P02671
2,0.0128,0.009121,0,P02675
1,0.0176,0.022199,0,P06396
3,0.0288,0.010733,0,P02649
130,0.042697,0.04452,1,R-HSA-114608
216,0.04557,0.039627,2,R-HSA-166658
226,0.053165,0.063544,2,R-HSA-212436
4,0.0608,0.021615,0,P05109
303,0.073333,0.060782,3,R-HSA-2173782


In [29]:
# only proteins
filtered_df = plot_df[~plot_df['source'].str.contains('R-HSA')]
# Get the top 30
top_30_df = filtered_df.sort_values(by='mean', ascending=True).head(30)
# Save the data
model_save_dir = "BINN_Explanations"
os.makedirs(model_save_dir, exist_ok=True)
csv_file_path = os.path.join(model_save_dir, 'top_30_proteins_binn_covid.csv')
top_30_df.to_csv(csv_file_path, index=False)
top_30_df

Unnamed: 0,mean,std,source layer,source
0,0.0096,0.010431,0,P02671
2,0.0128,0.009121,0,P02675
1,0.0176,0.022199,0,P06396
3,0.0288,0.010733,0,P02649
4,0.0608,0.021615,0,P05109
15,0.0736,0.030146,0,P25311
8,0.0736,0.049767,0,P07996
7,0.0752,0.040239,0,P00740
6,0.088,0.039192,0,P23083
24,0.104,0.061968,0,P07998


In [30]:
# only proteins
filtered_df = plot_df[plot_df['source'].str.contains('R-HSA')]
# Get the top 30
top_30_df = filtered_df.sort_values(by='mean', ascending=True).head(30)
# Save the data
model_save_dir = "BINN_Explanations"
os.makedirs(model_save_dir, exist_ok=True)
csv_file_path = os.path.join(model_save_dir, 'top_30_pathways_binn_covid.csv')
top_30_df.to_csv(csv_file_path, index=False)
top_30_df

Unnamed: 0,mean,std,source layer,source
293,0.0,0.0,3,R-HSA-168249
130,0.042697,0.04452,1,R-HSA-114608
216,0.04557,0.039627,2,R-HSA-166658
226,0.053165,0.063544,2,R-HSA-212436
303,0.073333,0.060782,3,R-HSA-2173782
154,0.074157,0.141144,1,R-HSA-8878171
237,0.075949,0.12042,2,R-HSA-2168880
305,0.09,0.064118,3,R-HSA-5223345
223,0.106329,0.048697,2,R-HSA-6798695
220,0.106329,0.026249,2,R-HSA-9651496
