In [74]:
# Imports
import pandas as pd

from binn import Network
from binn import BINN
from binn import BINNClassifier
from binn import BINNExplainer
from binn import ImportanceNetwork

# utils is  a file from the BINN github repository
from util_for_examples import fit_data_matrix_to_network_input, generate_data
import torch
from lightning.pytorch import Trainer

import torch.nn.functional as F
import seaborn as sns

from lightning.pytorch import Trainer

import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


In [211]:
#### Import all the data and test train split

# Input Data
input_data = pd.read_csv("aki_data/test_data.tsv", sep="\t", )
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
translation = pd.read_csv("aki_data/translation.tsv", sep="\t", index_col=0)
pathways = pd.read_csv("aki_data/pathways.tsv", sep="\t")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

# split into test and train
tm_p19_cols = [col for col in input_data_qm.columns if col.startswith("TM_P19")]
tm_m2012_cols = [col for col in input_data_qm.columns if col.startswith("TM_M2012")]

# Create DataFrames based on these columns
test_input_data = input_data_qm[tm_p19_cols]
train_input_data = input_data_qm[tm_m2012_cols]
first_column = input_data_qm.iloc[:, 0]
train_input_data.insert(0, first_column.name, first_column)
test_input_data.insert(0, first_column.name, first_column)


#design_matrix
mask_tm_m2012 = design_matrix['sample'].str.startswith("TM_M2012")
mask_tm_p19 = design_matrix['sample'].str.startswith("TM_P19")
# Use the mask to create the separate DataFrames
test_design_matrix = design_matrix[mask_tm_p19]
train_design_matrix = design_matrix[mask_tm_m2012]

# check if design matrix has the right amount of subphenotype 1&2 according to paper
train_design_matrix
group_counts = train_design_matrix['group'].value_counts()
count_group_1 = group_counts.get(1, 0)
count_group_2 = group_counts.get(2, 0)
print(count_group_1,count_group_2)
design_matrix

60 81


Unnamed: 0,sample,group
0,TM_P1911_190,2
1,TM_P1911_191,2
2,TM_P1911_192,2
3,TM_P1911_193,2
4,TM_P1911_194,2
...,...,...
192,TM_M2012_198,2
193,TM_M2012_199,2
194,TM_M2012_200,2
195,TM_M2012_202,2


In [235]:
# Permute the translasions and the pathways

shuffled_translation = np.random.permutation(translation['translation'].values)
translation["translation"] = shuffled_translation


# Permute pathways (doesnt always work) or import the pathway_copy.csc which contains the shuffled child column
shuffled_pathways = np.random.permutation(pathways['child'].values)
pathways["child"] = shuffled_pathways

In [271]:
# all proteins to one pathway
translation["translation"] = "R-HSA-8941858"
translation
# pathways
data = {
    'parent': ['R-HSA-8941858','R-HSA-8941858','R-HSA-8941858','R-HSA-8941858','R-HSA-8941858','R-HSA-3108214','R-HSA-3108214'],#'R-HSA-8941858', 'R-HSA-9672393', 'R-HSA-1299361', 'R-HSA-2160916'],
    'child': ['R-HSA-3108214','R-HSA-3108215','R-HSA-3108216','R-HSA-3108217','R-HSA-3108218','R-HSA-3108215','R-HSA-3108216']#, 'R-HSA-9672393', 'R-HSA-1299361', 'R-HSA-2160916', 'R-HSA-9682706']
}

pathways_simple = pd.DataFrame(data)

In [272]:
accuracies = []
counter = 0
for run in range(5):
    
    counter +=1 
    network = Network(
        input_data    =   input_data_qm, # use the preprocessed data
        pathways      =   pathways_simple,
        mapping       =   translation,
        input_data_column = "Protein", # This is the default value
        source_column = "parent", # defined by our pathways-file
        target_column = "child",
        subset_pathways = True # This is the default value
    )

    binn = BINN(
        network=network,
        n_layers=4,
        dropout=0.2,
        validate=False,
        residual=False,
        device="cpu",
        learning_rate=0.001
        # defaults
        # activation = "tanh"
        # weight = torch.tensor([1, 1])
        # scheduler = "plateau"
        # optimizer = "adam"
        # n_outputs = 2
    )

    ### Training with all train data no CV

    protein_matrix = fit_data_matrix_to_network_input(train_input_data, features=network.inputs)
    X, y = generate_data(protein_matrix, design_matrix=train_design_matrix)

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(X, dtype=torch.float32, device=binn.device),
        torch.tensor(y, dtype=torch.torch.long, device=binn.device),
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)


    # You can also train with a standard PyTorch train loop 

    optimizer = binn.configure_optimizers()[0][0]

    num_epochs = 50

    for epoch in range(num_epochs):
        binn.train() 
        total_loss = 0.0
        total_accuracy = 0

        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.to(binn.device)
            targets = targets.to(binn.device).type(torch.LongTensor)
            optimizer.zero_grad()
            outputs = binn(inputs).to(binn.device)
            loss = F.cross_entropy(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_accuracy += torch.sum(torch.argmax(outputs, axis=1) == targets) / len(targets)

        avg_loss = total_loss / len(dataloader)
        avg_accuracy = total_accuracy / len(dataloader)
        print(f'Epoch {epoch}, Average Accuracy {avg_accuracy}, Average Loss: {avg_loss}')


    # Using the test manually

    # and test_loader is your DataLoader for the test data
    protein_matrix_test = fit_data_matrix_to_network_input(test_input_data, features=network.inputs)
    # Assuming generate_data() returns the entire dataset
    X_test, y_test = generate_data(protein_matrix_test, design_matrix=test_design_matrix)

    # Convert to PyTorch tensors
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)  # tryout long instead of int16

    # Create a DataLoader for your test data
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=8)  # Adjust batch size as needed
    # Set the model to evaluation mode
    binn.eval()
    
    # Initialize variables to track accuracy
    correct = 0
    total = 0

    # No need to track gradients for evaluation
    with torch.no_grad():
        for inputs, labels in test_loader:
            # Move data to the same device as your model
            inputs = inputs.to(binn.device)
            labels = labels.to(binn.device)

            # Forward pass
            outputs = binn(inputs)

            # Get predictions from the maximum value
            _, predicted = torch.max(outputs.data, 1)

            # Total number of labels
            total += labels.size(0)

            # Total correct predictions
            correct += (predicted == labels).sum().item()

    # Calculate the accuracy
    accuracy = 100 * correct / total
    accuracies.append(accuracy)
    print(f'Accuracy of the model on the test data: {accuracy:.2f}%')




BINN is on the device: cpu
dasdf
Epoch 0, Average Accuracy 0.5541666746139526, Average Loss: 0.705829938252767
Epoch 1, Average Accuracy 0.5222222208976746, Average Loss: 0.73586439092954
Epoch 2, Average Accuracy 0.6486111283302307, Average Loss: 0.6653531226846907
Epoch 3, Average Accuracy 0.5694444179534912, Average Loss: 0.6711628337701162
Epoch 4, Average Accuracy 0.5569444298744202, Average Loss: 0.6845298045211368
Epoch 5, Average Accuracy 0.49861112236976624, Average Loss: 0.7170505854818556
Epoch 6, Average Accuracy 0.6166666746139526, Average Loss: 0.6582944558726417
Epoch 7, Average Accuracy 0.6347222328186035, Average Loss: 0.6145958238177829
Epoch 8, Average Accuracy 0.5736110806465149, Average Loss: 0.6413041899601618
Epoch 9, Average Accuracy 0.6819444298744202, Average Loss: 0.5791554699341456
Epoch 10, Average Accuracy 0.6166666746139526, Average Loss: 0.6517116145955192
Epoch 11, Average Accuracy 0.7166666388511658, Average Loss: 0.586608683069547
Epoch 12, Average A

In [274]:
# 30 samples on 50 epochs
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(mean_accuracy,std_accuracy)

81.22448979591836 5.9709138115623634


#### Test on synthetic data

In [1]:
import synthetic_data as sync_data
from sklearn.model_selection import train_test_split

# Imports
import pandas as pd

from binn import Network
from binn import BINN
from binn import BINNClassifier
from binn import BINNExplainer
from binn import ImportanceNetwork

# utils is  a file from the BINN github repository
from util_for_examples import fit_data_matrix_to_network_input, generate_data
import torch
from lightning.pytorch import Trainer

import torch.nn.functional as F
import seaborn as sns

from lightning.pytorch import Trainer

import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
qm_matrix_synthetic,design_matrix_synthetic,translations_synthetic,parent_child_df,key_genes,key_pathways = sync_data.create_synthetic_data()

total_columns = len(qm_matrix_synthetic.columns) - 1
num_test_columns = int(total_columns * 0.2)

column_indices = list(range(1, len(qm_matrix_synthetic.columns)))

# Randomly select column indices for the test set
np.random.seed(42)  # For reproducibility
test_column_indices = np.random.choice(column_indices, size=num_test_columns, replace=False)

# Initialize train and test DataFrames
train_input_data = qm_matrix_synthetic.copy()
test_input_data = qm_matrix_synthetic.iloc[:, test_column_indices]

# Drop the test columns from the train DataFrame
train_input_data.drop(qm_matrix_synthetic.columns[test_column_indices], axis=1, inplace=True)
first_column = train_input_data.iloc[:, 0]
test_input_data.insert(0, 'Protein', first_column)

train_patient_names = train_input_data.columns[1:]  # Exclude the first column 'Genes'
test_patient_names = test_input_data.columns[1:]   # Exclude the first column 'Genes'

# Subset design_matrix_synthetic for train and test
train_design_matrix = design_matrix_synthetic[design_matrix_synthetic['sample'].isin(train_patient_names)]
test_design_matrix = design_matrix_synthetic[design_matrix_synthetic['sample'].isin(test_patient_names)]

0.9000392397938404
-1
1.0144569349470727
111
0.9823503670950573
-1
-0.4151301923577571
-1
1.5318529604652769
111
-1.0584952847804987
111
-0.6756569446117376
-1
-7.767671961112741
111
-5.22431228045784
111
0.5469597735236577
-1
-6.793826514714235
111
1.6266329943757771
111
0.35513989659731543
-1
0.27298750150784457
-1
0.8137070201567544
-1
0.9023657786163095
-1
-4.26535229620022
111
1.5643468194856944
111
-0.0052686709239992404
-1
-2.9362751133206575
111
-7.808176251905342
111
-0.7670432522226862
-1
-0.9530603095973521
-1
0.9314869251920663
-1
0.3173277700674413
-1
-2.1654752871568537
111
0.9069854581080896
-1
27.347042141486344
111
1.3869582983922628
111
1.4735364728780969
111
1.4551671387637228
111
1.1270711953099717
111
0.9118368441458862
-1
1.0124724589593672
111
0.7429625029070933
-1
1.6278538756616105
111
-4.885465687096948
111
0.9031996912351726
-1
0.8930880716641261
-1
-6.34596240344116
111
1.4743752549772433
111
-4.920845978426838
111
1.4280326445514298
111
0.8897587777234441
-

In [3]:
test_design_matrix.value_counts("group")

group
2    62
1    38
Name: count, dtype: int64

In [4]:
# the names o fhte data frames: qm_matrix_synthetic, design_matrix_synthetic, translations_synthetic, parent_child_df 



# Train a binn model on the normal data

network = Network(
        input_data    =   train_input_data,#qm_matrix_synthetic, # use the preprocessed data
        pathways      =   parent_child_df,
        mapping       =   translations_synthetic,
        input_data_column = "Protein", # This is the default value
        source_column = "child", # defined by our pathways-file
        target_column = "parent",
        subset_pathways = True # This is the default value
    )

binn = BINN(
    network=network,
    n_layers=4,
    dropout=0.2,
    validate=False,
    residual=False,
    device="cpu",
    learning_rate=0.001
    # defaults
    # activation = "tanh"
    # weight = torch.tensor([1, 1])
    # scheduler = "plateau"
    # optimizer = "adam"
    # n_outputs = 2
)

### Training with all train data no CV

protein_matrix = fit_data_matrix_to_network_input(train_input_data, features=network.inputs)
X, y = generate_data(protein_matrix, design_matrix=train_design_matrix)

dataset = torch.utils.data.TensorDataset(
    torch.tensor(X, dtype=torch.float32, device=binn.device),
    torch.tensor(y, dtype=torch.torch.long, device=binn.device),
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)


# You can also train with a standard PyTorch train loop 

optimizer = binn.configure_optimizers()[0][0]

num_epochs = 50
# Training loop
for epoch in range(num_epochs):
    binn.train() 
    total_loss = 0.0
    total_accuracy = 0

    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(binn.device), targets.to(binn.device)
        optimizer.zero_grad()
        outputs = binn(inputs)
        loss = F.cross_entropy(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_accuracy += (torch.argmax(outputs, axis=1) == targets).sum().item() / len(targets)

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')

"""or epoch in range(num_epochs):
    binn.train() 
    total_loss = 0.0
    total_accuracy = 0

    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs = inputs.to(binn.device)
        targets = targets.to(binn.device).type(torch.LongTensor)
        optimizer.zero_grad()
        outputs = binn(inputs).to(binn.device)
        loss = F.cross_entropy(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_accuracy += torch.sum(torch.argmax(outputs, axis=1) == targets) / len(targets)

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)"""


BINN is on the device: cpu
dasdf
Epoch 1/50, Loss: 0.6874, Accuracy: 0.6350
Epoch 2/50, Loss: 0.5136, Accuracy: 0.7500
Epoch 3/50, Loss: 0.4689, Accuracy: 0.7625
Epoch 4/50, Loss: 0.4644, Accuracy: 0.7975
Epoch 5/50, Loss: 0.4612, Accuracy: 0.7825
Epoch 6/50, Loss: 0.4489, Accuracy: 0.7925
Epoch 7/50, Loss: 0.4789, Accuracy: 0.7925
Epoch 8/50, Loss: 0.4073, Accuracy: 0.8200
Epoch 9/50, Loss: 0.4743, Accuracy: 0.7850
Epoch 10/50, Loss: 0.4041, Accuracy: 0.8300
Epoch 11/50, Loss: 0.4660, Accuracy: 0.7875
Epoch 12/50, Loss: 0.4305, Accuracy: 0.7925
Epoch 13/50, Loss: 0.4770, Accuracy: 0.7700
Epoch 14/50, Loss: 0.4446, Accuracy: 0.7750
Epoch 15/50, Loss: 0.4046, Accuracy: 0.8200
Epoch 16/50, Loss: 0.4154, Accuracy: 0.8325
Epoch 17/50, Loss: 0.4215, Accuracy: 0.8050
Epoch 18/50, Loss: 0.4366, Accuracy: 0.8000
Epoch 19/50, Loss: 0.4074, Accuracy: 0.8050
Epoch 20/50, Loss: 0.4528, Accuracy: 0.7875
Epoch 21/50, Loss: 0.4112, Accuracy: 0.8125
Epoch 22/50, Loss: 0.3777, Accuracy: 0.8325
Epoch 2

'or epoch in range(num_epochs):\n    binn.train() \n    total_loss = 0.0\n    total_accuracy = 0\n\n    for batch_idx, (inputs, targets) in enumerate(dataloader):\n        inputs = inputs.to(binn.device)\n        targets = targets.to(binn.device).type(torch.LongTensor)\n        optimizer.zero_grad()\n        outputs = binn(inputs).to(binn.device)\n        loss = F.cross_entropy(outputs, targets)\n        loss.backward()\n        optimizer.step()\n        total_loss += loss.item()\n        total_accuracy += torch.sum(torch.argmax(outputs, axis=1) == targets) / len(targets)\n\n    avg_loss = total_loss / len(dataloader)\n    avg_accuracy = total_accuracy / len(dataloader)'

In [5]:
# Test the Binn model on the 

# and test_loader is your DataLoader for the test data
protein_matrix_test = fit_data_matrix_to_network_input(test_input_data, features=network.inputs)
# Assuming generate_data() returns the entire dataset
X_test, y_test = generate_data(protein_matrix_test, design_matrix=test_design_matrix)

# Convert to PyTorch tensors
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)  # tryout long instead of int16

# Create a DataLoader for your test data
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=8)  # Adjust batch size as needed
# Set the model to evaluation mode
binn.eval()

# Initialize variables to track accuracy
correct = 0
total = 0

# No need to track gradients for evaluation
with torch.no_grad():
    for inputs, labels in test_loader:
        # Move data to the same device as your model
        inputs = inputs.to(binn.device)
        labels = labels.to(binn.device)

        # Forward pass
        outputs = binn(inputs)

        # Get predictions from the maximum value
        _, predicted = torch.max(outputs.data, 1)

        # Total number of labels
        total += labels.size(0)

        # Total correct predictions
        correct += (predicted == labels).sum().item()

# Calculate the accuracy
accuracy = 100 * correct / total

dasdf


In [6]:
accuracy

73.0

In [7]:
### Logistic regression

from sklearn.linear_model import LogisticRegression
import pandas as pd

# Assuming train_input_data and train_design_matrix are your training data

# Reshape train_input_data: drop 'Protein' column and transpose it
X_train = train_input_data.drop('Protein', axis=1).T

# Map 'sample' in train_design_matrix to match index of X_train
y_train = train_design_matrix.set_index('sample').reindex(X_train.index)['group']

# Initialize logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Fit the model
logreg.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming test_input_data and test_design_matrix are your testing data

# Reshape test_input_data: drop 'Protein' column and transpose it
X_test = test_input_data.drop('Protein', axis=1).T

# Map 'sample' in test_design_matrix to match index of X_test
y_test = test_design_matrix.set_index('sample').reindex(X_test.index)['group']

# Make predictions using the trained model
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.69
Classification Report:
               precision    recall  f1-score   support

           1       0.59      0.61      0.60        38
           2       0.75      0.74      0.75        62

    accuracy                           0.69       100
   macro avg       0.67      0.67      0.67       100
weighted avg       0.69      0.69      0.69       100



In [9]:
background_protein_matrix = fit_data_matrix_to_network_input(train_input_data, features=network.inputs)
background_X, _ = generate_data(background_protein_matrix, design_matrix=train_design_matrix)

background_data = torch.tensor(background_X, dtype=torch.float32, device=binn.device)
# Use the same data for test data
test_data = background_data.clone()  # Cloning to keep test data separate
# Initialize the explainer with your trained model
explainer = BINNExplainer(binn)

# Generate explanations
explanation_df = explainer.explain(test_data, background_data)

# Analyze the explanation_df dataframe



dasdf


In [13]:
#explanation_df.sort_values(by = "value", ascending = False).head(20)

explanation_df_sorted_filtered = explanation_df[explanation_df['source name'].str.contains("Gene")]
explanation_df_sorted_filtered.sort_values(by = "value", ascending = False).head(40)

Unnamed: 0,source,target,source name,target name,value,type,source layer,target layer
340,106,196,Gene_288,Level_3Pathway2,0.272471,0,0,1
342,106,200,Gene_288,Level_3Pathway6,0.272471,0,0,1
172,56,255,Gene_208,Level_1Pathway13,0.169622,0,0,1
178,56,200,Gene_208,Level_3Pathway6,0.169622,0,0,1
176,56,196,Gene_208,Level_3Pathway2,0.169622,0,0,1
174,56,297,Gene_208,Level_1Pathway7,0.169622,0,0,1
277,89,242,Gene_262,Level_2Pathway2,0.159064,1,0,1
275,89,268,Gene_262,Level_1Pathway25,0.159064,1,0,1
12,6,297,Gene_108,Level_1Pathway7,0.15292,0,0,1
216,71,298,Gene_237,Level_1Pathway8,0.151966,0,0,1


In [12]:
key_genes.sort()
print(f"the key pathways are {key_pathways}")
print(f"the key genes are {key_genes}") 

the key pathways are {'Level_1Pathway8', 'Level_0Pathway19', 'Level_1Pathway19', 'Level_2Pathway2', 'Level_0Pathway14', 'Level_3Pathway6', 'Level_3Pathway7', 'Level_1Pathway0', 'Level_0Pathway18', 'Level_2Pathway8', 'Level_0Pathway15', 'Level_3Pathway0', 'Level_2Pathway18', 'Level_0Pathway10', 'Level_2Pathway5', 'Level_3Pathway3', 'Level_2Pathway11', 'Level_2Pathway12', 'Level_0Pathway9', 'Level_1Pathway5', 'Level_0Pathway0', 'Level_1Pathway11', 'Level_1Pathway3', 'Level_1Pathway14', 'Level_1Pathway1', 'Level_0Pathway6', 'Level_1Pathway17', 'Level_1Pathway16', 'Level_0Pathway3', 'Level_0Pathway1', 'Level_3Pathway1', 'Level_2Pathway6', 'Level_1Pathway7', 'Level_2Pathway3', 'Level_1Pathway10', 'Level_1Pathway9', 'Level_0Pathway5', 'Level_3Pathway4', 'Level_1Pathway6', 'Level_0Pathway8', 'Level_1Pathway12', 'Level_2Pathway1', 'Level_2Pathway13', 'Level_3Pathway8', 'Level_0Pathway13', 'Level_2Pathway19', 'Level_2Pathway16', 'Level_2Pathway17', 'Level_3Pathway9', 'Level_0Pathway7', 'Level_2

In [21]:
# First, compute the absolute values of the SHAP values


# Then, group by 'source name' and calculate the mean of the absolute values
mean_abs_shap_by_feature = explanation_df.groupby('source name')['value'].mean().sort_values(ascending=False)
"""mean_abs_shap_by_feature_filtered = mean_abs_shap_by_feature[mean_abs_shap_by_feature['source name'].str.contains("Gene")]
mean_abs_shap_by_feature_filtered.sort_values(by = "value", ascending = False).head(40)"""


# # Select the top N features
# num_top_features = 10  # for example
# top_features = mean_abs_shap_by_feature.head(num_top_features).index.tolist()

# # Extract SHAP values for the top features
# top_features_shap_values = explanation_df[explanation_df['source name'].isin(top_features)]
mean_abs_shap_by_feature.head(30)

source name
Level_2Pathway16     0.210095
Level_3Pathway2      0.174580
Level_1Pathway9      0.157260
Level_1Pathway7      0.148311
Gene_288             0.144112
Level_2Pathway2      0.135746
Level_0Pathway130    0.124500
Level_2Pathway5      0.117403
Level_2Pathway14     0.113414
Gene_192             0.112957
Gene_108             0.111557
Level_3Pathway6      0.110981
Level_1Pathway45     0.109585
Level_0Pathway88     0.101156
Level_0Pathway8      0.100119
Gene_208             0.099973
Level_1Pathway48     0.099973
Level_0Pathway37     0.099930
Level_1Pathway37     0.099861
Level_0Pathway63     0.099059
Gene_237             0.096050
Level_0Pathway67     0.094924
Level_3Pathway4      0.094675
Gene_261             0.094129
Level_1Pathway42     0.093785
Level_0Pathway31     0.093721
Level_0Pathway144    0.092887
Level_1Pathway17     0.091747
Gene_210             0.090880
Gene_262             0.090285
Name: value, dtype: float64

In [286]:
explanation_df_sorted = explanation_df.sort_values(by = "abs_value", ascending = True)
#[  0  15  25  26  44  59  61  63  97 117 124 134 138 156 157 158 176 177 188 199 212 213 215 233 238 241 273 276 278 293]
explanation_df_sorted_filtered = explanation_df_sorted[explanation_df_sorted['source name'].str.contains("Gene")]
explanation_df_sorted_filtered.head(40)

KeyError: 'abs_value'

In [10]:
# First, compute the absolute values of the SHAP values
explanation_df['abs_value'] = explanation_df['value'].abs()

# Then, group by 'source name' and calculate the mean of the absolute values
mean_abs_shap_by_feature = explanation_df.groupby('source name')['abs_value'].mean().sort_values(ascending=False)

# Select the top N features
num_top_features = 10  # for example
top_features = mean_abs_shap_by_feature.head(num_top_features).index.tolist()

# Extract SHAP values for the top features
top_features_shap_values = explanation_df[explanation_df['source name'].isin(top_features)]


In [62]:
import pandas as pd

# Assuming explanation_df is your dataframe and binn is your trained BINN model
n_layers = binn.n_layers  # Number of layers in the BINN model

# Initialize an empty dataframe for the final results
plot_df = pd.DataFrame()

# Iterate through each layer
for layer in range(n_layers):
    # Filter the dataframe for the current layer
    layer_df = explanation_df[explanation_df['source layer'] == layer]

    # Group by 'source name', then calculate the mean SHAP value
    grouped_df = layer_df.groupby('source name', as_index=False)['abs_value'].mean()

    # Sort the grouped dataframe based on mean SHAP values
    grouped_df.sort_values(by='abs_value', ascending=False, inplace=True)

    # Rank the features based on their mean SHAP values
    grouped_df['rank'] = grouped_df['abs_value'].rank(ascending=False)

    # Add a column for the layer number
    grouped_df['source layer'] = layer

    # Append to the final dataframe
    plot_df = pd.concat([plot_df, grouped_df], ignore_index=True)

# Display the first few rows of the dataframe
print(plot_df.head())


  source name  abs_value  rank  source layer
0    Gene_171   0.163694   1.0             0
1    Gene_269   0.111017   2.0             0
2     Gene_66   0.110108   3.0             0
3    Gene_217   0.102805   4.0             0
4     Gene_11   0.096489   5.0             0


In [298]:
print(key_genes)
print(key_pathways)

[233 119  58 106 177 270  77  10 257  17 203   2 130 134  84 101  13 225
  31  23  30 181 212 195   8 224  28 213  26 299]
{'Level_1Pathway11', 'Level_0Pathway2', 'Level_3Pathway3', 'Level_3Pathway4', 'Level_3Pathway7', 'Level_1Pathway9', 'Level_2Pathway12', 'Level_0Pathway13', 'Level_2Pathway2', 'Level_0Pathway12', 'Level_0Pathway4', 'Level_0Pathway1', 'Level_1Pathway2', 'Level_0Pathway14', 'Level_3Pathway9', 'Level_1Pathway10', 'Level_1Pathway4', 'Level_2Pathway4', 'Level_0Pathway5', 'Level_0Pathway9', 'Level_0Pathway8', 'Level_2Pathway13', 'Level_1Pathway12', 'Level_3Pathway1', 'Level_2Pathway10', 'Level_1Pathway0', 'Level_3Pathway6', 'Level_2Pathway9', 'Level_1Pathway8', 'Level_3Pathway0', 'Level_0Pathway11', 'Level_0Pathway3', 'Level_1Pathway13', 'Level_2Pathway5', 'Level_0Pathway6', 'Level_3Pathway2', 'Level_1Pathway1', 'Level_1Pathway6', 'Level_0Pathway7', 'Level_3Pathway5', 'Level_2Pathway0', 'Level_3Pathway8', 'Level_0Pathway0', 'Level_2Pathway7', 'Level_1Pathway5', 'Level_2Pa

In [39]:

key_genes_sorted = key_genes
key_genes_gene = [f"Gene_{i}" for i in key_genes_sorted]
key_genes_gene

['Gene_0',
 'Gene_15',
 'Gene_25',
 'Gene_26',
 'Gene_44',
 'Gene_59',
 'Gene_61',
 'Gene_63',
 'Gene_97',
 'Gene_117',
 'Gene_124',
 'Gene_134',
 'Gene_138',
 'Gene_156',
 'Gene_157',
 'Gene_158',
 'Gene_176',
 'Gene_177',
 'Gene_188',
 'Gene_199',
 'Gene_212',
 'Gene_213',
 'Gene_215',
 'Gene_233',
 'Gene_238',
 'Gene_241',
 'Gene_273',
 'Gene_276',
 'Gene_278',
 'Gene_293']

In [57]:
print("Level_1Pathway0" in key_pathways)

True


In [15]:
# Select the top 50 features from mean_abs_shap_filtered
top_50_features = mean_abs_shap_filtered.head(50).index.tolist()

# Check how many key genes are in the top 50 features
matches = sum(gene in top_50_features for gene in key_genes_gene)

print(f"Number of key genes in the top 50 features: {matches}")

NameError: name 'mean_abs_shap_filtered' is not defined

In [14]:
mean_abs_shap_by_feature

source name
Gene_171             0.163694
Level_1Pathway11     0.153358
Level_2Pathway19     0.152273
Level_0Pathway38     0.148235
Level_0Pathway34     0.146577
                       ...   
Gene_179             0.000027
Gene_78              0.000027
Gene_116             0.000004
Level_0Pathway134    0.000003
Gene_274             0.000001
Name: abs_value, Length: 286, dtype: float64