<a href="https://colab.research.google.com/github/ahmedhesham47/Bayesian-Network-for-Predicting-ICB-Response/blob/main/Bayes_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pgmpy

# **Importing Packages**

In [128]:
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import HillClimbSearch, BicScore
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination
import random
import matplotlib.pyplot as plt
import networkx as nx

# **Loading files**

In [153]:
mrna_cont = pd.read_csv('mrna_cont.tsv', sep='\t')
best_e_genes = pd.read_csv('Best Expression Genes.tsv', sep='\t')
best_c_genes = pd.read_csv('Best CNA Genes.tsv', sep='\t')
labels_df = pd.read_csv('labels_df.tsv',sep='\t')

# **Data pre-processing**

In [199]:
samples = mrna_cont['Sample Identifier']
mrna_cont.set_index(samples, inplace=True)
best_e_genes_list = best_e_genes['Gene_Name']

reduced_df = mrna_cont[best_e_genes_list]
reduced_df = reduced_df.reset_index()

reduced_df_with_labels = pd.merge(reduced_df, labels_df, on='Sample Identifier')
reduced_df_with_labels

Unnamed: 0,Sample Identifier,ST6GAL1,RAD54L,TIMM17A,NOP2,RARB,KIAA0391,WNK4,ATP5F1,MRPL20,...,KIAA0319L,UBXN7,NPAS3,EIF4A2,WWC1,ZNF888,CTPS1,CERS4,XRCC3,ICB Response
0,Sample100,68.957692,19.192052,49.554546,32.121619,10.326063,16.131174,11.469494,82.432588,60.425937,...,59.176958,57.945571,3.553432,185.464528,5.013506,7.634602,53.442211,15.744167,51.384035,1.0
1,Sample106,17.175347,13.611997,80.294152,34.588250,14.110866,17.602949,10.856340,198.050990,42.831466,...,39.933275,49.055451,7.221723,168.332651,2.684390,4.133486,118.279463,41.548660,30.312230,1.0
2,Sample107,0.772705,0.000000,0.000000,0.000000,0.000000,0.000000,107.701415,0.000000,77.361388,...,65.907175,12.340550,0.000000,135.132436,0.000000,0.000000,67.657124,0.000000,0.000000,1.0
3,Sample108,1.742372,1.590862,27.877958,47.953118,2.651436,8.484596,78.558267,60.376990,5.227117,...,28.787022,58.180086,25.075011,619.223988,0.000000,0.000000,21.059979,28.938532,42.271469,0.0
4,Sample10,49.567013,54.004152,115.877589,86.732792,20.953156,32.102889,3.887237,252.803144,76.531165,...,37.981150,45.869397,4.588836,335.117758,46.059019,51.785582,211.636355,86.315625,74.691838,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,Sample196,20.242258,23.213953,77.920150,39.809131,0.328044,30.064288,22.133336,204.680296,65.261503,...,60.572400,51.001228,6.252137,334.759477,21.978963,48.261094,91.234886,28.829298,34.097302,0.0
115,Sample197,58.131817,72.419637,54.609889,31.337151,3.201752,19.530690,14.147743,211.675854,79.883722,...,51.968443,50.107425,0.320175,302.045315,4.282344,4.802629,146.320084,61.173481,61.153470,1.0
116,Sample204,19.759614,22.340575,75.846944,40.351796,0.777063,21.036218,24.588508,124.413411,50.564630,...,38.575650,27.197221,13.126822,159.075992,6.244260,12.571777,136.818674,32.719922,44.819910,1.0
117,Sample205,29.037639,25.201994,89.146582,33.027741,38.202001,34.675266,10.554461,110.101049,36.605960,...,41.651508,30.015858,5.508914,143.257503,7.671291,47.057451,153.811964,29.990115,58.667358,0.0


# **Discretizing the dataframe**

## **Using mean and standard deviation**

In [178]:
# Calculate mean and standard deviation for each gene (column)
means = reduced_df_with_labels.iloc[:, 1:-1].mean()
std_devs = reduced_df_with_labels.iloc[:, 1:-1].std()

# Function to categorize values
def categorize_value(x, gene):
    if x < means[gene] - std_devs[gene]:
        return -1
    elif x > means[gene] + std_devs[gene]:
        return 1
    else:
        return 0

# Apply the function to each cell in the DataFrame
std_categorized_df = reduced_df_with_labels.iloc[:, 1:-1].apply(lambda x: x.apply(lambda y: categorize_value(y, x.name)))
std_categorized_df.set_index(samples, inplace=True)
std_categorized_df_with_labels = pd.merge(std_categorized_df, labels_df, on='Sample Identifier')

## **Using Min-Max scaling followed by grouping into n categories**

In [192]:
# Min-Max scaling
df = reduced_df_with_labels.iloc[:, 1:-1]
scaled_df = (df - df.min()) / (df.max() - df.min())

# Function to classify values into groups
def classify_into_groups(value):
    if value == 1:
        return 10
    else:
        return int(value // 0.1) + 1

# Apply the classification to the entire DataFrame
min_max_categorized_df = scaled_df.applymap(classify_into_groups)
min_max_categorized_df = min_max_categorized_df.apply(lambda x: x.astype('category'))
min_max_categorized_df.set_index(samples, inplace=True)
min_max_categorized_df_with_labels = pd.merge(min_max_categorized_df, labels_df, on='Sample Identifier')

# **Bayesian Network**

In [200]:
# Create an instance of BicScore with the DataFrame
bic_score = BicScore(std_categorized_df_with_labels.iloc[:,1:])

# Initialize HillClimbSearch with the DataFrame
hc = HillClimbSearch(std_categorized_df_with_labels.iloc[:,1:])

# Estimate the best model using the BIC scoring method
best_model = hc.estimate(scoring_method=bic_score, max_iter=1000)

# Learning CPDs (Conditional Probability Distributions)
model = BayesianNetwork(best_model.edges())
model.fit(std_categorized_df_with_labels.iloc[:,1:], estimator=BayesianEstimator, prior_type="BDeu") # Choose appropriate priors

  0%|          | 0/1000 [00:00<?, ?it/s]

In [203]:
# Create an inference object
inference = VariableElimination(model)

# Query the model (example: finding the probability of ICB response being 1)
query_result = inference.query(variables=['ICB Response'], evidence={'NOP2':1})
print(query_result)

+-------------------+---------------------+
| ICB Response      |   phi(ICB Response) |
| ICB Response(0.0) |              0.5198 |
+-------------------+---------------------+
| ICB Response(1.0) |              0.4802 |
+-------------------+---------------------+


In [182]:
for cpd in model.cpds:
    print(cpd)

+-----------+----------------------+---------------------+--------------------+
| PRPF38A   | PRPF38A(-1)          | PRPF38A(0)          | PRPF38A(1)         |
+-----------+----------------------+---------------------+--------------------+
| RAD54L(0) | 0.95                 | 0.9349442379182157  | 0.4433962264150943 |
+-----------+----------------------+---------------------+--------------------+
| RAD54L(1) | 0.049999999999999996 | 0.06505576208178439 | 0.5566037735849056 |
+-----------+----------------------+---------------------+--------------------+
+------------+----------------------+----------------------+
| RAD54L     | RAD54L(0)            | RAD54L(1)            |
+------------+----------------------+----------------------+
| MRPS15(-1) | 0.044961240310077526 | 0.050505050505050504 |
+------------+----------------------+----------------------+
| MRPS15(0)  | 0.9007751937984497   | 0.47474747474747475  |
+------------+----------------------+----------------------+
| MRPS15(1)  

# **Network Visualization**

In [None]:
# Create a NetworkX graph from the Bayesian network
G = nx.DiGraph()
G.add_edges_from(best_model.edges())

# Draw the graph
plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
pos = nx.spring_layout(G)  # Layout for the nodes

# Draw nodes and edges
nx.draw(G, pos, with_labels=True, node_size=3000, node_color="lightblue", font_size=10, font_weight="bold", edge_color="gray")

# Display the graph
plt.title("Bayesian Network Visualization")
plt.savefig('Bayes_network.png')
plt.show()

In [141]:
edges_list = [edge for edge in best_model.edges()]
edges_df = pd.DataFrame(edges_list, columns=['Gene 1', 'Gene 2'])
edges_df.to_csv('gene_interactions.tsv', sep='\t', index=False)

# **Quantifying the strength of edges**

In [144]:
# Initialize a list to store edge data
edge_data = []

# Iterate over each edge
for parent, child in best_model.edges:
    # Get the CPT for the child node
    cpt = model.get_cpds(node=child)

    # Calculate a measure of influence, e.g., range of probabilities
    influence_measure = cpt.values.ptp()  # Example: peak-to-peak (max-min) of the probabilities

    # Append to the edge data
    edge_data.append((parent, child, influence_measure))

# Convert to DataFrame
edges_df = pd.DataFrame(edge_data, columns=['Gene 1', 'Gene 2', 'Association Strength'])

edges_df.to_csv('gene_interactions_strength_quantified.tsv', sep='\t', index=False)