In [2]:
from collections import defaultdict
from itertools import combinations

# Define the file path
file = '../dataset/training_validation_annotation.txt'

# Function to read file content and return it as a list of lines
def read_file(file):
    with open(file, 'r') as f:
        return f.readlines()

# Read content from the file
data = read_file(file)

# Initialize dictionaries to count occurrences
location_count = defaultdict(int)
pair_count = defaultdict(int)

# Process each line in the data
for line in data:
    locations = line.strip().split('|')
    # Count occurrences of each individual location
    for location in locations:
        location_count[location] += 1
    # Count occurrences of each pair of locations
    for pair in combinations(locations, 2):
        pair_count[tuple(sorted(pair))] += 1

# Find all unique locations
all_locations = set(location_count.keys())

# Calculate total co-occurrence count for each pair of locations
total_pair_count = defaultdict(int)
for loc1 in all_locations:
    for loc2 in all_locations:
        if loc1 != loc2:
            pair = tuple(sorted((loc1, loc2)))
            total_pair_count[pair] += pair_count[pair]

# Output the results
print("Counts of individual locations:")
for location, count in location_count.items():
    print(f"{location}: {count}")

print("\nCounts of co-occurrence for each pair of locations:")
for pair, count in total_pair_count.items():
    print(f"{pair}: {count}")


Counts of individual locations:
Exosome: 28304
Nucleus: 19301
Nucleoplasm: 12807
Chromatin: 12893
Cytosol: 14686
Ribosome: 7796
Cytoplasm: 3597
Nucleolus: 10000
Membrane: 6047

Counts of co-occurrence for each pair of locations:
('Exosome', 'Ribosome'): 15564
('Exosome', 'Membrane'): 12068
('Exosome', 'Nucleolus'): 19974
('Exosome', 'Nucleus'): 32336
('Cytosol', 'Exosome'): 29204
('Cytoplasm', 'Exosome'): 3594
('Exosome', 'Nucleoplasm'): 25548
('Chromatin', 'Exosome'): 25712
('Membrane', 'Ribosome'): 4328
('Nucleolus', 'Ribosome'): 8282
('Nucleus', 'Ribosome'): 11752
('Cytosol', 'Ribosome'): 11440
('Cytoplasm', 'Ribosome'): 1082
('Nucleoplasm', 'Ribosome'): 10210
('Chromatin', 'Ribosome'): 10188
('Membrane', 'Nucleolus'): 7410
('Membrane', 'Nucleus'): 10742
('Cytosol', 'Membrane'): 10616
('Cytoplasm', 'Membrane'): 974
('Membrane', 'Nucleoplasm'): 9422
('Chromatin', 'Membrane'): 9304
('Nucleolus', 'Nucleus'): 17016
('Cytosol', 'Nucleolus'): 15674
('Cytoplasm', 'Nucleolus'): 1386
('Nucle

In [4]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# node name
nodes = ["Exosome", "Nucleus", "Nucleoplasm", "Chromatin", "Cytoplasm", 
         "Nucleolus", "Cytosol", "Membrane", "Ribosome"]


cooccurrence_matrix = np.array([
    [28304, 32336, 25548, 25712, 3594, 19974, 29204, 12068, 15564],
    [32336, 19301, 21504, 21588, 3614, 17016, 23188, 10742, 11752],
    [25548, 21504, 12807, 20666, 1834, 16294, 19816, 9422,  10210],
    [25712, 21588, 20666, 12893, 1814, 16736, 19790, 9304,  10188],
    [3594,  3614,  1834,  1814,  14686, 1836, 2106,  974,   1082],
    [19974, 17016, 16294, 16736, 1836,  7796, 15674, 7410,  8282],
    [29204, 23188, 19816, 19790, 2106, 15674, 3597,  10616, 11440],
    [12068, 10742, 9422,  9304,  974,  7410,  10616, 10000,  4328],
    [15564, 11752, 10210, 10188, 1082, 8282,  11440, 4328,  6047]
])


# set the threshold
threshold = 0

# Create an undirected graph
G = nx.Graph()

# Add nodes
G.add_nodes_from(nodes)


edge_labels = {}
for i in range(len(nodes)):
    for j in range(i + 1, len(nodes)):
        if cooccurrence_matrix[i, j] > threshold:
            G.add_edge(nodes[i], nodes[j])
            edge_labels[(nodes[i], nodes[j])] = f"{cooccurrence_matrix[i, j]:.4f}"

# Convert an undirected graph to a directed graph to build a Bayesian network
bn = BayesianNetwork()

for edge in G.edges():
    bn.add_edge(edge[0], edge[1])



# load the training data
training_validation_annotation_encoded_path = 'D:/RNA/DRpred/dataset/training_validation_annotation_encoded.csv'
training_data = pd.read_csv(training_validation_annotation_encoded_path)

# Train a Bayesian network using maximum likelihood estimation
bn.fit(training_data, estimator=MaximumLikelihoodEstimator)

# print trained CPD (Conditional Probability distribution)
print("Conditional Probability Distributions:")
for cpd in bn.get_cpds():
    print(cpd)

# Inference with variable elimination
infer = VariableElimination(bn)

# compute marginal probabilities
marginal_probabilities = {}
for node in nodes:
    marginal_prob = infer.query([node], show_progress=False)
    marginal_probabilities[node] = marginal_prob.values[1]  # 获取P(node=1)
# Normalized marginal probability
total_marginal_probability = sum(marginal_probabilities.values())
for node in nodes:
    marginal_probabilities[node] /= total_marginal_probability

for node, prob in marginal_probabilities.items():
    print(f"{node}: {prob:.3f}")


for i in range(len(nodes)):
    for j in range(i + 1, len(nodes)):
        joint_prob = infer.query([nodes[i], nodes[j]], show_progress=False)
        cooccurrence_probabilities[(nodes[i], nodes[j])] = joint_prob.values[1, 1]


for edge, prob in cooccurrence_probabilities.items():
    print(f"{edge}: {prob:.4f}")

def extract_mean_prior_information(sequence_labels, cooccurrence_probabilities, nodes):
    prior_information = []
    for index, row in sequence_labels.iterrows():
        
        label_indices = [i for i, label in enumerate(row) if label == 1]
        
        # Compute the mean co-occurrence probability of the remaining irrelevant labels
        remaining_indices = np.setdiff1d(range(len(nodes)), label_indices)
        remaining_cooccurrence_probs = [cooccurrence_probabilities[(nodes[i], nodes[j])] for i in remaining_indices for j in remaining_indices if i < j]
        if len(remaining_cooccurrence_probs) > 0:
            mean_remaining_prob = np.mean(remaining_cooccurrence_probs)
        else:
            mean_remaining_prob = 0.0
        
        # Use the mean as the prior information value
        prior_information.append(mean_remaining_prob)
    
    # Check if the number of lines of the prior matches the input file
    if len(prior_information) != len(sequence_labels):
        raise ValueError("Prior information rows do not match the number of rows in the input CSV file.")
    
    return np.array(prior_information).reshape(-1, 1)




# The label of each mRNA sequence was extracted
sequence_labels = training_data  

# calculate and print the prior
prior_information = extract_mean_prior_information(sequence_labels, cooccurrence_probabilities, nodes)
print(prior_information)

# Convert the prior into a DataFrame
prior_df = pd.DataFrame(prior_information, columns=["Prior Probability"])

#Save as a CSV file
prior_csv_path = "../dataset/train_prior_information.csv"
prior_df.to_csv(prior_csv_path, index=False)



In [3]:
def plot_bayesian_network(bn, edge_labels, cooccurrence_probabilities, filename='bayesian_network.png'):
    G = nx.DiGraph()
    for edge in bn.edges():
        G.add_edge(edge[0], edge[1])  # Add one-way edges
        G.add_edge(edge[1], edge[0])  # add reverse edges

    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G)  
    nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_weight='bold', arrowsize=20, arrowstyle='-|>')
    edge_labels_formatted = {(u, v): f"{prob:.3f}" for (u, v), prob in cooccurrence_probabilities.items()}
    nx.draw_networkx_edge_labels(pos, edge_labels=edge_labels_formatted, font_color='red')
    plt.title("Bayesian Network Structure with Co-occurrence Probabilities")
    plt.savefig(filename, dpi=300)  
    plt.show()



indepence

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Step 1: Load the feature vectors and prior information for the training and test sets
test_features_path = "../dataset/independent.csv"
train_prior_info_path = "../dataset/train_prior_information.csv"

train_features = pd.read_csv(train_features_path)
test_features = pd.read_csv(test_features_path)
train_prior_info = pd.read_csv(train_prior_info_path)

# Step 2: Train the Nearest neighbor model
nbrs = NearestNeighbors(n_neighbors=9, algorithm='auto').fit(train_features)

# Step 3: For each sample in the test set, find the index of the most similar training set sample
distances, indices = nbrs.kneighbors(test_features)
# Step 4: Extract the prior information on the most similar training set samples
test_prior_info = [train_prior_info.iloc[idx[0]].values.tolist() for idx in indices]
# Save the test set priors to a file
test_prior_info_df = pd.DataFrame(test_prior_info)
test_prior_info_df.to_csv("../dataset/test_prior_information.csv", index=False)
