In [1]:
import pandas as pd
import numpy as np
import os
clinical_df = pd.read_csv('clinical+genomic_split.csv')
corresponding_images_df = pd.read_csv('WSI_patientfiles.csv')

In [2]:
clinical_image_df = pd.merge(clinical_df, corresponding_images_df, on = 'case_id', how = 'left')

In [3]:
# Assuming your DataFrame is named df
# You can create a list to hold the extracted image features for each row
image_features_list = []

# Iterate over each row in the DataFrame
for idx, row in clinical_image_df.iterrows():
    image_filename = row['chosen_exam']
    image_path = f"WSI_Features/{image_filename}"  # Assuming the .npz files are in the 'WSI_Features' directory
    
    # Check if the image file exists
    if os.path.exists(image_path):
        # Load the .npz file and extract the image features
        image_feature = np.load(image_path)
        image_features = image_feature['arr_0']  # Access the feature array
        
        # You might want to flatten or process the features before appending them
        image_features_list.append(image_features.flatten())  # Flatten if necessary
    else:
        # In case the file doesn't exist, append NaN or a default value
        image_features_list.append([np.nan] * len(image_feature['arr_0']))  # Adjust the length accordingly

# Convert the list of features to a DataFrame
image_features_df = pd.DataFrame(image_features_list)

# Combine the image features DataFrame with the original clinical data
clinical_image_df_combined = pd.concat([clinical_image_df, image_features_df], axis=1)

In [4]:
clinical_image_df_combined['cancer_history'] = clinical_image_df_combined['cancer_history'].fillna(0)
clinical_image_df_combined = clinical_image_df_combined.dropna(subset = ['age_diag'])

In [5]:
clinical_image_df_combined.isnull().sum()

case_id           0
gender            0
age_diag          0
grade             0
cancer_history    0
                 ..
1019              0
1020              0
1021              0
1022              0
1023              0
Length: 1045, dtype: int64

In [30]:
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

# Define the list of columns that will be connected to 'vital_status_12'
features = clinical_image_df_combined.columns.tolist()  # or any list of feature names you want to include
features.remove('vital_status_12')  # Remove 'vital_status_12' to avoid creating a self-loop
features.remove('case_id')  # Remove 'case_id' since it shouldn't be in the model

# Define the model structure (edges from all features to 'vital_status_12')
edges = [('vital_status_12', feature) for feature in features]

# Create the Bayesian Network model
model = DiscreteBayesianNetwork(edges)

# Define Conditional Probability Distributions (CPDs) for each feature and vital_status_12
cpd_vital_status_12 = TabularCPD(variable='vital_status_12', variable_card=2, values=[[0.6], [0.4]])  # Example

# Add CPDs for other features. Here we just use an example; you'll want to define them based on data.
cpds = [cpd_vital_status_12]

# Add CPDs for the other features
for feature in features:
    # Each feature has a parent (vital_status_12), so we define its CPD accordingly
    # For example, the feature has 2 possible states, so we provide a 2x2 array of probabilities for the CPD.
    # Here, we assume the probabilities are random for simplicity, but you should replace this with actual data-based probabilities.
    
    cpd_feature = TabularCPD(variable=feature, variable_card=2, 
                             values=[[0.7, 0.8],  # Probabilities for feature being 0 given vital_status_12 = 0 and 1
                                     [0.3, 0.2]],  # Probabilities for feature being 1 given vital_status_12 = 0 and 1
                             evidence=['vital_status_12'], evidence_card=[2])  # Defining the parent (vital_status_12)
    cpds.append(cpd_feature)

# Add all CPDs to the model
model.add_cpds(*cpds)

# Check if the model is valid
model.check_model()

# Perform inference
inference = VariableElimination(model)

+--------------------+------------------------+
| vital_status_12    |   phi(vital_status_12) |
| vital_status_12(0) |                 0.6923 |
+--------------------+------------------------+
| vital_status_12(1) |                 0.3077 |
+--------------------+------------------------+


In [32]:
# Example of querying the model to infer the 'vital_status_12' given some feature value
result = inference.query(variables=['vital_status_12'], evidence={'gender': 1})

# Print the result
print(result)

+--------------------+------------------------+
| vital_status_12    |   phi(vital_status_12) |
| vital_status_12(0) |                 0.6923 |
+--------------------+------------------------+
| vital_status_12(1) |                 0.3077 |
+--------------------+------------------------+
