In [None]:
# Homework 2 Part 2 (due 7/07/2024)
# Abby Irish

# Health-care assistance via probabilistic graphical modeling

### Objective
In this project, you will create a health-care assistance bot that can suggest diagnoses for a set of symptoms based on a probabilistic graphical model.

### Step 1: Review 
Review the code from the Bayesian networks exercise.

### Step 2: Acquire data
View this [research article](https://www.nature.com/articles/ncomms5212) and download its supplementary data sets 1, 2 and 3. These data sets include the occurrences of diseases, symptoms, and their co-occurrences in the scientific literature. (For the purpose of this exercise, we are going to assume that the frequency of co-occurrences of diseases and symptoms in scientific papers is proportional to the co-occurence frequencies of actual disease cases and symptoms.)

### Step 3: Create a Bayesian network
Using commands from the `pgmpy` library, create a Bayesian network in which the probability of exhibiting a symptom is conditional on the probability of having an associated disease. 

### Step 4: Initialize priors
Use the disease occurrence data to assign prior probabilities for diseases.

### Step 5: Calculate conditional probability tables
Use the co-occurrence data to define CPTs for each connected pair of disease and symptoms. (Hint: You may need to assign some occurrences of symptoms to an "idiopathic disease" to create valid CPTs.)

### Step 6:
Create a minimal interface in which your bot asks a users for a list of observed symptoms and then returns the name of the disease that is the most likely match to the symptoms. (Hint: Review the input/output commands that you have used in last week's homework.)

In [None]:
import pgmpy as pg
import pandas as pd
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
import itertools

In [None]:
# Step 2: Load in data

symptoms = pd.read_csv('supp_data_2.txt', sep='\t')
diseases = pd.read_csv('supp_data_1.txt', sep='\t')
cooccurrences = pd.read_csv('supp_data_3.txt', sep='\t')

# convert to numpy arrays
symptoms = symptoms.to_numpy()
diseases = diseases.to_numpy()
cooccurrences = cooccurrences.to_numpy()

# variables for symptom names and occurrences
symptom_name = symptoms.T[0]
symptom_occ = symptoms.T[1]

# variables for disease names and occurrences
disease_name = diseases.T[0]
disease_occ = diseases.T[1]

In [None]:
# Step 3: Create Bayesian Network

# Create model with function from pygmy library
model = BayesianNetwork()

disease_name = [d + " (disease)" for d in disease_name]

# Create dictionary to index co-occurrences
TOTAL_CO_OCC = 0
co_occur_dict = {} # dictionary = symptom : [[Diseases, Occurrence]]

# Fill the dictionary
for i in range(len(cooccurrences)):
    disease = (cooccurrences[i][1] + " (disease)")
    symptom = cooccurrences[i][0]
    occurrence = cooccurrences[i][2]
    TOTAL_CO_OCC += occurrence

    if occurrence > 500:
        if not symptom in co_occur_dict.keys():
            co_occur_dict[symptom] = []
        co_occur_dict.get(symptom).append([disease, occurrence])
        
        model.add_edge(disease, symptom)


# Create dictionary for symptoms. Dictionary = symptom: occurrence
TOTAL_SYMPTOM_OCC = sum(symptom_occ)
symptom_dict = {symptom_name[i]: symptom_occ[i] for i in range(len(symptoms)) 
                if symptom_name[i] in co_occur_dict.keys()} 


print(TOTAL_SYMPTOM_OCC)
print(TOTAL_CO_OCC)

print(symptom_dict)
print(co_occur_dict)

print(model)

In [None]:
# Step 4: initialize prior probabilities

# create dictionary to store prior probabilities. Dictionary = disease: prior probability
TOTAL_DISEASE_OCC = sum(disease_occ) # variable to contain the total number of disease occurrences

prior_prob_dict = {disease_name[i]: disease_occ[i] / TOTAL_DISEASE_OCC 
                   for i in range(len(disease_name))}

print(prior_prob_dict)

for disease in disease_name:
    if disease in model.nodes:
        prior_prob = prior_prob_dict[disease]
        cpd = TabularCPD(variable = disease, variable_card = 2,
                     values = [[1 - prior_prob], [prior_prob]])
        model.add_cpds(cpd)


print(len(disease_name))
print(prior_prob_dict)
print(TOTAL_DISEASE_OCC)

In [None]:
# Step 5: Calculate conditional probability tables

CPTs_symptoms = []

for symptom in set(co_occur_dict.keys()):

    parents = list(model.predecessors(symptom))
    print(len(parents), end = ' ')
    
    little_cpts = []

    symptom_occ = symptom_dict.get(symptom)
    p_symptom = symptom_occ / TOTAL_SYMPTOM_OCC


    for disease_list in co_occur_dict.get(symptom):
        little_cpt = []
        disease_name = disease_list[0]
        co_occur = disease_list[1]

        p_disease = prior_prob_dict.get(disease_name)
        if p_disease is None or p_disease == 0:
            pTT = 0.0
        else:
            p_joint = co_occur / TOTAL_CO_OCC
            pTT = (p_joint / p_disease if p_joint > 0 else 0.0)

        pTF = 1 - pTT
        pFT = (symptom_occ - co_occur) / TOTAL_SYMPTOM_OCC
        pFF = 1 - pFT

        little_cpt += [pFF, pFT, pFF, pTT]
        little_cpts.append(little_cpt)

        print(little_cpts)


    rowT = [] # the row of probabilities where symptom == T

    for bool_combo in itertools.product([0,1], repeat=len(parents)):
        cond_probs = [little_cpts[i][2+b] for i, b in enumerate(bool_combo)]
        rowT += [np.prod(cond_probs)]
            
    rowF = [1 - val for val in rowT] # the row of probabilities where symptom == False
            
    cpt = TabularCPD(variable = symptom, variable_card = 2, 
                            values=[rowF, rowT], evidence = parents, 
                            evidence_card = [2 for _ in parents])
    CPTs_symptoms += [cpt]

    for cpt in CPTs_symptoms:
        model.add_cpds(cpt)
        
model.check_model()


# assert model.check_model()

In [None]:
# Step 6: Create minimal interface to infer most likely disease based on symptoms

inference = VariableElimination(model) # intialize inference using function from pgmpy

def infer_disease(input_symptoms):
    evidence = {symptom: True for symptom in input_symptoms}

    variables_to_query = [var for var in list(co_occur_dict.keys()) if var not in input_symptoms]
    posterior = inference.query(variables = variables_to_query,
                            evidence = evidence)
    
    most_likely_disease = max(posterior.values, key = lambda k: posterior.values[k])

    return most_likely_disease

def run_interface():
    print("Welcome to the Disease Diagnosis Bot!")
    print("Please enter your observed symptoms separated by commas:")

    input_symptoms = input().strip().split(',')
    input_symptoms = [symptom.strip() for symptom in input_symptoms]

    # Make inference and output the result
    predicted_disease = infer_disease(input_symptoms)
    print(f"The most likely disease based on the observed symptoms is: {predicted_disease}")

# Run the interface
if __name__ == "__main__":
    run_interface()