In [None]:
# Homework 2 Part 2 (due 7/07/2024)

# Health-care assistance via probabilistic graphical modeling

### Objective
In this project, you will create a health-care assistance bot that can suggest diagnoses for a set of symptoms based on a probabilistic graphical model.

### Step 1: Review 
Review the code from the Bayesian networks exercise.

### Step 2: Acquire data
View this [research article](https://www.nature.com/articles/ncomms5212) and download its supplementary data sets 1, 2 and 3. These data sets include the occurrences of diseases, symptoms, and their co-occurrences in the scientific literature. (For the purpose of this exercise, we are going to assume that the frequency of co-occurrences of diseases and symptoms in scientific papers is proportional to the co-occurence frequencies of actual disease cases and symptoms.)

### Step 3: Create a Bayesian network
Using commands from the `pgmpy` library, create a Bayesian network in which the probability of exhibiting a symptom is conditional on the probability of having an associated disease. 

### Step 4: Initialize priors
Use the disease occurrence data to assign prior probabilities for diseases.

### Step 5: Calculate conditional probability tables
Use the co-occurrence data to define CPTs for each connected pair of disease and symptoms. (Hint: You may need to assign some occurrences of symptoms to an "idiopathic disease" to create valid CPTs.)

### Step 6:
Create a minimal interface in which your bot asks a users for a list of observed symptoms and then returns the name of the disease that is the most likely match to the symptoms. (Hint: Review the input/output commands that you have used in last week's homework.)

In [None]:
import pgmpy as pg
import pandas as pd
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

In [70]:
# Step 3: Create Bayesian Network
diseases = pd.read_csv("supp_data_1.txt", sep = "\t")
symptoms = pd.read_csv("supp_data_2.txt", sep = "\t") 
cooccurrences = pd.read_csv("supp_data_3.txt", sep = "\t")

diseases = diseases.to_numpy()
symptoms = symptoms.to_numpy()
cooccurrences = cooccurrences.to_numpy()

disease_name = diseases.T[0]
disease_occ = diseases.T[1]
symptom_name = symptoms.T[0]
symptom_occ = symptoms.T[1]

model = BayesianNetwork()

print(symptom_occ)

network_diseases = []
for i in range(len(disease_occ)):
    if i > 500:
        model.add_node("Disease: " + disease_name[i])
        network_diseases.append(disease_name[i])

for i in range(len(symptom_occ)): 
    model.add_node("Symptom :" + symptom_name[i])
    for d in range(len(network_diseases)):
        model.add_edge(d, symptom_name[i])

model.add_node("idiopathic disease")

print(network_diseases)
print(model)


[147857 103168 100301 47351 43883 36959 34850 29473 29446 29364 27734
 22793 21470 20212 19403 18689 18661 18604 17571 16827 16162 15975 15477
 15199 13385 12973 12913 12844 12076 11643 11489 11450 11361 11096 10577
 10544 10308 10187 10094 9677 9513 9207 9201 8659 8445 8320 8193 8094 7916
 7774 7771 7764 7754 7742 7698 7693 7494 7345 7236 7078 6702 6660 6658
 6617 6461 6418 6300 6081 5975 5973 5941 5870 5835 5800 5749 5710 5639
 5549 5523 5455 5395 5240 5135 5119 5096 5038 4918 4711 4648 4576 4573
 4516 4496 4424 4410 4304 4301 4252 4190 4124 4102 4043 3958 3869 3857
 3800 3752 3748 3616 3469 3387 3375 3369 3368 3364 3297 3286 3274 3274
 3268 3264 3242 3235 3213 3169 3150 2970 2841 2815 2806 2784 2780 2771
 2747 2723 2639 2616 2574 2556 2476 2338 2331 2232 2228 2206 2200 2148
 2137 2136 2119 2089 2077 2065 2052 2022 1953 1840 1816 1792 1788 1784
 1779 1763 1753 1745 1739 1679 1661 1649 1591 1586 1539 1511 1502 1490
 1485 1458 1448 1440 1409 1370 1331 1329 1302 1298 1291 1278 1264 1241

In [75]:
# Step 4: initialize prior probabilities

total = sum(disease_occ)

prior_prob_list = []

for i in range(len(disease_name)): 
    if disease_name[i] in model:
        prior_prob = disease_occ[i] / total
        prior_prob_list.append(prior_prob)

        cpd = TabularCPD(variable = disease_name[i], 
                        variable_card = 2, 
                        values = [[1 - prior_prob], [prior_prob]])
        
        model.add_cpds(cpd)

print(len(disease_name))
print(prior_prob_list)
# assert model.check_model()



4442
[0.004925566634835935, 0.0048398910299983664, 0.001789429660035687, 0.0016434493583990099, 0.0014357006537302358, 0.0009346252199706517, 0.0009040825499089673, 0.0009005696549497958, 0.0008508036430282013, 0.0006786717900288038, 0.0006688161680600174, 0.0006585702244291009, 0.0006523250778350184, 0.0006262711068878307, 0.0005990461709542526, 0.0005946550522552883, 0.0005926058635291051, 0.0005732849412536624, 0.0005483043548773326, 0.0005127850836234888, 0.00045072393934479437, 0.00044516185565943964, 0.0004378433244944993, 0.0004373554224168366, 0.00043696510075470643, 0.0004331594645489375, 0.00043208607997807953, 0.00042984173042083117, 0.00042789012211018036, 0.0004120820947939092, 0.00039900631911254903, 0.0003884676342350349, 0.00037500153689154465, 0.0003708055790236455, 0.0003687563902974622, 0.00036709752323340906, 0.0003657313974159535, 0.0003598765724840012, 0.0003394822656377007, 0.0003303097065776421, 0.0003285532590980564, 0.0003284556786825239, 0.0003251379445544175

In [79]:
# Step 5: Calculate conditional probability tables

total_prob = sum(prior_prob_list)
idiopathic_disease_CPD = 1 - total_prob

# create dictionary to index co-occurrences
co_occur_dict = {} # dictionary = symptom : [[Diseases, Occurrence]]
total_co_occurrences = 0

# Fill the dictionary
for i in range(len(cooccurrences)):
    disease = cooccurrences[i][1]
    symptom = cooccurrences[i][0]
    occurrence = cooccurrences[i][2]
    total_co_occurrences += occurrence
    if occurrence > 500:
        if not symptom in co_occur_dict.keys():
            co_occur_dict[symptom] = []
        co_occur_dict.get(symptom).append([disease, occurrence])

# print(co_occur_dict)

# Calculate CPTs
CPTs = {}
for symptom, disease_list in co_occur_dict.items():
    total_occurrences_for_symptom = sum([item[1] for item in disease_list])
    CPT = {}
    for disease, occurrence in disease_list:
        CPT[disease] = occurrence / total_occurrences_for_symptom

    # assign remaining probabilities to idiopathic diseases
    CPT["idiopathic disease"] = idiopathic_disease_CPD * (total_occurrences_for_symptom / total_co_occurrences)
    CPTs[symptom] = CPT

# Print CPTs
for symptom, CPT in CPTs.items():
    print(f"Symptom: {symptom}")
    for disease, prob in CPT.items():
        cpd = TabularCPD(variable = disease, 
                         variable_card = 2,
                         values = [[1 - prob], [prob]])
        model.add_cpds(cpd)
        print(f"Disease: {disease}, Probability: {prob:.4f}")




# assert model.check_model()

Symptom: Fever


ValueError: ('CPD defined on variable not in the model', <TabularCPD representing P(Bacterial Infections:2) at 0x1f63c7290>)

In [57]:
# Step 6: Create a minimal interface

def most_likely_disease_fxn(symptoms, CPTs):

    disease_probabilities = {}

    for symptom in symptoms:
        if symptom in CPTs:
            for disease, prob in CPTs[symptom].items():
                if disease not in disease_probabilities:
                    disease_probabilities[disease] = 1
                disease_probabilities[disease] *= prob

        else:
            print(f"Symptom '{symptom}' is not recognized")

    total_prob = sum(disease_probabilities.values())
    for disease in disease_probabilities:
        disease_probabilities[disease] /= total_prob

    most_likely_disease = max(disease_probabilities, key = disease_probabilities.get)

    return most_likely_disease


def main():

    user_input = input("Please enter your observed symptoms separated by commas: ")
    symptoms = [symptom.strip() for symptom in user_input.split(",")]

    most_likely_disease = most_likely_disease_fxn(symptoms, CPTs)

    print(f"Based on your symptoms, the most likely disease is: {most_likely_disease}")

if __name__ == "__main__":
    main()

Based on your symptoms, the most likely disease is: Obesity
