In [None]:
# Homework 2 Part 2 (due 7/07/2024)

# Health-care assistance via probabilistic graphical modeling

### Objective
In this project, you will create a health-care assistance bot that can suggest diagnoses for a set of symptoms based on a probabilistic graphical model.

### Step 1: Review 
Review the code from the Bayesian networks exercise.

### Step 2: Acquire data
View this [research article](https://www.nature.com/articles/ncomms5212) and download its supplementary data sets 1, 2 and 3. These data sets include the occurrences of diseases, symptoms, and their co-occurrences in the scientific literature. (For the purpose of this exercise, we are going to assume that the frequency of co-occurrences of diseases and symptoms in scientific papers is proportional to the co-occurence frequencies of actual disease cases and symptoms.)

### Step 3: Create a Bayesian network
Using commands from the `pgmpy` library, create a Bayesian network in which the probability of exhibiting a symptom is conditional on the probability of having an associated disease. 

### Step 4: Initialize priors
Use the disease occurrence data to assign prior probabilities for diseases.


### Step 5: Calculate conditional probability tables
Use the co-occurrence data to define CPTs for each connected pair of disease and symptoms. (Hint: You may need to assign some occurrences of symptoms to an "idiopathic disease" to create valid CPTs.)

### Step 6:
Create a minimal interface in which your bot asks a users for a list of observed symptoms and then returns the name of the disease that is the most likely match to the symptoms. (Hint: Review the input/output commands that you have used in last week's homework.)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from pgmpy.models import BayesianNetwork, BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
import itertools


Step 2: download data 

In [None]:

symptoms_data = pd.read_csv('symptom_data.txt', sep='\t')
diseases_data = pd.read_csv('disease_data.txt', sep='\t')
symptoms_and_diseases = pd.read_csv('diseaseXsymptom.txt', sep='\t')

symptoms_data.columns = ["symptom", "occurrence"]
diseases_data.columns = ["disease", "occurrence"]
symptoms_and_diseases.columns = ["symptom", "disease", "occurrence", "score"]


s_data = symptoms_data.to_numpy()
d_data = diseases_data.to_numpy()
sd_data = symptoms_and_diseases.to_numpy()

            

Step 3: create the bayesian network 

In [None]:

# first, create an array with an ordered pair of each disease and symptom combination
# then load that combination into the network 
network = BayesianNetwork()

# for loading the symptom data
symptoms = s_data.T[0]
symptom_occurrences = s_data.T[1]

diseases = d_data.T[0]
disease_occurrences = d_data.T[1]
TOTAL_DISEASE_OCCURRENCES = sum(disease_occurrences)
TOTAL_CO_OCCURRENCES = 0


diseases = [disease + "disease" for disease in diseases]

# creating a dictionary to more efficiently index co-occurrences 
co_occurrences_dict = {}
 # dictionary = symptom : [[Disease, occurrence]]
for i in range(len(sd_data)): 
    disease = (sd_data[i][1] + "disease")
    symptom = sd_data[i][0]
    occurrence = sd_data[i][2]
    TOTAL_CO_OCCURRENCES += occurrence
    if occurrence > 400:
        if not symptom in co_occurrences_dict.keys():
            co_occurrences_dict[symptom] = []
        co_occurrences_dict.get(symptom).append([disease, occurrence])
        
        network.add_edge(disease, symptom)


TOTAL_SYMPTOM_OCCURRENCES = 0
# make dictionary — symptom: occurrence
symptom_dict = {}


i = 0 
# go through each symptom
while i < len(symptoms):
    symp = symptoms[i]
    occ = symptom_occurrences[i]
    TOTAL_SYMPTOM_OCCURRENCES += occ
    
    #if it is in the co-occ dictionary then add it to the network
    if symp in co_occurrences_dict.keys():
        symptom_dict[symp] = symptom_occurrences[i]

    i += 1
 
 
 
 

STEP 4

In [None]:
# STEP 4
# for each disease, create a probability distribution 
# note that the first state is having the disease and the second state is not having the disease 
total_prob = 0
i = 0

disease_prob_dict = {}
disease_CPDs = []
while i < len(diseases):

    prob = disease_occurrences[i] / TOTAL_DISEASE_OCCURRENCES
    disease_prob_dict[diseases[i]] = prob
    if diseases[i] in network.nodes: 
        cpd = TabularCPD(diseases[i], variable_card=2, values = [[1-prob], [prob]])
        network.add_cpds(cpd)
        
    i +=1
    
symptom_CPDs = []

for i0, symptom in enumerate(list(set(co_occurrences_dict.keys()))): 
    
    parents = list(network.predecessors(symptom))
    disease_cpts = []
    symptom_occurrence = symptom_dict.get(symptom)
    p_symptom = symptom_occurrence / TOTAL_SYMPTOM_OCCURRENCES
    
    
    for d_list in co_occurrences_dict.get(symptom): 
        disease_cpt = []
        disease = d_list[0]
        co_occurrence = d_list[1]
        
        # disease = T, symptom = T 
        p_disease = disease_prob_dict.get(disease)
        p_symptom_and_disease = co_occurrence / TOTAL_CO_OCCURRENCES
        p_symptom_given_disease = (p_symptom_and_disease / p_disease if p_symptom_and_disease>0 else 0.0)
        
        
        # disease = T, symptom = F 
        p_symptom_not_disease = 1 - p_symptom_given_disease
    
    
        # disease = F , symptom = T
        p_disease_not_symptom = (symptom_occurrence - co_occurrence) / TOTAL_SYMPTOM_OCCURRENCES
        
        # disease = F, symptom = F 
        p_not_both = 1 - p_disease_not_symptom
        
        disease_cpt += [p_not_both]
        disease_cpt += [p_disease_not_symptom]
        disease_cpt += [p_symptom_not_disease]
        disease_cpt += [p_symptom_given_disease]
        disease_cpts.append(disease_cpt)
        
        
        
    rowT = [] # the row where symptom = T
    for bool_combo in itertools.product([0,1], repeat=len(parents)):
        cond_probs = [disease_cpts[i][2+b] for i, b in enumerate(bool_combo)]
        rowT += [np.prod(cond_probs)]
    
    rowF = [1 - val for val in rowT]
    
    cpd = TabularCPD(variable=symptom, variable_card=2, 
                     values=[rowF, rowT], evidence=parents, 
                     evidence_card=[2 for _ in parents])
    
    
    symptom_CPDs += [cpd]
    network.add_cpds(cpd)
    
network.check_model()
    

Step 6: create bot interface

In [None]:
def run_bot(network):
    
    # initialize symptom list as empty 
    user_symptoms = []
    
    print("Welcome to DoctorBot!")
    input_symptom = input("do you want to input a symptom? y/n").lower() == 'y'
    
    possible_diseases = []


    while input_symptom:
        
        symptom = input("Please list your symptom: ")
        user_symptoms += [symptom]
        input_symptom = input("do you want to input another symptom? y/n").lower() == 'y'
        
    
    # now go through each disease
    inference = VariableElimination(network)
    possible_diseases = []
    for symptom in user_symptoms:
        ps = list(network.predecessors(symptom))
        possible_diseases += [parent for parent in ps]
        
        
    
    # here we build a dictionary of evidence 
    evidence = {}
    for symptom in user_symptoms:
        evidence[symptom] = 1
        
    map_query_result = inference.map_query(variables=possible_diseases, evidence=evidence)
    
    
    disease_probabilities = {}
    for disease in map_query_result.keys():
        if disease in network.nodes():
            disease_probabilities[disease] = inference.query(variables=[disease], evidence=evidence).values

    most_probable_disease = max(disease_probabilities, key=lambda disease: disease_probabilities[disease][1])
    
    most_probable_disease = most_probable_disease.strip("disease")
    
    
    
    print("The most probable disease is you have is: " +  most_probable_disease)
    



In [15]:
run_bot(network)

Welcome to DoctorBot!
['Asthmadisease', 'Coughdisease', 'Migraine Disordersdisease', 'Paindisease', 'Headachedisease']


0it [00:00, ?it/s]

0it [00:00, ?it/s]

{'Coughdisease': 0, 'Asthmadisease': 0, 'Paindisease': 0, 'Migraine Disordersdisease': 0, 'Headachedisease': 0}
The most probable disease is you have is: Migraine Disorder
