In [2]:
!pip install pymc3

Collecting pymc3
  Downloading pymc3-3.11.6-py3-none-any.whl.metadata (15 kB)
Collecting arviz>=0.11.0 (from pymc3)
  Downloading arviz-0.18.0-py3-none-any.whl.metadata (8.7 kB)
Collecting deprecat (from pymc3)
  Downloading deprecat-2.1.1-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting fastprogress>=0.2.0 (from pymc3)
  Downloading fastprogress-1.0.3-py3-none-any.whl.metadata (5.6 kB)
Collecting numpy<1.22.2,>=1.15.0 (from pymc3)
  Downloading numpy-1.22.1.zip (11.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
INFO: pip is looking at multiple versions of pymc3 to determine which version is compatible with other requirements. This could take a while.
Collecting pymc3
  Downloading pymc3-3.11.5-py3-none-any.whl.metadata (

In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('medical.csv')

# Encode categorical variables
label_encoders = {}
for column in ['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Gender', 'Blood Pressure', 'Cholesterol Level', 'Outcome Variable']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

data.head()


Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,56,1,0,1,1,19,0,1,2,1
1,24,0,1,1,0,25,0,2,2,0
2,37,0,1,1,0,25,0,2,2,0
3,6,1,1,0,1,25,1,2,2,1
4,6,1,1,0,1,25,1,2,2,1


In [26]:
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

# Define the structure of the Bayesian Network
model = BayesianNetwork([
    ('Fever', 'Disease'),
    ('Cough', 'Disease'),
    ('Fatigue', 'Disease'),
    ('Difficulty Breathing', 'Disease'),
    ('Age', 'Disease'),
    ('Gender', 'Disease'),
    ('Blood Pressure', 'Disease'),
    ('Cholesterol Level', 'Disease'),
    ('Disease', 'Outcome Variable')
])

# Parameterize the network
cpd_fever = TabularCPD(variable='Fever', variable_card=2, values=[[0.5], [0.5]])
cpd_cough = TabularCPD(variable='Cough', variable_card=2, values=[[0.5], [0.5]])
cpd_fatigue = TabularCPD(variable='Fatigue', variable_card=2, values=[[0.5], [0.5]])
cpd_difficulty_breathing = TabularCPD(variable='Difficulty Breathing', variable_card=2, values=[[0.5], [0.5]])
cpd_age = TabularCPD(variable='Age', variable_card=11, values=[[1/11], [1/11], [1/11], [1/11], [1/11], [1/11], [1/11], [1/11], [1/11], [1/11], [1/11]])
cpd_gender = TabularCPD(variable='Gender', variable_card=2, values=[[0.5], [0.5]])
cpd_blood_pressure = TabularCPD(variable='Blood Pressure', variable_card=3, values=[[1/3], [1/3], [1/3]])
cpd_cholesterol_level = TabularCPD(variable='Cholesterol Level', variable_card=3, values=[[1/3], [1/3], [1/3]])


# Define the CPD for Disease based on all parent nodes
# This should be learned from the data. For simplicity, assume equal probabilities here
# Define the CPD for Disease based on all parent nodes
# Define the CPD for Disease based on all parent nodes
cpd_disease = TabularCPD(
    variable='Disease', variable_card=10,
    values=[[1/10]*3168] * 10,
    evidence=['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level'],
    evidence_card=[2, 2, 2, 2, 11, 2, 3, 3]  # Adjust with actual cardinalities
)

# Define the CPD for Outcome Variable based on Disease
cpd_outcome = TabularCPD(
    variable='Outcome Variable', variable_card=2,
    values=[[0.5]*10, [0.5]*10],
    evidence=['Disease'],
    evidence_card=[10]  # Adjust with the actual cardinality of 'Disease'
)


# Add CPDs to the model
model.add_cpds(cpd_fever, cpd_cough, cpd_fatigue, cpd_difficulty_breathing, cpd_age, cpd_gender, cpd_blood_pressure, cpd_cholesterol_level, cpd_disease, cpd_outcome)

# Check if the model is valid
model.check_model()

# Perform exact inference
infer = VariableElimination(model)
posterior_prob = infer.query(['Outcome Variable'], evidence={'Fever': 1, 'Cough': 1})
print(posterior_prob)


+---------------------+-------------------------+
| Outcome Variable    |   phi(Outcome Variable) |
| Outcome Variable(0) |                  0.5000 |
+---------------------+-------------------------+
| Outcome Variable(1) |                  0.5000 |
+---------------------+-------------------------+


In [29]:
import matplotlib.pyplot as plt

# Perform exact inference
infer = VariableElimination(model)

# Predict diseases for each instance in the dataset
predicted_diseases = []
for index, instance in data.iterrows():
    evidence = instance.to_dict()
    del evidence['Outcome Variable']  # Remove Outcome Variable from evidence if it exists in your dataset
    
    # Perform inference to get posterior probabilities of 'Disease'
    posterior_prob = infer.query(['Disease'], evidence=evidence)
    
    # Get the predicted disease (highest probability)
    predicted_disease = posterior_prob.values.argmax()
    predicted_diseases.append(predicted_disease)

# Add predicted diseases back to the dataframe for visualization
data['Predicted Disease'] = predicted_diseases

# Visualize the predicted diseases
plt.figure(figsize=(10, 6))
data['Disease'].value_counts().plot(kind='bar', color='blue', alpha=0.5, label='Actual Diseases')
data['Predicted Disease'].value_counts().plot(kind='bar', color='red', alpha=0.5, label='Predicted Diseases')
plt.title('Actual vs Predicted Diseases')
plt.xlabel('Disease Index')
plt.ylabel('Count')
plt.legend()
plt.show()


ValueError: Can't have the same variables in both `variables` and `evidence`. Found in both: {'Disease'}