# Bayesian network in Python using pgmpy
- https://www.vtupulse.com/machine-learning/bayesian-network-in-python-using-pgmpy/ (요 파일로 확인하기)
- https://pgmpy.org/index.html
<img src='https://images.unsplash.com/photo-1619615392944-7540c434f401?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxzZWFyY2h8NXx8ZWFzeXxlbnwwfHwwfHw%3D&auto=format&fit=crop&w=500&q=60'>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell # 모든 명령어 출력
InteractiveShell.ast_node_interactivity = 'all'
import warnings
warnings.filterwarnings('ignore')

### Obtain data 
https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset

In [2]:
heartDisease = pd.read_csv('../data/heart.csv')
heartDisease.head()
heartDisease.shape

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


(1025, 14)

In [3]:
heartDisease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [4]:
heartDisease = heartDisease.replace('?',np.nan)

In [5]:
#! pip install pgmpy
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination

In [6]:
heartDisease.rename(columns={'sex': 'gender', 'target' : 'heartdisease'}, inplace=True)
heartDisease.head()

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heartdisease
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [7]:
model= BayesianModel([('age','heartdisease'),('gender','heartdisease'),('exang','heartdisease'),
                      ('cp','heartdisease'),('heartdisease','restecg'),('heartdisease','chol')])

In [8]:
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)


Learning CPD using Maximum likelihood estimators


In [9]:
print('\n Inferencing with Bayesian Network:')
HeartDiseasetest_infer = VariableElimination(model)


 Inferencing with Bayesian Network:


In [10]:
print('\n 1. Probability of HeartDisease given evidence= restecg')
q1=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'restecg':1})
print(q1)

print('\n 2. Probability of HeartDisease given evidence= cp ')
q2=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'cp':2})
print(q2)


 1. Probability of HeartDisease given evidence= restecg
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.4354 |
+-----------------+---------------------+
| heartdisease(1) |              0.5646 |
+-----------------+---------------------+

 2. Probability of HeartDisease given evidence= cp 
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.3832 |
+-----------------+---------------------+
| heartdisease(1) |              0.6168 |
+-----------------+---------------------+


## 다른 데이터로 Inference
- 데이터출처는 못찾았음

In [14]:
heartDisease = pd.read_csv('../data/7-dataset.csv')
heartDisease.head()
heartDisease.shape

Unnamed: 0,age,gender,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heartdisease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


(303, 14)

In [15]:
heartDisease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           303 non-null    int64  
 1   gender        303 non-null    int64  
 2   cp            303 non-null    int64  
 3   trestbps      303 non-null    int64  
 4   chol          303 non-null    int64  
 5   fbs           303 non-null    int64  
 6   restecg       303 non-null    int64  
 7   thalach       303 non-null    int64  
 8   exang         303 non-null    int64  
 9   oldpeak       303 non-null    float64
 10  slope         303 non-null    int64  
 11  ca            303 non-null    object 
 12  thal          303 non-null    object 
 13  heartdisease  303 non-null    int64  
dtypes: float64(1), int64(11), object(2)
memory usage: 33.3+ KB


In [16]:
heartDisease = heartDisease.replace('?',np.nan)

In [17]:
model= BayesianModel([('age','heartdisease'),('gender','heartdisease'),('exang','heartdisease'),
                      ('cp','heartdisease'),('heartdisease','restecg'),('heartdisease','chol')])

# Learning CPD using Maximum likelihood estimators
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)

# Inferencing with Bayesian Network:')
HeartDiseasetest_infer = VariableElimination(model)

In [18]:
print('\n 1. Probability of HeartDisease given evidence= restecg')
q1=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'restecg':1})
print(q1)

print('\n 2. Probability of HeartDisease given evidence= cp ')
q2=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'cp':2})
print(q2)


 1. Probability of HeartDisease given evidence= restecg
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.1012 |
+-----------------+---------------------+
| heartdisease(1) |              0.0000 |
+-----------------+---------------------+
| heartdisease(2) |              0.2392 |
+-----------------+---------------------+
| heartdisease(3) |              0.2015 |
+-----------------+---------------------+
| heartdisease(4) |              0.4581 |
+-----------------+---------------------+

 2. Probability of HeartDisease given evidence= cp 
+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.3610 |
+-----------------+---------------------+
| heartdisease(1) |              0.2159 |
+-----------------+---------------------+
| heartdisease(2) |              0.1373 |
+-----------------+---------------------+
| heartdisease(3) |              0.1537 |
+-------

### simple example

In [18]:
data = {'X': [0, 0, 0, 0, 1, 1, 1, 1], 'Z': [0, 0, 0, 1, 0, 1, 1, 1]}
data_xz = pd.DataFrame(data)
data_xz
model_xz = BayesianModel([('X','Z')])
model_xz.fit(data_xz, estimator=MaximumLikelihoodEstimator)
print(model_xz.get_cpds('Z'))

Unnamed: 0,X,Z
0,0,0
1,0,0
2,0,0
3,0,1
4,1,0
5,1,1
6,1,1
7,1,1


+------+------+------+
| X    | X(0) | X(1) |
+------+------+------+
| Z(0) | 0.75 | 0.25 |
+------+------+------+
| Z(1) | 0.25 | 0.75 |
+------+------+------+


# End