In [7]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import HillClimbSearch, BayesianEstimator
from pgmpy.sampling import BayesianModelSampling
from pgmpy.inference import VariableElimination

In [9]:
df = pd.read_csv('../data/heart_disease.csv')

In [11]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [13]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [15]:
target = 'target'

In [17]:
# Encode any categorical variables (if needed)
df_encoded = df.copy()
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df_encoded[col] = LabelEncoder().fit_transform(df[col].astype(str))

In [19]:
continuous_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
df_encoded[continuous_cols] = discretizer.fit_transform(df_encoded[continuous_cols])
df_encoded = df_encoded.astype(int)
df_encoded.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,1,1,0,1,0,0,1,2,0,1,2,2,3,0
1,1,1,0,2,0,1,0,1,1,2,0,0,3,0
2,2,1,0,2,0,0,1,0,1,2,0,0,3,0
3,2,1,0,2,0,0,1,1,0,0,2,1,3,0
4,2,0,0,2,2,1,1,0,0,2,1,3,2,0


In [27]:
hc = HillClimbSearch(df_encoded)
best_model = hc.estimate()
model = DiscreteBayesianNetwork(best_model.edges())
model.fit(df_encoded, estimator=BayesianEstimator, prior_type='BDeu')

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'sex': 'N', 'cp': 'N', 'trestbps': 'N', 'chol': 'N', 'fbs': 'N', 'restecg': 'N', 'thalach': 'N', 'exang': 'N', 'oldpeak': 'N', 'slope': 'N', 'ca': 'N', 'thal': 'N', 'target': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'sex': 'N', 'cp': 'N', 'trestbps': 'N', 'chol': 'N', 'fbs': 'N', 'restecg': 'N', 'thalach': 'N', 'exang': 'N', 'oldpeak': 'N', 'slope': 'N', 'ca': 'N', 'thal': 'N', 'target': 'N'}
INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'sex': 'N', 'cp': 'N', 'trestbps': 'N', 'chol': 'N', 'fbs': 'N', 'restecg': 'N', 'thalach': 'N', 'exang': 'N', 'oldpeak': 'N', 'slope': 'N', 'ca': 'N', 'thal': 'N', 'target': 'N'}


  0%|          | 0/1000000 [00:00<?, ?it/s]

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'age': 'N', 'sex': 'N', 'cp': 'N', 'trestbps': 'N', 'chol': 'N', 'fbs': 'N', 'restecg': 'N', 'thalach': 'N', 'exang': 'N', 'oldpeak': 'N', 'slope': 'N', 'ca': 'N', 'thal': 'N', 'target': 'N'}


<pgmpy.models.DiscreteBayesianNetwork.DiscreteBayesianNetwork at 0x1eaf43370e0>

In [23]:
sampler = BayesianModelSampling(model)
samples = sampler.forward_sample(size=1000)
samples.head()

  0%|          | 0/14 [00:00<?, ?it/s]



Unnamed: 0,age,trestbps,fbs,sex,ca,cp,exang,chol,thalach,target,restecg,oldpeak,slope,thal
0,0,0,0,1,0,2,0,2,2,1,1,0,2,3
1,1,0,0,0,0,2,0,0,0,1,1,0,2,2
2,2,1,0,1,0,0,1,2,0,0,0,2,0,3
3,2,1,0,0,0,1,0,1,2,1,1,1,1,2
4,1,2,0,1,1,0,0,1,0,0,1,2,1,3


In [25]:
query_result = sampler.rejection_sample(
    evidence={'chol': 2, 'oldpeak': 2}, size=5000
)
# Calculate probability of target = 1 in sampled data
p_target_1 = (query_result['target'] == 1).mean()
print(f"Estimated P(target=1 | chol=2, oldpeak=2): {p_target_1:.4f}")

  0%|          | 0/5000 [00:00<?, ?it/s]



ValueError: too many values to unpack (expected 2)