In [2]:
import pandas as pd
import seaborn as sns
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination




# Load Data

In [3]:
# Load the Titanic dataset
data = sns.load_dataset('titanic')
print(data.head())
print("\n ############ \n")
print(data.info())


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

 ############ 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   surviv

# Preprocessing the data

In [4]:

# Select relevant columns and drop rows with missing values
data = data[['survived', 'pclass', 'sex', 'age', 'fare', 'embarked']].dropna()

# Encode categorical variables
data['sex'] = data['sex'].map({'male': 0, 'female': 1})
data['embarked'] = data['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Discretize 'age' and 'fare' into bins
data['age'] = pd.cut(data['age'], bins=3, labels=[0, 1, 2])  # Age groups: young, middle, older
data['fare'] = pd.cut(data['fare'], bins=3, labels=[0, 1, 2])  # Fare groups: low, medium, high




# Structure learning

BIC (Bayesian Information Criterion):

The BIC score evaluates the fit of the Bayesian Network structure to the data, penalizing more complex structures (i.e., those with more edges) to prevent overfitting. Lower BIC values indicate a better model.

Hill climbing tries different possible structures, adjusting edges to maximize the BIC score iteratively until it converges on a local maximum (i.e., the best structure it can find).

Output: best_model will store the estimated structure, which includes nodes (variables) and edges (dependencies between variables) that form the Bayesian Network.

In [7]:
# Structure learning
hc = HillClimbSearch(data)
best_model = hc.estimate(scoring_method=BicScore(data))

# Print the learned structure
print("Learned structure (edges):")
print(best_model.edges())




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000000.0), HTML(value='')))


Learned structure (edges):
[('survived', 'sex'), ('pclass', 'survived'), ('pclass', 'embarked'), ('pclass', 'age'), ('pclass', 'sex'), ('fare', 'pclass')]


In [11]:
# Parameter learning
model = BayesianNetwork(best_model.edges())
model.fit(data, estimator=MaximumLikelihoodEstimator)

# Print the learned CPDs
print("\nLearned CPDs:")
for cpd in model.get_cpds():
    print(cpd)



Learned CPDs:
+-------------+---------------------+--------------------+---------------------+
| pclass      | pclass(1)           | pclass(2)          | pclass(3)           |
+-------------+---------------------+--------------------+---------------------+
| survived(0) | 0.34782608695652173 | 0.5202312138728323 | 0.7605633802816901  |
+-------------+---------------------+--------------------+---------------------+
| survived(1) | 0.6521739130434783  | 0.4797687861271676 | 0.23943661971830985 |
+-------------+---------------------+--------------------+---------------------+
+----------+-------------+-----+--------------------+
| pclass   | pclass(1)   | ... | pclass(3)          |
+----------+-------------+-----+--------------------+
| survived | survived(0) | ... | survived(1)        |
+----------+-------------+-----+--------------------+
| sex(0)   | 0.953125    | ... | 0.4470588235294118 |
+----------+-------------+-----+--------------------+
| sex(1)   | 0.046875    | ... | 0.55294

In [26]:
# Perform inference
inference = VariableElimination(model)
query_result = inference.query(variables=['survived'], evidence={'pclass': 1, 'sex': 1}) # Rose
print("\nProbability of survival given pclass=1 and sex=female:")
print(query_result)

query_result = inference.query(variables=['survived'], evidence={'pclass': 3, 'sex':0}) # Jack
print("\nProbability of survival given pclass=3 and sex=male:")
print(query_result)


Probability of survival given pclass=1 and sex=female:
+-------------+-----------------+
| survived    |   phi(survived) |
| survived(0) |          0.0361 |
+-------------+-----------------+
| survived(1) |          0.9639 |
+-------------+-----------------+

Probability of survival given pclass=3 and sex=male:
+-------------+-----------------+
| survived    |   phi(survived) |
| survived(0) |          0.8498 |
+-------------+-----------------+
| survived(1) |          0.1502 |
+-------------+-----------------+


In [30]:
# Perform inference
inference = VariableElimination(model)
query_result = inference.query(variables=['survived'], evidence={'age': 0, 'sex': 0}) # Rose
print("\nProbability of survival given pclass=1 and sex=female:")
print(query_result)

query_result = inference.query(variables=['survived'], evidence={ 'sex':1}) # Jack
print("\nProbability of survival given pclass=3 and sex=male:")
print(query_result)


Probability of survival given pclass=1 and sex=female:
+-------------+-----------------+
| survived    |   phi(survived) |
| survived(0) |          0.8214 |
+-------------+-----------------+
| survived(1) |          0.1786 |
+-------------+-----------------+

Probability of survival given pclass=3 and sex=male:
+-------------+-----------------+
| survived    |   phi(survived) |
| survived(0) |          0.2471 |
+-------------+-----------------+
| survived(1) |          0.7529 |
+-------------+-----------------+
