In [1]:
from pgmpy.models.BayesianModel import BayesianModel
from pgmpy.inference import VariableElimination
from pgmpy.factors.discrete import TabularCPD

In [2]:
restaurant=BayesianModel([('location','cost'),('quality','cost'),('cost','no_of_people'),('location','no_of_people')])

In [3]:
cpd_location=TabularCPD('location',2,[[0.6,0.4]])

In [4]:
cpd_quality=TabularCPD('quality',3,[[0.3,0.5,0.2]])

In [5]:
cpd_cost=TabularCPD('cost',2,[[0.8, 0.6, 0.1, 0.6,0.6, 0.05],[0.2, 0.4, 0.9, 0.4, 0.4, 0.95]],['location','quality'],[2,3])

In [6]:
cpd_no_of_people=TabularCPD('no_of_people',2,[[0.6, 0.8, 0.1, 0.6],[0.4, 0.2, 0.9, 0.4]],['cost','location'],[2,2])

In [7]:
restaurant.add_cpds(cpd_location, cpd_quality, cpd_cost, cpd_no_of_people)

In [8]:
restaurant_inference=VariableElimination(restaurant)

In [9]:
restaurant_inference.query(variables=['location'])

  phi.values = phi.values[slice_]


{'location': <DiscreteFactor representing phi(location:2) at 0x1e123302388>}

In [10]:
restaurant_inference.query(variables=['location','no_of_people'])

  phi1.values = phi1.values[slice_]


{'location': <DiscreteFactor representing phi(location:2) at 0x1e1232f1508>,
 'no_of_people': <DiscreteFactor representing phi(no_of_people:2) at 0x1e1232ecac8>}

In [11]:
restaurant_inference.query(variables=['no_of_people'],elimination_order=['location','cost','quality'])

{'no_of_people': <DiscreteFactor representing phi(no_of_people:2) at 0x1e1232e1a48>}

In [12]:
restaurant_inference.query(variables=['no_of_people'], evidence={'location':1})

{'no_of_people': <DiscreteFactor representing phi(no_of_people:2) at 0x1e1232d99c8>}

In [13]:
restaurant_inference.query(variables=['no_of_people'],evidence={'location':1 , 'quality':1})

{'no_of_people': <DiscreteFactor representing phi(no_of_people:2) at 0x1e12331cd48>}

In [14]:
induced_graph=restaurant_inference.induced_graph(['cost','location','no_of_people','quality'])

In [15]:
induced_graph.nodes()

['location', 'quality', 'no_of_people', 'cost']

In [16]:
induced_graph.edges()

[('location', 'quality'),
 ('location', 'no_of_people'),
 ('location', 'cost'),
 ('quality', 'no_of_people'),
 ('quality', 'cost'),
 ('no_of_people', 'cost')]

## belief propagation

In [17]:
from pgmpy.models import JunctionTree

In [18]:
junction_tree=JunctionTree()

In [19]:
junction_tree.add_nodes_from([('A','B','C'),('C','D')])

In [20]:
junction_tree.add_edge(('A','B','C'),('C','D'))

In [21]:
junction_tree.add_edge(('A','B','C'),('D','E','F'))
#원래 여기 에러남

ValueError: No sepset found between these two edges.

## constructing a clique tree

In [22]:
from pgmpy.models import BayesianModel, MarkovModel

In [23]:
from pgmpy.factors.discrete import TabularCPD, DiscreteFactor

In [24]:
model = BayesianModel([('rain','traffic_jam'),('accident','traffic_jam'),('traffic_jam','long_queues'),('traffic_jam','late_for_school'),('getting_up_late','late_for_school')])

In [25]:
cpd_rain = TabularCPD('rain', 2, [[0.4], [0.6]])

In [26]:
cpd_accident = TabularCPD('accident', 2, [[0.2], [0.8]])

In [27]:
cpd_traffic_jam = TabularCPD(
'traffic_jam', 2,
[[0.9, 0.6, 0.7, 0.1],
[0.1, 0.4, 0.3, 0.9]],
evidence=['rain', 'accident'],
evidence_card=[2, 2])

In [28]:
cpd_getting_up_late = TabularCPD('getting_up_late', 2,
[[0.6], [0.4]])

In [29]:
cpd_late_for_school = TabularCPD(
'late_for_school', 2,
[[0.9, 0.45, 0.8, 0.1],
[0.1, 0.55, 0.2, 0.9]],
evidence=['getting_up_late',
'traffic_jam'],
evidence_card=[2, 2])

In [30]:
cpd_long_queues = TabularCPD('long_queues', 2,
[[0.9, 0.2],
[0.1, 0.8]],
evidence=['traffic_jam'],
evidence_card=[2])

In [31]:
model.add_cpds(cpd_rain, cpd_accident,
cpd_traffic_jam, cpd_getting_up_late,
cpd_late_for_school, cpd_long_queues)

In [32]:
junction_tree_bm = model.to_junction_tree()

In [33]:
type(junction_tree_bm)

pgmpy.models.JunctionTree.JunctionTree

In [34]:
junction_tree_bm.nodes()

[('traffic_jam', 'accident', 'rain'),
 ('traffic_jam', 'late_for_school', 'getting_up_late'),
 ('traffic_jam', 'long_queues')]

In [35]:
junction_tree_bm.edges()

[(('traffic_jam', 'accident', 'rain'),
  ('traffic_jam', 'late_for_school', 'getting_up_late')),
 (('traffic_jam', 'accident', 'rain'), ('traffic_jam', 'long_queues'))]

## factor division

In [36]:
from pgmpy.factors.discrete import DiscreteFactor

In [37]:
phi1 = DiscreteFactor(['a', 'b'], [2, 3], range(6))

In [38]:
phi2 = DiscreteFactor(['b'], [3], range(3))

In [39]:
psi = phi1 / phi2

  phi1.values = phi1.values[slice_]
  phi.values = phi.values / phi1.values
  phi.values = phi.values / phi1.values


In [40]:
print(psi)

+-----+-----+------------+
| a   | b   |   phi(a,b) |
| a_0 | b_0 |     0.0000 |
+-----+-----+------------+
| a_0 | b_1 |     1.0000 |
+-----+-----+------------+
| a_0 | b_2 |     1.0000 |
+-----+-----+------------+
| a_1 | b_0 |   inf      |
+-----+-----+------------+
| a_1 | b_1 |     4.0000 |
+-----+-----+------------+
| a_1 | b_2 |     2.5000 |
+-----+-----+------------+


As this algorithm updates the belief of a cluster using the beliefs of its neighbors,  
we call it the belief update message passing algorithm. It is also known as the  
Lauritzen-Spiegelhalter algorithm.  

In pgmpy, this can be implemented as follows:  

In [41]:
from pgmpy.models import BayesianModel

In [42]:
from pgmpy.factors.discrete import TabularCPD, DiscreteFactor

In [43]:
from pgmpy.inference import BeliefPropagation

In [44]:
model = BayesianModel([('rain', 'traffic_jam'),('accident', 'traffic_jam'),('traffic_jam', 'long_queues'),('traffic_jam', 'late_for_school'),('getting_up_late', 'late_for_school')])

In [45]:
cpd_rain = TabularCPD('rain', 2, [[0.4], [0.6]])

In [46]:
cpd_accident = TabularCPD('accident', 2, [[0.2], [0.8]])

In [47]:
cpd_traffic_jam = TabularCPD('traffic_jam', 2, [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]], evidence=['rain', 'accident'],evidence_card=[2, 2])

In [48]:
cpd_getting_up_late = TabularCPD('getting_up_late', 2,[[0.6], [0.4]])

In [49]:
cpd_late_for_school = TabularCPD( 'late_for_school', 2,[[0.9, 0.45, 0.8, 0.1],[0.1, 0.55, 0.2, 0.9]],evidence=['getting_up_late','traffic_jam'],evidence_card=[2, 2])

In [50]:
cpd_long_queues = TabularCPD('long_queues', 2,
[[0.9, 0.2],
[0.1, 0.8]],
evidence=['traffic_jam'],
evidence_card=[2])

In [51]:
model.add_cpds(cpd_rain, cpd_accident,
cpd_traffic_jam, cpd_getting_up_late,
cpd_late_for_school, cpd_long_queues)

In [52]:
belief_propagation = BeliefPropagation(model)

In [53]:
# To calibrate the clique tree, use calibrate() method
belief_propagation.calibrate()

In [54]:
belief_propagation.get_clique_beliefs()

{('traffic_jam',
  'accident',
  'rain'): <DiscreteFactor representing phi(traffic_jam:2, accident:2, rain:2) at 0x1e12341be48>,
 ('traffic_jam',
  'late_for_school',
  'getting_up_late'): <DiscreteFactor representing phi(traffic_jam:2, late_for_school:2, getting_up_late:2) at 0x1e12343da48>,
 ('traffic_jam',
  'long_queues'): <DiscreteFactor representing phi(traffic_jam:2, long_queues:2) at 0x1e12343d208>}

In [55]:
 belief_propagation.get_sepset_beliefs()

{frozenset({('traffic_jam', 'accident', 'rain'),
            ('traffic_jam',
             'late_for_school',
             'getting_up_late')}): <DiscreteFactor representing phi(traffic_jam:2) at 0x1e123439088>,
 frozenset({('traffic_jam', 'accident', 'rain'),
            ('traffic_jam',
             'long_queues')}): <DiscreteFactor representing phi(traffic_jam:2) at 0x1e1233231c8>}

In [56]:
belief_propagation.query(variables=['no_of_people'], evidence={'location': 1, 'quality': 1})
#why?

IndexError: tuple index out of range

## Factor maximization

In [57]:
from pgmpy.models import BayesianModel

In [58]:
from pgmpy.factors.discrete import TabularCPD

In [59]:
from pgmpy.inference import VariableElimination

In [60]:
model = BayesianModel([('rain', 'traffic_jam'),('accident', 'traffic_jam'),('traffic_jam', 'long_queues'),('traffic_jam', 'late_for_school'),('getting_up_late', 'late_for_school')])

In [61]:
cpd_rain = TabularCPD('rain', 2, [[0.4], [0.6]])

In [62]:
cpd_accident = TabularCPD('accident', 2, [[0.2], [0.8]])

In [63]:
cpd_traffic_jam = TabularCPD('traffic_jam', 2,[[0.9, 0.6, 0.7, 0.1],[0.1, 0.4, 0.3, 0.9]],evidence=['rain','accident'],evidence_card=[2, 2])

In [64]:
cpd_getting_up_late = TabularCPD('getting_up_late', 2,[[0.6], [0.4]])

In [65]:
cpd_late_for_school = TabularCPD('late_for_school', 2,[[0.9, 0.45, 0.8, 0.1],[0.1, 0.55, 0.2, 0.9]],evidence=['getting_up_late', 'traffic_jam'],evidence_card=[2, 2])

In [66]:
cpd_long_queues = TabularCPD('long_queues', 2,[[0.9, 0.2],[0.1, 0.8]],evidence=['traffic_jam'],evidence_card=[2])

In [67]:
model.add_cpds(cpd_rain, cpd_accident,cpd_traffic_jam, cpd_getting_up_late,cpd_late_for_school, cpd_long_queues)

In [68]:
# Calculating max marginals
model_inference = VariableElimination(model)

In [69]:
model_inference.max_marginal(variables=['late_for_school'])

0.5714285714285715

In [70]:
model_inference.max_marginal(variables=['late_for_school', 'traffic_jam'])

0.3850158442734269

In [71]:
model_inference.max_marginal(variables=['late_for_school'],evidence={'traffic_jam': 1})

0.5714285714285714

In [72]:
model_inference.max_marginal(variables=['late_for_school'],evidence={'traffic_jam': 1,'getting_up_late': 0})
# 책이랑 결과가 다른데?

0.55

In [73]:
model_inference.max_marginal(variables=['late_for_school','long_queues'],evidence={'traffic_jam': 1,'getting_up_late': 0})

0.44000000000000006

In [74]:
model_inference.m_marginal(variables=['late_for_school'],elimination_order=['traffic_jam','getting_up_late', 'rain','long_queues', 'accident'])

AttributeError: 'VariableElimination' object has no attribute 'm_marginal'

In [75]:
model_inference.max_marginal(
variables=['late_for_school'],
evidence={'accident': 1},
elimination_order=['traffic_jam', 'getting_up_late', 'rain', 'long_queues'])

0.5714285714285714

## Finding the most probable assignment

In [76]:
model_inference.map_query(variables=['late_for_school'])
#왜 책이랑 다르지?

{'late_for_school': 0}

In [77]:
model_inference.map_query(variables=['late_for_school','accident'])

{'late_for_school': 0, 'accident': 1}

In [78]:
model_inference.map_query(variables=['late_for_school'],
evidence={'accident': 1})

{'late_for_school': 1}

In [79]:
model_inference.map_query(variables=['late_for_school'],
evidence={'accident': 1,
'rain': 1})

{'late_for_school': 1}

In [80]:
model_inference.map_query(
variables=['late_for_school'],
elimination_order=['accident', 'rain',
'traffic_jam',
'getting_up_late',
'long_queues'])

{'late_for_school': 0}

In [81]:
model_inference.map_query(
variables=['late_for_school'],
evidence={'accident': 1},
elimination_order=['rain',
'traffic_jam',
'getting_up_late',
'long_queues'])

{'late_for_school': 1}

In [82]:
belief_propagation.map_query(['late_for_school'],
evidence={'accident': 1})

{'late_for_school': 1}

## Predictions from the model using pgmpy

In [83]:
import numpy as np

In [84]:
 from pgmpy.models import BayesianModel

In [85]:
data = np.random.randint(low=0, high=2, size=(1000, 4))

In [86]:
data

array([[1, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 1],
       ...,
       [1, 0, 1, 0],
       [1, 1, 0, 0],
       [1, 0, 1, 1]])

In [87]:
import pandas as pd

In [88]:
data = pd.DataFrame(data, columns=['cost', 'quality',
'location',
'no_of_people'])

In [89]:
data

Unnamed: 0,cost,quality,location,no_of_people
0,1,0,1,0
1,0,0,0,1
2,0,0,1,1
3,1,1,0,0
4,1,1,0,0
...,...,...,...,...
995,1,1,0,0
996,1,1,0,1
997,1,0,1,0
998,1,1,0,0


In [90]:
train = data[:750]

In [91]:
test = data[750:].drop('no_of_people', axis=1)

In [92]:
test

Unnamed: 0,cost,quality,location
750,0,0,0
751,0,0,1
752,1,1,1
753,1,0,1
754,0,0,1
...,...,...,...
995,1,1,0
996,1,1,0
997,1,0,1
998,1,1,0


In [93]:
restaurant_model = BayesianModel(
[('location', 'cost'),
('quality', 'cost'),
('location', 'no_of_people'),
('cost', 'no_of_people')])

In [94]:
restaurant_model.fit(train)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  states = sorted(list(self.data.ix[:, variable].dropna().unique()))
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  state_counts.ix[:, (state_counts == 0).all()] = 1
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  state_count_data = data.ix[:, variable].value_counts()


In [95]:
restaurant_model.get_cpds()

[<TabularCPD representing P(cost:2 | location:2, quality:2) at 0x1e12344fdc8>,
 <TabularCPD representing P(location:2) at 0x1e1234eaf48>,
 <TabularCPD representing P(no_of_people:2 | cost:2, location:2) at 0x1e121ccf188>,
 <TabularCPD representing P(quality:2) at 0x1e1234fa308>]

In [96]:
restaurant_model.predict(test).values.ravel()

array([0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)