-
Notifications
You must be signed in to change notification settings - Fork 793
/
PolicyIteration.java
144 lines (136 loc) · 4.2 KB
/
PolicyIteration.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
package aima.core.probability.mdp.search;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.probability.mdp.Policy;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.impl.LookupPolicy;
import aima.core.util.Util;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 657.<br>
* <br>
*
* <pre>
* function POLICY-ITERATION(mdp) returns a policy
* inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a)
* local variables: U, a vector of utilities for states in S, initially zero
* π, a policy vector indexed by state, initially random
*
* repeat
* U <- POLICY-EVALUATION(π, U, mdp)
* unchanged? <- true
* for each state s in S do
* if max<sub>a ∈ A(s)</sub> Σ<sub>s'</sub>P(s'|s,a)U[s'] > Σ<sub>s'</sub>P(s'|s,π[s])U[s'] then do
* π[s] <- argmax<sub>a ∈ A(s)</sub> Σ<sub>s'</sub>P(s'|s,a)U[s']
* unchanged? <- false
* until unchanged?
* return π
* </pre>
*
* Figure 17.7 The policy iteration algorithm for calculating an optimal policy.
*
* @param <S>
* the state type.
* @param <A>
* the action type.
*
* @author Ciaran O'Reilly
* @author Ravi Mohan
*
*/
public class PolicyIteration<S, A extends Action> {
private PolicyEvaluation<S, A> policyEvaluation = null;
/**
* Constructor.
*
* @param policyEvaluation
* the policy evaluation function to use.
*/
public PolicyIteration(PolicyEvaluation<S, A> policyEvaluation) {
this.policyEvaluation = policyEvaluation;
}
// function POLICY-ITERATION(mdp) returns a policy
/**
* The policy iteration algorithm for calculating an optimal policy.
*
* @param mdp
* an MDP with states S, actions A(s), transition model P(s'|s,a)
* @return an optimal policy
*/
public Policy<S, A> policyIteration(MarkovDecisionProcess<S, A> mdp) {
// local variables: U, a vector of utilities for states in S, initially
// zero
Map<S, Double> U = Util.create(mdp.states(),0.0);
// π, a policy vector indexed by state, initially random
Map<S, A> pi = initialPolicyVector(mdp);
boolean unchanged;
// repeat
do {
// U <- POLICY-EVALUATION(π, U, mdp)
U = policyEvaluation.evaluate(pi, U, mdp);
// unchanged? <- true
unchanged = true;
// for each state s in S do
for (S s : mdp.states()) {
// calculate:
// max<sub>a ∈ A(s)</sub>
// Σ<sub>s'</sub>P(s'|s,a)U[s']
double aMax = Double.NEGATIVE_INFINITY, piVal = 0;
A aArgmax = pi.get(s);
for (A a : mdp.actions(s)) {
double aSum = 0;
for (S sDelta : mdp.states()) {
aSum += mdp.transitionProbability(sDelta, s, a)
* U.get(sDelta);
}
if (aSum > aMax) {
aMax = aSum;
aArgmax = a;
}
// track:
// Σ<sub>s'</sub>P(s'|s,π[s])U[s']
if (a.equals(pi.get(s))) {
piVal = aSum;
}
}
// if max<sub>a ∈ A(s)</sub>
// Σ<sub>s'</sub>P(s'|s,a)U[s']
// > Σ<sub>s'</sub>P(s'|s,π[s])U[s'] then do
if (aMax > piVal) {
// π[s] <- argmax<sub>a ∈A(s)</sub>
// Σ<sub>s'</sub>P(s'|s,a)U[s']
pi.put(s, aArgmax);
// unchanged? <- false
unchanged = false;
}
}
// until unchanged?
} while (!unchanged);
// return π
return new LookupPolicy<S, A>(pi);
}
/**
* Create a policy vector indexed by state, initially random.
*
* @param mdp
* an MDP with states S, actions A(s), transition model P(s'|s,a)
* @return a policy vector indexed by state, initially random.
*/
public static <S, A extends Action> Map<S, A> initialPolicyVector(
MarkovDecisionProcess<S, A> mdp) {
Map<S, A> pi = new LinkedHashMap<S, A>();
List<A> actions = new ArrayList<A>();
for (S s : mdp.states()) {
actions.clear();
actions.addAll(mdp.actions(s));
// Handle terminal states (i.e. no actions).
if (actions.size() > 0) {
pi.put(s, Util.selectRandomlyFromList(actions));
}
}
return pi;
}
}