-
Notifications
You must be signed in to change notification settings - Fork 794
/
ValueIteration.java
129 lines (122 loc) · 3.85 KB
/
ValueIteration.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package aima.core.probability.mdp.search;
import java.util.Map;
import java.util.Set;
import aima.core.agent.Action;
import aima.core.probability.mdp.MarkovDecisionProcess;
import aima.core.util.Util;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 653.<br>
* <br>
*
* <pre>
* function VALUE-ITERATION(mdp, ε) returns a utility function
* inputs: mdp, an MDP with states S, actions A(s), transition model P(s' | s, a),
* rewards R(s), discount γ
* ε the maximum error allowed in the utility of any state
* local variables: U, U', vectors of utilities for states in S, initially zero
* δ the maximum change in the utility of any state in an iteration
*
* repeat
* U <- U'; δ <- 0
* for each state s in S do
* U'[s] <- R(s) + γ max<sub>a ∈ A(s)</sub> Σ<sub>s'</sub>P(s' | s, a) U[s']
* if |U'[s] - U[s]| > δ then δ <- |U'[s] - U[s]|
* until δ < ε(1 - γ)/γ
* return U
* </pre>
*
* Figure 17.4 The value iteration algorithm for calculating utilities of
* states. The termination condition is from Equation (17.8):<br>
*
* <pre>
* if ||U<sub>i+1</sub> - U<sub>i</sub>|| < ε(1 - γ)/γ then ||U<sub>i+1</sub> - U|| < ε
* </pre>
*
* @param <S>
* the state type.
* @param <A>
* the action type.
*
* @author Ciaran O'Reilly
* @author Ravi Mohan
*
*/
public class ValueIteration<S, A extends Action> {
// discount γ to be used.
private double gamma = 0;
/**
* Constructor.
*
* @param gamma
* discount γ to be used.
*/
public ValueIteration(double gamma) {
if (gamma > 1.0 || gamma <= 0.0) {
throw new IllegalArgumentException("Gamma must be > 0 and <= 1.0");
}
this.gamma = gamma;
}
// function VALUE-ITERATION(mdp, ε) returns a utility function
/**
* The value iteration algorithm for calculating the utility of states.
*
* @param mdp
* an MDP with states S, actions A(s), <br>
* transition model P(s' | s, a), rewards R(s)
* @param epsilon
* the maximum error allowed in the utility of any state
* @return a vector of utilities for states in S
*/
public Map<S, Double> valueIteration(MarkovDecisionProcess<S, A> mdp,
double epsilon) {
//
// local variables: U, U', vectors of utilities for states in S,
// initially zero
Map<S, Double> U = Util.create(mdp.states(), 0.0);
Map<S, Double> Udelta = Util.create(mdp.states(), 0.0);
// δ the maximum change in the utility of any state in an
// iteration
double delta = 0;
// Note: Just calculate this once for efficiency purposes:
// ε(1 - γ)/γ
double minDelta = epsilon * (1 - gamma) / gamma;
// repeat
do {
// U <- U'; δ <- 0
U.putAll(Udelta);
delta = 0;
// for each state s in S do
for (S s : mdp.states()) {
// max<sub>a ∈ A(s)</sub>
Set<A> actions = mdp.actions(s);
// Handle terminal states (i.e. no actions).
double aMax = 0;
if (actions.size() > 0) {
aMax = Double.NEGATIVE_INFINITY;
}
for (A a : actions) {
// Σ<sub>s'</sub>P(s' | s, a) U[s']
double aSum = 0;
for (S sDelta : mdp.states()) {
aSum += mdp.transitionProbability(sDelta, s, a)
* U.get(sDelta);
}
if (aSum > aMax) {
aMax = aSum;
}
}
// U'[s] <- R(s) + γ
// max<sub>a ∈ A(s)</sub>
Udelta.put(s, mdp.reward(s) + gamma * aMax);
// if |U'[s] - U[s]| > δ then δ <- |U'[s] - U[s]|
double aDiff = Math.abs(Udelta.get(s) - U.get(s));
if (aDiff > delta) {
delta = aDiff;
}
}
// until δ < ε(1 - γ)/γ
} while (delta > minDelta);
// return U
return U;
}
}