Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
  • 3 commits
  • 3 files changed
  • 0 comments
  • 1 contributor
13  README
@@ -39,9 +39,20 @@ To use it:
39 39
 ------------------------------------------------------------------------------------
40 40
 chapter 4
41 41
 
42  
-module policy_eva.erl is an example for iterative policy evaluation.
  42
+module policy_eva.erl is an example of iterative policy evaluation.
43 43
 >policy_eva:start().
44 44
 >policy_eva:run(equi_prob).  %To evaluate the policy of equiprobably selecting actions
45 45
 >policy_eva:pause(). %To pause the ongoing evaluation
46 46
 >policy_eva:run(optimal). %To evaluate the optimal policy
47 47
 >policy_eva:stop(). %To stop the process.
  48
+
  49
+module car_rental.erl is an example of the Policy Iteration
  50
+>car_rental:start().
  51
+>car_rental:run()  %To start the policy iteration process
  52
+>car_rental:print()  % To see the final policy and value fuctions
  53
+>car_rental:stop(). %To stop the state machine
  54
+
  55
+module gambler.erl is an example of Value Iteration
  56
+>gambler:start() % to start the Value Iteration process
  57
+>gambler:print(). % to print out the final policy and value function
  58
+>gambler:stop(). %to stop the state machine
236  src/car_rental.erl
... ...
@@ -0,0 +1,236 @@
  1
+%% @author Barco You <barcojie@gmail.com>
  2
+%% This source is NOT limited by any license.
  3
+%%
  4
+%% This module simulates the Exmaple 4.2 and Figure 4.4
  5
+%% in the book: <Reinforcement Learning: An Introduction>
  6
+-module(car_rental).
  7
+-author('barcojie@gmail.com').
  8
+
  9
+-define(CREDIT, 10).
  10
+-define(COST, -2).
  11
+-define(LAMBDA1_OUT, 3).
  12
+-define(LAMBDA2_OUT, 4).
  13
+-define(LAMBDA1_IN, 3).
  14
+-define(LAMBDA2_IN, 2).
  15
+-define(LIMIT, 20).
  16
+-define(MAXMOV, 5).
  17
+-define(GAMA, 0.9).
  18
+-define(DELTA, 0.000001).
  19
+-define(MINPROB, 0.01).
  20
+
  21
+-behaviour(gen_fsm).
  22
+
  23
+-export([start/0,
  24
+		run/0,
  25
+		print/0,
  26
+		stop/0]).
  27
+-export([init/1,
  28
+		policy_eval/2,
  29
+		policy_impr/2,
  30
+		paused/2,
  31
+		handle_event/3,
  32
+		handle_sync_event/4,
  33
+		code_change/4,
  34
+		terminate/3,
  35
+		handle_info/3]).
  36
+-compile([export_all]).
  37
+-record(state, {values, policy}).
  38
+
  39
+
  40
+%%%
  41
+%%% API
  42
+%%% 
  43
+start() ->
  44
+	gen_fsm:start_link({local, ?MODULE}, ?MODULE, [], []).
  45
+
  46
+run() ->
  47
+	gen_fsm:send_event(?MODULE, start).
  48
+
  49
+print() ->
  50
+	gen_fsm:send_event(?MODULE, print).
  51
+
  52
+stop() ->
  53
+	gen_fsm:send_event(?MODULE, stop).
  54
+
  55
+%%% 
  56
+%%% Callbacks
  57
+%%% 
  58
+init([]) ->
  59
+	{ok, policy_eval, #state{values=[0 || _X<-lists:seq(0,?LIMIT), _Y<-lists:seq(0,?LIMIT)]}}.
  60
+
  61
+policy_eval(start, Data) ->
  62
+	Policy =
  63
+	fun(_State) ->
  64
+			0
  65
+	end,
  66
+	Values = full_backup(Policy, Data#state.values),
  67
+	io:format("Values:~n~p", [Values]),
  68
+	{next_state, policy_eval, Data#state{values=Values,policy=Policy}, 0};
  69
+policy_eval(timeout, Data) ->
  70
+	Values = full_backup(Data#state.policy, Data#state.values),
  71
+	io:format("Evaluating Policy: -------------------------------------~n",[]),
  72
+	[print_policy({X,Y}, Data#state.policy) || X<-lists:seq(0,?LIMIT),Y<-lists:seq(0,?LIMIT)],
  73
+	io:format("Values:~n~p", [Values]),
  74
+
  75
+	case delta(Values, Data#state.values)<?DELTA of
  76
+		true ->
  77
+			{next_state, policy_impr, Data#state{values = Values}, 1000};
  78
+		false ->
  79
+			{next_state, policy_eval, Data#state{values = Values},0}
  80
+	end.
  81
+
  82
+policy_impr(timeout, Data) ->
  83
+	MaxActs = [{{X,Y}, max_action({X,Y},Data#state.values)} || X<-lists:seq(0,?LIMIT),Y<-lists:seq(0,?LIMIT)],
  84
+	Policy =
  85
+	fun(State) ->
  86
+			{_State, Act} = lists:keyfind(State, 1, MaxActs),
  87
+			Act
  88
+	end,
  89
+	io:format("New Policy: ^^^^^^^^^^^^^^^^^~n",[]),
  90
+	[print_policy({X,Y}, Policy) || X<-lists:seq(0,?LIMIT),Y<-lists:seq(0,?LIMIT)],
  91
+	%io:format("Values:~n~p", [Values]),
  92
+	case is_policy_same(Policy, Data#state.policy) of
  93
+		true ->
  94
+			io:format("!!!!! Policy Stable !!!!!~n",[]),
  95
+			{next_state, paused, Data};
  96
+		false ->
  97
+			Values = full_backup(Policy, Data#state.values),
  98
+			{next_state, policy_eval,Data#state{values=Values, policy=Policy}, 0}
  99
+	end.
  100
+
  101
+paused(print, Data) ->
  102
+	io:format("Policy:~n",[]),
  103
+	[print_policy({X,Y}, Data#state.policy) || X<-lists:seq(0,?LIMIT),Y<-lists:seq(0,?LIMIT)],
  104
+	io:format("Values:~n~p", [Data#state.values]),
  105
+	{next_state, paused, Data};
  106
+paused(stop, Data) ->
  107
+	{stop, normal, Data}.
  108
+
  109
+
  110
+%%% 
  111
+%%% Internal Function
  112
+full_backup(Policy, Old) ->
  113
+	[q_pi({X,Y}, Policy, Old) || X<-lists:seq(0,?LIMIT), Y<-lists:seq(0,?LIMIT)].
  114
+
  115
+q_pi(State, Policy, Values) ->
  116
+	Action = Policy(State),
  117
+	reward(State, Action, Values).
  118
+
  119
+reward({A, B}, Action, Values) ->
  120
+	lists:sum([reward_event({A,B},Action,Ra,Rb,Da,Db, Values) ||
  121
+			Ra<-req_a(), Rb<-req_b(), Da<- drop_a(), Db<-drop_b()]). %only 40 cars in all
  122
+
  123
+max_action(State, Values) ->
  124
+	Actions = action_set(State),
  125
+	L = [reward(State, A, Values) || A <- Actions],
  126
+	Max = lists:max(L),
  127
+	max_action(Max, L, Actions).
  128
+
  129
+max_action(Max, [Vh|Vt], [Ah|At]) ->
  130
+	if
  131
+		Max == Vh ->
  132
+			Ah;
  133
+		true ->
  134
+			max_action(Max, Vt, At)
  135
+	end.
  136
+
  137
+action_set({A, B}) ->
  138
+	lists:seq(-min(5,B),min(5,A)).
  139
+
  140
+transition({A,B}, Action, Ra, Rb, Da, Db) ->
  141
+	{min(?LIMIT,Da+(A-Action)-min(A-Action, Ra)), min(?LIMIT,Db+(B+Action)-min(B+Action, Rb))}.
  142
+
  143
+req_a() ->
  144
+	max_prob_list(?LAMBDA1_OUT).
  145
+req_b() ->
  146
+	max_prob_list(?LAMBDA2_OUT).
  147
+drop_a() ->
  148
+	max_prob_list(?LAMBDA1_IN).
  149
+drop_b() ->
  150
+	max_prob_list(?LAMBDA2_IN).
  151
+
  152
+max_prob_list(Lambda) ->
  153
+	max_prob_list(Lambda, [], 0).
  154
+max_prob_list(Lambda, L, N) ->
  155
+	case poisson(Lambda, N)<?MINPROB of
  156
+		true ->
  157
+			L;
  158
+		false ->
  159
+			max_prob_list(Lambda, [N|L], N+1)
  160
+	end.
  161
+%% The probability of event with Ra requests at site A, Rb requests at site B,
  162
+%% and Da dropoffs at site A, Db dropoffs at site B
  163
+prob_event(Ra, Rb, Da, Db) ->
  164
+	poisson(?LAMBDA1_OUT,Ra)*poisson(?LAMBDA2_OUT,Rb)*poisson(?LAMBDA1_IN,Da)*poisson(?LAMBDA2_IN,Db).
  165
+
  166
+reward_event({A,B}, Action, Ra, Rb, Da, Db, Values) ->
  167
+	prob_event(Ra,Rb,Da,Db)*(?CREDIT*(min(A-Action,Ra)+min(B+Action,Rb))+?COST*abs(Action)+
  168
+		?GAMA*get_state_var(transition({A,B},Action,Ra,Rb,Da,Db), Values)).
  169
+
  170
+delta(L1, L2) ->
  171
+	delta(L1, L2, 0).
  172
+delta([], [], Sum) ->
  173
+	Sum;
  174
+delta([H1|T1], [H2|T2], Sum) ->
  175
+	delta(T1, T2, Sum+abs(H1-H2)).
  176
+
  177
+get_state_var({A, B}, VarL) ->
  178
+	lists:nth((?LIMIT+1)*A+B+1, VarL).
  179
+
  180
+poisson(Lambda, N) ->
  181
+	math:pow(Lambda, N)*math:exp(-Lambda)/factorial(N).
  182
+
  183
+factorial(N) ->
  184
+	factorial(N, 1).
  185
+factorial(0, V) ->
  186
+	V;
  187
+factorial(N, V) ->
  188
+	factorial(N-1, N*V).
  189
+
  190
+print_policy(State, Policy) ->
  191
+	io:format("~p --> ~w~n", [State, Policy(State)]). 
  192
+
  193
+is_policy_same(P1, P2) ->
  194
+	A1 = [P1({X,Y}) || X<-lists:seq(0,?LIMIT), Y<-lists:seq(0,?LIMIT)],
  195
+	A2 = [P2({X,Y}) || X<-lists:seq(0,?LIMIT), Y<-lists:seq(0,?LIMIT)],
  196
+	if
  197
+		A1==A2 ->
  198
+			true;
  199
+		true ->
  200
+			false
  201
+	end.
  202
+
  203
+%%
  204
+%% Not important callbacks
  205
+handle_event(cancel, StateName, StateData) ->
  206
+	notice("CANCEL", StateName),
  207
+	{stop, cancelled, StateData};
  208
+handle_event(Event, StateName, StateData) ->
  209
+	unexpected(Event, StateName),
  210
+	{next_state, StateName, StateData}.
  211
+
  212
+handle_sync_event(cancel, _From, StateName, StateData) ->
  213
+	notice("CANCEL", StateName),
  214
+	{stop, cancelled, ok, StateData};
  215
+handle_sync_event(Event, _From, StateName, StateData) ->
  216
+	unexpected(Event, StateName),
  217
+	{next_state, StateName, StateData}.
  218
+
  219
+code_change(_OldeVsn, StateName, Data, _Extra) ->
  220
+	{ok, StateName, Data}.
  221
+
  222
+terminate(normal, StateName, StateData) ->
  223
+	{stop, StateName, StateData};
  224
+terminate(_Reason, _StateName, _StateData) ->
  225
+	ok.
  226
+
  227
+handle_info(Info, StateName, StateData) ->
  228
+	unexpected(Info, StateName),
  229
+	{next_state, StateName, StateData}.
  230
+
  231
+notice(Msg, StateName) ->
  232
+	io:format("Received "++Msg++" at state: ~w", [StateName]).
  233
+
  234
+
  235
+unexpected(Event, StateName) ->
  236
+	io:format("Received unexpected event: ~p, at state: ~w", [Event, StateName]).
158  src/gambler.erl
... ...
@@ -0,0 +1,158 @@
  1
+%% @author Barco You <barcojie@gmail.com>
  2
+%% This source is NOT limited by any license.
  3
+%%
  4
+%% This module simulates the Exmaple 4.3 and Figure 4.6
  5
+%% in the book: <Reinforcement Learning: An Introduction>
  6
+-module(gambler).
  7
+-author('barcojie@gmail.com').
  8
+
  9
+-define(GOAL, 100).
  10
+-define(FLIP, 0.4).
  11
+-define(MAXCAP, 99).
  12
+-define(DELTA, 0.000001).
  13
+
  14
+-behaviour(gen_fsm).
  15
+
  16
+-export([start/0,
  17
+		print/0,
  18
+		stop/0]).
  19
+-export([init/1,
  20
+		sweep/2,
  21
+		paused/2,
  22
+		handle_event/3,
  23
+		handle_sync_event/4,
  24
+		code_change/4,
  25
+		terminate/3,
  26
+		handle_info/3]).
  27
+-compile([export_all]).
  28
+-record(state, {values, policy, count}).
  29
+
  30
+
  31
+%%%
  32
+%%% API
  33
+%%% 
  34
+start() ->
  35
+	gen_fsm:start_link({local, ?MODULE}, ?MODULE, [], []).
  36
+
  37
+print() ->
  38
+	gen_fsm:send_event(?MODULE, print).
  39
+
  40
+stop() ->
  41
+	gen_fsm:send_event(?MODULE, stop).
  42
+
  43
+%%% 
  44
+%%% Callbacks
  45
+%%% 
  46
+init([]) ->
  47
+	{ok, sweep, #state{values=[0 || _X<-lists:seq(1,?MAXCAP)], count=1},0}.
  48
+
  49
+sweep(timeout, Data) ->
  50
+	io:format("Sweep: ~w~n", [Data#state.count]),
  51
+	MaxActs = [{S, max_action(S,Data#state.values)} || S<-lists:seq(1,?MAXCAP)],
  52
+	Policy =
  53
+	fun(State) ->
  54
+			{_State, Act} = lists:keyfind(State, 1, MaxActs),
  55
+			Act
  56
+	end,
  57
+	Values = full_backup(Policy, Data#state.values),
  58
+	case delta(Values, Data#state.values)<?DELTA of
  59
+		true ->
  60
+			{next_state, paused, Data#state{values = Values, policy=Policy}};
  61
+		false ->
  62
+			[print_policy(S, Policy) || S<-lists:seq(1,?MAXCAP)],
  63
+			io:format("Values:~n~p~n", [Values]),
  64
+			{next_state, sweep, Data#state{values=Values,policy=Policy, count=Data#state.count+1}, 0}
  65
+	end.
  66
+
  67
+paused(print, Data) ->
  68
+	io:format("Final sweep: ~w~n",[Data#state.count]),
  69
+	io:format("Policy:~n",[]),
  70
+	[print_policy(S, Data#state.policy) || S<-lists:seq(1,?MAXCAP)],
  71
+	io:format("Values:~n~p", [Data#state.values]),
  72
+	{next_state, paused, Data};
  73
+paused(stop, Data) ->
  74
+	{stop, normal, Data}.
  75
+
  76
+
  77
+%%% 
  78
+%%% Internal Function
  79
+full_backup(Policy, Old) ->
  80
+	[q_pi(S, Policy, Old) || S<-lists:seq(1,?MAXCAP)].
  81
+
  82
+q_pi(State, Policy, Values) ->
  83
+	Action = Policy(State),
  84
+	reward(State, Action, Values).
  85
+
  86
+reward(State, Action, Values) ->
  87
+	?FLIP*get_state_var(min(?GOAL,State+Action), Values) + (1-?FLIP)*get_state_var(State-Action, Values).
  88
+
  89
+max_action(State, Values) ->
  90
+	Actions = action_set(State),
  91
+	L = [reward(State, A, Values) || A <- Actions],
  92
+	Max = lists:max(L),
  93
+	max_action(Max, L, Actions).
  94
+
  95
+max_action(Max, [Vh|Vt], [Ah|At]) ->
  96
+	if
  97
+		Max == Vh ->
  98
+			Ah;
  99
+		true ->
  100
+			max_action(Max, Vt, At)
  101
+	end.
  102
+
  103
+action_set(State) ->
  104
+	lists:seq(0, min(State, ?GOAL-State)).
  105
+
  106
+delta(L1, L2) ->
  107
+	delta(L1, L2, 0).
  108
+delta([], [], Sum) ->
  109
+	Sum;
  110
+delta([H1|T1], [H2|T2], Sum) ->
  111
+	delta(T1, T2, Sum+abs(H1-H2)).
  112
+
  113
+get_state_var(S, VarL) ->
  114
+	case S of
  115
+		?GOAL ->
  116
+			1;
  117
+		0 ->
  118
+			0;
  119
+		S ->
  120
+			lists:nth(S, VarL)
  121
+	end.
  122
+
  123
+print_policy(State, Policy) ->
  124
+	io:format("~p --> ~w~n", [State, Policy(State)]). 
  125
+%%
  126
+%% Not important callbacks
  127
+handle_event(cancel, StateName, StateData) ->
  128
+	notice("CANCEL", StateName),
  129
+	{stop, cancelled, StateData};
  130
+handle_event(Event, StateName, StateData) ->
  131
+	unexpected(Event, StateName),
  132
+	{next_state, StateName, StateData}.
  133
+
  134
+handle_sync_event(cancel, _From, StateName, StateData) ->
  135
+	notice("CANCEL", StateName),
  136
+	{stop, cancelled, ok, StateData};
  137
+handle_sync_event(Event, _From, StateName, StateData) ->
  138
+	unexpected(Event, StateName),
  139
+	{next_state, StateName, StateData}.
  140
+
  141
+code_change(_OldeVsn, StateName, Data, _Extra) ->
  142
+	{ok, StateName, Data}.
  143
+
  144
+terminate(normal, StateName, StateData) ->
  145
+	{stop, StateName, StateData};
  146
+terminate(_Reason, _StateName, _StateData) ->
  147
+	ok.
  148
+
  149
+handle_info(Info, StateName, StateData) ->
  150
+	unexpected(Info, StateName),
  151
+	{next_state, StateName, StateData}.
  152
+
  153
+notice(Msg, StateName) ->
  154
+	io:format("Received "++Msg++" at state: ~w", [StateName]).
  155
+
  156
+
  157
+unexpected(Event, StateName) ->
  158
+	io:format("Received unexpected event: ~p, at state: ~w", [Event, StateName]).

No commit comments for this range

Something went wrong with that request. Please try again.