forked from siyamak45/CS188.1x-Project3
-
Notifications
You must be signed in to change notification settings - Fork 1
/
DK_from_scratch.py
175 lines (126 loc) · 4.72 KB
/
DK_from_scratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy
np = numpy
import gridworld
# hyper-parameters:
grid_width = 8
prob_random_action = 0.1
prob_random_reset = 0.001
query_cost = .01
gamma = .9 # discount factor
prob_zero_reward = .9
learning_rate = .1
# states (lexical order)
states = range(grid_width**2)
# reward probabilities
reward_probabilities = np.random.binomial(1, 1 - prob_zero_reward, len(states)) * np.random.uniform(0, 1, len(states))
#reward_probabilities = [0,0,0,0,0,0,0,0,1]
##################################
def row_and_column(state):
return state / grid_width, state % grid_width
def state_from_row_and_column(row, column):
state = row * grid_width + column
assert 0 <= state and state <= len(states)
return state
actions = range(5) # stay, N, E, S, W
def next_state(state, action):
row, column = row_and_column(state)
if action == 1 and row > 0:
return state - grid_width
if action == 2 and column < grid_width - 1:
return state + 1
if action == 3 and row < grid_width - 1:
return state + grid_width
if action == 4 and column > 0:
return state - 1
else:
return state
#################################################################
# Query functions
# query each time
def query_fn(s, a, step):
return True
# query first N times
max_num_queries = 10000
def query_fn(s, a, step):
return step < max_num_queries
# query first N times in state s
max_num_queries = 10000 / len(states)
def query_fn(s, a, step):
return sum(nqueries[s]) < max_num_queries
# query first N times if state s, taking action a
max_num_queries = 10000 / len(states) / len(actions)
def query_fn(s, a, step):
return nqueries[s][a] < max_num_queries
#################################################################
# TODO: policy should just be action probabilities...
# policy should probably be a function (taking Q_values)
policy = [[.2, .2, .2, .2, .2] for state in states]
Q_values = [[0,0,0,0,0] for state in states]
total_r_observed = [[0,0,0,0,0] for state in states]
nqueries = [[0,0,0,0,0] for state in states]
#policy = {state: [.2, .2, .2, .2, .2] for state in states}
#nqueries = {state: [0,0,0,0,0] for state in states}
nsteps = 100000
current_state = 0
total_reward = 0
total_observed_reward = 0
# TODO: discounting
# TODO: more policies
def expected_reward(state, action):
return (total_r_observed[state][action] + .5 ) / ( nqueries[state][action] + 10)
action_names = [ 'south', 'north','east','west']
class QDummy(object):
def getQValue(self, state, action):
s = state_from_row_and_column(*state)
a = action_names.index(action)+1
return Q_values[s][a]
dummy_agent = QDummy()
dummy_grid = gridworld.Gridworld([[' '] *grid_width ] * grid_width)
def reverse(t):
return (t[1], t[0])
import graphicsGridworldDisplay
display = graphicsGridworldDisplay.GraphicsGridworldDisplay(dummy_grid, 75, 10.0)
display.start()
def update_q(state0, action, state1, reward, query):
if not query:
reward = expected_reward(state0, action)
old = Q_values[state0][action]
new = reward + gamma*np.max(Q_values[state1])
Q_values[state0][action] = (1-learning_rate)*old + learning_rate*new
for step in range(nsteps):
# uniform random policy:
action = np.argmax(np.random.multinomial(1, [.2,.2,.2,.2,.2]))
# "thompson-sampling" greedy (proportional agent):
action = np.argmax(np.random.multinomial(1, policy[current_state]))
# greedy policy:
action = np.argmax(Q_values[current_state])
if np.random.binomial(1, prob_random_action, 1)[0]: # take a random action
action = np.argmax(np.random.multinomial(1, [.2, .2, .2, .2, .2], 1))
query = query_fn(current_state, action, step)
# +1 thing??
reward = np.random.binomial(1, reward_probabilities[current_state], 1)
total_reward += reward
if query:
nqueries[state][action] += 1
total_r_observed[current_state][action] += reward
total_observed_reward += reward
old_state = current_state
current_state = next_state(current_state, action)
if np.random.uniform(0,1) < prob_random_reset: #np.random.binomial(1, prob_random_reset, 1)[0]: # reset to initial state
current_state = 0
# TODO: learning
#simple q-learner
update_q(old_state, action, current_state, reward, query)
for i in range(8):
print reward_probabilities[i*7:(i+1)*7]
display.displayQValues(dummy_agent, row_and_column(current_state), "q vals")
print total_r_observed
display.pause()
total_nqueries = sum([ sum(nqueries_s) for nqueries_s in nqueries])
total_query_cost = query_cost * total_nqueries
# TODO: more complicated stuff, e.g.:
#total_query_cost = query_cost(query_history)
performance = total_reward - total_query_cost
print "total_reward =", total_reward
print "total_nqueries =", total_nqueries
print "performance =", performance