forked from siyamak45/CS188.1x-Project3
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraps_for_ipynb.py
246 lines (199 loc) · 10.2 KB
/
scraps_for_ipynb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
from pylab import *
import scipy
import scipy.stats
import numpy
np = numpy
from collections import OrderedDict
import time
# basic environment dynamics.
# running into walls leaves you where you are.
def get_next_state(state, current_action):
row, column = state / grid_width, state % grid_width
if np.random.binomial(1, prob_random_move): # current_action is replaced at random
current_action = np.argmax(np.random.multinomial(1, [.2,.2,.2,.2,.2]))
if current_action == 1 and row > 0:
next_state = state - grid_width
elif current_action == 2 and column < grid_width - 1:
next_state = state + 1
elif current_action == 3 and row < grid_width - 1:
next_state = state + grid_width
elif current_action == 4 and column > 0:
next_state = state - 1
else:
next_state = state
return next_state
#################################################################
# generally useful functions
def updated_running_average(avg, new_value, update_n):
return (avg * (update_n - 1) + new_value) / update_n
def multinomial_entropy(probs): # TODO: stability?
return -np.sum(probs * np.log(probs))
def binomial_entropy(p):
return multinomial_entropy([p, 1-p])
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
#################################################################
# Bayesian Stuff
# TODO: precompute entropies?
# learn a distribution over states (dirichlet-multinomial)
def state_entropy():
return scipy.stats.dirichlet([sum(nvisit) + .5 for nvisit in nvisits]).entropy()
def state_probability(state):
(sum(nvisits[state]) + .5) / (step + .5*len(states))
# learn a distribution over rewards (beta-bernoulli)
# we use a lookup table to speed up computations
# the list version was too large. I'm trying a dictionary instead...
#beta_entropy_lookup = np.inf * np.ones((num_steps, num_steps))
beta_entropy_lookup = {}
def reward_entropy():
# empirical counts
alpha_ = total_r_observed[current_state][current_action]
beta_ = nqueries[current_state][current_action] - alpha_
if (alpha_, beta_) in beta_entropy_lookup:
entry = beta_entropy_lookup[(alpha_, beta_)]
else:
entry = scipy.stats.beta(alpha_ + .5, beta_ + .5).entropy()
beta_entropy_lookup[(alpha_, beta_)] = entry
return entry
def ireward_entropy(current_state, current_action):
# empirical counts
alpha_ = total_r_observed[current_state][current_action]
beta_ = nqueries[current_state][current_action] - alpha_
if (alpha_, beta_) in beta_entropy_lookup:
entry = beta_entropy_lookup[(alpha_, beta_)]
else:
entry = scipy.stats.beta(alpha_ + .5, beta_ + .5).entropy()
beta_entropy_lookup[(alpha_, beta_)] = entry
return entry
def expected_reward(state, current_action):
alpha = total_r_observed[state][current_action] + .5
beta = ( nqueries[state][current_action] + 1) - alpha
return alpha / (alpha + beta)#(total_r_observed[state][current_action] + .5 ) / ( nqueries[state][current_action] + 1)
# TODO:
# learn a distribution over Q_values (Gaussian, even though it's probably the wrong thing...)
# initialize means based on prior over rewards (E[r] = .5) and discount rate
# initialize std = mean
mean_Q_values = [[.5 * (1 / 1 - gamma),] * len(current_actions),] *len(states)
std_Q_values = [[.5 * (1 / 1 - gamma),] * len(current_actions),] *len(states)
#
def softmax_entropy():
return multinomial_entropy(softmax(Q_values[current_state]))
#################################################################
# Query functions
query_fns = OrderedDict()
# simple baselines
if budget == 'high':
max_num_queries = 100000. / query_cost
#temperature = 1 #TODO
thresh = -3
elif budget == 'med':
max_num_queries = 30000. / query_cost
thresh = -2
elif budget == 'low':
max_num_queries = 10000. / query_cost
thresh = -1
elif budget == 'very_low':
max_num_queries = 1000. / query_cost
thresh = -.5
#query_fns['first N steps'] = lambda : step < max_num_queries
#query_fns['first N steps (based on query cost)'] = lambda : step < max_num_queries * (.1 / query_cost)
#query_fns['first N state visits'] = lambda : sum(nvisits[current_state]) < (max_num_queries * (.1 / query_cost) / len(states))
if not our_hack:
query_fns['first N (state,current_action) visits'] = lambda : nvisits[current_state][current_action] < (max_num_queries * (.1 / query_cost) / (len(states) * len(current_actions)))
#query_probability_decay = 1 - 1. / max_num_queries
#query_fns['decaying probability'] = lambda : np.random.binomial(1, query_probability_decay**step)
#query_fns['every time'] = lambda : True
# query based on entropy of softmax policy:
# FIXME: querying too much... also, we should be looking MORE at those with lower entropy, I think...
#query_fns['softmax entropy threshold'] = lambda : softmax_entropy() > 1
#query_fns['stochastic softmax entropy'] = lambda : np.random.binomial(1, softmax_entropy() / multinomial_entropy([.2,.2,.2,.2,.2]))
# query based on entropy of P(r|s,a):
#query_fns['r entropy threshold'] = lambda : reward_entropy() > thresh
#query_fns['stochastic r entropy'] = lambda : np.random.binomial(1, np.exp(reward_entropy()))
# query based on number of state visits:
#query_fns['prob = proportion of state visits'] = lambda : np.random.binomial(1, (sum(nvisits[current_state]) + 50) / (sum(nvisits) + 50. * len(states)))
#query_fns['prob = proportion of state-current_action visits'] = lambda : np.random.binomial(1, (sum(nvisits[current_state][current_action]) + 10) / (sum(nvisits) + 10. * len(states) * len(current_actions)))
# query based on expected reward
#query_fns['geq than average expected reward'] = lambda : expected_reward(current_state, current_action) >= mean([expected_reward(s,a) for s in states for a in current_actions])
# this explores more where there's more to explore!
if our_hack:
query_fns['proportion of expected reward'] = lambda : np.random.binomial(1, expected_reward(current_state, current_action) / sum(expected_reward(s,a) for s in states for a in current_actions))
# query based on expected reward
#query_fns['geq than average Q'] = lambda : Q_values[current_state][current_action] >= mean(Q_values)
#query_fns['proportion of Q'] = lambda : np.random.binomial(1, (Q_values[current_state][current_action] + 1) / (sum(Q_values) + 1))
# query based on entropy of P(s):
# TODO: we want this to actually depend on the state we're in, otherwise, it seems to mostly just scale with sqrt(nvisits)
#query_fns['s entropy threshold'] = lambda : state_entropy() > -240
#query_fns['stochastic s entropy'] = lambda : np.random.binomial(1, np.exp(state_entropy())) # TODO: scaling...
#################################################################
# LEARNING
def update_q(state0, current_action, state1, reward, query):
if query:
reward = expected_reward(state0, current_action)
old = Q_values[state0][current_action]
new = reward + gamma*np.max(Q_values[state1])
Q_values[state0][current_action] = (1-learning_rate)*old + learning_rate*new
#query_fn = query_fns['every time']
t1 = time.time()
all_results = {}
for name,query_fn in query_fns.items():
print "\n query_cost=", query_cost, '\n'
print "name=", name
all_results[query_cost] = {}
performances = []
for nex in range(num_experiments):
# reward probabilities (rewards are stochastic bernoulli)
if fixed_mdp:
reward_probabilities = np.load('/u/kruegerd/CS188.1x-Project3/fixed_mdp0.npy')
elif fixed_battery:
reward_probabilities = np.load('/u/kruegerd/CS188.1x-Project3/200mdps.npy')[nex*len(states): (nex+1)*len(states)]
else:
reward_probabilities = np.random.binomial(1, 1 - prob_zero_reward, len(states)) * np.random.uniform(0, 1, len(states))
for n in range(grid_width):
print reward_probabilities[n*grid_width: (n+1)*grid_width]
# reset for a new experiment
Q_values = [[0,0,0,0,0] for state in states]
total_r_observed = [[0,0,0,0,0] for state in states]
nvisits = [[0,0,0,0,0] for state in states]
nqueries = [[0,0,0,0,0] for state in states]
current_state = 0
total_reward = 0
#import itertools
#print [ireward_entropy(s,a) for (s,a) in itertools.product(states, current_actions)]
for step in range(num_steps):
current_action = np.argmax(Q_values[current_state])
if np.random.binomial(1, epsilon): # take a random action
current_action = np.argmax(np.random.multinomial(1, [.2, .2, .2, .2, .2]))
nvisits[current_state][current_action] += 1
reward = np.random.binomial(1, reward_probabilities[current_state])
total_reward += reward
query = query_fn()
if query:
nqueries[current_state][current_action] += 1
# TODO: naming these two
observed_rewards[current_state][current_action] += reward
old_state = current_state
if np.random.binomial(1, prob_random_reset): # reset to initial state
current_state = 0
#simple q-learner
update_q(old_state, current_action, current_state, reward, query)
total_observed_rewards = sum([ sum(r_observed) for r_observed in observed_rewards])
total_nqueries = sum([ sum(nqueries_s) for nqueries_s in nqueries])
total_query_cost = query_cost * total_nqueries
performance = total_reward - total_query_cost
print "total_reward =", total_reward
print "total_observed_reward =", total_observed_rewards
print "total_nqueries =", total_nqueries
print "performance =", performance
print ''
performances.append(performance)
print time.time() - t1
save_str = '/u/kruegerd/CS188.1x-Project3/results/'
save_str += 'total_nqueries_' + name + '__query_cost=' + str(query_cost) + '__budget=' + str(budget)
np.save(save_str, total_nqueries)
save_str = '/u/kruegerd/CS188.1x-Project3/results/'
save_str += 'performances_' + name + '__query_cost=' + str(query_cost) + '__budget=' + str(budget)
np.save(save_str, performances)
#hist(performances, 50)