<a href="https://colab.research.google.com/github/arkB/baby-steps-of-rl-ja/blob/master/note/RF_Day2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
def V(s, gamma=0.99):
  V = R(s) + gamma * max_V_on_next_state(s)
  return V

def R(s):
  if s == "happy_end":
    return 1
  elif s == "bad_end":
    return -1
  else:
    return 0

def max_V_on_next_state(s):
  # If game end, expected value is 0.
  if s in ["happy_end", "bad_end"]:
    return 0
  
  actions = ["up", "down"]
  values = []
  for a in actions:
    transition_probs = transit_func(s, a)
    v = 0
    for next_state in transition_probs:
      prob = transition_probs[next_state]
      v += prob * V(next_state)
    values.append(v)
  return max(values)

In [0]:
def transit_func(s, a):
  """
  Make next state by adding action str to state.
  ex:(s = 'state', a = 'up') => 'state_up'
     (s = 'state_up', a = 'down') => 'state_up_down'
  """
  
  actions = s.split("_")[1:]
  LIMIT_GAME_COUNT = 5
  HAPPY_END_BORDER = 4
  MOVE_PROB = 0.9
  
  def next_state(state, action):
    return "_".join([state, action])
  
  if len(actions) == LIMIT_GAME_COUNT:
    up_count = sum([1 if a == "up" else 0 for a in  actions])
    state = "happy_end" if up_count >= HAPPY_END_BORDER else "bad_end"
    prob = 1.0
    return {state: prob}
  else:
    opposite = "up" if a == "down" else "down"
    return {
        next_state(s, a): MOVE_PROB,
        next_state(s, opposite): 1- MOVE_PROB
    }

In [0]:
if __name__ == "__main__":
  print(V("state"))
  print(V("state_up_up"))
  print(V("state_down_down"))
  print(V("state_up_up_up"))

0.7880942034605892
0.9068026334400001
-0.96059601
0.9508930200000001


In [0]:
class Planner():
  
  def __init__(self, env):
    self.env = env
    self.log = []
    
  def initialize(self):
    self.env.reset()
    self.log = []
    
  def plan(self, gamma=0.9, threshold=0.0001):
    raise Exception("Planner have to implements plan method.")
    
  def transitions_at(self, state, action):
    transition_probs = self.env.transit_func(state, action)
    for next_state in transition_probs:
      prob = transtion_probs[next_state]
      reward, _ = self.env.reward_func(next_state)
      yield prob, next_state, reward
      
  def dict_to_grid(self, state_reward_dict):
    grid = []
    for i in range(self.env.column_length):
      row = [0] * self.env.column_length
      grid.append(row)
    for s in state_reward_dict:
      grid[s.row][s.column] = state_reward_dict[s]
    
    return grid