<a href="https://colab.research.google.com/github/alerotta/DRL/blob/main/02%20-%20Q%20Learning/Tabular_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium[toy-text] --quiet
!pip install torch --quiet

In [4]:
import typing as tt
import gymnasium as gym
from collections import defaultdict
from torch.utils.tensorboard.writer import SummaryWriter

In [9]:

ENV_NAME = "FrozenLake-v1"
GAMMA = 0.9
ALPHA =0.2
TEST_EPISODES = 20

state = int
action = int
ValueKey = tt.Tuple[state,action]

class Agent ():
  def __init__(self):
    self.env = gym.make(ENV_NAME)
    self.state, _ = self.env.reset()
    self.values : tt.Dict[ValueKey] = defaultdict(float) #the q-table, is a dictionraty where keys are state, value pairs


  # this is random sampling and returns s,a,r,s'
  def sample_env(self):
    action = self.env.action_space.sample() #random action
    old_state = self.state
    new_state, reward , is_done, is_trunc, _ = self.env.step(action)
    if is_done or is_trunc :
      self.state, _ = self.env.reset()
    else:
      self.state = new_state
    return old_state , action , float(reward), new_state

  # this is a greegy policy selectiong best Q values.
  def best_value_and_action (self, state):
    best_value , best_action = None , None
    for action in range(self.env.action_space.n) :
      action_value = self.values[(state,action)]
      if best_value is None or best_value < action_value:
        best_value = action_value
        best_action = action
    return best_value, best_action

  # this is a smooth update rule to avoid large updates.
  def value_update(self, state, action ,reward, next_state):
     best_val , _ =  self.best_value_and_action(next_state)
     new_val = reward + GAMMA * best_val
     old_val = self.values[(state,action)]
     key = (state,action)
     self.values[key] = old_val * (1 -ALPHA) + new_val * ALPHA

  # play the ep, using the greedy policy and no updates
  def play_episode (self, env):
    total_reward = 0.0
    state, _ = env.reset()
    while True :
      _ , action = self.best_value_and_action(state)
      new_state, reward, is_done, is_trunc , _ = env.step(action)
      total_reward += reward
      if is_done or is_trunc:
        break
      state = new_state
    return total_reward

def run():
  test_env = gym.make(ENV_NAME)
  agent = Agent()
  writer = SummaryWriter(comment="-q-learning")

  iter_no = 0
  best_reward = 0.0
  while True:
    iter_no +=1
    state, action , reward , next_state = agent.sample_env()
    agent.value_update(state,action,reward,next_state)

    test_reward = 0.0

    for _ in range(TEST_EPISODES):
      test_reward += agent.play_episode(test_env)
    test_reward /= TEST_EPISODES #mean
    writer.add_scalar("reward", test_reward, iter_no)
    if test_reward > best_reward:
      print("%d: Best test reward updated %.3f -> %.3f" % (iter_no, best_reward, test_reward))
      best_reward = test_reward
    if test_reward > 0.8:
      print("Solved in %d iterations!" % iter_no)
      break

  writer.close()

run()






433: Best test reward updated 0.000 -> 0.300
436: Best test reward updated 0.300 -> 0.350
439: Best test reward updated 0.350 -> 0.450
1604: Best test reward updated 0.450 -> 0.500
2614: Best test reward updated 0.500 -> 0.600
3060: Best test reward updated 0.600 -> 0.700
3281: Best test reward updated 0.700 -> 0.850
Solved in 3281 iterations!
