# Reinforcement Learning in Finite MDPs

In [0]:
!git clone https://github.com/rlgammazero/mvarl_hands_on.git > /dev/null 2>&1

## MDPs

In [0]:
import sys
sys.path.insert(0, './mvarl_hands_on/utils')
import numpy as np
from scipy.special import softmax # for SARSA
import matplotlib.pyplot as plt
import json
import math
from cliffwalk import CliffWalk
from test_env import ToyEnv1

Setting up the environment

In [0]:
env = CliffWalk(proba_succ=0.98)

####################################################################################
# You probably want to test smaller enviroments before
# env = ToyEnv1(gamma=0.99)
####################################################################################

# Useful attributes
print("Set of states:", env.states)
print("Set of actions:", env.actions)
print("Number of states: ", env.Ns)
print("Number of actions: ", env.Na)
print("P has shape: ", env.P.shape)  # P[s, a, s'] = env.P[s, a, s']
print("discount factor: ", env.gamma)
print("")

# Usefult methods
state = env.reset() # get initial state
print("initial state: ", state)
print("reward at (s=1, a=3,s'=2): ", env.reward_func(1,3,2))
print("")

# A random policy
policy = np.random.randint(env.Na, size = (env.Ns,))
print("random policy = ", policy)

# Interacting with the environment
print("(s, a, s', r):")
for time in range(4):
    action = policy[state]
    next_state, reward, done, info = env.step(action)
    print(state, action, next_state, reward)
    if done:
        break
    state = next_state
print("")
print(env.R.shape)

## Question 1: Value iteration
1. Write a function applying the optimal Bellman operator on a provided Q function: $Q_1 = LQ_0, \; Q_0\in \mathbb{R}^{S\times A}$
2. Write a function implementing Value Iteration (VI) with $\infty$-norm stopping condition (reuse function implemented in 1)
3. Evaluate the convergence of your estimate, i.e., plot the value $\|Q_n - Q^\star\|_{\infty} = \max_{s,a} |Q_n(s,a) - Q^\star(s,a)|$

In [0]:
# --------------
# Point 1
# --------------
def bellman_operator(Q0, Ns, Na, R, P, gamma):
  r = np.sum(R * P, axis=2)
  Q1 = r + gamma * P @ np.max(Q0, axis=1)
  greedy_policy = np.argmax(Q1, axis=1)
  return Q1, greedy_policy

In [0]:
# --------------
# Point 2
# --------------
def value_iteration(Q0, env, epsilon=1e-5):
  Q = Q0.copy()
  Q_history = [Q0]
  while True:
    previous_Q = Q
    Q, greedy_policy = bellman_operator(Q, env.Ns, env.Na, env.R, env.P, env.gamma)
    Q_history.append(Q)
    if np.max(np.abs(Q - previous_Q)) <= epsilon:
      break
  return Q, greedy_policy, Q_history

In [0]:
# --------------
# Point 3
# --------------
with open("./mvarl_hands_on/data/Q_opts.json", "r") as fp:
    Qopts = json.load(fp)
Qstar = Qopts["{}_{}".format(type(env).__name__,env.gamma)]

Q0 = np.zeros((env.Ns, env.Na))
Q, greedy_policy, Q_history = value_iteration(Q0, env)

norm_values = []
for Qh in Q_history:
  # norm_values.append(np.linalg.norm(np.max(Qh, axis=1) - np.max(Qstar, axis=1)))
  norm_values.append(np.max(np.abs(Qh - Qstar)))

plt.plot(norm_values)
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.title("Q-learning: Convergence of Q")

In [0]:
state = env.reset()
env.render()
for i in range(50):
    action = greedy_policy[state]
    state, reward, done, _ = env.step(action)
    env.render()

## Question 2: Q learning
Q learning is a model-free algorithm for estimating the optimal Q-function online.
It is an off-policy algorithm since the samples are collected with a policy that is (potentially) not the one associated to the estimated Q-function.

1. Implement Q learning with $\epsilon$-greedy exploration.
  - Plot the error in Q-functions over iterations
  - Plot the sum of rewards as a function of iteration


$\epsilon$-greedy policy:
$$
\pi(s) = \begin{cases}
\max_a Q(s,a) & \text{w.p.} \epsilon\\
\text{random action} & \text{w.p.} 1- \epsilon
\end{cases}
$$

In [0]:
# ---------------------------
# Q-Learning
# ---------------------------
# suggested interface
# you can change it!
class QLearning:
    """
    Q learning with epsilon-greedy exploration
    """
    def __init__(self, env, epsilon=1e-5):
      self.env = env
      self.epsilon = epsilon
      self.Q = np.random.random((self.env.Ns, self.env.Na))
      self.count = np.zeros((self.env.Ns, self.env.Na))
      self.alpha = np.zeros((self.env.Ns, self.env.Na))
    
    def sample_action(self, state):
      if np.random.random() < self.epsilon:
        return np.argmax(self.Q[state, :])
      else:
        return np.random.randint(self.env.Na)
    
    def update(self, state, action, next_state, reward):
      self.count[state, action] += 1
      # self.alpha[state, action] = 1 / np.log(2 + np.log(1 + self.count[state, action]))
      # self.alpha[state, action] = 1 / np.log(2 + self.count[state, action])
      self.alpha[state, action] = 1 / np.log(1 + np.log(10 + self.count[state, action]))

      dt = reward + self.env.gamma * np.max(self.Q[next_state, :]) - self.Q[state, action]
      self.Q[state, action] += self.alpha[state, action] * dt

In [0]:
# --------------
# Point 1
# --------------
# Number of Q learning steps
max_steps = int(1e5)  

# Use the previous code to verify the correctness of q learning
epsilon = 1e-8
Q_opt, pi_opt, _ = value_iteration(Q0, env, epsilon=epsilon)

# main algorithmic loop
ql = QLearning(env, epsilon)
norm_values = []
t = 0
state = 0
cum_rewards = [0]
while t < max_steps:
    action = ql.sample_action(state)
    next_state, reward, done, info = env.step(action)

    norm_values.append(np.abs(ql.Q - Q_opt).mean())
    cum_rewards.append(env.gamma * cum_rewards[-1] + reward)
    
    ql.update(state, action, next_state, reward)
    state = next_state

    t = t + 1
    
print(env.render())
print("optimal policy: ", pi_opt)
greedy_policy = np.argmax(ql.Q, axis=1)
print("est policy:", greedy_policy)


plt.figure()
plt.plot(norm_values)
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.title("Q-learning: Convergence of Q")
plt.show()

plt.figure()
plt.plot(cum_rewards)
plt.xlabel('Iteration')
plt.ylabel('Cumulative reward')
plt.title('Q-learning: Cumulative rewards')
plt.show()

# how confident are you in the performance of the algorithm? maybe a single run is not enough