#Reinfrocement Learning Workshop
---

## Imports & Installs

In [None]:
!apt install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install box2d-py
!pip install gym==0.17.2 > /dev/null 2>&1
!pip install pyvirtualdisplay==1.3.2 > /dev/null 2>&1

In [None]:
%matplotlib inline

In [None]:
import gym
import glob
import base64
import io
import os
import sys
import tqdm
import numpy as np
import copy
import random
import torch
import time
import pandas as pd
import seaborn as sns
import pyvirtualdisplay
from gym.wrappers import Monitor
import matplotlib.pyplot as plt
from IPython.display import HTML
from collections import defaultdict
from pyvirtualdisplay import Display
from collections import namedtuple, deque
from torch.distributions import Categorical
from IPython import display as ipythondisplay

In [None]:
# Setup of xvfb display server wrapper.
display = Display(visible=0, size=(400, 300))
display.start()
os.environ.get("DISPLAY")

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video(path_prefix='/video/'):
  if not path_prefix.endswith('/'):
    path_prefix += '/'
  mp4list = glob.glob(f'{path_prefix}*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    fmt = 'ascii'
    ipythondisplay.display(HTML(
        data=f'<video alt="recording" autoplay loop controls style="height: 400px;">\
               <source src="data:video/mp4;base64,{encoded.decode(fmt)}" type="video/mp4"/>\
               </video>'))
  else: 
    print("Could not find video")
    

def wrap_env(env, path_prefix='/video/'):
  if not path_prefix.endswith('/'):
    path_prefix += '/'
  env = Monitor(env, path_prefix, force=True)
  return env

In [None]:
# Mounting gdrive..

from google.colab import drive
drive.mount('/content/drive')
!mkdir -p  '/content/drive/My Drive/ml_college_data/rl_workshop/'
rl_workshop_path = '/content/drive/My Drive/ml_college_data/rl_workshop/'
!ls '/content/drive/My Drive/ml_college_data'

## OpenAI Gym
 - introduction of OpenAI gym environment
 - exploring simple and more complex environments
 - how to visualize game play

All environments have its leaderboards and defined success scores: https://github.com/openai/gym/wiki/Leaderboard#

In [None]:
np.random.choice(list(gym.envs.registry.all()), 5)

#### Simple text based environment exploration

Text environment with fully observable state space. More details can be found at https://gym.openai.com/envs/FrozenLake-v0/



In [None]:
# Creating and resetting of the gym environment..
game = "FrozenLake8x8-v0"
env = gym.make(game)
state = env.reset()
screen = env.render(mode='ansi')
print(screen)

In [None]:
env.observation_space.n

In [None]:
np.arange(64).reshape((8, 8))

In [None]:
env.action_space.n

In [None]:
env.action_space.sample()

In [None]:
# Semantic of operations.
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

In [None]:
state = 1
action = RIGHT
env.P[state][action]

In [None]:
# Exploration of non-deterministic behaviour of environment.
next_state, reward, done, info = env.step(action)
screen = env.render(mode='ansi')
print(screen)
print('Action:', action)
print('Next state:', next_state)
print('Reward:', reward)
print('Done:', done)
print('Info:', info)

In [None]:
env.close()

In [None]:
#############
# Game play #
#############


# Setup of environment.
game = "FrozenLake8x8-v0"
env = gym.make(game)
state = env.reset()

for iteration in range(100):
  # Sampling random action.
  action = env.action_space.sample()
  # Applying action in the envrinment.
  next_state, reward, done, info = env.step(action)
  # Clearing up the screen.
  ipythondisplay.clear_output(wait=True)
  screen = env.render('ansi')
  print(screen)
  print()
  print('Action', action)
  print('Obs', next_state)
  print('Reward', reward)
  print('Done', done)
  print('Info', info)
  time.sleep(0.3)
  if done:
    print(f"Agent end up in {iteration} iterations.")
    break
env.close()

#### Complex Atari environment
Not fully observable environment of Pacman (https://gym.openai.com/envs/MsPacman-v0/) with even more rich domain of actions and continuous state space.

In [None]:
env = gym.make("MsPacman-v0")

In [None]:
env.observation_space

In [None]:
env.action_space.n

In [None]:
env.get_action_meanings()

In [None]:
state = env.reset()
state.shape

In [None]:
plt.figure(figsize=(7, 7))
f = plt.imshow(env.render('rgb_array'))


In [None]:
#############
# Game play #
#############


# Prefix where to save video saved from gameplay
prefix = '/video/pacman/play_1/'
env = wrap_env(gym.make("MsPacman-v0"), prefix)

state = env.reset()
while True:
    # 'human' type of rendenring is suitable when we do the recording.
    env.render('human')
    action = env.action_space.sample()      
    next_state, reward, done, info = env.step(action) 
    if done: 
      break;   
env.close()

show_video(prefix)

#### Questions and experiment suggestions
 - Explore other environments, eg. https://gym.openai.com/envs/CartPole-v1/. There is always a link to actual code of environment to check details about behaviour etc..

## Model Based Methods
 - revisit of frozen lake environment
 - policy evaluation in the fully observable environment
 - policy improvement and interation
 - visualization of the V and policy

#### Frozen Lake revisited

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="4x4", is_slippery=True)

In [None]:
env.action_space.n

In [None]:
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

policy_mapping = {
    0: "LEFT",
    1: "DOWN",
    2: "RIGHT",
    3: "UP"
}

In [None]:
env.observation_space.n

In [None]:
state = 10
print(env.P[state][LEFT])
print()
prob, next_state, reward, done = env.P[state][LEFT][0]
prob, next_state, reward, done

In [None]:
env.close()

In [None]:
# Comparison of non deterministic and deterministic environment.
env = gym.make(game, map_name="4x4", is_slippery=False)
state = 10
env.P[state][LEFT]

In [None]:
env.close()

#### Policy evaluation

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="4x4", is_slippery=False)

In [None]:
policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n
policy

In [None]:
def policy_evaluation(policy, env, gamma=0.9, improvement=1e-8):
  """ Applies policy to environment and collects results to build value function.
      
      Params:
        policy: np.array([states, actions]).
        env: OpneAI envirnment.
        gamma: Discount of future rewards.
        improvements: Minimal improvement of V to continue evaluation.
      Return: Value function.
  """

  V = np.zeros(env.observation_space.n)

  while True:
    # Record biggest single iteration improvement.
    delta = 0
    # Iterate over all states, actions and env responses to actions.
    for state in range(env.observation_space.n):
      v_s = 0
      for action in env.P[state].keys():
        for prob, next_state, reward, done in env.P[state][action]:
          v_s += policy[state][action] * prob * (reward + gamma * V[next_state])
      # Keep information about biggest improvement.
      delta = max(delta, np.abs(V[state] - v_s))
      V[state] = v_s
    if delta < improvement:
      break
  return V

In [None]:
# Test your policy evaluation method
# game = "FrozenLake-v0"
# env = gym.make(game, map_name="4x4", is_slippery=False)
policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n

V_expected = np.array([0.0045, 0.0042, 0.0101, 0.0041, 0.0067, 0., 0.0263, 0., 0.0187, 0.0576, 0.107, 0., 0., 0.1304, 0.3915, 0.])
np.testing.assert_array_equal(V_expected, policy_evaluation(policy, env, gamma=0.9, improvement=1e-8).round(4))
# env.close()

In [None]:
V = policy_evaluation(policy, env, gamma=0.9, improvement=1e-8)

In [None]:
def visualize_V(V, shape, figsize=(10, 10)):
  V_df = pd.DataFrame(V.reshape(shape))
  plt.figure(figsize=figsize)
  fig = sns.heatmap(V_df, annot=True, linewidths=.5, cmap='Blues',  xticklabels=False, yticklabels=False)

In [None]:
visualize_V(V, (4,4))

In [None]:
env.render()

#### Policy improvement

In [None]:
def get_Q(V, env, gamma=0.9):
    """ Calculates Q(s,a) value fucntion based on V and MDP. """ 
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    for state in range(env.observation_space.n):
      for action in range(env.action_space.n):
        for prob, next_state, reward, done in env.P[state][action]:
            Q[state][action] += prob * (reward + gamma * V[next_state])
    return Q

In [None]:
# Test your get_Q fucntion
# game = "FrozenLake-v0"
# env = gym.make(game, map_name="4x4", is_slippery=False)

V_test = np.array([0.0045, 0.0042, 0.0101, 0.0041, 0.0067, 0., 0.0263, 0., 0.0187, 0.0576, 0.107, 0., 0., 0.1304, 0.3915, 0.])
Q_expected = np.array([
       [0.004, 0.006, 0.004, 0.004],
       [0.004, 0.   , 0.009, 0.004],
       [0.004, 0.024, 0.004, 0.009],
       [0.009, 0.   , 0.004, 0.004],
       [0.006, 0.017, 0.   , 0.004],
       [0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.096, 0.   , 0.009],
       [0.   , 0.   , 0.   , 0.   ],
       [0.017, 0.   , 0.052, 0.006],
       [0.017, 0.117, 0.096, 0.   ],
       [0.052, 0.352, 0.   , 0.024],
       [0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.117, 0.352, 0.052],
       [0.117, 0.352, 1.   , 0.096],
       [0.   , 0.   , 0.   , 0.   ]])


np.testing.assert_array_equal(Q_expected, get_Q(V_test, env, gamma=0.9).round(3))
# env.close()

In [None]:
Q = get_Q(V, env)
Q

In [None]:
# Check state-action values.
state = 0
Q[state][DOWN]

In [None]:
def policy_improvement(V, env, gamma=0.9):
    """Generate new policy based on max values for each state in Q"""
    policy = np.zeros([env.observation_space.n, env.action_space.n])
    Q = get_Q(V, env, gamma)
    for state in range(env.observation_space.n):
        # There can be multiple maximal options in Q[state].
        best_actions = np.argwhere(Q[state]==np.max(Q[state])).flatten()
        # We distribute probability of action evenly amongst maximum values.
        policy[state] = np.sum([np.eye(env.nA)[a] for a in best_actions], axis=0) / len(best_actions)
    return policy

In [None]:
policy = policy_improvement(V, env)
policy

In [None]:
def visualize_policy(policy, shape):
  policy_mapping = {
    0: "LEFT",
    1: "DOWN",
    2: "RIGHT",
    3: "UP"
  }
  readable_policy = []
  for state in range(policy.shape[0]):
    # In case of multiple maximas we take first letter of each maximal action.
    possible_moves = list(np.argwhere(policy[state] == np.max(policy[state])).flatten())
    code = "".join(map(lambda move: policy_mapping[move][0], possible_moves))
    readable_policy.append(code)
  return pd.DataFrame(np.array(readable_policy).reshape(shape))

visualize_policy(policy, (4, 4))

In [None]:
env.close()

#### Policy iteration

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=False)

In [None]:
env.render()

In [None]:
np.arange(64).reshape((8,8))

In [None]:
# Policy iteration algorithm
policy = np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n
improvement = 1e-3
gamma = 0.9

for iteration in tqdm.tqdm(range(1000), total=1000):
  # Calculate V function based on current policy.
  V = policy_evaluation(policy, env, improvement=improvement, gamma=gamma)
  # Improve policy based on new value function.
  new_policy = policy_improvement(V, env)
  if np.all(policy == new_policy) and iteration > 5:
    break
  policy = new_policy.copy()

In [None]:
visualize_V(V, (8, 8))

In [None]:
visualize_policy(policy, shape=(8,8))

In [None]:
policy_mapping

In [None]:
env.P[0]

In [None]:
np.arange(64).reshape((8,8))

In [None]:
def get_action(state, policy, action_space_size):
  return np.random.choice(action_space_size, size=1, p=policy[state])[0]

get_action(6, policy, env.action_space.n)

In [None]:
env.close()

In [None]:
#############
# Game play #
#############


game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=False)
state = env.reset()

for iteration in range(100):
  # Sampling random action from policy.
  action = get_action(state, policy, env.action_space.n)
  # Applying action in the envrinment.
  next_state, reward, done, info = env.step(action)

  ipythondisplay.clear_output(wait=True)
  screen = env.render('ansi')
  print(screen)
  print()
  print('Action', action)
  print('Obs', next_state)
  print('Reward', reward)
  print('Done', done)
  print('Info', info)
  time.sleep(0.3)
  if done:
    print(f"We finished after {iteration} iterations")
    break
  state = next_state  
env.close()

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=True)

cumulative_reward = 0
for _ in tqdm.tqdm(range(1000), total=1000):
  state = env.reset()
  for iteration in range(100):
    # Sampling random action from policy.
    action = get_action(state, policy, env.action_space.n)
    # Applying action in the envrinment.
    next_state, reward, done, info = env.step(action)
    if done:
      break
    state = next_state
  cumulative_reward += reward  
env.close()

print()
print('Cumulative reward is', cumulative_reward)

#### Questions and experiment suggestions
 - Change environment to `is_slippery=True` and explore final policy, use `env.P` to understand strange moves agent does.
 - Try policy trained in slippery env in non-slippery one and vice versa
 - Write function which calculates cumulative reward of policy in slippery env over multiple runs. 

## Model Free Value Based Methods - Intro
Methods exploring environment without knowledge of `env.P` or state size.
 - monte carlo policy evaluation
 - monte carlo control
 - temporad differece learning
 - q learning

#### Monte Carlo policy evaluation
We don't know how many states the environment has, neither the `env.P`

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=False)

# Defalut policy for unknow state is random.
policy = defaultdict(lambda: np.ones(env.action_space.n) / env.action_space.n)

In [None]:
def sample_episode_with_policy(env, policy):
  """Samples the environment with given policy and return whole episode
     Return: List of tuples in form (state, action, reward)
  """
  episode = []
  state = env.reset()

  while True:
    # Sampling random action.
    action = get_action(state, policy, env.action_space.n)
    # Applying action to the envrinment.
    next_state, reward, done, info = env.step(action)
    episode.append((state, action, reward))
    if done:
      break
    state = next_state  
  return episode

In [None]:
episode = sample_episode_with_policy(env, policy)

In [None]:
states, actions, rewards = zip(*episode)
states[:10]

In [None]:
def monte_carlo_policy_evaluation(env, policy, episodes=100000, gamma=0.9):
    V_sum = defaultdict(lambda: 0)
    V = defaultdict(lambda: 0)
    N = defaultdict(lambda: 1)
    
    for _ in tqdm.tqdm(range(episodes), total=episodes):
        episode = sample_episode_with_policy(env, policy)
        states, actions, rewards = zip(*episode)
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])
        # For each sampled state we just take the future rewards.
        for i, state in enumerate(states):
            V_sum[state] += sum(rewards[i:]*discounts[:-(1+i)])
            N[state] += 1.0
            V[state] = V_sum[state] / N[state]
    return V

In [None]:
V_88_fully_observed = np.array([
       0.25418658, 0.28242954, 0.3138106 , 0.34867844, 0.38742049,
       0.43046721, 0.4782969 , 0.531441  , 0.28242954, 0.3138106 ,
       0.34867844, 0.38742049, 0.43046721, 0.4782969 , 0.531441  ,
       0.59049   , 0.3138106 , 0.34867844, 0.38742049, 0.        ,
       0.4782969 , 0.531441  , 0.59049   , 0.6561    , 0.34867844,
       0.38742049, 0.43046721, 0.4782969 , 0.531441  , 0.        ,
       0.6561    , 0.729     , 0.3138106 , 0.34867844, 0.38742049,
       0.        , 0.59049   , 0.6561    , 0.729     , 0.81      ,
       0.28242954, 0.        , 0.        , 0.59049   , 0.6561    ,
       0.729     , 0.        , 0.9       , 0.3138106 , 0.        ,
       0.4782969 , 0.531441  , 0.        , 0.81      , 0.        ,
       1.        , 0.34867844, 0.38742049, 0.43046721, 0.        ,
       0.81      , 0.9       , 1.        , 0.        ])


mses = []
for ep in [100, 1000, 10000]:
  V = monte_carlo_policy_evaluation(env, policy, episodes=ep, gamma=0.9)
  V_88_mc = np.zeros(64)
  for state, value in V.items():
    V_88_mc[state] = value
  mse = np.mean(np.power(V_88_fully_observed - V_88_mc, 2))
  mses.append(mse)

print()
print(mses)

In [None]:
visualize_V(V_88_mc, shape=(8,8))

In [None]:
visualize_V(V_88_fully_observed, shape=(8,8))

In [None]:
env.close()

#### Monte Carlo Control  

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=False)

In [None]:
def get_policy_Qs(Qs, action_space_size, eps=0.99):
  """Get eps greedy policy for given state based of Q value."""
  policy = np.ones(action_space_size) * eps / action_space_size
  max_action_position = np.argwhere(Qs==np.max(Qs)).flatten()
  policy[max_action_position] = (1 - eps) / len(max_action_position) + eps / action_space_size
  return policy

def get_action_Q(state, Q, action_space_size, eps=0.99):
  """Sampling action based on Q using epsilon greedy policy"""
  if state not in Q:
    return np.random.choice(np.arange(action_space_size))
  return np.random.choice(np.arange(action_space_size), p=get_policy_Qs(Q[state], action_space_size, eps))
  

In [None]:
get_policy_Qs(np.array([1,2,3,2,3]), action_space_size=5, eps=0.5)

In [None]:
def sample_episode_with_Q(env, Q, eps=0.99):
  episode = []
  state = env.reset()

  while True:
    action = get_action_Q(state, Q, env.action_space.n, eps)
    next_state, reward, done, info = env.step(action)
    episode.append((state, action, reward))
    if done:
      break
    state = next_state  
  return episode

In [None]:
Q = defaultdict(lambda: np.zeros(env.action_space.n))
episode = sample_episode_with_Q(env, Q, eps=0.99)
episode[:4]

In [None]:
def monte_carlo_control(env, episodes, alpha=0.01, gamma=0.9, eps_start=1.0, eps_decay=.99999, eps_min=0.05):
    """Monte carlo sampling of environment and improving Q with eps-greedy policy
       Params:
          alpha: Approxiation of 1/N term in monte carlo policy evaluaton.
          gamma: Future reward discount.
          eps: How much we we explore rather than exploit env.
          eps_decay: Decrease eps each iteration.
      Return: Estimated Q function
    """

    action_space_size = env.action_space.n
    Q = defaultdict(lambda: np.zeros(action_space_size))
    eps = eps_start
    
    # In each iteration we sample episode and  update Q and policy
    for _ in tqdm.tqdm(range(episodes), total=episodes):
        episode = sample_episode_with_Q(env, Q, eps)
        states, actions, rewards = zip(*episode)
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])
        # Using same discount strategy as with mc policy evaluation.
        for i, state in enumerate(states):
            cumulative_reward = sum(rewards[i:]*discounts[:-(1+i)])
            action = actions[i]
            # Q average value with just approximation of N.
            Q[state][action] = Q[state][action] + alpha * (cumulative_reward - Q[state][action])
        # Decrease eps
        eps = max(eps*eps_decay, eps_min)
    return Q

In [None]:
Q = monte_carlo_control(env, episodes=25000, eps_start=0.5, alpha=0.01)

In [None]:
# For visualization we cheat with prior about size of states.
policy_q = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
for state, Qs in Q.items():
  policy_q[state] = get_policy_Qs(Qs, env.action_space.n, eps=0)
visualize_policy(policy_q, shape=(8, 8))

In [None]:
env.close()

In [None]:
#############
# Game play #
#############


game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=False)
state = env.reset()

for iteration in range(100):
  # Sampling random action.
  action = get_action_Q(state, Q, env.action_space.n, eps=0.1)
  # Applying action in the envrinment.
  next_state, reward, done, info = env.step(action)

  ipythondisplay.clear_output(wait=True)
  screen = env.render('ansi')
  print(screen)
  print()
  print('Action', action)
  print('Obs', next_state)
  print('Reward', reward)
  print('Done', done)
  print('Info', info)
  time.sleep(0.05)
  if done:
    print(f"We finished after {iteration} iterations")
    break
  state = next_state  
env.close()

#### Questions and experiment suggestions
 - Change environment to `is_slippery=True` and find sample count which produce reasonable policy
 - Write function calculating cumulative reward of multiple mc-control runs  with given policy and different *eps*

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=False)

cumulative_reward = 0
eps = 0.2
for _ in tqdm.tqdm(range(1000), total=1000):
  state = env.reset()
  for iteration in range(100):
    # Sampling random action.
    action = get_action_Q(state, Q, env.action_space.n, eps=eps)
    # Applying action in the envrinment.
    next_state, reward, done, info = env.step(action)
    if done:
      break
    state = next_state
  cumulative_reward += reward  
env.close()

print()
print('Cumulative reward is', cumulative_reward)

#### Temporal Difference

In [None]:
game = "FrozenLake-v0"
env = gym.make(game, map_name="8x8", is_slippery=True)

In [None]:
def td_sarsa(env, episodes, alpha=0.01, gamma=0.9, eps_start=1.0, eps_decay=.99999, eps_min=0.05):
    """Temporal difference SARSA apprach with eps greedy policy to estimate Q.
       Params:
          alpha: Approxiation of 1/N term in monte carlo policy evaluaton.
          gamma: Future reward discount.
          eps: How much we we explore rather than exploit env.
          eps_decay: Decrease eps each iteration.
      Return: Estimated Q function
    """
    action_space_size = env.action_space.n
    eps = eps_start
    Q = defaultdict(lambda: np.zeros(action_space_size))

    for _ in tqdm.tqdm(range(episodes), total=episodes):
        eps = max(eps*eps_decay, eps_min)
        # We sample every step, not waiting for the finish of the episode.
        state = env.reset()
        action = get_action_Q(state, Q, env.action_space.n, eps)
        # Here starts one episode.
        while True:
            next_state, reward, done, info = env.step(action)
            if not done:
                next_action = get_action_Q(next_state, Q, env.action_space.n, eps)
                # We just estimate cummulative reward based on sarsa sample and current Q.
                td_target = reward + gamma * Q[next_state][next_action]
                td_error = td_target - Q[state][action]
                Q[state][action] += alpha * td_error
            # We can not sample next step in case we finished.
            else:
                td_target = reward
                td_error = td_target - Q[state][action]
                Q[state][action] += alpha * td_error
                break
            state = next_state
            action = next_action  
    return Q

In [None]:
Q = td_sarsa(env, episodes=75000, alpha=0.01, eps_start=1)

In [None]:
policy_q = np.ones([64, env.action_space.n]) / env.action_space.n
for state, Qs in Q.items():
  policy_q[state] = get_policy_Qs(Qs, env.action_space.n, eps=0)
visualize_policy(policy_q, shape=(8, 8))

#### Questions and experiment suggestions
 - Use function for calculating cumulative reward and compare MC, SARSA and Q-learning (next part)

In [None]:
cumulative_reward = 0
for _ in tqdm.tqdm(range(1000), total=1000):
  state = env.reset()
  for iteration in range(100):
    # Sampling random action.
    action = get_action_Q(state, Q, env.action_space.n, eps=0)
    # Applying action in the envrinment.
    next_state, reward, done, info = env.step(action)
    if done:
      break
    state = next_state
  cumulative_reward += reward  
  env.close()

print('Cumulative reward is', cumulative_reward)

#### Q learining control (off policy)
Learinig from optimal policy while following exploratory policy.

In [None]:
def Q_learning(env, episodes, alpha=0.01, gamma=0.9, eps_start=1.0, eps_decay=.99999, eps_min=0.05):
    """Q learning apprach with eps greedy policy to estimate Q.
       Params:
          alpha: Approxiation of 1/N term in monte carlo policy evaluaton.
          gamma: Future reward discount.
          eps: How much we we explore rather than exploit env.
          eps_decay: Decrease eps each iteration.
      Return: Estimated Q function
    """
    action_space_size = env.action_space.n
    eps = eps_start
    Q = defaultdict(lambda: np.zeros(action_space_size))

    for _ in tqdm.tqdm(range(episodes), total=episodes):
        eps = max(eps*eps_decay, eps_min)
        # We now sample evry step, not after finishing the episode.
        state = env.reset()
        # Here starts one episode.
        while True:
            action = get_action_Q(state, Q, env.action_space.n, eps)
            next_state, reward, done, info = env.step(action)
            if not done:
                # We just estimate cummulative reward based on sars sample and current Q.
                td_target = reward + gamma * max(Q[next_state])
                td_error = td_target - Q[state][action]
                Q[state][action] += alpha * td_error
            # We can not sample next step in case we finished.
            else:
                td_target = reward
                td_error = td_target - Q[state][action]
                Q[state][action] += alpha * td_error
                break
            state = next_state
    return Q

In [None]:
Q = Q_learning(env, episodes=25000, alpha=0.01, eps_start=1)

In [None]:
policy_q = np.ones([64, env.action_space.n]) / env.action_space.n
for state, Qs in Q.items():
  policy_q[state] = get_policy_Qs(Qs, env.action_space.n, eps=0)
visualize_policy(policy_q, shape=(8, 8))

In [None]:
cumulative_reward = 0
for _ in tqdm.tqdm(range(1000), total=1000):
  state = env.reset()
  for iteration in range(100):
    # Sampling random action.
    action = get_action_Q(state, Q, env.action_space.n, eps=0)
    # Applying action in the envrinment.
    next_state, reward, done, info = env.step(action)
    if done:
      break
    state = next_state
  cumulative_reward += reward  
  env.close()

print('Cumulative reward is', cumulative_reward)

## Model Free Value Based Methods - Deep Q-Learning
 - lunar environment
 - replay buffer
 - qnetwork architecture
 - QAgent 
 - deep qlearning algorithm


#### Environment exploration
More details at https://gym.openai.com/envs/LunarLander-v2/ and in to charts.

In [None]:
game = "LunarLander-v2"
env = gym.make(game)

In [None]:
env.observation_space.shape[0]

In [None]:
env.action_space.n

In [None]:
lunar_actions = {
    'none': 0,
    'left': 1,
    'up': 2,
    'right': 3
}

In [None]:
state = env.reset()
state

In [None]:
plt.figure(figsize=(7,7))
next_state, reward, done, info = env.step(lunar_actions['left'])
plt.imshow(env.render('rgb_array'))

In [None]:
env.close()

In [None]:
logging_path = '/video/lunar/test_1/'
env = wrap_env(gym.make("LunarLander-v2"), logging_path)
state = env.reset()

while True:
    # 'human' type of rendenring is suitable when we do the recording
    env.render('human')
    action = env.action_space.sample()      
    state, reward, done, info = env.step(action) 
    if done: 
      break;   
env.close()
show_video(path_prefix=logging_path)

#### Replay buffer

In [None]:
class QReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, buffer_size=int(1e5), batch_size=64, seed=42, device='cpu'):
        """Initialize a ReplayBuffer object.
        Params:
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            device (str): device where tensors are proecssed
        """

        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.device = device
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self, batch_size=None):
        """Randomly sample a batch of experiences from memory."""
        batch_size = batch_size if batch_size is not None else self.batch_size
        batch = random.sample(self.memory, k=batch_size)

        states = torch.from_numpy(np.vstack([b.state for b in batch if b is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([b.action for b in batch if b is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([b.reward for b in batch if b is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([b.next_state for b in batch if b is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([b.done for b in batch if b is not None]).astype(np.uint8)).float().to(self.device)
        return (states, actions, rewards, next_states, dones)

    def is_ready_to_sample(self):
        return len(self) > self.batch_size

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

    def set_device(self, device):
        self.device = device

In [None]:
##########################################################################################
# Run through the env multiple times and fill in the buffer. Then sample several batches #
##########################################################################################
game = "LunarLander-v2"
env = gym.make(game)
replay_buffer = QReplayBuffer(batch_size=30)

for _ in tqdm.tqdm(range(100)):
  state = env.reset()
  while True:
      action = env.action_space.sample()      
      next_state, reward, done, info = env.step(action)
      replay_buffer.add(state, action, reward, next_state, done)
      state = next_state
      if done: 
        break;   
env.close()

In [None]:
len(replay_buffer)

In [None]:
states, actions, rewards, next_states, dones = replay_buffer.sample(2)
states

#### Q network

In [None]:
class DuelingQNetwork(torch.nn.Module):
    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(DuelingQNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.bn0 = torch.nn.BatchNorm1d(state_size)
        self.fc_layer1 = torch.nn.Linear(state_size, 64)
        torch.nn.init.xavier_normal_(self.fc_layer1.weight)
        self.fc_bn1 = torch.nn.BatchNorm1d(64)

        self.fc_layer2 = torch.nn.Linear(64, 64)
        torch.nn.init.xavier_normal_(self.fc_layer2.weight)
        self.fc_bn2 = torch.nn.BatchNorm1d(64)

        self.value_layer1 = torch.nn.Linear(64, 32)
        torch.nn.init.xavier_normal_(self.value_layer1.weight)
        self.v_bn1 = torch.nn.BatchNorm1d(32)
        self.value_layer2 = torch.nn.Linear(32, 1)

        self.action_layer1 = torch.nn.Linear(64, 32)
        torch.nn.init.xavier_normal_(self.action_layer1.weight)
        self.a_bn1 = torch.nn.BatchNorm1d(32)
        self.action_layer2 = torch.nn.Linear(32, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = state
        #x = self.bn0(x)

        # Shared part of network.
        x = self.fc_layer1(x)
        torch.nn.functional.leaky_relu_(x)
        #x = self.fc_bn1(x)

        x = self.fc_layer2(x)
        torch.nn.functional.leaky_relu_(x)
        #x = self.fc_bn2(x)

        # Value part of network.
        v = self.value_layer1(x)
        torch.nn.functional.leaky_relu_(v)
        #v = self.v_bn1(v)        
        v = self.value_layer2(v)

        # Advantage part of network
        a = self.action_layer1(x)
        torch.nn.functional.leaky_relu_(a)
        #a = self.a_bn1(a)        
        a = self.action_layer2(a)

        return v + (a - torch.mean(a, dim=1).unsqueeze(1))

In [None]:
qnetwork = DuelingQNetwork(state_size=8, action_size=4, seed=42)

In [None]:
qnetwork.eval()
##################################################################
# Use replay buffer and let neural net predict over the batches. #
##################################################################
result = qnetwork(states)
states, result

#### Agent

In [None]:
class QAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, tau=1e-3, eps=0., gamma=0.99, lr=5e-4, seed=0, device='cpu'):
        """Initialize an Agent object.
        Params:
            state_size (int): Dimension of input state.
            action_size (int): Dimension of actions.
            seed (int): Random seed for reproducibility.
            lr (float): Learning rate.
            gamma (float): Reward discount.
            tau (float): For soft update of target network parameters (tau is weight for local network).
            eps (float): For epsilon-greedy action selection, higher eps means more exploration.
        """
        self.device = device
        self.gamma = gamma
        self.tau = tau
        self.eps = eps
        self.state_size = state_size
        self.action_size = action_size

        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=lr)

    def __repr__(self):
        return f'QAgent(state_size={self.state_size}, action_size={self.action_size}, device="{self.device}")'

    def act(self, state, eps=None):
        """Return action for given state.
        Params:
            state (numpy.array): Current state.
            eps (float): Eps overlad.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        if eps is None:
            eps = self.eps

        is_training = self.qnetwork_local.training
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train(is_training)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def train(self, replay_buffer, gamma=None, tau=None, batch_size=None):
        """Update value parameters using sampled batches from replay buffer.
        Params:
            replay_buffer: Buffer with records from history.
            gamma (float): Discount factor.
            tau (float): For soft update of target network parameters
        """

        if gamma is None:
            gamma = self.gamma
        if not replay_buffer.is_ready_to_sample():
            return None
        batch = replay_buffer.sample(batch_size)
        states, actions, rewards, next_states, dones = batch
        
        self.qnetwork_local.train()
        target = rewards + gamma * self.qnetwork_target(next_states).max(dim=1)[0].unsqueeze(1) * (1 - dones)
        prediction = self.qnetwork_local(states).gather(dim=1, index=actions)
        loss = torch.nn.functional.mse_loss(target, prediction)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau)

    def soft_update(self, local_model, target_model, tau=None):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): For soft update of target network parameters
        """
        if tau is None:
            tau = self.tau

        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def set_device(self, device):
      self.device = device
      self.qnetwork_local = self.qnetwork_local.to(device)
      self.qnetwork_target = self.qnetwork_target.to(device)
    
    def save(self, path):
        folder, file = path.rsplit('/', 1)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torch.save(self.qnetwork_local.state_dict(), path)

    def load(self, path):
        self.qnetwork_local.load_state_dict(torch.load(path))
        self.qnetwork_target.load_state_dict(torch.load(path))

In [None]:
agent = QAgent(state_size=8, action_size=4, seed=0)
agent.act(state=np.array([1, 2, 3, 4, 5, 6, 7, 8]))


#### Q learning algorithm

In [None]:
#######################
# Setup of parameters #
#######################

episodes = 2000                           # Number of episodes played.
steps_per_episodes = 1000                 # Maximal amount of steps in one episode.
batch_size = 64                           # Size of batches sampled during training from replay buffer.
train_rate = 4                            # Rate of episodes including training (each train_rate th).
eps = 1.0                                 # Eps params cotroling exploration / exploitation.
eps_decay = 0.995
eps_min = 0.01
gamma = 0.99                              # Reward discounting.
stop_reward = 240                         # Average reward from 100 consecutive runs which would stop algorithm.
rewards_window = deque(maxlen=100)        # Buffer for 100 consecutive run rewards.        
rewards = []                              # Log of all episode rewards.
model_name = 'luna_q'                     # Identifier of model saved params.
game = "LunarLander-v2"
seed=42
env = gym.make(game)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = QAgent(state_size=8, action_size=4, seed=0, device=device)
replay_buffer = QReplayBuffer(buffer_size=int(1e5), 
                             batch_size=batch_size, 
                             seed=seed, device=device)


#######################
# Q lerning algorithm #
#######################

for episode_id in range(episodes):
    episode_reward = 0
    # At the start of episode, we restart the environment.
    state = env.reset()
    # Here starts episode.
    for t_step in range(steps_per_episodes):
        # Agent selects action based on current policy.
        action = agent.act(state, eps)
        next_state, reward, done, info = env.step(action)
        # Save experience into replay buffer.
        replay_buffer.add(state, action, reward, next_state, done)
        # Train with train_rate.
        if t_step % train_rate == 0:
            agent.train(replay_buffer, gamma=gamma, batch_size=batch_size)          
        episode_reward += reward
        if done:
            break
        state = next_state
    # Afet each episode we exploit current policy more.
    eps = max(eps*eps_decay, eps_min)

    # Reporting.
    rewards_window.append(episode_reward)
    rewards.append(episode_reward)
    print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}', end="")
    if episode_id % 100 == 0:
        print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
    if np.mean(rewards_window)>=stop_reward:
        print(f'\nSolved! Took {episode_id-100} episodes\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
        break
env.close()

In [None]:
env.close()

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.plot(np.arange(len(rewards)), rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

In [None]:
!ls '/content/drive/My Drive/ml_college_data/rl_workshop/models/'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = QAgent(state_size=8, action_size=4, seed=0, device=device)

In [None]:
model_path = os.path.join(rl_workshop_path, 'models/luna-657-240.3-274.9.pth')
video_path = '/video/lunar/'
game = "LunarLander-v2"

agent.load(path=model_path)
env = wrap_env(gym.make(game), video_path)
state = env.reset()
while True:
    env.render('human')
    action = agent.act(state, eps=0)      
    state, reward, done, info = env.step(action) 
    if done: 
      break;   
env.close()
show_video(video_path)

 #### Questions and experiment suggestions:
 - Try benefit of weighted replay buffer (weighting by current error)
 - Simplified vs batch normed vs dueling network 
 - Different eps values during training and evaluation - what are the differences?
 - Try different environments

## Policy Based Methods - Reinforce
 - cart pole environment 
 - simple policy network with categorical sampling
 - building of reinforce loss
 - reinfroce agent
 - reinforce algorith 

#### CartPole environment

In [None]:
game = "CartPole-v0"
env = gym.make(game)

###########################
# Explore the environment #
###########################


env.close()

#### Policy network

In [None]:
class ReinforcePolicyNetwork(torch.nn.Module):
    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(ReinforcePolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.fc_layer1 = torch.nn.Linear(state_size, 64)
        torch.nn.init.xavier_normal_(self.fc_layer1.weight)

        self.fc_layer2 = torch.nn.Linear(64, 32)
        torch.nn.init.xavier_normal_(self.fc_layer2.weight)

        self.action_layer = torch.nn.Linear(32, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = state
        x = self.fc_layer1(x)
        torch.nn.functional.leaky_relu_(x)

        x = self.fc_layer2(x)
        torch.nn.functional.leaky_relu_(x)
     
        logits = self.action_layer(x)
        probs =  torch.nn.functional.softmax(logits, dim=1)
        # We return both raw values and probabilities.
        return logits, probs

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy = ReinforcePolicyNetwork(state_size=8, action_size=4, seed=42)
policy = policy.to(device)
policy

In [None]:
input = torch.tensor([[1, 2, 3, 4, 5, 61, 71, 81]], device=device, dtype=torch.float32)
logits, probs = policy(input)
probs

In [None]:
##################################################################
# Use Categorial to sample actions from probability distribution #
#                        given by probs.                         #
##################################################################

dist = Categorical(probs)
actions = dist.sample()
#actions = torch.argmax(probs, dim=1)
actions, actions.cpu().item()

#### Construct Reinforce Loss Function

In [None]:
def get_online_reinforce_loss(action_probs, rewards, gamma=1):
    ##############################################################################
    #       Build discounted reward for each action of one episode               #
    # and calculate loss. Don't forget that we are minimizing the objectiv now.  #
    ##############################################################################

    loss = []
    discounts = [gamma**i for i in range(len(rewards)+1)]
    R = sum([a*b for a,b in zip(discounts, rewards)])
    discounts = np.array(discounts)
    rewards = np.array(rewards)
    
    for i, prob in enumerate(action_probs):
        loss.append((-torch.log(prob) * sum(rewards[i:]*discounts[:-(1+i)])).unsqueeze(0))
        #loss.append(-torch.log(prob).unsqueeze(0) * R)
    return torch.cat(loss).sum()

In [None]:
action_probs = torch.tensor([0.3, 0.1, 0.5], requires_grad=True)
rewards = [1,-1 , 1]

loss = get_online_reinforce_loss(action_probs, rewards)
print(loss)
assert loss.requires_grad == True
assert loss.cpu().item() in [1.8971199989318848, 4.199705123901367]

#### Reinforce Agent

In [None]:
class ReinforceAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, gamma=1, lr=1e-3, seed=42, device='cpu'):
        """Initialize an Agent object.
        Params:
            state_size (int): Dimension of input state.
            action_size (int): Dimension of actions.
            seed (int): Random seed for reproducibility.
            gamma (float): Reward discount.
        """
        self.device = device
        self.gamma = gamma
        self.state_size = state_size
        self.action_size = action_size

        self.policy = ReinforcePolicyNetwork(state_size, action_size, seed).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

    def __repr__(self):
        return f'ReinforceAgent(state_size={self.state_size}, action_size={self.action_size}, device="{self.device}")'

    def act(self, state, training_process=True, deterministic=False):
        """Return action and it's probability (with requires_grad) for given state and current policy.
        Params:
            state (numpy.array): Current state.
            training_process (bool): Marks whether we keep info for gradient descent or not.
            deterministic (bool): If False, we sample action from distribution, otherwise we take maximum.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        
        is_training = self.policy.training
        if training_process:
            self.policy.train()
            logits, probs = self.policy(state)
        else:
            self.policy.eval()
            with torch.no_grad():
                logits, probs = self.policy(state)    
        self.policy.train(is_training)

        ##########################################################################
        # Sample action from distribution if deterministic=False, else take max  #
        #              and return -> action id, action probability scalar        #
        ##########################################################################
        #return 0, probs[0,0]
        dist = Categorical(probs)
        if deterministic:
          action = torch.argmax(probs, dim=1)
        else:
          action = dist.sample()
        action = action.cpu().item()
        return action, probs[0, action]


    def train(self, action_probs, rewards, gamma=None):
        """Update policy parameters using action probs and rewards from given episode.
        Params:
            rewards (list): List of rewards from the whole episode.
            action_probs (torch.tensor): Probabilities of sampled actions
            gamma (float): discount factor

        """
        if gamma is None:
            gamma = self.gamma
        loss = get_online_reinforce_loss(action_probs=action_probs, rewards=rewards, gamma=gamma)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def set_device(self, device):
      self.device = device
      self.policy = self.policy.to(device)
    
    def save(self, path):
        folder, file = path.rsplit('/', 1)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torch.save(self.policy.state_dict(), path)

    def load(self, path):
        self.policy.load_state_dict(torch.load(path))


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

reinforce_agent = ReinforceAgent(4, 2, device=device)
reinforce_agent

In [None]:
reinforce_agent.act(state=np.array([1, 2, 3, 4]))

#### Reinforce learning

In [None]:
#######################
# Setup of parameters #
#######################

episodes = 20000
steps_per_episodes = 1000
stop_reward = 200
rewards_window = deque(maxlen=100)
rewards = []
gamma = 1
lr = 0.001
model_name='pole'
game = "CartPole-v0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
env = gym.make(game)
agent = ReinforceAgent(state_size=4, action_size=2, seed=0, gamma=gamma, lr=lr, device=device)

#######################
# Reinforce algorithm #
#######################

for episode_id in range(episodes):
    # List of rewards collected during one episode.
    episode_rewards = []
    # Action probs collected during one episode.
    episode_action_probs = []
    state = env.reset()
    for _ in range(steps_per_episodes):
        # Keep information about gradient flow.
        action, action_prob = agent.act(state, training_process=True, deterministic=False)
        next_state, reward, done, info = env.step(action)
        # Simplified buffer fill in.
        episode_rewards.append(reward)
        episode_action_probs.append(action_prob)
        if done:
            break
        state = next_state
    # Agent is trained after each episode.
    agent.train(action_probs=episode_action_probs, rewards=episode_rewards)

    # Reporting.
    episode_reward = sum(episode_rewards)
    rewards_window.append(episode_reward)
    rewards.append(episode_reward)
    print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}', end="")
    if episode_id % 100 == 0:
        print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
    if np.mean(rewards_window)>=stop_reward:
        print(f'\nSolved! Took {episode_id-100} episodes\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
        break

env.close()

In [None]:
env.close()

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.plot(np.arange(len(rewards)), rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

In [None]:
!ls '/content/drive/My Drive/ml_college_data/rl_workshop/models/'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = ReinforceAgent(4, 2, device=device)
agent

In [None]:
model_path = os.path.join(rl_workshop_path, 'models/pole-815-200.0-200.0.pth')
video_path = '/video/pole_exp1/'
game = "CartPole-v0"

agent.load(path=model_path)
env = wrap_env(gym.make(game), video_path)
state = env.reset()
while True:
    env.render('human')
    action, action_prob = agent.act(state, training_process=False, deterministic=True)      
    state, reward, done, info = env.step(action) 
    if done: 
      break;   
env.close()
show_video(video_path)

#### Questions and experiment suggestions
 - Use reward just from future for each action
 - What would happen if we would normalize reward
 - Train other environments, e.g. `LunarLander` again..

## Policy based methods - PPO
 - pong environment
 - preparing input data for agent
 - computing PPO loss function

#### Observe Pong Env

In [None]:
game = "Pong-v0"
env = gym.make(game)

In [None]:
env.observation_space

In [None]:
env.action_space

In [None]:
env.get_action_meanings()

In [None]:
state = env.reset()
state.shape

In [None]:
plt.figure(figsize=(7, 7))
plt.imshow(state)

In [None]:
next_state, reward, done, info = env.step(1)
reward, done, info

In [None]:
env.close()

#### State space preprocessing

In [None]:
def state_preprocess(state):
    """ Preprocess 210x160x3 uint8 frame into 6000 (75x80) 1D float vector. """
    #############################################################################
    #  Remove from the state as many pixels as possible and keep important info #
    #   - dowscale                                                              #
    #   - greyscale                                                             #
    #   - cutoffs                                                               #
    #############################################################################

    """
      if state is None:
          return torch.zeros(210*160*3)
      return state.astype(np.float32).ravel()
    """

    if state is None:
        return torch.zeros(6000)

    state = state[35:185]     # Crop - remove 35px from start & 25px from end of image in x, to reduce redundant parts of image (i.e. after ball passes paddle).
    state = state[::2,::2,0]  # Downsample by factor of 2.
    state[state == 144] = 0   # Erase background (background type 1).
    state[state == 109] = 0   # Erase background (background type 2)
    state[state != 0] = 1     # Everything else (paddles, ball) just set to 1. this makes the image grayscale effectively
    return state.astype(np.float32).ravel()

In [None]:
def get_final_state(state, prev_state):
    #######################################################
    # Experiment with combination of 2 successing images  #
    #######################################################
    #return state_preprocess(state) - state_preprocess(prev_state)
    
    return np.concatenate([state_preprocess(state), state_preprocess(prev_state)])

In [None]:
state_preprocess(None)

In [None]:
state.shape

In [None]:
plt.figure(figsize=(7,7))
plt.imshow(state_preprocess(state).reshape((75, 80)))

In [None]:
get_final_state(state, None).shape

#### Replay Buffer

In [None]:
def get_discounted_rewards(rewards, gamma):
    discounted_rewards = []
    dr = 0
    for r in rewards[::-1]:
        # Specific for pong - we can recognize episodes.
        if r != 0: 
          dr = 0
        dr = r + gamma * dr
        discounted_rewards.insert(0, dr)

    discounted_rewards = np.array(discounted_rewards)
    std = discounted_rewards.std()
    std = 1 if std==0 else std
    return (discounted_rewards - discounted_rewards.mean()) / std

In [None]:
get_discounted_rewards(np.array([0,0,0,0,1,0,0,0,0,-1]), gamma=0.9)

In [None]:
class PPOReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, batch_size=64, gamma=0.99, seed=42, device='cpu'):
        """Initialize a ReplayBuffer object.
        Params:
            batch_size (int): size of each training batch
            seed (int): random seed
            device (str): device where tensors are proecssed
        """
        self.batch_size = batch_size
        self.gamma = gamma
        self.is_reward_recalculated = False
        self.device = device
        self.seed = random.seed(seed)

        # Those are values we want to collect
        self.states = []
        self.actions = []
        self.action_probs = []
        self.rewards = []
        self.discounted_rewards = []

    def add(self, state, action, action_prob, reward):
        """Add a new experience to memory."""
        self.is_reward_recalculated = False
        self.states.append(state)
        self.actions.append(action)
        self.action_probs.append(action_prob)
        self.rewards.append(reward)

    def sample(self, batch_size=None):
        """Randomly sample a batch of experiences from memory."""
        idxs = random.sample(range(len(self.actions)), batch_size)

        # We need to recalculate because of normalization.
        if  not self.is_reward_recalculated:
          self.update_discounted_rewards()

        states_batch = torch.from_numpy(np.vstack([self.states[idx] for idx in idxs])).float().to(self.device)
        actions_batch = torch.from_numpy(np.vstack([self.actions[idx] for idx in idxs])).long().to(self.device).squeeze()
        action_probs_batch = torch.from_numpy(np.vstack([self.action_probs[idx] for idx in idxs])).float().to(self.device).squeeze()
        rewards_batch = torch.from_numpy(np.vstack([self.discounted_rewards[idx] for idx in idxs])).float().to(self.device).squeeze()
        return states_batch, actions_batch, action_probs_batch, rewards_batch

    def update_discounted_rewards(self, gamma=None):
      gamma = self.gamma if gamma is None else gamma
      self.discounted_rewards = get_discounted_rewards(self.rewards, gamma)
      self.is_reward_recalculated = True

    def is_ready_to_sample(self):
        return len(self) > self.batch_size

    def empty_buffer(self):
        # Because of big variance, we clean up buffer frequently.
        self.states = []
        self.actions = []
        self.action_probs = []
        self.rewards = []
        self.discounted_rewards = []

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.actions)

    def set_device(self, device):
        self.device = device

In [None]:
buffer = PPOReplayBuffer()
for i in range(10):
  buffer.add([1, 2, 3, 4], 1, 0.74, 1 if i%4==0 and i>0 else 0)

In [None]:
state, action, action_prob, reward = buffer.sample(4)
reward, action_prob

#### Policy network

In [None]:
class PPOPolicyNetwork(torch.nn.Module):
    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(PPOPolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.fc_layer1 = torch.nn.Linear(state_size, 512)
        torch.nn.init.xavier_normal_(self.fc_layer1.weight)

        #self.fc_layer2 = torch.nn.Linear(64, 32)
        #torch.nn.init.xavier_normal_(self.fc_layer2.weight)

        self.action_layer = torch.nn.Linear(512, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = state
        x = self.fc_layer1(x)
        torch.nn.functional.leaky_relu_(x)

        #x = self.fc_layer2(x)
        #torch.nn.functional.leaky_relu_(x)
     
        logits = self.action_layer(x)
        probs =  torch.nn.functional.softmax(logits, dim=1)
        # Return both logits and probs.
        return logits, probs


In [None]:
states_batch, actions_batch, action_probs_batch, rewards_batch = buffer.sample(4)
states_batch, actions_batch, action_probs_batch, rewards_batch 

In [None]:
policy = PPOPolicyNetwork(state_size=4, action_size=2, seed=42)
policy(states_batch)

#### Constructing PPO loss functions

In [None]:
states_batch = torch.tensor([[1., 2., 3., 4.],
                            [1., 2., 3., 4.],
                            [1., 2., 3., 4.],
                            [1., 2., 3., 4.]])

actions_batch = torch.tensor([1, 1, 1, 1])
action_probs_batch = torch.tensor([0.7400, 0.7400, 0.7400, 0.7400])
rewards_batch = torch.tensor([0.3778, 0.2776, 0.3106, 0.3778])

In [None]:
def get_new_action_probs(policy, states, actions, device='cpu'):
    """ Return new probabilities for  actions. Works only for 2 action state space! """
    # Selector can be generalized using identity matrix for more actions.

    ###################################
    # Analyze what code actually does #
    ###################################
    selector = np.array([[1., 0.], [0., 1.]])
    selector = torch.FloatTensor(selector[actions.cpu().numpy()]).squeeze(1).to(device)

    logits, probs = policy(states)
    action_probs = torch.sum(probs * selector, dim=1) 
    return action_probs

In [None]:
get_new_action_probs(policy, states_batch, actions_batch)

In [None]:
def get_ppo_loss(policy, states, actions, action_probs, rewards, eps_clip=0.1, device='cpu'):
    new_action_probs = get_new_action_probs(policy, states, actions, device=device)

    r = new_action_probs / action_probs
    loss1 = r * rewards
    loss2 = torch.clamp(r, 1-eps_clip, 1+eps_clip) * rewards
    loss = -torch.min(loss1, loss2)
    loss = torch.mean(loss)
    return loss

In [None]:
assert get_ppo_loss(policy, states_batch, actions_batch, action_probs_batch, rewards_batch, eps_clip=0.1).item() == -0.22267144918441772

In [None]:
def get_reinforce_loss(policy, states, actions, action_probs, rewards):
    logits, probs = policy(states)
    loss = torch.nn.functional.cross_entropy(logits, actions, reduction='none') * rewards
    loss = torch.mean(loss)
    return loss

In [None]:
assert get_reinforce_loss(policy, states_batch, actions_batch, action_probs_batch, rewards_batch).item() == 0.23932071030139923

#### PPO Agent

In [None]:
class PPOAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, lr=1e-3, seed=42, device='cpu'):
        """Initialize an Agent object.
        Params:
            state_size (int): Dimension of input state.
            action_size (int): Dimension of actions.
            seed (int): Random seed for reproducibility.
            gamma (float): Reward discount.
        """
        self.device = device
        self.state_size = state_size
        self.action_size = action_size

        self.policy = PPOPolicyNetwork(state_size, action_size, seed).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)

    def __repr__(self):
        return f'PPOAgent(state_size={self.state_size}, action_size={self.action_size}, device="{self.device}")'

    def act(self, state, deterministic=False):
        """Return actions and probabilities for given state and current policy.
        Params:
            state (array_like): Current state.
            deterministic (bool): If False, we sample action from distribution, otherwise we take maximum.
        ReturnL 
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        is_training = self.policy.training

        self.policy.eval()
        with torch.no_grad():
            logits, probs = self.policy(state)
        dist = Categorical(probs)
        if deterministic:
            actions = torch.argmax(probs, dim=1)    
        else:
            actions = dist.sample()
        self.policy.train(is_training)
        return actions.cpu().item(), probs[0, actions].cpu().item()

    def train(self, buffer, training_iterations, batch_size=64, eps_clip=0.1):
        for _ in range(training_iterations):
          states_batch, actions_batch, action_probs_batch, rewards_batch = buffer.sample(batch_size)     
          self.optimizer.zero_grad()
          loss = get_ppo_loss(self.policy, states=states_batch, 
                              actions=actions_batch, 
                              action_probs=action_probs_batch, 
                              rewards=rewards_batch, 
                              eps_clip=eps_clip, 
                              device=self.device)

          loss.backward()
          self.optimizer.step()

    def set_device(self, device):
      self.device = device
      self.policy = self.policy.to(device)
    
    def save(self, path):
        folder, file = path.rsplit('/', 1)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torch.save(self.policy.state_dict(), path)

    def load(self, path):
        self.policy.load_state_dict(torch.load(path, map_location=self.device))

In [None]:
agent = PPOAgent(action_size=2, state_size=4)

In [None]:
agent.act(np.array([1,2,3,4]))

#### PPO training

In [None]:
#######################
# Setup of parameters #
#######################

game = "Pong-v0"
episodes = 20000
steps_per_episodes = 100000
stop_reward = 200
gamma = 0.99
eps_clip = 0.1
model_name='pong'
env = gym.make(game)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(state_size=12000, action_size=2, device=device)
replay_buffer = PPOReplayBuffer(gamma=gamma, device=device) 
rewards_window = deque(maxlen=100)
rewards = []


###################
#  PPO algorithm  #
###################

for episode_id in range(episodes):
    episode_reward = 0
    # We need to keep 2 state sequence to get dynamics.
    state, prev_state = env.reset(), None
    for _ in range(steps_per_episodes):
        # Preprocess states.
        final_state = get_final_state(state, prev_state)
        # Run the agent and get action + prob.
        action, action_prob = agent.act(final_state)
        # We increase action +2 to keep real semantic.
        next_state, reward, done, info = env.step(action+2)
        # Save data to replay buffer.
        replay_buffer.add(final_state, action, action_prob, reward)
        episode_reward += reward
        if done:
            break
        prev_state = state
        state = next_state
    # Agent is trained after a few episodes & buffer is emptied.
    if episode_id % 10:
      agent.train(replay_buffer, training_iterations=10, batch_size=512, eps_clip=eps_clip)
      replay_buffer.empty_buffer()

    # Reporting.
    rewards_window.append(episode_reward)
    rewards.append(episode_reward)
    print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}', end="")
    if episode_id % 50 == 0:
        print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
    if np.mean(rewards_window)>=stop_reward:
        print(f'\nSolved! Took {episode_id-100} episodes\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
        break
env.close()

In [None]:
env.close()

In [None]:
!ls '/content/drive/My Drive/ml_college_data/rl_workshop/models/'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = PPOAgent(state_size=12000, action_size=2, device=device)

In [None]:
model_path = os.path.join(rl_workshop_path, 'models/pong-2650--8.1-3.0.pth')
agent.load(path=model_path)

video_path = '/video/pong_reinforce/'
game = "Pong-v0"
env = wrap_env(gym.make(game), video_path)

state, prev_state = env.reset(), None
for _ in range(100000):
    env.render('human')
    # Preprocess state.
    final_state = get_final_state(state, prev_state)
    # Run the action.
    action, action_prob = agent.act(final_state, deterministic=True)
    next_state, reward, done, info = env.step(action+2)
    if done:
        break
    prev_state = state
    state = next_state 
env.close()
show_video(video_path)


#### Questions and experiment suggestions
 - Experiment with policy network complexity - adding/removing layers can make big difference.
 - Compare the progression with different hyperparam setup.
 - Try to use crossentropy loss instead of PPO reweight clipped loss. 
 - Implement PPO for lunar module - think through discount mechanism.

## Actor Critic Methods
 - explore motion based environment
 - gaussian process for randomizing actions
 - actor/critic network
 - AC agent
 - AC training  

#### Environment

In [None]:
env = gym.make('BipedalWalker-v3')

In [None]:
env.action_space.sample()

In [None]:
env.observation_space

In [None]:
env.close()

In [None]:
logging_path = '/video/walker/test/'
env = wrap_env(gym.make('BipedalWalker-v3'), logging_path)
state = env.reset()
while True:
    # 'human' type of rendenring is suitable when we do the recording
    env.render('human')
    action = env.action_space.sample()      
    state, reward, done, info = env.step(action) 
    if done: 
      break;   
env.close()
show_video(path_prefix=logging_path)

#### Exploration of continuous space

In [None]:
class OUNoise:
    """Ornstein-Uhlenbeck process"""

    def __init__(self, size, seed, mu=0.0, theta=0.15, sigma=0.2, sigma_min=0.05, sigma_decay=.975):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.sigma_min = sigma_min
        self.sigma_decay = sigma_decay
        self.seed = random.seed(seed)
        self.size = size
        self.reset()

    def reset(self):
        self.state = copy.copy(self.mu)
        self.sigma = max(self.sigma_min, self.sigma * self.sigma_decay)

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.standard_normal(self.size)
        self.state = x + dx
        return self.state

In [None]:
plt.figure(figsize=(7,7))
noise = OUNoise(size=1, seed=42)
sns.kdeplot([noise.sample()[0] for _ in range(10000)], shade='blue')
noise.sigma = 0.05
sns.kdeplot([noise.sample()[0] for _ in range(10000)], shade='blue')

In [None]:
plt.figure(figsize=(7,7))
plt.plot([noise.sample()[0] for _ in range(10000)])

#### Replay Buffer

In [None]:
class ACReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, buffer_size=int(1e5), batch_size=64, seed=42, device='cpu'):
        """Initialize a ReplayBuffer object.
        Params:
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            device (str): device where tensors are proecssed
        """

        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.device = device
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self, batch_size=None):
        """Randomly sample a batch of experiences from memory."""
        batch_size = batch_size if batch_size is not None else self.batch_size
        batch = random.sample(self.memory, k=batch_size)

        states = torch.from_numpy(np.vstack([b.state for b in batch if b is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([b.action for b in batch if b is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([b.reward for b in batch if b is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([b.next_state for b in batch if b is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([b.done for b in batch if b is not None]).astype(np.uint8)).float().to(self.device)
        return (states, actions, rewards, next_states, dones)

    def is_ready_to_sample(self):
        return len(self) > self.batch_size

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

    def set_device(self, device):
        self.device = device

In [None]:
# Try to fill in data into buffer and sample batches from it.
game = "BipedalWalker-v3"
env = gym.make(game)
replay_buffer = ACReplayBuffer(batch_size=30)

for _ in tqdm.tqdm(range(100)):
  state = env.reset()
  while True:
      action = env.action_space.sample()      
      next_state, reward, done, info = env.step(action)
      replay_buffer.add(state, action, reward, next_state, done)
      state = next_state
      if done: 
        break;   
env.close()

In [None]:
replay_buffer.sample(10)

#### Actor Critic Networks

In [None]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)


class Actor(torch.nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """

        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.fc1 = torch.nn.Linear(state_size, 256)
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        #torch.nn.init.xavier_normal_(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(256, action_size)
        self.fc2.weight.data.uniform_(-3e-3, 3e-3)

    def forward(self, state):
        x = torch.nn.functional.leaky_relu(self.fc1(state))
        return torch.tanh(self.fc2(x))


class Critic(torch.nn.Module):
    """Critic (Value) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """

        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        
        self.fc1 = torch.nn.Linear(state_size, 256)
        #torch.nn.init.xavier_normal_(self.fc1.weight)
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))

        self.fc2 = torch.nn.Linear(256 + action_size, 256)
        #torch.nn.init.xavier_normal_(self.fc2.weight)
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))

        self.fc3 = torch.nn.Linear(256, 128)
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))

        self.fc4 = torch.nn.Linear(128, 1)
        self.fc4.weight.data.uniform_(-3e-3, 3e-3)


    def forward(self, state, action):
        """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
        x = torch.nn.functional.leaky_relu(self.fc1(state))
        x = torch.cat((x, action.float()), dim=1)
        x = torch.nn.functional.leaky_relu(self.fc2(x))
        x = torch.nn.functional.leaky_relu(self.fc3(x))
        return self.fc4(x)

In [None]:
states, actions, rewards, next_states, dones = replay_buffer.sample(2)

In [None]:
actor = Actor(24, 4, seed=43)

In [None]:
actor(states)

In [None]:
critic = Critic(24, 4, seed=42)

In [None]:
critic(states, actions)

#### Agent

In [None]:
class ACAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, tau=1e-3, weight_decay=0.0001, gamma=0.99, lr_actor=1e-4, lr_critic=3e-4, seed=42, device='cpu'):
        """Initialize an Agent object.
        Params:
            state_size (int): Dimension of state.
            action_size (int): Dimension of action.
            lr_actor (float): Learning rate for actor optimization.
            lr_critic (float): Learning rate for critic optimization.
            gamma (float): Reward discount,
            tau (float): For soft update of target network parameters.
            weight_decay (float): l2 loss during adam optimization.
        """
        self.device = device
        self.state_size = state_size
        self.action_size = action_size
        
        self.gamma = gamma
        self.tau = tau
        self.weight_decay = weight_decay
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        # Actor Network
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay)

        self.noise = OUNoise(action_size, seed)

    def __repr__(self):
        return f'ACAgent(state_size={self.state_size}, action_size={self.action_size}, num_agents={self.num_agents}, device="{self.device}")'


    def act(self, state, add_noise=True):
        """Return actions for given state.
        Params:
            state (array_like): Current state.
            add_noise (bool): Add UO noise to actions.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        is_training = self.actor_local.training
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state).cpu().data.numpy()[0]
        self.actor_local.train(is_training)

        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, replay_buffer, gamma=None, tau=None, batch_size=None):
        """Update value parameters using sampled batches from replay buffer.
        Params:
            replay_buffer: Buffer with records from history.
            gamma (float): Discount factor.
            tau (float): For soft update of target network parameters
        """

        if tau is None:
            tau = self.tau

        if gamma is None:
            gamma = self.gamma

        if not replay_buffer.is_ready_to_sample():
            return None
        batch = replay_buffer.sample(batch_size)
        states, actions, rewards, next_states, dones = batch

        # Use target actor to predict next continuous action. 
        next_actions = self.actor_target(next_states)
        # Compute Q targets for current states.
        target = rewards + (gamma * self.critic_target(next_states, next_actions) * (1 - dones))
        prediction = self.critic_local(states, actions)
        # Compute critic loss.
        critic_loss = torch.nn.functional.mse_loss(prediction, target)
        # Minimize the loss.
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Compute actor loss.
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)


    def soft_update(self, local_model, target_model, tau=None):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): For soft update of target network parameters
        """
        if tau is None:
            tau = self.tau

        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def save(self, path):
        prefix, _ = path.rsplit('.', 1)
        folder, file = path.rsplit('/', 1)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torch.save(self.actor_local.state_dict(), prefix + '_actor.pth')
        torch.save(self.critic_local.state_dict(), prefix + '_critic.pth')

    def load(self, path):
        prefix, _ = path.rsplit('.', 1)
        self.actor_local.load_state_dict(torch.load(prefix + '_actor.pth'))
        self.actor_target.load_state_dict(torch.load(prefix + '_actor.pth'))
        self.critic_local.load_state_dict(torch.load(prefix + '_critic.pth'))
        self.critic_target.load_state_dict(torch.load(prefix + '_critic.pth'))

    def set_device(self, device):
      self.device = device
      self.actor_local = self.actor_local.to(device)
      self.actor_target = self.actor_target.to(device)
      self.critic_local = self.critic_local.to(device)
      self.critic_target = self.critic_target.to(device)


In [None]:
agent = ACAgent(24, 4)

In [None]:
agent.act(np.random.randn(24))

#### Actor Critic Algorithm

In [None]:
#######################
# Setup of parameters #
#######################

episodes = 2000                           # Number of episodes played.
steps_per_episodes = 700                  # Maximal amount of steps in one episode.
batch_size = 128                          # Size of batches sampled during training from replay buffer.
gamma = 0.99                              # Reward discounting.
stop_reward = 300                         # Average reward from 100 consecutive runs which would stop algorithm.
rewards_window = deque(maxlen=100)        # Buffer for 100 consecutive run rewards.        
rewards = []                              # Log of all episode rewards.
best_reward = -1000
model_name = 'bipedal'                    # Identifier of model saved params.
game = "BipedalWalker-v3"
seed = 42


env = gym.make(game)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = ACAgent(state_size=24, action_size=4, gamma=gamma, lr_actor=1e-4, lr_critic=3e-4, seed=seed, device=device)
replay_buffer = ACReplayBuffer(buffer_size=int(1e5), batch_size=batch_size, seed=seed, device=device)


##################################
# Actor-Critic lerning algorithm #
##################################

for episode_id in range(episodes):
    episode_reward = 0
    # At the start of episode, we restart the environment.
    state = env.reset()
    # Here starts episode.
    for t_step in range(steps_per_episodes):
        # Agent selects action.
        action = agent.act(state, add_noise=True)
        next_state, reward, done, info = env.step(action)
        # Save experience into replay buffer.
        replay_buffer.add(state, action, reward, next_state, done)
        # Train with train_rate.
        agent.train(replay_buffer, gamma=gamma, batch_size=batch_size)          
        episode_reward += reward
        if done:
            break
        state = next_state
    ##########################################################################
    # Experiment with lowering the noise ration after few 100s of iterations #
    ##########################################################################
    #agent.reset()


    # Reporting.
    rewards_window.append(episode_reward)
    rewards.append(episode_reward)
    print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}\t Current Score: {np.round(episode_reward, 1)}', end="")
    if best_reward <= episode_reward:
      best_reward = episode_reward
      agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-best-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
    if episode_id % 100 == 0:
        print(f'\rEpisode {episode_id}\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
    if np.mean(rewards_window)>=stop_reward:
        print(f'\nSolved! Took {episode_id-100} episodes\tAverage Score: {np.round(np.mean(rewards_window), 1)}')
        agent.save(path=os.path.join(rl_workshop_path, f'models/{model_name}-{episode_id}-{np.round(np.mean(rewards_window), 1)}-{np.round(episode_reward, 1)}.pth'))
        break
env.close()

In [None]:
env.close()

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.plot(np.arange(len(rewards)), rewards)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

In [None]:
!ls '/content/drive/My Drive/ml_college_data/rl_workshop/models/'

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
agent = ACAgent(state_size=24, action_size=4, gamma=0.99, lr_actor=0.0001, lr_critic=0.0001, seed=42, device=device)

In [None]:
model_path = os.path.join(rl_workshop_path, 'models/bipedal-best--80.5-2.2.pth')
video_path = '/video/bipedal/'
game = "BipedalWalker-v3"

agent.load(path=model_path)
env = wrap_env(gym.make(game), video_path)
state = env.reset()
while True:
    env.render('human')
    action = agent.act(state, add_noise=True)      
    state, reward, done, info = env.step(action) 
    if done: 
      break;   
env.close()
show_video(video_path)

 #### Questions and experiment suggestions
 - Try different Gaussian Markov process for randomization of actions
 - Go hardcore https://gym.openai.com/envs/BipedalWalkerHardcore-v2/
 - Go back to reinforce algorithm and apply critic each step instead of R at the end of episode
 - Use 2 headed policy-value network 
