In [0]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import  clear_output
%matplotlib inline

In [0]:
# solution for Google Colab rendering from here: https://star-ai.github.io/Rendering-OpenAi-Gym-in-Colaboratory/
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils
clear_output()

In [0]:

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env    

In [4]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from IPython.core.debugger import set_trace
import torch
import torch.nn as nn
from torch.optim import Adam
import time
from collections import defaultdict
from numpy import random
import copy
from tqdm import tqdm_notebook

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

env = gym.make("MountainCar-v0")
env = wrap_env(env)

print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(2,)
Action space: Discrete(3)


# Deep Q-Learning

In [0]:
class MLP(nn.Module):
    def __init__(self, input_dim,
                     out_dim,
                     n_hidden_layers=3,
                     hidden_dim=64):
        super(MLP, self).__init__()

        self.layers = nn.Sequential( 
                      nn.Linear(input_dim, hidden_dim),
                      nn.ReLU()
                      )
        for i in range(n_hidden_layers):
            self.layers.add_module(f'linear_{i}',nn.Linear(hidden_dim, hidden_dim))
            self.layers.add_module(f'relu_{i}',nn.ReLU())

        self.layers.add_module(f'linear_out',nn.Linear(hidden_dim, out_dim))

    def forward(self,x):
        x = self.layers(x)
        return x

# model to approximate q-function
model = MLP(2,3).cuda() # s->q(a)
target_model = MLP(2,3).cuda() # s1->q(a)
optimizer = Adam(model.parameters(), lr=0.00003)

In [0]:
# inspierd by https://github.com/transedward/pytorch-dqn/blob/master/dqn_learn.py
def prepare_batch(batch):
    batch = np.array(batch,dtype=[('state', np.ndarray),
                              ('action', np.ndarray),
                              ('reward', np.float32),
                              ('next_state', np.ndarray),
                              ('done', np.bool)])
    state = np.stack(batch['state'],0)
    action = np.stack(batch['action'],0)
    reward = np.stack(batch['reward'],0)
    next_state = np.stack(batch['next_state'],0)
    done = np.stack(batch['done'],0)

    state = torch.tensor(state, dtype=torch.float).cuda()
    next_state = torch.tensor(next_state, dtype=torch.float).cuda()
    reward = torch.tensor(reward, dtype=torch.float).cuda()
    action = torch.tensor(action).cuda()
    done = torch.tensor(done).cuda()
    return state, action, reward, next_state, done

def get_action(state, epsilon, model):
    if random.random() < epsilon:
        action = random.randint(0, 2)
    else:
        state_tensor = torch.tensor(state, dtype=torch.float).cuda().unsqueeze(0)
        action = model(state_tensor)[0].max(0)[1].view(1, 1).item()  
    return action      

In [0]:
# inspired by https://habr.com/ru/company/hsespb/blog/444428/
device = 'cuda:0'
env = gym.make("MountainCar-v0").env
target_update = 1000
batch_size = 200
max_steps = 150000
max_epsilon = 0.5
min_epsilon = 0.1
gamma = 0.99
HISTORY_LENGTH = 10000
history = []
rewards_by_target_updates = []
state = env.reset()

for step in tqdm_notebook(range(1,max_steps+1)):

      epsilon = max_epsilon - (max_epsilon - min_epsilon)* step / max_steps

      action = get_action(state, epsilon, model)

      new_state, reward, done, _ = env.step(action)

      modified_reward = reward + 300 * (gamma * abs(new_state[1]) - abs(state[1]))
      event = [state, action, modified_reward, new_state, done]

      # filling history list
      n_events = len(history)
      if n_events < HISTORY_LENGTH:
          history.append(event)
      else:
          history[step%n_events] = event  
      
      # change state
      if done:
          state = env.reset()
          done = False
      else:
          state = new_state

      # optimization step
      if len(history) > batch_size:
            batch_indexes = np.random.choice(np.arange(n_events),batch_size,replace=False)
            batch = [tuple(history[i]) for i in batch_indexes]
            (state_tensor, 
            action_tensor, 
            reward_tensor, 
            next_state_tensor, 
            done_tensor) = prepare_batch(batch)
            target_q = target_model(next_state_tensor).max(1)[0].view(-1) 
            target_q[done_tensor] = 0
            target_q = reward_tensor + target_q * gamma

            q = model(state_tensor).gather(1, action_tensor.unsqueeze(1))

            loss = nn.MSELoss()(q, target_q.unsqueeze(1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

      if step % target_update == 0:
            target_model = copy.deepcopy(model)
            #Exploitation
            state = env.reset()
            total_reward = 0
            while not done:
                  action = get_action(state, 0, target_model)
                  state, reward, done, _ = env.step(action)
                  total_reward += reward

            done = False
            state = env.reset()
            rewards_by_target_updates.append(total_reward)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=150000), HTML(value='')))