In [143]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
import gym
from tqdm import tqdm_notebook

class DeepQNet(nn.Module):
  def __init__(self, in_dim, hidden_dim, out_dim, batch_size, device):
    super(DeepQNet, self).__init__()
    self.embedding = nn.Embedding(in_dim, hidden_dim)
    self.model = nn.Sequential(
      nn.Linear(hidden_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, out_dim),
    )
    
  def forward(self, x):
    return self.model(self.embedding(x))

class TaxiDeepQ():
  def __init__(self, learning_rate=.001, gamma=.99, tau=.001, batch_size=128, max_steps=2500, memory_capacity=100000):
    self.gamma = gamma
    self.tau = tau
    self.batch_size = batch_size
    self.max_steps=max_steps
    self.memory_position = 0
    self.memory_capacity = memory_capacity
    self.memory_buffer = []
    self.update_step = 8
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.action_network = self.spawn_network().to(self.device)
    self.target_network = self.spawn_network().to(self.device)
    self.optimizer = optim.Adam(self.action_network.parameters(), learning_rate)
    
  def push_memory(self, memory):
    '''memory = (observation, next_observation, action, reward, done)'''
    if len(self.memory_buffer) < self.memory_capacity:
      self.memory_buffer.append(None)
    self.memory_buffer[self.memory_position] = memory
    self.memory_position = (self.memory_position + 1) % self.memory_capacity
    
  def get_memory_batch(self):
    obs, next_obs, action, reward, done = zip(*random.sample(self.memory_buffer, k=self.batch_size))
    return {
      'observation':torch.tensor(np.vstack(obs), dtype=torch.long),
      'next_observation':torch.tensor(np.vstack(next_obs), dtype=torch.long),
      'action':torch.tensor(np.vstack(action), dtype=torch.long),
      'reward':torch.tensor(np.vstack(reward), dtype=torch.float32),
      'done':torch.tensor(np.vstack(done).astype(np.float32), dtype=torch.float32),
    }
    
  
  def spawn_network(self):
    return DeepQNet(in_dim=500, hidden_dim=128, out_dim=6, batch_size=self.batch_size, device=self.device)

  def try_network_update(self):
    if len(self.memory_buffer) > self.batch_size:
      obs, next_obs, action, reward, done = map(lambda tensor: tensor.to(self.device), self.get_memory_batch().values())
      q_targets_next = self.target_network(next_obs).detach().max(1)[0].unsqueeze(1)
      q_targets = reward + (self.gamma * q_targets_next * (1 - done))
      q_expected = self.action_network(obs)
      print(q_expected.shape)
      print(action.shape)
      q_expected = q_expected.gather(1, action)

      loss = F.mse_loss(q_expected, q_targets)
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

      self.update_target_network()  
  
  def update_target_network(self):
    for target_param, action_param in zip(self.target_network.parameters(), self.action_network.parameters()):
      target_param.data.copy_(self.tau*action_param.data + (1.0-self.tau)*target_param.data)
  
  def get_action(self, observation, eps):
    if random.random() > eps:
      with torch.no_grad(): 
        state_input = torch.tensor(observation, dtype=torch.long).to(self.device)
        return np.argmax(self.action_network(state_input).squeeze().cpu().numpy())
    else:
      return np.random.randint(0, 5)
      
  def spawn_env(self):
    return gym.make('Taxi-v2')
  
  def run_episodes(self, episodes=1, eps_range=(1, 0), seed=None, render=False):
    env = self.spawn_env()
    for e in tqdm_notebook(range(episodes), desc='Episodes'):
      lerp = e / max(1, (episodes-1))
      eps = (1 - lerp) * eps_range[0] + lerp * eps_range[1]
      self.run_episode(env, eps, seed, render)
    env.close()
    
  def run_episode(self, env, eps, seed=None, render=False):
    env.seed(seed)
    obs = env.reset()
    for step in range(self.max_steps):
      if render: env.render()
      action = self.get_action(obs, eps)
      new_obs, reward, done, _ = env.step(action)
      self.push_memory((obs, new_obs, action, reward, done))
      self.try_network_update()
      self.update_step += 1
      if done: break

In [144]:
agent = TaxiDeepQ(tau=.01, max_steps=500)

In [145]:
agent.run_episodes(250, eps_range=(1, .05))

HBox(children=(IntProgress(value=0, description='Episodes', max=250), HTML(value='')))

torch.Size([128, 1, 6])
torch.Size([128, 1])


RuntimeError: invalid argument 4: Index tensor must have same dimensions as input tensor at c:\programdata\miniconda3\conda-bld\pytorch_1524549877902\work\aten\src\thc\generic/THCTensorScatterGather.cu:16